Publish ESS-AIST-81M preview base release

Browse files

Files changed (11) hide show

.gitattributes +1 -34
ESS-AIST-81M.safetensors +3 -0
README.md +112 -0
ess_ait_86m_spec.yaml +192 -0
event_eval.json +266 -0
export_metadata.json +49 -0
manifest.json +10 -0
parameter_breakdown.json +9 -0
prefix_eval.json +48 -0
retrieval_512_gt1030.json +40 -0
subject_eval.json +119 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


























1	*.safetensors filter=lfs diff=lfs merge=lfs -text
2	+ *.gguf filter=lfs diff=lfs merge=lfs -text

ESS-AIST-81M.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b7eb74bacea98e7122e723c164dfd912dd1ccb6902605972f905f95c337dfa2
+size 323643112

README.md ADDED Viewed

	@@ -0,0 +1,112 @@

+---
+language:
+- en
+license: apache-2.0
+tags:
+- multimodal
+- embedding
+- trimodal
+- retrieval
+- image-text-audio
+- feature-extraction
+library_name: pytorch
+pipeline_tag: feature-extraction
+datasets:
+- custom
+---
+# ESS-AIST-81M Preview
+`ESS-AIST-81M Preview` is the current Cortext trial checkpoint from the ESS line.
+- release checkpoint: `ess_aist_full_v7_librispeech360_l4i/checkpoint_epoch_11.pt`
+- text encoder: `MongoDB/mdbr-leaf-ir`
+- image encoder: `mobilenetv4_conv_medium.e180_r384_in12k`
+- audio encoder: native `mn20_as` EfficientAT LoRA audio backbone
+This is the base safetensors release. GGUF quantizations are published separately.
+## Embedding Layout
+Output embedding: `1536d`
+- `0:512` semantic
+- `512:1024` subject
+- `1024:1536` event
+Recommended normalized runtime views:
+- `semantic_key = l2norm(z[0:512])`
+- `subject_key = l2norm(z[512:1024])`
+- `event_key = l2norm(z[1024:1536])`
+- `full_key = l2norm(z[0:1536])`
+## Exact Release Metrics
+All numbers below are from the exact published checkpoint `checkpoint_epoch_11.pt`.
+### 512d Retrieval
+Source:
+- `retrieval_512_gt1030.json`
+Speech holdout:
+- `A->T_r1 = 0.4672`
+- `T->A_r1 = 0.4606`
+- `A->T_r5 = 0.7398`
+- `T->A_r5 = 0.7426`
+SALT:
+- `I->T_r1 = 0.4149`
+- `T->I_r1 = 0.4327`
+- `A->T_r1 = 0.2408`
+- `T->A_r1 = 0.2486`
+- `I->A_r1 = 0.4621`
+- `A->I_r1 = 0.4829`
+### Held-Out ESS Eval
+Subject:
+- `subject_key` same/different AUC: `0.5067`
+- `subject_key` same-topic-different-subject rejection AUC: `0.5067`
+Event:
+- `event_key` same/different AUC: `0.8241`
+- `event_key` same-subject-different-event rejection AUC: `0.5535`
+- `event_key` topic-shift rejection AUC: `0.9770`
+## Architecture
+This preview is a frozen-encoder / trainable-projector stack:
+- text encoder params: `22,861,056`
+- image encoder params: `8,434,512`
+- audio encoder params: `20,639,974`
+- image projection params: `9,975,296`
+- audio projection params: `9,975,296`
+- text projection params: `8,926,720`
+- total exact loaded params: `80,812,854`
+## Files
+| File | Purpose |
+|---|---|
+| `ESS-AIST-81M.safetensors` | Base preview release artifact |
+| `export_metadata.json` | ESS export contract |
+| `manifest.json` | Release manifest |
+| `parameter_breakdown.json` | Exact parameter accounting |
+| `ess_ait_86m_spec.yaml` | Training config used for the release line |
+| `retrieval_512_gt1030.json` | Exact 512d retrieval eval for this checkpoint |
+| `subject_eval.json` | Exact held-out subject eval for this checkpoint |
+| `event_eval.json` | Exact held-out event eval for this checkpoint |
+| `prefix_eval.json` | Prefix-level AUC summary |
+## Caveats
+- This is the current preview checkpoint, not the finished ESS subject-memory model.
+- Subject performance is still the weakest domain on the current held-out eval.
+- Use this for internal Cortext trials, not as the final memory-model release.

ess_ait_86m_spec.yaml ADDED Viewed

	@@ -0,0 +1,192 @@

+# ESS-AIST-86M starter spec config.
+# This is a design-time config for the ESS trainer/export path built on AIT-86M.
+# It intentionally extends beyond the current trimodal trainer surface.
+dataset_dir: datasets
+dataset_name: ess_multimodal_core_v1
+cache_dir: cache
+encoder_name: mobilenetv4_conv_medium.e180_r384_in12k
+encoder_dim: 1280
+modality: trimodal
+audio_encoder_dim: 1280
+audio_finetune_last_n_stages: 0
+projection_hidden_dim: 2048
+projection_output_dim: 1536
+projection_dropout: 0.3
+# Teacher policy:
+# - AIST-95M is the primary semantic distillation teacher, especially for speech
+#   and audio-text retrieval retention.
+# - AIT-86M is a secondary compatibility teacher / drift regularizer.
+# Naming note:
+# - although the deployment/runtime surface remains AIT-compatible, the primary
+#   teacher lineage is AIST, so the model family name is ESS-AIST-86M.
+primary_semantic_teacher: AIST-95M
+secondary_compat_teacher: AIT-86M
+batch_size: 4096
+max_epochs: 50
+learning_rate: 0.0012
+weight_decay: 0.0001
+warmup_fraction: 0.05
+grad_clip_norm: 1.0
+gradient_accumulation_steps: 1
+# Prefix Matryoshka targets.
+matryoshka_dims: [1536, 1024, 512]
+matryoshka_weights: [1.0, 1.0, 1.0]
+loss_type: infonce
+temperature: 0.07
+temperature_min: 0.01
+learn_temperature: true
+hard_neg_k: 8
+hard_neg_weight: 2.0
+false_neg_threshold: 0.85
+feature_noise_std: 0.0
+feature_mask_ratio: 0.0
+mixup_alpha: 0.0
+num_workers: 4
+pin_memory: true
+prefetch_factor: 2
+persistent_workers: true
+mixed_precision: bf16
+checkpoint_dir: checkpoints
+save_every_n_epochs: 2
+early_stopping_patience: 8
+log_dir: runs
+benchmark_eval_every_epochs: 1
+# Next-run ESS corpus, adding LibriSpeech person-subject rows on top of the
+# v6 subject-media + WIT + speech/wavcaps semantic lane.
+ess_corpus_dir: checkpoints/ess_ait_86m_20260430T035907Z/ess_corpus_v7_subject_media_wit4096_speech100k_wavcaps100k_librispeech360
+ess_train_jsonl: checkpoints/ess_ait_86m_20260430T035907Z/ess_corpus_v7_subject_media_wit4096_speech100k_wavcaps100k_librispeech360/train.jsonl
+ess_val_jsonl: checkpoints/ess_ait_86m_20260430T035907Z/ess_corpus_v7_subject_media_wit4096_speech100k_wavcaps100k_librispeech360/val.jsonl
+ess_train_text_cache: checkpoints/ess_ait_86m_20260430T035907Z/ess_corpus_v7_subject_media_wit4096_speech100k_wavcaps100k_librispeech360/cache/ess_corpus_v7_subject_media_wit4096_speech100k_wavcaps100k_librispeech360_train_leaf_ir_text_features.npy
+ess_val_text_cache: checkpoints/ess_ait_86m_20260430T035907Z/ess_corpus_v7_subject_media_wit4096_speech100k_wavcaps100k_librispeech360/cache/ess_corpus_v7_subject_media_wit4096_speech100k_wavcaps100k_librispeech360_val_leaf_ir_text_features.npy
+# Multimodal subject-media attachment from the finalized v19 generated bundle.
+ess_subject_media_dataset_dir: checkpoints/ess_ait_86m_20260430T035907Z/ess_subject_media_pilot52_full_v19
+ess_subject_media_train_image_cache: checkpoints/ess_ait_86m_20260430T035907Z/ess_subject_media_pilot52_full_v19/cache/ess_subject_media_pilot52_full_v19_train_mobilenetv4_conv_medium_image_features.npy
+ess_subject_media_val_image_cache: checkpoints/ess_ait_86m_20260430T035907Z/ess_subject_media_pilot52_full_v19/cache/ess_subject_media_pilot52_full_v19_val_mobilenetv4_conv_medium_image_features.npy
+ess_subject_media_train_audio_cache: checkpoints/ess_ait_86m_20260430T035907Z/ess_subject_media_pilot52_full_v19/cache/ess_subject_media_pilot52_full_v19_train_mn20_audioheavy_lora1280_audio_features.npy
+ess_subject_media_val_audio_cache: checkpoints/ess_ait_86m_20260430T035907Z/ess_subject_media_pilot52_full_v19/cache/ess_subject_media_pilot52_full_v19_val_mn20_audioheavy_lora1280_audio_features.npy
+ess_wordnet_train_audio_cache: cache/wordnet_2024_openai_validaudio_train_mn20_audioheavy_lora1280_audio_features.npy
+ess_wordnet_val_audio_cache: cache/wordnet_2024_openai_validaudio_val_mn20_audioheavy_lora1280_audio_features.npy
+ess_speech_audio_cache: cache/speech_chatterbox_150k_train_mn20_audioheavy_lora1280_audio_features.npy
+ess_wavcaps_audio_cache: cache/wavcaps_fsd_train_mn20_audioheavy_lora1280_audio_features.npy
+ess_salt_audio_cache: cache/benchmark_salt_features/salt_audio_mn20_audioheavy_lora1280_4999.npy
+# Optional external entity-subject image caches, e.g. WIT-derived ESS records.
+# These are split-aligned caches built from the final train/val JSONL rows.
+ess_wit_train_image_cache: checkpoints/ess_ait_86m_20260430T035907Z/ess_wit_records_en_maincap_4096/cache/ess_wit_records_en_maincap_4096_train_mobilenetv4_conv_medium_image_features.npy
+ess_wit_val_image_cache: checkpoints/ess_ait_86m_20260430T035907Z/ess_wit_records_en_maincap_4096/cache/ess_wit_records_en_maincap_4096_val_mobilenetv4_conv_medium_image_features.npy
+# Optional external person-subject caches keyed by ESS record_id. The field
+# names still say "voxceleb" for compatibility, but they also carry staged
+# LibriSpeech speaker-subject audio caches.
+# ess_voxceleb_train_image_cache:
+# ess_voxceleb_val_image_cache:
+ess_voxceleb_train_audio_cache: checkpoints/ess_ait_86m_20260430T035907Z/ess_librispeech_subject_trainclean360/cache/ess_librispeech_subject_trainclean360_train_mn20_audioheavy_lora1280_audio_features.npy
+ess_voxceleb_val_audio_cache: checkpoints/ess_ait_86m_20260430T035907Z/ess_librispeech_subject_trainclean360/cache/ess_librispeech_subject_trainclean360_val_mn20_audioheavy_lora1280_audio_features.npy
+# ESS-specific fields for the forthcoming trainer.
+ess_semantic_slice: [0, 512]
+ess_subject_slice: [512, 1024]
+ess_event_slice: [1024, 1536]
+# Corpus composition at build time:
+# - train: 219957 semantic / 28026 event / 94446 subject
+# - val:   19954 semantic / 3074 event / 10967 subject
+#
+# Do not sample raw row frequency. Subject supervision is too small and must be
+# explicitly oversampled to shape the subject block.
+ess_sampling:
+  strategy: weighted_family_with_replacement
+  unit: record
+  train_family_weights:
+    semantic: 0.50
+    subject: 0.15
+    event: 0.35
+  train_dataset_weights:
+    speech_chatterbox_150k: 5.0
+    wit_entity_subject: 0.25
+    librispeech_subject: 0.01
+  val_family_weights:
+    semantic: 0.50
+    subject: 0.15
+    event: 0.35
+  val_dataset_weights:
+    speech_chatterbox_150k: 5.0
+    wit_entity_subject: 0.25
+    librispeech_subject: 0.01
+  family_from_active_supervision:
+    semantic: semantic
+    subject: subject
+    event: event
+  max_records_per_step:
+    semantic: 2048
+    subject: 1024
+    event: 1024
+  notes:
+    - subject rows are intentionally oversampled relative to raw corpus count
+    - semantic remains dominant to protect 512d retrieval
+    - speech_chatterbox semantic rows are oversampled within semantic because only ~20k rows survive dedupe into v6
+    - librispeech_subject is heavily downweighted within subject so person voice identity helps without flooding the entire subject block
+    - event stays high enough to shape prefix_1536 without overwhelming semantic
+ess_loss_weights:
+  semantic_retrieval: 1.0
+  semantic_distillation: 1.0
+  subject_multimodal: 0.8
+  subject_contrastive: 0.8
+  subject_hard_negative: 0.8
+  event_contrastive: 0.8
+  event_rejection: 1.0
+  prefix_512: 1.0
+  prefix_1024: 0.75
+  prefix_1536: 0.75
+  block_decorrelation: 0.1
+  variance_regularization: 0.1
+ess_negative_buckets:
+  - same_topic_different_subject
+  - same_subject_different_event
+  - stale_same_source
+  - wrong_active
+  - topic_shift
+  - lookalike_or_soundalike
+ess_negative_buckets_by_family:
+  semantic:
+    - same_topic_different_subject
+    - same_subject_different_event
+    - stale_same_source
+    - wrong_active
+    - topic_shift
+    - lookalike_or_soundalike
+  subject:
+    - same_topic_different_subject
+    - stale_same_source
+    - wrong_active
+    - topic_shift
+    - lookalike_or_soundalike
+  event:
+    - same_topic_different_subject
+    - same_subject_different_event
+    - stale_same_source
+    - wrong_active
+    - topic_shift
+    - lookalike_or_soundalike
+ess_eval_views:
+  - semantic_key
+  - subject_key
+  - event_key
+  - full_key
+  - prefix_512
+  - prefix_1024
+  - prefix_1536

event_eval.json ADDED Viewed

	@@ -0,0 +1,266 @@

+{
+  "checkpoint": "/shared/augmem/triembed/checkpoints/ess_aist_full_v7_librispeech360_l4i/checkpoint_epoch_11.pt",
+  "split": "val",
+  "records_path": "/shared/augmem/triembed/checkpoints/ess_ait_86m_20260430T035907Z/ess_corpus_v7_subject_media_wit4096_speech100k_wavcaps100k_librispeech360/val.jsonl",
+  "views": {
+    "semantic_key": {
+      "event_same_different_auc": {
+        "auc": 0.829112461248993,
+        "positive_pairs": 7703,
+        "negative_pairs": 327075,
+        "positive_mean": 0.7533491437897564,
+        "negative_mean": 0.5995114385256625
+      },
+      "same_subject_different_event_rejection_auc": {
+        "auc": 0.5802306316888112,
+        "positive_pairs": 7703,
+        "negative_pairs": 118115,
+        "positive_mean": 0.7533491437897564,
+        "negative_mean": 0.7313196869598024
+      },
+      "stale_same_source_rejection_auc": {
+        "auc": null,
+        "positive_pairs": 7703,
+        "negative_pairs": 0,
+        "positive_mean": 0.7533491437897564,
+        "negative_mean": null
+      },
+      "wrong_active_rejection_auc": {
+        "auc": null,
+        "positive_pairs": 7703,
+        "negative_pairs": 0,
+        "positive_mean": 0.7533491437897564,
+        "negative_mean": null
+      },
+      "topic_shift_rejection_auc": {
+        "auc": 0.9697933441859231,
+        "positive_pairs": 7703,
+        "negative_pairs": 208960,
+        "positive_mean": 0.7533491437897564,
+        "negative_mean": 0.525006599016673
+      }
+    },
+    "subject_key": {
+      "event_same_different_auc": {
+        "auc": 0.6676734827239529,
+        "positive_pairs": 7703,
+        "negative_pairs": 327075,
+        "positive_mean": 0.668074605508422,
+        "negative_mean": 0.5629470272292642
+      },
+      "same_subject_different_event_rejection_auc": {
+        "auc": 0.1862661483021773,
+        "positive_pairs": 7703,
+        "negative_pairs": 118115,
+        "positive_mean": 0.668074605508422,
+        "negative_mean": 0.7863739344748515
+      },
+      "stale_same_source_rejection_auc": {
+        "auc": null,
+        "positive_pairs": 7703,
+        "negative_pairs": 0,
+        "positive_mean": 0.668074605508422,
+        "negative_mean": null
+      },
+      "wrong_active_rejection_auc": {
+        "auc": null,
+        "positive_pairs": 7703,
+        "negative_pairs": 0,
+        "positive_mean": 0.668074605508422,
+        "negative_mean": null
+      },
+      "topic_shift_rejection_auc": {
+        "auc": 0.9397898078829693,
+        "positive_pairs": 7703,
+        "negative_pairs": 208960,
+        "positive_mean": 0.668074605508422,
+        "negative_mean": 0.43665458298485105
+      }
+    },
+    "event_key": {
+      "event_same_different_auc": {
+        "auc": 0.8240710674869262,
+        "positive_pairs": 7703,
+        "negative_pairs": 327075,
+        "positive_mean": 0.6786398216704803,
+        "negative_mean": 0.4559661049066459
+      },
+      "same_subject_different_event_rejection_auc": {
+        "auc": 0.5534970574958717,
+        "positive_pairs": 7703,
+        "negative_pairs": 118115,
+        "positive_mean": 0.6786398216704803,
+        "negative_mean": 0.6430964619714105
+      },
+      "stale_same_source_rejection_auc": {
+        "auc": null,
+        "positive_pairs": 7703,
+        "negative_pairs": 0,
+        "positive_mean": 0.6786398216704803,
+        "negative_mean": null
+      },
+      "wrong_active_rejection_auc": {
+        "auc": null,
+        "positive_pairs": 7703,
+        "negative_pairs": 0,
+        "positive_mean": 0.6786398216704803,
+        "negative_mean": null
+      },
+      "topic_shift_rejection_auc": {
+        "auc": 0.9770134927840807,
+        "positive_pairs": 7703,
+        "negative_pairs": 208960,
+        "positive_mean": 0.6786398216704803,
+        "negative_mean": 0.35019034818428435
+      }
+    },
+    "full_key": {
+      "event_same_different_auc": {
+        "auc": 0.78280390304866,
+        "positive_pairs": 7703,
+        "negative_pairs": 327075,
+        "positive_mean": 0.7056915993491164,
+        "negative_mean": 0.5454881820918898
+      },
+      "same_subject_different_event_rejection_auc": {
+        "auc": 0.44637572176232837,
+        "positive_pairs": 7703,
+        "negative_pairs": 118115,
+        "positive_mean": 0.7056915993491164,
+        "negative_mean": 0.7219292155613277
+      },
+      "stale_same_source_rejection_auc": {
+        "auc": null,
+        "positive_pairs": 7703,
+        "negative_pairs": 0,
+        "positive_mean": 0.7056915993491164,
+        "negative_mean": null
+      },
+      "wrong_active_rejection_auc": {
+        "auc": null,
+        "positive_pairs": 7703,
+        "negative_pairs": 0,
+        "positive_mean": 0.7056915993491164,
+        "negative_mean": null
+      },
+      "topic_shift_rejection_auc": {
+        "auc": 0.9729705121252057,
+        "positive_pairs": 7703,
+        "negative_pairs": 208960,
+        "positive_mean": 0.7056915993491164,
+        "negative_mean": 0.4457545839475431
+      }
+    },
+    "prefix_512": {
+      "event_same_different_auc": {
+        "auc": 0.829112461248993,
+        "positive_pairs": 7703,
+        "negative_pairs": 327075,
+        "positive_mean": 0.7533491437897564,
+        "negative_mean": 0.5995114385256625
+      },
+      "same_subject_different_event_rejection_auc": {
+        "auc": 0.5802306316888112,
+        "positive_pairs": 7703,
+        "negative_pairs": 118115,
+        "positive_mean": 0.7533491437897564,
+        "negative_mean": 0.7313196869598024
+      },
+      "stale_same_source_rejection_auc": {
+        "auc": null,
+        "positive_pairs": 7703,
+        "negative_pairs": 0,
+        "positive_mean": 0.7533491437897564,
+        "negative_mean": null
+      },
+      "wrong_active_rejection_auc": {
+        "auc": null,
+        "positive_pairs": 7703,
+        "negative_pairs": 0,
+        "positive_mean": 0.7533491437897564,
+        "negative_mean": null
+      },
+      "topic_shift_rejection_auc": {
+        "auc": 0.9697933441859231,
+        "positive_pairs": 7703,
+        "negative_pairs": 208960,
+        "positive_mean": 0.7533491437897564,
+        "negative_mean": 0.525006599016673
+      }
+    },
+    "prefix_1024": {
+      "event_same_different_auc": {
+        "auc": 0.7453008223026156,
+        "positive_pairs": 7703,
+        "negative_pairs": 327075,
+        "positive_mean": 0.7183707235626442,
+        "negative_mean": 0.5870387001189491
+      },
+      "same_subject_different_event_rejection_auc": {
+        "auc": 0.3570483627258597,
+        "positive_pairs": 7703,
+        "negative_pairs": 118115,
+        "positive_mean": 0.7183707235626442,
+        "negative_mean": 0.7610074661872391
+      },
+      "stale_same_source_rejection_auc": {
+        "auc": null,
+        "positive_pairs": 7703,
+        "negative_pairs": 0,
+        "positive_mean": 0.7183707235626442,
+        "negative_mean": null
+      },
+      "wrong_active_rejection_auc": {
+        "auc": null,
+        "positive_pairs": 7703,
+        "negative_pairs": 0,
+        "positive_mean": 0.7183707235626442,
+        "negative_mean": null
+      },
+      "topic_shift_rejection_auc": {
+        "auc": 0.9647611939666115,
+        "positive_pairs": 7703,
+        "negative_pairs": 208960,
+        "positive_mean": 0.7183707235626442,
+        "negative_mean": 0.48870255538236756
+      }
+    },
+    "prefix_1536": {
+      "event_same_different_auc": {
+        "auc": 0.78280390304866,
+        "positive_pairs": 7703,
+        "negative_pairs": 327075,
+        "positive_mean": 0.7056915993491164,
+        "negative_mean": 0.5454881820918898
+      },
+      "same_subject_different_event_rejection_auc": {
+        "auc": 0.44637572176232837,
+        "positive_pairs": 7703,
+        "negative_pairs": 118115,
+        "positive_mean": 0.7056915993491164,
+        "negative_mean": 0.7219292155613277
+      },
+      "stale_same_source_rejection_auc": {
+        "auc": null,
+        "positive_pairs": 7703,
+        "negative_pairs": 0,
+        "positive_mean": 0.7056915993491164,
+        "negative_mean": null
+      },
+      "wrong_active_rejection_auc": {
+        "auc": null,
+        "positive_pairs": 7703,
+        "negative_pairs": 0,
+        "positive_mean": 0.7056915993491164,
+        "negative_mean": null
+      },
+      "topic_shift_rejection_auc": {
+        "auc": 0.9729705121252057,
+        "positive_pairs": 7703,
+        "negative_pairs": 208960,
+        "positive_mean": 0.7056915993491164,
+        "negative_mean": 0.4457545839475431
+      }
+    }
+  }
+}

export_metadata.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+  "model_name": "ESS-AIST-81M",
+  "base_family": "AIST-86M-compatible",
+  "output_dimension": 1536,
+  "slices": {
+    "semantic": [
+      0,
+      512
+    ],
+    "subject": [
+      512,
+      1024
+    ],
+    "event": [
+      1024,
+      1536
+    ]
+  },
+  "prefixes": {
+    "semantic_prefix": 512,
+    "semantic_subject_prefix": 1024,
+    "full_prefix": 1536
+  },
+  "normalized_views": {
+    "semantic_key": "l2norm(z[0:512])",
+    "subject_key": "l2norm(z[512:1024])",
+    "event_key": "l2norm(z[1024:1536])",
+    "full_key": "l2norm(z[0:1536])"
+  },
+  "supported_modalities": [
+    "text",
+    "image",
+    "audio"
+  ],
+  "normalization_behavior": {
+    "raw_embedding": "un-normalized 1536d vector",
+    "recommended_runtime": "L2-normalize per-slice or full vector depending on task"
+  },
+  "matryoshka_behavior": {
+    "512": "semantic retrieval",
+    "1024": "semantic plus subject continuity",
+    "1536": "semantic plus subject plus event continuity"
+  },
+  "optional_probes": [
+    "salience_score",
+    "novelty_score",
+    "boundary_score"
+  ]
+}

manifest.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "model_id": "ESS-AIST-81M",
+  "trimodal_checkpoint": "/shared/augmem/triembed/checkpoints/ess_aist_full_v7_librispeech360_l4i/checkpoint_epoch_11.pt",
+  "audio_checkpoint": "/shared/augmem/triembed/checkpoints/mn20_native_lora_aistmix_audioheavy100k175k175k_continue_from_balanced_20260426T143137Z/latest_model.pt",
+  "safetensors": "/shared/augmem/triembed/dist/ESS-AIST-81M-preview/ESS-AIST-81M.safetensors",
+  "gguf": [
+    "/shared/augmem/triembed/dist/ESS-AIST-81M-preview/ESS-AIST-81M_q8_0.gguf",
+    "/shared/augmem/triembed/dist/ESS-AIST-81M-preview/ESS-AIST-81M_q5_1.gguf"
+  ]
+}

parameter_breakdown.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "text_encoder": 22861056,
+  "image_encoder": 8434512,
+  "audio_encoder": 20639974,
+  "image_projection": 9975296,
+  "audio_projection": 9975296,
+  "text_projection": 8926720,
+  "total_exact_loaded_params": 80812854
+}

prefix_eval.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "checkpoint": "/shared/augmem/triembed/checkpoints/ess_aist_full_v7_librispeech360_l4i/checkpoint_epoch_11.pt",
+  "split": "val",
+  "views": {
+    "semantic_key": {
+      "subject_same_different_auc": 0.4265240470767738,
+      "event_same_different_auc": 0.829112461248993,
+      "same_topic_different_subject_rejection_auc": 0.4265240470767738,
+      "same_subject_different_event_rejection_auc": 0.5802306316888112
+    },
+    "subject_key": {
+      "subject_same_different_auc": 0.5066875746523821,
+      "event_same_different_auc": 0.6676734827239529,
+      "same_topic_different_subject_rejection_auc": 0.5066875746523821,
+      "same_subject_different_event_rejection_auc": 0.1862661483021773
+    },
+    "event_key": {
+      "subject_same_different_auc": 0.3832485276953712,
+      "event_same_different_auc": 0.8240710674869262,
+      "same_topic_different_subject_rejection_auc": 0.3832485276953712,
+      "same_subject_different_event_rejection_auc": 0.5534970574958717
+    },
+    "full_key": {
+      "subject_same_different_auc": 0.42067046032727157,
+      "event_same_different_auc": 0.78280390304866,
+      "same_topic_different_subject_rejection_auc": 0.42067046032727157,
+      "same_subject_different_event_rejection_auc": 0.44637572176232837
+    },
+    "prefix_512": {
+      "subject_same_different_auc": 0.4265240470767738,
+      "event_same_different_auc": 0.829112461248993,
+      "same_topic_different_subject_rejection_auc": 0.4265240470767738,
+      "same_subject_different_event_rejection_auc": 0.5802306316888112
+    },
+    "prefix_1024": {
+      "subject_same_different_auc": 0.4690923681629257,
+      "event_same_different_auc": 0.7453008223026156,
+      "same_topic_different_subject_rejection_auc": 0.4690923681629257,
+      "same_subject_different_event_rejection_auc": 0.3570483627258597
+    },
+    "prefix_1536": {
+      "subject_same_different_auc": 0.42067046032727157,
+      "event_same_different_auc": 0.78280390304866,
+      "same_topic_different_subject_rejection_auc": 0.42067046032727157,
+      "same_subject_different_event_rejection_auc": 0.44637572176232837
+    }
+  }
+}

retrieval_512_gt1030.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "SALT-512": {
+    "A->I_r1": 0.4828965961933136,
+    "A->I_r10": 0.8761752843856812,
+    "A->I_r5": 0.7863572835922241,
+    "A->T_r1": 0.24084816873073578,
+    "A->T_r10": 0.5153030753135681,
+    "A->T_r5": 0.45209044218063354,
+    "I->A_r1": 0.46209242939949036,
+    "I->A_r10": 0.881176233291626,
+    "I->A_r5": 0.7905581593513489,
+    "I->T_r1": 0.41488298773765564,
+    "I->T_r10": 0.5707141757011414,
+    "I->T_r5": 0.5401080250740051,
+    "T->A_r1": 0.2486497312784195,
+    "T->A_r10": 0.5323064923286438,
+    "T->A_r5": 0.46209242939949036,
+    "T->I_r1": 0.43268653750419617,
+    "T->I_r10": 0.5763152837753296,
+    "T->I_r5": 0.550710141658783
+  },
+  "_meta": {
+    "audio_suffix": "mn20_audioheavy_lora1280_audio_features",
+    "checkpoint": "/shared/augmem/triembed/checkpoints/ess_aist_full_v7_librispeech360_l4i/checkpoint_epoch_11.pt",
+    "device": "NVIDIA GeForce GT 1030",
+    "dims": [
+      512
+    ],
+    "encoder_name": "mobilenetv4_conv_medium",
+    "image_suffix": "mobilenetv4_conv_medium_image_features"
+  },
+  "speech_chatterbox-512": {
+    "A->T_r1": 0.46719998121261597,
+    "A->T_r10": 0.824999988079071,
+    "A->T_r5": 0.739799976348877,
+    "T->A_r1": 0.46059998869895935,
+    "T->A_r10": 0.8277999758720398,
+    "T->A_r5": 0.7425999641418457
+  }
+}

subject_eval.json ADDED Viewed

	@@ -0,0 +1,119 @@

+{
+  "checkpoint": "/shared/augmem/triembed/checkpoints/ess_aist_full_v7_librispeech360_l4i/checkpoint_epoch_11.pt",
+  "split": "val",
+  "records_path": "/shared/augmem/triembed/checkpoints/ess_ait_86m_20260430T035907Z/ess_corpus_v7_subject_media_wit4096_speech100k_wavcaps100k_librispeech360/val.jsonl",
+  "views": {
+    "semantic_key": {
+      "subject_same_different_auc": {
+        "auc": 0.4265240470767738,
+        "positive_pairs": 160248,
+        "negative_pairs": 6805,
+        "positive_mean": 0.7436165443684017,
+        "negative_mean": 0.7611866422867968
+      },
+      "same_topic_different_subject_rejection_auc": {
+        "auc": 0.4265240470767738,
+        "positive_pairs": 160248,
+        "negative_pairs": 6805,
+        "positive_mean": 0.7436165443684017,
+        "negative_mean": 0.7611866422867968
+      }
+    },
+    "subject_key": {
+      "subject_same_different_auc": {
+        "auc": 0.5066875746523821,
+        "positive_pairs": 160248,
+        "negative_pairs": 6805,
+        "positive_mean": 0.7964573271532047,
+        "negative_mean": 0.7948588548339001
+      },
+      "same_topic_different_subject_rejection_auc": {
+        "auc": 0.5066875746523821,
+        "positive_pairs": 160248,
+        "negative_pairs": 6805,
+        "positive_mean": 0.7964573271532047,
+        "negative_mean": 0.7948588548339001
+      }
+    },
+    "event_key": {
+      "subject_same_different_auc": {
+        "auc": 0.3832485276953712,
+        "positive_pairs": 160248,
+        "negative_pairs": 6805,
+        "positive_mean": 0.6533856675037972,
+        "negative_mean": 0.7097943563909324
+      },
+      "same_topic_different_subject_rejection_auc": {
+        "auc": 0.3832485276953712,
+        "positive_pairs": 160248,
+        "negative_pairs": 6805,
+        "positive_mean": 0.6533856675037972,
+        "negative_mean": 0.7097943563909324
+      }
+    },
+    "full_key": {
+      "subject_same_different_auc": {
+        "auc": 0.42067046032727157,
+        "positive_pairs": 160248,
+        "negative_pairs": 6805,
+        "positive_mean": 0.7333961783599913,
+        "negative_mean": 0.754728921120172
+      },
+      "same_topic_different_subject_rejection_auc": {
+        "auc": 0.42067046032727157,
+        "positive_pairs": 160248,
+        "negative_pairs": 6805,
+        "positive_mean": 0.7333961783599913,
+        "negative_mean": 0.754728921120172
+      }
+    },
+    "prefix_512": {
+      "subject_same_different_auc": {
+        "auc": 0.4265240470767738,
+        "positive_pairs": 160248,
+        "negative_pairs": 6805,
+        "positive_mean": 0.7436165443684017,
+        "negative_mean": 0.7611866422867968
+      },
+      "same_topic_different_subject_rejection_auc": {
+        "auc": 0.4265240470767738,
+        "positive_pairs": 160248,
+        "negative_pairs": 6805,
+        "positive_mean": 0.7436165443684017,
+        "negative_mean": 0.7611866422867968
+      }
+    },
+    "prefix_1024": {
+      "subject_same_different_auc": {
+        "auc": 0.4690923681629257,
+        "positive_pairs": 160248,
+        "negative_pairs": 6805,
+        "positive_mean": 0.7721689081039962,
+        "negative_mean": 0.7791595536982812
+      },
+      "same_topic_different_subject_rejection_auc": {
+        "auc": 0.4690923681629257,
+        "positive_pairs": 160248,
+        "negative_pairs": 6805,
+        "positive_mean": 0.7721689081039962,
+        "negative_mean": 0.7791595536982812
+      }
+    },
+    "prefix_1536": {
+      "subject_same_different_auc": {
+        "auc": 0.42067046032727157,
+        "positive_pairs": 160248,
+        "negative_pairs": 6805,
+        "positive_mean": 0.7333961783599913,
+        "negative_mean": 0.754728921120172
+      },
+      "same_topic_different_subject_rejection_auc": {
+        "auc": 0.42067046032727157,
+        "positive_pairs": 160248,
+        "negative_pairs": 6805,
+        "positive_mean": 0.7333961783599913,
+        "negative_mean": 0.754728921120172
+      }
+    }
+  }
+}