File size: 5,889 Bytes
6835659
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358d3bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""
Centralized configuration for MultiModal Coherence AI.

All magic numbers, model names, paths, and thresholds live here.
Import from this module instead of hardcoding values in source files.
"""

from __future__ import annotations

import os
from pathlib import Path

# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------

PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent

# Data directories
DATA_DIR = PROJECT_ROOT / "data"
IMAGE_INDEX_PATH = DATA_DIR / "embeddings" / "image_index.npz"
AUDIO_INDEX_PATH = DATA_DIR / "embeddings" / "audio_index.npz"
COHERENCE_STATS_PATH = PROJECT_ROOT / "artifacts" / "coherence_stats.json"

IMAGE_DIRS = [
    DATA_DIR / "processed" / "images",
    DATA_DIR / "wikimedia" / "images",
]
AUDIO_DIRS = [
    DATA_DIR / "processed" / "audio",
    DATA_DIR / "freesound" / "audio",
]

# Embedding cache
CACHE_DIR = PROJECT_ROOT / ".cache" / "embeddings"

# Experiment output
RUNS_DIR = PROJECT_ROOT / "runs"

# ---------------------------------------------------------------------------
# Model names
# ---------------------------------------------------------------------------

CLIP_MODEL = "openai/clip-vit-base-patch32"
CLAP_MODEL = "laion/clap-htsat-unfused"
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "qwen2:7b")
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
HF_FALLBACK_MODEL = "gpt2"

# ---------------------------------------------------------------------------
# Embedding dimensions
# ---------------------------------------------------------------------------

CLIP_DIM = 512
CLAP_DIM = 512
TARGET_DIM = 512

# ---------------------------------------------------------------------------
# MSCI weights
# ---------------------------------------------------------------------------

# These weights are hypothesized, not empirically derived.
# Text-image and text-audio are weighted equally; image-audio is down-weighted
# because CLIP and CLAP are different embedding spaces.
MSCI_WEIGHTS = {
    "st_i": 0.45,   # text-image (CLIP shared space)
    "st_a": 0.45,   # text-audio (CLAP shared space)
    "si_a": 0.10,   # image-audio (cross-space — usually omitted)
}

# ---------------------------------------------------------------------------
# Retrieval thresholds
# ---------------------------------------------------------------------------

IMAGE_MIN_SIMILARITY = 0.20
AUDIO_MIN_SIMILARITY = 0.10
IMAGE_LOW_SIMILARITY_WARN = 0.25

# ---------------------------------------------------------------------------
# Text generation
# ---------------------------------------------------------------------------

TEXT_MAX_TOKENS = 160
TEXT_TEMPERATURE_DETERMINISTIC = 0.0
TEXT_TEMPERATURE_STOCHASTIC = 0.7
TEXT_TOP_P_DETERMINISTIC = 1.0
TEXT_TOP_P_STOCHASTIC = 0.9

# ---------------------------------------------------------------------------
# Audio generation (fallback ambient)
# ---------------------------------------------------------------------------

AUDIO_DURATION_SEC = 6.0
AUDIO_SAMPLE_RATE = 48000

# ---------------------------------------------------------------------------
# Drift detection
# ---------------------------------------------------------------------------

DRIFT_ASYMMETRY_THRESHOLD = 0.15  # |st_i - st_a| gap to flag drift

# ---------------------------------------------------------------------------
# Human evaluation
# ---------------------------------------------------------------------------

RERATING_FRACTION = 0.20
KAPPA_ACCEPTABLE_THRESHOLD = 0.70
ALPHA_ACCEPTABLE_THRESHOLD = 0.667

# ---------------------------------------------------------------------------
# cMSCI (Calibrated Multimodal Semantic Coherence Index)
# ---------------------------------------------------------------------------

# Calibration store (fitted from RQ1 baseline data)
CMSCI_CALIBRATION_PATH = PROJECT_ROOT / "artifacts" / "cmsci_calibration.json"

# Ex-MCR cross-space alignment (CLAP → CLIP projection)
EXMCR_WEIGHTS_PATH = PROJECT_ROOT / "models" / "exmcr" / "ex_clap.pt"

# Cross-Space Bridge (CLIP image + CLAP audio → shared 256-d bridge space)
BRIDGE_WEIGHTS_PATH = PROJECT_ROOT / "models" / "bridge" / "bridge_best.pt"

# Probabilistic adapters (ProbVLM-style uncertainty)
PROB_CLIP_ADAPTER_PATH = PROJECT_ROOT / "models" / "prob_adapters" / "clip_adapter.pt"
PROB_CLAP_ADAPTER_PATH = PROJECT_ROOT / "models" / "prob_adapters" / "clap_adapter.pt"

# Full pipeline optimized parameters (via LOO-CV on RQ3 human ratings)
# Full-sample rho=0.608 (p=0.0004), LOO-CV rho=0.546 (p=0.0018), overfit gap=0.001
# Selected in 87% of LOO folds (26/30) — highly stable
CMSCI_MARGIN_ALPHA = 16             # Margin scaling factor (amplifies contrastive signal)
CMSCI_CHANNEL_WEIGHT_TI = 0.90     # Text-image channel weight (1 - w for text-audio)
CMSCI_CALIBRATION_MODE = "gram"    # "cosine" (z-norm cosine sims) or "gram" (z-norm gram coherences)

# Variant E: ExMCR cross-modal complementarity (w_3d=0 recovers D exactly)
# ExMCR projects CLAP audio → CLIP space; complementarity = Gramian dispersion
# High complementarity = image and audio contribute unique perspectives (rewarded)
CMSCI_W_3D = 0.45                  # Weight for z-normalized IA complementarity
# Variant F: ProbVLM adaptive channel weighting (gamma=0 recovers E exactly)
CMSCI_GAMMA = 0.10                 # Mixing ratio: w_final = (1-gamma)*base_w + gamma*adaptive_w

# Contrastive negative bank
CMSCI_NEGATIVE_K = 5                # Number of hard negatives per modality
CMSCI_NEGATIVE_BANK_ENABLED = True  # Enable/disable contrastive calibration

# MC sampling for uncertainty estimation
CMSCI_MC_SAMPLES = 100  # Number of Monte Carlo samples for Variant F

# Probabilistic adapter training
PROB_ADAPTER_EPOCHS = 100
PROB_ADAPTER_LR = 1e-4
PROB_ADAPTER_BATCH_SIZE = 32
PROB_ADAPTER_PATIENCE = 15