| """ | |
| Centralized configuration for MultiModal Coherence AI. | |
| All magic numbers, model names, paths, and thresholds live here. | |
| Import from this module instead of hardcoding values in source files. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| from pathlib import Path | |
| # --------------------------------------------------------------------------- | |
| # Paths | |
| # --------------------------------------------------------------------------- | |
| PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent | |
| # Data directories | |
| DATA_DIR = PROJECT_ROOT / "data" | |
| IMAGE_INDEX_PATH = DATA_DIR / "embeddings" / "image_index.npz" | |
| AUDIO_INDEX_PATH = DATA_DIR / "embeddings" / "audio_index.npz" | |
| COHERENCE_STATS_PATH = PROJECT_ROOT / "artifacts" / "coherence_stats.json" | |
| IMAGE_DIRS = [ | |
| DATA_DIR / "processed" / "images", | |
| DATA_DIR / "wikimedia" / "images", | |
| ] | |
| AUDIO_DIRS = [ | |
| DATA_DIR / "processed" / "audio", | |
| DATA_DIR / "freesound" / "audio", | |
| ] | |
| # Embedding cache | |
| CACHE_DIR = PROJECT_ROOT / ".cache" / "embeddings" | |
| # Experiment output | |
| RUNS_DIR = PROJECT_ROOT / "runs" | |
| # --------------------------------------------------------------------------- | |
| # Model names | |
| # --------------------------------------------------------------------------- | |
| CLIP_MODEL = "openai/clip-vit-base-patch32" | |
| CLAP_MODEL = "laion/clap-htsat-unfused" | |
| OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "qwen2:7b") | |
| OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434") | |
| HF_FALLBACK_MODEL = "gpt2" | |
| # --------------------------------------------------------------------------- | |
| # Embedding dimensions | |
| # --------------------------------------------------------------------------- | |
| CLIP_DIM = 512 | |
| CLAP_DIM = 512 | |
| TARGET_DIM = 512 | |
| # --------------------------------------------------------------------------- | |
| # MSCI weights | |
| # --------------------------------------------------------------------------- | |
| # These weights are hypothesized, not empirically derived. | |
| # Text-image and text-audio are weighted equally; image-audio is down-weighted | |
| # because CLIP and CLAP are different embedding spaces. | |
| MSCI_WEIGHTS = { | |
| "st_i": 0.45, # text-image (CLIP shared space) | |
| "st_a": 0.45, # text-audio (CLAP shared space) | |
| "si_a": 0.10, # image-audio (cross-space — usually omitted) | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Retrieval thresholds | |
| # --------------------------------------------------------------------------- | |
| IMAGE_MIN_SIMILARITY = 0.20 | |
| AUDIO_MIN_SIMILARITY = 0.10 | |
| IMAGE_LOW_SIMILARITY_WARN = 0.25 | |
| # --------------------------------------------------------------------------- | |
| # Text generation | |
| # --------------------------------------------------------------------------- | |
| TEXT_MAX_TOKENS = 160 | |
| TEXT_TEMPERATURE_DETERMINISTIC = 0.0 | |
| TEXT_TEMPERATURE_STOCHASTIC = 0.7 | |
| TEXT_TOP_P_DETERMINISTIC = 1.0 | |
| TEXT_TOP_P_STOCHASTIC = 0.9 | |
| # --------------------------------------------------------------------------- | |
| # Audio generation (fallback ambient) | |
| # --------------------------------------------------------------------------- | |
| AUDIO_DURATION_SEC = 6.0 | |
| AUDIO_SAMPLE_RATE = 48000 | |
| # --------------------------------------------------------------------------- | |
| # Drift detection | |
| # --------------------------------------------------------------------------- | |
| DRIFT_ASYMMETRY_THRESHOLD = 0.15 # |st_i - st_a| gap to flag drift | |
| # --------------------------------------------------------------------------- | |
| # Human evaluation | |
| # --------------------------------------------------------------------------- | |
| RERATING_FRACTION = 0.20 | |
| KAPPA_ACCEPTABLE_THRESHOLD = 0.70 | |
| ALPHA_ACCEPTABLE_THRESHOLD = 0.667 | |
| # --------------------------------------------------------------------------- | |
| # cMSCI (Calibrated Multimodal Semantic Coherence Index) | |
| # --------------------------------------------------------------------------- | |
| # Calibration store (fitted from RQ1 baseline data) | |
| CMSCI_CALIBRATION_PATH = PROJECT_ROOT / "artifacts" / "cmsci_calibration.json" | |
| # Ex-MCR cross-space alignment (CLAP → CLIP projection) | |
| EXMCR_WEIGHTS_PATH = PROJECT_ROOT / "models" / "exmcr" / "ex_clap.pt" | |
| # Cross-Space Bridge (CLIP image + CLAP audio → shared 256-d bridge space) | |
| BRIDGE_WEIGHTS_PATH = PROJECT_ROOT / "models" / "bridge" / "bridge_best.pt" | |
| # Probabilistic adapters (ProbVLM-style uncertainty) | |
| PROB_CLIP_ADAPTER_PATH = PROJECT_ROOT / "models" / "prob_adapters" / "clip_adapter.pt" | |
| PROB_CLAP_ADAPTER_PATH = PROJECT_ROOT / "models" / "prob_adapters" / "clap_adapter.pt" | |
| # Full pipeline optimized parameters (via LOO-CV on RQ3 human ratings) | |
| # Full-sample rho=0.608 (p=0.0004), LOO-CV rho=0.546 (p=0.0018), overfit gap=0.001 | |
| # Selected in 87% of LOO folds (26/30) — highly stable | |
| CMSCI_MARGIN_ALPHA = 16 # Margin scaling factor (amplifies contrastive signal) | |
| CMSCI_CHANNEL_WEIGHT_TI = 0.90 # Text-image channel weight (1 - w for text-audio) | |
| CMSCI_CALIBRATION_MODE = "gram" # "cosine" (z-norm cosine sims) or "gram" (z-norm gram coherences) | |
| # Variant E: ExMCR cross-modal complementarity (w_3d=0 recovers D exactly) | |
| # ExMCR projects CLAP audio → CLIP space; complementarity = Gramian dispersion | |
| # High complementarity = image and audio contribute unique perspectives (rewarded) | |
| CMSCI_W_3D = 0.45 # Weight for z-normalized IA complementarity | |
| # Variant F: ProbVLM adaptive channel weighting (gamma=0 recovers E exactly) | |
| CMSCI_GAMMA = 0.10 # Mixing ratio: w_final = (1-gamma)*base_w + gamma*adaptive_w | |
| # Contrastive negative bank | |
| CMSCI_NEGATIVE_K = 5 # Number of hard negatives per modality | |
| CMSCI_NEGATIVE_BANK_ENABLED = True # Enable/disable contrastive calibration | |
| # MC sampling for uncertainty estimation | |
| CMSCI_MC_SAMPLES = 100 # Number of Monte Carlo samples for Variant F | |
| # Probabilistic adapter training | |
| PROB_ADAPTER_EPOCHS = 100 | |
| PROB_ADAPTER_LR = 1e-4 | |
| PROB_ADAPTER_BATCH_SIZE = 32 | |
| PROB_ADAPTER_PATIENCE = 15 | |