""" Centralized configuration for MultiModal Coherence AI. All magic numbers, model names, paths, and thresholds live here. Import from this module instead of hardcoding values in source files. """ from __future__ import annotations import os from pathlib import Path # --------------------------------------------------------------------------- # Paths # --------------------------------------------------------------------------- PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent # Data directories DATA_DIR = PROJECT_ROOT / "data" IMAGE_INDEX_PATH = DATA_DIR / "embeddings" / "image_index.npz" AUDIO_INDEX_PATH = DATA_DIR / "embeddings" / "audio_index.npz" COHERENCE_STATS_PATH = PROJECT_ROOT / "artifacts" / "coherence_stats.json" IMAGE_DIRS = [ DATA_DIR / "processed" / "images", DATA_DIR / "wikimedia" / "images", ] AUDIO_DIRS = [ DATA_DIR / "processed" / "audio", DATA_DIR / "freesound" / "audio", ] # Embedding cache CACHE_DIR = PROJECT_ROOT / ".cache" / "embeddings" # Experiment output RUNS_DIR = PROJECT_ROOT / "runs" # --------------------------------------------------------------------------- # Model names # --------------------------------------------------------------------------- CLIP_MODEL = "openai/clip-vit-base-patch32" CLAP_MODEL = "laion/clap-htsat-unfused" OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "qwen2:7b") OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434") HF_FALLBACK_MODEL = "gpt2" # --------------------------------------------------------------------------- # Embedding dimensions # --------------------------------------------------------------------------- CLIP_DIM = 512 CLAP_DIM = 512 TARGET_DIM = 512 # --------------------------------------------------------------------------- # MSCI weights # --------------------------------------------------------------------------- # These weights are hypothesized, not empirically derived. # Text-image and text-audio are weighted equally; image-audio is down-weighted # because CLIP and CLAP are different embedding spaces. MSCI_WEIGHTS = { "st_i": 0.45, # text-image (CLIP shared space) "st_a": 0.45, # text-audio (CLAP shared space) "si_a": 0.10, # image-audio (cross-space — usually omitted) } # --------------------------------------------------------------------------- # Retrieval thresholds # --------------------------------------------------------------------------- IMAGE_MIN_SIMILARITY = 0.20 AUDIO_MIN_SIMILARITY = 0.10 IMAGE_LOW_SIMILARITY_WARN = 0.25 # --------------------------------------------------------------------------- # Text generation # --------------------------------------------------------------------------- TEXT_MAX_TOKENS = 160 TEXT_TEMPERATURE_DETERMINISTIC = 0.0 TEXT_TEMPERATURE_STOCHASTIC = 0.7 TEXT_TOP_P_DETERMINISTIC = 1.0 TEXT_TOP_P_STOCHASTIC = 0.9 # --------------------------------------------------------------------------- # Audio generation (fallback ambient) # --------------------------------------------------------------------------- AUDIO_DURATION_SEC = 6.0 AUDIO_SAMPLE_RATE = 48000 # --------------------------------------------------------------------------- # Drift detection # --------------------------------------------------------------------------- DRIFT_ASYMMETRY_THRESHOLD = 0.15 # |st_i - st_a| gap to flag drift # --------------------------------------------------------------------------- # Human evaluation # --------------------------------------------------------------------------- RERATING_FRACTION = 0.20 KAPPA_ACCEPTABLE_THRESHOLD = 0.70 ALPHA_ACCEPTABLE_THRESHOLD = 0.667 # --------------------------------------------------------------------------- # cMSCI (Calibrated Multimodal Semantic Coherence Index) # --------------------------------------------------------------------------- # Calibration store (fitted from RQ1 baseline data) CMSCI_CALIBRATION_PATH = PROJECT_ROOT / "artifacts" / "cmsci_calibration.json" # Ex-MCR cross-space alignment (CLAP → CLIP projection) EXMCR_WEIGHTS_PATH = PROJECT_ROOT / "models" / "exmcr" / "ex_clap.pt" # Cross-Space Bridge (CLIP image + CLAP audio → shared 256-d bridge space) BRIDGE_WEIGHTS_PATH = PROJECT_ROOT / "models" / "bridge" / "bridge_best.pt" # Probabilistic adapters (ProbVLM-style uncertainty) PROB_CLIP_ADAPTER_PATH = PROJECT_ROOT / "models" / "prob_adapters" / "clip_adapter.pt" PROB_CLAP_ADAPTER_PATH = PROJECT_ROOT / "models" / "prob_adapters" / "clap_adapter.pt" # Full pipeline optimized parameters (via LOO-CV on RQ3 human ratings) # Full-sample rho=0.608 (p=0.0004), LOO-CV rho=0.546 (p=0.0018), overfit gap=0.001 # Selected in 87% of LOO folds (26/30) — highly stable CMSCI_MARGIN_ALPHA = 16 # Margin scaling factor (amplifies contrastive signal) CMSCI_CHANNEL_WEIGHT_TI = 0.90 # Text-image channel weight (1 - w for text-audio) CMSCI_CALIBRATION_MODE = "gram" # "cosine" (z-norm cosine sims) or "gram" (z-norm gram coherences) # Variant E: ExMCR cross-modal complementarity (w_3d=0 recovers D exactly) # ExMCR projects CLAP audio → CLIP space; complementarity = Gramian dispersion # High complementarity = image and audio contribute unique perspectives (rewarded) CMSCI_W_3D = 0.45 # Weight for z-normalized IA complementarity # Variant F: ProbVLM adaptive channel weighting (gamma=0 recovers E exactly) CMSCI_GAMMA = 0.10 # Mixing ratio: w_final = (1-gamma)*base_w + gamma*adaptive_w # Contrastive negative bank CMSCI_NEGATIVE_K = 5 # Number of hard negatives per modality CMSCI_NEGATIVE_BANK_ENABLED = True # Enable/disable contrastive calibration # MC sampling for uncertainty estimation CMSCI_MC_SAMPLES = 100 # Number of Monte Carlo samples for Variant F # Probabilistic adapter training PROB_ADAPTER_EPOCHS = 100 PROB_ADAPTER_LR = 1e-4 PROB_ADAPTER_BATCH_SIZE = 32 PROB_ADAPTER_PATIENCE = 15