Spaces:

pratik-250620
/

MultiModal-Coherence-AI

Running

File size: 5,889 Bytes

"""
Centralized configuration for MultiModal Coherence AI.

All magic numbers, model names, paths, and thresholds live here.
Import from this module instead of hardcoding values in source files.
"""

from __future__ import annotations

import os
from pathlib import Path

# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------

PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent

# Data directories
DATA_DIR = PROJECT_ROOT / "data"
IMAGE_INDEX_PATH = DATA_DIR / "embeddings" / "image_index.npz"
AUDIO_INDEX_PATH = DATA_DIR / "embeddings" / "audio_index.npz"
COHERENCE_STATS_PATH = PROJECT_ROOT / "artifacts" / "coherence_stats.json"

IMAGE_DIRS = [
    DATA_DIR / "processed" / "images",
    DATA_DIR / "wikimedia" / "images",
]
AUDIO_DIRS = [
    DATA_DIR / "processed" / "audio",
    DATA_DIR / "freesound" / "audio",
]

# Embedding cache
CACHE_DIR = PROJECT_ROOT / ".cache" / "embeddings"

# Experiment output
RUNS_DIR = PROJECT_ROOT / "runs"

# ---------------------------------------------------------------------------
# Model names
# ---------------------------------------------------------------------------

CLIP_MODEL = "openai/clip-vit-base-patch32"
CLAP_MODEL = "laion/clap-htsat-unfused"
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "qwen2:7b")
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
HF_FALLBACK_MODEL = "gpt2"

# ---------------------------------------------------------------------------
# Embedding dimensions
# ---------------------------------------------------------------------------

CLIP_DIM = 512
CLAP_DIM = 512
TARGET_DIM = 512

# ---------------------------------------------------------------------------
# MSCI weights
# ---------------------------------------------------------------------------

# These weights are hypothesized, not empirically derived.
# Text-image and text-audio are weighted equally; image-audio is down-weighted
# because CLIP and CLAP are different embedding spaces.
MSCI_WEIGHTS = {
    "st_i": 0.45,   # text-image (CLIP shared space)
    "st_a": 0.45,   # text-audio (CLAP shared space)
    "si_a": 0.10,   # image-audio (cross-space — usually omitted)
}

# ---------------------------------------------------------------------------
# Retrieval thresholds
# ---------------------------------------------------------------------------

IMAGE_MIN_SIMILARITY = 0.20
AUDIO_MIN_SIMILARITY = 0.10
IMAGE_LOW_SIMILARITY_WARN = 0.25

# ---------------------------------------------------------------------------
# Text generation
# ---------------------------------------------------------------------------

TEXT_MAX_TOKENS = 160
TEXT_TEMPERATURE_DETERMINISTIC = 0.0
TEXT_TEMPERATURE_STOCHASTIC = 0.7
TEXT_TOP_P_DETERMINISTIC = 1.0
TEXT_TOP_P_STOCHASTIC = 0.9

# ---------------------------------------------------------------------------
# Audio generation (fallback ambient)
# ---------------------------------------------------------------------------

AUDIO_DURATION_SEC = 6.0
AUDIO_SAMPLE_RATE = 48000

# ---------------------------------------------------------------------------
# Drift detection
# ---------------------------------------------------------------------------

DRIFT_ASYMMETRY_THRESHOLD = 0.15  # |st_i - st_a| gap to flag drift

# ---------------------------------------------------------------------------
# Human evaluation
# ---------------------------------------------------------------------------

RERATING_FRACTION = 0.20
KAPPA_ACCEPTABLE_THRESHOLD = 0.70
ALPHA_ACCEPTABLE_THRESHOLD = 0.667

# ---------------------------------------------------------------------------
# cMSCI (Calibrated Multimodal Semantic Coherence Index)
# ---------------------------------------------------------------------------

# Calibration store (fitted from RQ1 baseline data)
CMSCI_CALIBRATION_PATH = PROJECT_ROOT / "artifacts" / "cmsci_calibration.json"

# Ex-MCR cross-space alignment (CLAP → CLIP projection)
EXMCR_WEIGHTS_PATH = PROJECT_ROOT / "models" / "exmcr" / "ex_clap.pt"

# Cross-Space Bridge (CLIP image + CLAP audio → shared 256-d bridge space)
BRIDGE_WEIGHTS_PATH = PROJECT_ROOT / "models" / "bridge" / "bridge_best.pt"

# Probabilistic adapters (ProbVLM-style uncertainty)
PROB_CLIP_ADAPTER_PATH = PROJECT_ROOT / "models" / "prob_adapters" / "clip_adapter.pt"
PROB_CLAP_ADAPTER_PATH = PROJECT_ROOT / "models" / "prob_adapters" / "clap_adapter.pt"

# Full pipeline optimized parameters (via LOO-CV on RQ3 human ratings)
# Full-sample rho=0.608 (p=0.0004), LOO-CV rho=0.546 (p=0.0018), overfit gap=0.001
# Selected in 87% of LOO folds (26/30) — highly stable
CMSCI_MARGIN_ALPHA = 16             # Margin scaling factor (amplifies contrastive signal)
CMSCI_CHANNEL_WEIGHT_TI = 0.90     # Text-image channel weight (1 - w for text-audio)
CMSCI_CALIBRATION_MODE = "gram"    # "cosine" (z-norm cosine sims) or "gram" (z-norm gram coherences)

# Variant E: ExMCR cross-modal complementarity (w_3d=0 recovers D exactly)
# ExMCR projects CLAP audio → CLIP space; complementarity = Gramian dispersion
# High complementarity = image and audio contribute unique perspectives (rewarded)
CMSCI_W_3D = 0.45                  # Weight for z-normalized IA complementarity
# Variant F: ProbVLM adaptive channel weighting (gamma=0 recovers E exactly)
CMSCI_GAMMA = 0.10                 # Mixing ratio: w_final = (1-gamma)*base_w + gamma*adaptive_w

# Contrastive negative bank
CMSCI_NEGATIVE_K = 5                # Number of hard negatives per modality
CMSCI_NEGATIVE_BANK_ENABLED = True  # Enable/disable contrastive calibration

# MC sampling for uncertainty estimation
CMSCI_MC_SAMPLES = 100  # Number of Monte Carlo samples for Variant F

# Probabilistic adapter training
PROB_ADAPTER_EPOCHS = 100
PROB_ADAPTER_LR = 1e-4
PROB_ADAPTER_BATCH_SIZE = 32
PROB_ADAPTER_PATIENCE = 15