Spaces:

pratik-250620
/

MultiModal-Coherence-AI

Running

App Files Files Community

MultiModal-Coherence-AI / src /config /settings.py

pratik-250620

Upload folder using huggingface_hub

358d3bc verified 24 days ago

raw

history blame contribute delete

5.89 kB

	"""
	Centralized configuration for MultiModal Coherence AI.

	All magic numbers, model names, paths, and thresholds live here.
	Import from this module instead of hardcoding values in source files.
	"""

	from __future__ import annotations

	import os
	from pathlib import Path

	# ---------------------------------------------------------------------------
	# Paths
	# ---------------------------------------------------------------------------

	PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent

	# Data directories
	DATA_DIR = PROJECT_ROOT / "data"
	IMAGE_INDEX_PATH = DATA_DIR / "embeddings" / "image_index.npz"
	AUDIO_INDEX_PATH = DATA_DIR / "embeddings" / "audio_index.npz"
	COHERENCE_STATS_PATH = PROJECT_ROOT / "artifacts" / "coherence_stats.json"

	IMAGE_DIRS = [
	DATA_DIR / "processed" / "images",
	DATA_DIR / "wikimedia" / "images",
	]
	AUDIO_DIRS = [
	DATA_DIR / "processed" / "audio",
	DATA_DIR / "freesound" / "audio",
	]

	# Embedding cache
	CACHE_DIR = PROJECT_ROOT / ".cache" / "embeddings"

	# Experiment output
	RUNS_DIR = PROJECT_ROOT / "runs"

	# ---------------------------------------------------------------------------
	# Model names
	# ---------------------------------------------------------------------------

	CLIP_MODEL = "openai/clip-vit-base-patch32"
	CLAP_MODEL = "laion/clap-htsat-unfused"
	OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "qwen2:7b")
	OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
	HF_FALLBACK_MODEL = "gpt2"

	# ---------------------------------------------------------------------------
	# Embedding dimensions
	# ---------------------------------------------------------------------------

	CLIP_DIM = 512
	CLAP_DIM = 512
	TARGET_DIM = 512

	# ---------------------------------------------------------------------------
	# MSCI weights
	# ---------------------------------------------------------------------------

	# These weights are hypothesized, not empirically derived.
	# Text-image and text-audio are weighted equally; image-audio is down-weighted
	# because CLIP and CLAP are different embedding spaces.
	MSCI_WEIGHTS = {
	"st_i": 0.45, # text-image (CLIP shared space)
	"st_a": 0.45, # text-audio (CLAP shared space)
	"si_a": 0.10, # image-audio (cross-space — usually omitted)
	}

	# ---------------------------------------------------------------------------
	# Retrieval thresholds
	# ---------------------------------------------------------------------------

	IMAGE_MIN_SIMILARITY = 0.20
	AUDIO_MIN_SIMILARITY = 0.10
	IMAGE_LOW_SIMILARITY_WARN = 0.25

	# ---------------------------------------------------------------------------
	# Text generation
	# ---------------------------------------------------------------------------

	TEXT_MAX_TOKENS = 160
	TEXT_TEMPERATURE_DETERMINISTIC = 0.0
	TEXT_TEMPERATURE_STOCHASTIC = 0.7
	TEXT_TOP_P_DETERMINISTIC = 1.0
	TEXT_TOP_P_STOCHASTIC = 0.9

	# ---------------------------------------------------------------------------
	# Audio generation (fallback ambient)
	# ---------------------------------------------------------------------------

	AUDIO_DURATION_SEC = 6.0
	AUDIO_SAMPLE_RATE = 48000

	# ---------------------------------------------------------------------------
	# Drift detection
	# ---------------------------------------------------------------------------

	DRIFT_ASYMMETRY_THRESHOLD = 0.15 # \|st_i - st_a\| gap to flag drift

	# ---------------------------------------------------------------------------
	# Human evaluation
	# ---------------------------------------------------------------------------

	RERATING_FRACTION = 0.20
	KAPPA_ACCEPTABLE_THRESHOLD = 0.70
	ALPHA_ACCEPTABLE_THRESHOLD = 0.667

	# ---------------------------------------------------------------------------
	# cMSCI (Calibrated Multimodal Semantic Coherence Index)
	# ---------------------------------------------------------------------------

	# Calibration store (fitted from RQ1 baseline data)
	CMSCI_CALIBRATION_PATH = PROJECT_ROOT / "artifacts" / "cmsci_calibration.json"

	# Ex-MCR cross-space alignment (CLAP → CLIP projection)
	EXMCR_WEIGHTS_PATH = PROJECT_ROOT / "models" / "exmcr" / "ex_clap.pt"

	# Cross-Space Bridge (CLIP image + CLAP audio → shared 256-d bridge space)
	BRIDGE_WEIGHTS_PATH = PROJECT_ROOT / "models" / "bridge" / "bridge_best.pt"

	# Probabilistic adapters (ProbVLM-style uncertainty)
	PROB_CLIP_ADAPTER_PATH = PROJECT_ROOT / "models" / "prob_adapters" / "clip_adapter.pt"
	PROB_CLAP_ADAPTER_PATH = PROJECT_ROOT / "models" / "prob_adapters" / "clap_adapter.pt"

	# Full pipeline optimized parameters (via LOO-CV on RQ3 human ratings)
	# Full-sample rho=0.608 (p=0.0004), LOO-CV rho=0.546 (p=0.0018), overfit gap=0.001
	# Selected in 87% of LOO folds (26/30) — highly stable
	CMSCI_MARGIN_ALPHA = 16 # Margin scaling factor (amplifies contrastive signal)
	CMSCI_CHANNEL_WEIGHT_TI = 0.90 # Text-image channel weight (1 - w for text-audio)
	CMSCI_CALIBRATION_MODE = "gram" # "cosine" (z-norm cosine sims) or "gram" (z-norm gram coherences)

	# Variant E: ExMCR cross-modal complementarity (w_3d=0 recovers D exactly)
	# ExMCR projects CLAP audio → CLIP space; complementarity = Gramian dispersion
	# High complementarity = image and audio contribute unique perspectives (rewarded)
	CMSCI_W_3D = 0.45 # Weight for z-normalized IA complementarity
	# Variant F: ProbVLM adaptive channel weighting (gamma=0 recovers E exactly)
	CMSCI_GAMMA = 0.10 # Mixing ratio: w_final = (1-gamma)base_w + gammaadaptive_w

	# Contrastive negative bank
	CMSCI_NEGATIVE_K = 5 # Number of hard negatives per modality
	CMSCI_NEGATIVE_BANK_ENABLED = True # Enable/disable contrastive calibration

	# MC sampling for uncertainty estimation
	CMSCI_MC_SAMPLES = 100 # Number of Monte Carlo samples for Variant F

	# Probabilistic adapter training
	PROB_ADAPTER_EPOCHS = 100
	PROB_ADAPTER_LR = 1e-4
	PROB_ADAPTER_BATCH_SIZE = 32
	PROB_ADAPTER_PATIENCE = 15