"""
Centralized configuration for MultiModal Coherence AI.

All magic numbers, model names, paths, and thresholds live here.
Import from this module instead of hardcoding values in source files.
"""

from __future__ import annotations

import os
from pathlib import Path

# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------

PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent

# Data directories
DATA_DIR = PROJECT_ROOT / "data"
IMAGE_INDEX_PATH = DATA_DIR / "embeddings" / "image_index.npz"
AUDIO_INDEX_PATH = DATA_DIR / "embeddings" / "audio_index.npz"
COHERENCE_STATS_PATH = PROJECT_ROOT / "artifacts" / "coherence_stats.json"

IMAGE_DIRS = [
    DATA_DIR / "processed" / "images",
    DATA_DIR / "wikimedia" / "images",
]
AUDIO_DIRS = [
    DATA_DIR / "processed" / "audio",
    DATA_DIR / "freesound" / "audio",
]

# Embedding cache
CACHE_DIR = PROJECT_ROOT / ".cache" / "embeddings"

# Experiment output
RUNS_DIR = PROJECT_ROOT / "runs"

# ---------------------------------------------------------------------------
# Model names
# ---------------------------------------------------------------------------

CLIP_MODEL = "openai/clip-vit-base-patch32"
CLAP_MODEL = "laion/clap-htsat-unfused"
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "qwen2:7b")
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
HF_FALLBACK_MODEL = "gpt2"

# ---------------------------------------------------------------------------
# Embedding dimensions
# ---------------------------------------------------------------------------

CLIP_DIM = 512
CLAP_DIM = 512
TARGET_DIM = 512

# ---------------------------------------------------------------------------
# MSCI weights
# ---------------------------------------------------------------------------

# These weights are hypothesized, not empirically derived.
# Text-image and text-audio are weighted equally; image-audio is down-weighted
# because CLIP and CLAP are different embedding spaces.
MSCI_WEIGHTS = {
    "st_i": 0.45,   # text-image (CLIP shared space)
    "st_a": 0.45,   # text-audio (CLAP shared space)
    "si_a": 0.10,   # image-audio (cross-space — usually omitted)
}

# ---------------------------------------------------------------------------
# Retrieval thresholds
# ---------------------------------------------------------------------------

IMAGE_MIN_SIMILARITY = 0.20
AUDIO_MIN_SIMILARITY = 0.10
IMAGE_LOW_SIMILARITY_WARN = 0.25

# ---------------------------------------------------------------------------
# Text generation
# ---------------------------------------------------------------------------

TEXT_MAX_TOKENS = 160
TEXT_TEMPERATURE_DETERMINISTIC = 0.0
TEXT_TEMPERATURE_STOCHASTIC = 0.7
TEXT_TOP_P_DETERMINISTIC = 1.0
TEXT_TOP_P_STOCHASTIC = 0.9

# ---------------------------------------------------------------------------
# Audio generation (fallback ambient)
# ---------------------------------------------------------------------------

AUDIO_DURATION_SEC = 6.0
AUDIO_SAMPLE_RATE = 48000

# ---------------------------------------------------------------------------
# Drift detection
# ---------------------------------------------------------------------------

DRIFT_ASYMMETRY_THRESHOLD = 0.15  # |st_i - st_a| gap to flag drift

# ---------------------------------------------------------------------------
# Human evaluation
# ---------------------------------------------------------------------------

RERATING_FRACTION = 0.20
KAPPA_ACCEPTABLE_THRESHOLD = 0.70
ALPHA_ACCEPTABLE_THRESHOLD = 0.667

# ---------------------------------------------------------------------------
# cMSCI (Calibrated Multimodal Semantic Coherence Index)
# ---------------------------------------------------------------------------

# Calibration store (fitted from RQ1 baseline data)
CMSCI_CALIBRATION_PATH = PROJECT_ROOT / "artifacts" / "cmsci_calibration.json"

# Ex-MCR cross-space alignment (CLAP → CLIP projection)
EXMCR_WEIGHTS_PATH = PROJECT_ROOT / "models" / "exmcr" / "ex_clap.pt"

# Cross-Space Bridge (CLIP image + CLAP audio → shared 256-d bridge space)
BRIDGE_WEIGHTS_PATH = PROJECT_ROOT / "models" / "bridge" / "bridge_best.pt"

# Probabilistic adapters (ProbVLM-style uncertainty)
PROB_CLIP_ADAPTER_PATH = PROJECT_ROOT / "models" / "prob_adapters" / "clip_adapter.pt"
PROB_CLAP_ADAPTER_PATH = PROJECT_ROOT / "models" / "prob_adapters" / "clap_adapter.pt"

# Full pipeline optimized parameters (via LOO-CV on RQ3 human ratings)
# Full-sample rho=0.608 (p=0.0004), LOO-CV rho=0.546 (p=0.0018), overfit gap=0.001
# Selected in 87% of LOO folds (26/30) — highly stable
CMSCI_MARGIN_ALPHA = 16             # Margin scaling factor (amplifies contrastive signal)
CMSCI_CHANNEL_WEIGHT_TI = 0.90     # Text-image channel weight (1 - w for text-audio)
CMSCI_CALIBRATION_MODE = "gram"    # "cosine" (z-norm cosine sims) or "gram" (z-norm gram coherences)

# Variant E: ExMCR cross-modal complementarity (w_3d=0 recovers D exactly)
# ExMCR projects CLAP audio → CLIP space; complementarity = Gramian dispersion
# High complementarity = image and audio contribute unique perspectives (rewarded)
CMSCI_W_3D = 0.45                  # Weight for z-normalized IA complementarity
# Variant F: ProbVLM adaptive channel weighting (gamma=0 recovers E exactly)
CMSCI_GAMMA = 0.10                 # Mixing ratio: w_final = (1-gamma)*base_w + gamma*adaptive_w

# Contrastive negative bank
CMSCI_NEGATIVE_K = 5                # Number of hard negatives per modality
CMSCI_NEGATIVE_BANK_ENABLED = True  # Enable/disable contrastive calibration

# MC sampling for uncertainty estimation
CMSCI_MC_SAMPLES = 100  # Number of Monte Carlo samples for Variant F

# Probabilistic adapter training
PROB_ADAPTER_EPOCHS = 100
PROB_ADAPTER_LR = 1e-4
PROB_ADAPTER_BATCH_SIZE = 32
PROB_ADAPTER_PATIENCE = 15