"""
Centralised settings for the arxiv recommender app.
All credentials live in .env locally; override with env vars in production.
"""
import os
from dotenv import load_dotenv

load_dotenv()  # reads .env file if present (won't override existing env vars)

# ── Qdrant (BGE-M3 dense, 1 024-dim) ─────────────────────────────────────────
QDRANT_URL = os.getenv("QDRANT_URL", "").strip()
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY", "").strip()
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "arxiv_bgem3_dense")

# ── SQLite ────────────────────────────────────────────────────────────────────
DB_PATH = os.getenv("DB_PATH", "interactions.db")

# ── arXiv API ─────────────────────────────────────────────────────────────────
ARXIV_API_URL = "https://export.arxiv.org/api/query"
ARXIV_MAX_RESULTS = 10          # results per search page
METADATA_CACHE_TTL_DAYS = 30    # re-fetch metadata after this many days

# ── Turso (libSQL) — arXiv metadata DB — Phase 3.5 ───────────────────────────
TURSO_URL = os.getenv("TURSO_URL", "").strip()
TURSO_DB_TOKEN = os.getenv("TURSO_DB_TOKEN", "").strip()


# ── Recommendation settings ───────────────────────────────────────────────────
REC_LIMIT = 10                  # how many recommendations to show
REC_POSITIVE_LIMIT = 20         # max positive examples sent to Qdrant
REC_MIN_POSITIVES = 1           # minimum saves needed before showing recs

# ── Zilliz Cloud (BGE-M3 sparse vectors) — Phase 3 ────────────────────────────
ZILLIZ_URI = os.getenv("ZILLIZ_URI", "").strip()
ZILLIZ_TOKEN = os.getenv("ZILLIZ_TOKEN", "").strip()
ZILLIZ_COLLECTION = os.getenv("ZILLIZ_COLLECTION", "arxiv_bgem3_sparse")

# Zilliz schema (confirmed from notebooks/01-bme-upload.ipynb):
#   id            INT64  (auto_id, primary key)
#   arxiv_id      VARCHAR
#   sparse_vector SPARSE_FLOAT_VECTOR  (BGE-M3 lexical weights, int token IDs)
#   Index: SPARSE_INVERTED_INDEX, metric_type="IP"

# ── Groq (LLM query rewriter) — Phase 3 ──────────────────────────────────────
GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip()

# ── BGE-M3 (embedding model) — Phase 3 ───────────────────────────────────────
BGE_M3_MODEL = os.getenv("BGE_M3_MODEL", "BAAI/bge-m3")
BGE_M3_DEVICE = os.getenv("BGE_M3_DEVICE", "cpu")
ENCODE_CACHE_SIZE = 128  # LRU cache for encoded queries

# ── Cross-Encoder Reranker (search reranking) ─────────────────────────────────
RERANKER_MODEL = os.getenv("RERANKER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2")
SEARCH_RERANK_TOP_N = int(os.getenv("SEARCH_RERANK_TOP_N", "10"))  # cap to preserve CPU latency

# ── Hybrid search tuning — Phase 3 ───────────────────────────────────────────
SEARCH_RRF_K = 60                  # RRF denominator
SEARCH_FETCH_K_MULTIPLIER = 6     # candidates = top_k × 6 before rerank
SEARCH_SEMANTIC_WEIGHT = 0.80     # RRF contribution to final score
SEARCH_RECENCY_WEIGHT = 0.20      # recency contribution to final score

# ── App ───────────────────────────────────────────────────────────────────────
APP_TITLE = "ResearchIT"
COOKIE_NAME = "arxiv_user_id"
COOKIE_MAX_AGE = 60 * 60 * 24 * 365  # 1 year
APP_PORT = int(os.getenv("PORT", "7860"))  # HF Spaces requires 7860

# ── Phase 5: Onboarding category taxonomy ─────────────────────────────────────
# Each group maps a user-friendly label to real arXiv primary_topic codes.
# Used by the onboarding wizard AND as pool filters / LightGBM features later.
CATEGORY_GROUPS: dict[str, dict] = {
    "nlp": {
        "name": "Natural Language Processing",
        "icon": "💬",
        "arxiv": ["cs.CL", "cs.IR"],
        "desc": "Language models, text generation, information retrieval",
    },
    "cv": {
        "name": "Computer Vision",
        "icon": "👁️",
        "arxiv": ["cs.CV"],
        "desc": "Image recognition, object detection, video understanding",
    },
    "ml": {
        "name": "Machine Learning",
        "icon": "🧠",
        "arxiv": ["cs.LG", "stat.ML"],
        "desc": "Learning theory, optimization, generalization",
    },
    "ai": {
        "name": "Artificial Intelligence",
        "icon": "🤖",
        "arxiv": ["cs.AI"],
        "desc": "Reasoning, planning, knowledge representation",
    },
    "robotics": {
        "name": "Robotics",
        "icon": "🦾",
        "arxiv": ["cs.RO"],
        "desc": "Control, manipulation, autonomous systems",
    },
    "hep": {
        "name": "High Energy Physics",
        "icon": "⚛️",
        "arxiv": ["hep-ph", "hep-th", "hep-ex", "hep-lat"],
        "desc": "Particle physics, quantum field theory, colliders",
    },
    "astro": {
        "name": "Astrophysics",
        "icon": "🔭",
        "arxiv": ["astro-ph.GA", "astro-ph.CO", "astro-ph.SR", "astro-ph.HE"],
        "desc": "Galaxies, cosmology, stellar physics",
    },
    "quant_ph": {
        "name": "Quantum Computing",
        "icon": "💠",
        "arxiv": ["quant-ph"],
        "desc": "Quantum algorithms, error correction, quantum info",
    },
    "math": {
        "name": "Mathematics",
        "icon": "📐",
        "arxiv": ["math.CO", "math.AG", "math.NT", "math.PR", "math.AP"],
        "desc": "Pure and applied mathematics",
    },
    "bio": {
        "name": "Computational Biology",
        "icon": "🧬",
        "arxiv": ["q-bio.BM", "q-bio.GN", "q-bio.QM"],
        "desc": "Bioinformatics, genomics, protein structure",
    },
    "neuro": {
        "name": "Neuroscience",
        "icon": "🧪",
        "arxiv": ["q-bio.NC"],
        "desc": "Computational neuroscience, brain modeling",
    },
    "econ": {
        "name": "Economics & Game Theory",
        "icon": "📊",
        "arxiv": ["econ.TH", "cs.GT"],
        "desc": "Mechanism design, auctions, market models",
    },
    "crypto": {
        "name": "Cryptography & Security",
        "icon": "🔐",
        "arxiv": ["cs.CR"],
        "desc": "Encryption, protocols, privacy",
    },
    "systems": {
        "name": "Systems & Networking",
        "icon": "🌐",
        "arxiv": ["cs.DC", "cs.NI"],
        "desc": "Distributed systems, networks, cloud",
    },
    "hci": {
        "name": "Human-Computer Interaction",
        "icon": "🖱️",
        "arxiv": ["cs.HC"],
        "desc": "Interface design, accessibility, user studies",
    },
    "audio": {
        "name": "Speech & Audio",
        "icon": "🎵",
        "arxiv": ["cs.SD", "eess.AS"],
        "desc": "Speech recognition, audio generation, music AI",
    },
    "pde": {
        "name": "Physics — General",
        "icon": "🌊",
        "arxiv": ["physics.flu-dyn", "physics.comp-ph", "physics.optics"],
        "desc": "Fluid dynamics, computational physics, optics",
    },
    "cond_mat": {
        "name": "Condensed Matter",
        "icon": "🧊",
        "arxiv": ["cond-mat.mes-hall", "cond-mat.mtrl-sci", "cond-mat.str-el"],
        "desc": "Materials science, superconductivity",
    },
    "se": {
        "name": "Software Engineering",
        "icon": "💻",
        "arxiv": ["cs.SE", "cs.PL"],
        "desc": "Testing, verification, compilers",
    },
    "acl_acl": {
        "name": "Signal Processing",
        "icon": "📡",
        "arxiv": ["eess.SP", "eess.IV"],
        "desc": "Image & signal processing, medical imaging",
    },
}


def expand_category_groups(group_keys: list[str]) -> set[str]:
    """Convert a list of group keys (e.g. ['nlp', 'cv']) into a flat set of arXiv categories."""
    cats: set[str] = set()
    for key in group_keys:
        grp = CATEGORY_GROUPS.get(key)
        if grp:
            cats.update(grp["arxiv"])
    return cats