ResearchIT / app /config.py
siddhm11
fix: BGE-M3 and cross-encoder failing to load on HF Spaces
077ba39
"""
Centralised settings for the arxiv recommender app.
All credentials live in .env locally; override with env vars in production.
"""
import os
from dotenv import load_dotenv
load_dotenv() # reads .env file if present (won't override existing env vars)
# ── Qdrant (BGE-M3 dense, 1 024-dim) ─────────────────────────────────────────
QDRANT_URL = os.getenv("QDRANT_URL", "").strip()
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY", "").strip()
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "arxiv_bgem3_dense")
# ── SQLite ────────────────────────────────────────────────────────────────────
DB_PATH = os.getenv("DB_PATH", "interactions.db")
# ── arXiv API ─────────────────────────────────────────────────────────────────
ARXIV_API_URL = "https://export.arxiv.org/api/query"
ARXIV_MAX_RESULTS = 10 # results per search page
METADATA_CACHE_TTL_DAYS = 30 # re-fetch metadata after this many days
# ── Turso (libSQL) β€” arXiv metadata DB β€” Phase 3.5 ───────────────────────────
TURSO_URL = os.getenv("TURSO_URL", "").strip()
TURSO_DB_TOKEN = os.getenv("TURSO_DB_TOKEN", "").strip()
# ── Recommendation settings ───────────────────────────────────────────────────
REC_LIMIT = 10 # how many recommendations to show
REC_POSITIVE_LIMIT = 20 # max positive examples sent to Qdrant
REC_MIN_POSITIVES = 1 # minimum saves needed before showing recs
# ── Zilliz Cloud (BGE-M3 sparse vectors) β€” Phase 3 ────────────────────────────
ZILLIZ_URI = os.getenv("ZILLIZ_URI", "").strip()
ZILLIZ_TOKEN = os.getenv("ZILLIZ_TOKEN", "").strip()
ZILLIZ_COLLECTION = os.getenv("ZILLIZ_COLLECTION", "arxiv_bgem3_sparse")
# Zilliz schema (confirmed from notebooks/01-bme-upload.ipynb):
# id INT64 (auto_id, primary key)
# arxiv_id VARCHAR
# sparse_vector SPARSE_FLOAT_VECTOR (BGE-M3 lexical weights, int token IDs)
# Index: SPARSE_INVERTED_INDEX, metric_type="IP"
# ── Groq (LLM query rewriter) β€” Phase 3 ──────────────────────────────────────
GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip()
# ── BGE-M3 (embedding model) β€” Phase 3 ───────────────────────────────────────
BGE_M3_MODEL = os.getenv("BGE_M3_MODEL", "BAAI/bge-m3")
BGE_M3_DEVICE = os.getenv("BGE_M3_DEVICE", "cpu")
ENCODE_CACHE_SIZE = 128 # LRU cache for encoded queries
# ── Cross-Encoder Reranker (search reranking) ─────────────────────────────────
RERANKER_MODEL = os.getenv("RERANKER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2")
SEARCH_RERANK_TOP_N = int(os.getenv("SEARCH_RERANK_TOP_N", "10")) # cap to preserve CPU latency
# ── Hybrid search tuning β€” Phase 3 ───────────────────────────────────────────
SEARCH_RRF_K = 60 # RRF denominator
SEARCH_FETCH_K_MULTIPLIER = 6 # candidates = top_k Γ— 6 before rerank
SEARCH_SEMANTIC_WEIGHT = 0.80 # RRF contribution to final score
SEARCH_RECENCY_WEIGHT = 0.20 # recency contribution to final score
# ── App ───────────────────────────────────────────────────────────────────────
APP_TITLE = "ResearchIT"
COOKIE_NAME = "arxiv_user_id"
COOKIE_MAX_AGE = 60 * 60 * 24 * 365 # 1 year
APP_PORT = int(os.getenv("PORT", "7860")) # HF Spaces requires 7860
# ── Phase 5: Onboarding category taxonomy ─────────────────────────────────────
# Each group maps a user-friendly label to real arXiv primary_topic codes.
# Used by the onboarding wizard AND as pool filters / LightGBM features later.
CATEGORY_GROUPS: dict[str, dict] = {
"nlp": {
"name": "Natural Language Processing",
"icon": "πŸ’¬",
"arxiv": ["cs.CL", "cs.IR"],
"desc": "Language models, text generation, information retrieval",
},
"cv": {
"name": "Computer Vision",
"icon": "πŸ‘οΈ",
"arxiv": ["cs.CV"],
"desc": "Image recognition, object detection, video understanding",
},
"ml": {
"name": "Machine Learning",
"icon": "🧠",
"arxiv": ["cs.LG", "stat.ML"],
"desc": "Learning theory, optimization, generalization",
},
"ai": {
"name": "Artificial Intelligence",
"icon": "πŸ€–",
"arxiv": ["cs.AI"],
"desc": "Reasoning, planning, knowledge representation",
},
"robotics": {
"name": "Robotics",
"icon": "🦾",
"arxiv": ["cs.RO"],
"desc": "Control, manipulation, autonomous systems",
},
"hep": {
"name": "High Energy Physics",
"icon": "βš›οΈ",
"arxiv": ["hep-ph", "hep-th", "hep-ex", "hep-lat"],
"desc": "Particle physics, quantum field theory, colliders",
},
"astro": {
"name": "Astrophysics",
"icon": "πŸ”­",
"arxiv": ["astro-ph.GA", "astro-ph.CO", "astro-ph.SR", "astro-ph.HE"],
"desc": "Galaxies, cosmology, stellar physics",
},
"quant_ph": {
"name": "Quantum Computing",
"icon": "πŸ’ ",
"arxiv": ["quant-ph"],
"desc": "Quantum algorithms, error correction, quantum info",
},
"math": {
"name": "Mathematics",
"icon": "πŸ“",
"arxiv": ["math.CO", "math.AG", "math.NT", "math.PR", "math.AP"],
"desc": "Pure and applied mathematics",
},
"bio": {
"name": "Computational Biology",
"icon": "🧬",
"arxiv": ["q-bio.BM", "q-bio.GN", "q-bio.QM"],
"desc": "Bioinformatics, genomics, protein structure",
},
"neuro": {
"name": "Neuroscience",
"icon": "πŸ§ͺ",
"arxiv": ["q-bio.NC"],
"desc": "Computational neuroscience, brain modeling",
},
"econ": {
"name": "Economics & Game Theory",
"icon": "πŸ“Š",
"arxiv": ["econ.TH", "cs.GT"],
"desc": "Mechanism design, auctions, market models",
},
"crypto": {
"name": "Cryptography & Security",
"icon": "πŸ”",
"arxiv": ["cs.CR"],
"desc": "Encryption, protocols, privacy",
},
"systems": {
"name": "Systems & Networking",
"icon": "🌐",
"arxiv": ["cs.DC", "cs.NI"],
"desc": "Distributed systems, networks, cloud",
},
"hci": {
"name": "Human-Computer Interaction",
"icon": "πŸ–±οΈ",
"arxiv": ["cs.HC"],
"desc": "Interface design, accessibility, user studies",
},
"audio": {
"name": "Speech & Audio",
"icon": "🎡",
"arxiv": ["cs.SD", "eess.AS"],
"desc": "Speech recognition, audio generation, music AI",
},
"pde": {
"name": "Physics β€” General",
"icon": "🌊",
"arxiv": ["physics.flu-dyn", "physics.comp-ph", "physics.optics"],
"desc": "Fluid dynamics, computational physics, optics",
},
"cond_mat": {
"name": "Condensed Matter",
"icon": "🧊",
"arxiv": ["cond-mat.mes-hall", "cond-mat.mtrl-sci", "cond-mat.str-el"],
"desc": "Materials science, superconductivity",
},
"se": {
"name": "Software Engineering",
"icon": "πŸ’»",
"arxiv": ["cs.SE", "cs.PL"],
"desc": "Testing, verification, compilers",
},
"acl_acl": {
"name": "Signal Processing",
"icon": "πŸ“‘",
"arxiv": ["eess.SP", "eess.IV"],
"desc": "Image & signal processing, medical imaging",
},
}
def expand_category_groups(group_keys: list[str]) -> set[str]:
"""Convert a list of group keys (e.g. ['nlp', 'cv']) into a flat set of arXiv categories."""
cats: set[str] = set()
for key in group_keys:
grp = CATEGORY_GROUPS.get(key)
if grp:
cats.update(grp["arxiv"])
return cats