Spaces:

siddhm11
/

ResearchIT

Running

siddhm11

fix: BGE-M3 and cross-encoder failing to load on HF Spaces

077ba39 2 days ago

8.72 kB

	"""
	Centralised settings for the arxiv recommender app.
	All credentials live in .env locally; override with env vars in production.
	"""
	import os
	from dotenv import load_dotenv

	load_dotenv() # reads .env file if present (won't override existing env vars)

	# ── Qdrant (BGE-M3 dense, 1 024-dim) ─────────────────────────────────────────
	QDRANT_URL = os.getenv("QDRANT_URL", "").strip()
	QDRANT_API_KEY = os.getenv("QDRANT_API_KEY", "").strip()
	QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "arxiv_bgem3_dense")

	# ── SQLite ────────────────────────────────────────────────────────────────────
	DB_PATH = os.getenv("DB_PATH", "interactions.db")

	# ── arXiv API ─────────────────────────────────────────────────────────────────
	ARXIV_API_URL = "https://export.arxiv.org/api/query"
	ARXIV_MAX_RESULTS = 10 # results per search page
	METADATA_CACHE_TTL_DAYS = 30 # re-fetch metadata after this many days

	# ── Turso (libSQL) — arXiv metadata DB — Phase 3.5 ───────────────────────────
	TURSO_URL = os.getenv("TURSO_URL", "").strip()
	TURSO_DB_TOKEN = os.getenv("TURSO_DB_TOKEN", "").strip()



	# ── Recommendation settings ───────────────────────────────────────────────────
	REC_LIMIT = 10 # how many recommendations to show
	REC_POSITIVE_LIMIT = 20 # max positive examples sent to Qdrant
	REC_MIN_POSITIVES = 1 # minimum saves needed before showing recs

	# ── Zilliz Cloud (BGE-M3 sparse vectors) — Phase 3 ────────────────────────────
	ZILLIZ_URI = os.getenv("ZILLIZ_URI", "").strip()
	ZILLIZ_TOKEN = os.getenv("ZILLIZ_TOKEN", "").strip()
	ZILLIZ_COLLECTION = os.getenv("ZILLIZ_COLLECTION", "arxiv_bgem3_sparse")

	# Zilliz schema (confirmed from notebooks/01-bme-upload.ipynb):
	# id INT64 (auto_id, primary key)
	# arxiv_id VARCHAR
	# sparse_vector SPARSE_FLOAT_VECTOR (BGE-M3 lexical weights, int token IDs)
	# Index: SPARSE_INVERTED_INDEX, metric_type="IP"

	# ── Groq (LLM query rewriter) — Phase 3 ──────────────────────────────────────
	GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip()

	# ── BGE-M3 (embedding model) — Phase 3 ───────────────────────────────────────
	BGE_M3_MODEL = os.getenv("BGE_M3_MODEL", "BAAI/bge-m3")
	BGE_M3_DEVICE = os.getenv("BGE_M3_DEVICE", "cpu")
	ENCODE_CACHE_SIZE = 128 # LRU cache for encoded queries

	# ── Cross-Encoder Reranker (search reranking) ─────────────────────────────────
	RERANKER_MODEL = os.getenv("RERANKER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2")
	SEARCH_RERANK_TOP_N = int(os.getenv("SEARCH_RERANK_TOP_N", "10")) # cap to preserve CPU latency

	# ── Hybrid search tuning — Phase 3 ───────────────────────────────────────────
	SEARCH_RRF_K = 60 # RRF denominator
	SEARCH_FETCH_K_MULTIPLIER = 6 # candidates = top_k × 6 before rerank
	SEARCH_SEMANTIC_WEIGHT = 0.80 # RRF contribution to final score
	SEARCH_RECENCY_WEIGHT = 0.20 # recency contribution to final score

	# ── App ───────────────────────────────────────────────────────────────────────
	APP_TITLE = "ResearchIT"
	COOKIE_NAME = "arxiv_user_id"
	COOKIE_MAX_AGE = 60 * 60 * 24 * 365 # 1 year
	APP_PORT = int(os.getenv("PORT", "7860")) # HF Spaces requires 7860

	# ── Phase 5: Onboarding category taxonomy ─────────────────────────────────────
	# Each group maps a user-friendly label to real arXiv primary_topic codes.
	# Used by the onboarding wizard AND as pool filters / LightGBM features later.
	CATEGORY_GROUPS: dict[str, dict] = {
	"nlp": {
	"name": "Natural Language Processing",
	"icon": "💬",
	"arxiv": ["cs.CL", "cs.IR"],
	"desc": "Language models, text generation, information retrieval",
	},
	"cv": {
	"name": "Computer Vision",
	"icon": "👁️",
	"arxiv": ["cs.CV"],
	"desc": "Image recognition, object detection, video understanding",
	},
	"ml": {
	"name": "Machine Learning",
	"icon": "🧠",
	"arxiv": ["cs.LG", "stat.ML"],
	"desc": "Learning theory, optimization, generalization",
	},
	"ai": {
	"name": "Artificial Intelligence",
	"icon": "🤖",
	"arxiv": ["cs.AI"],
	"desc": "Reasoning, planning, knowledge representation",
	},
	"robotics": {
	"name": "Robotics",
	"icon": "🦾",
	"arxiv": ["cs.RO"],
	"desc": "Control, manipulation, autonomous systems",
	},
	"hep": {
	"name": "High Energy Physics",
	"icon": "⚛️",
	"arxiv": ["hep-ph", "hep-th", "hep-ex", "hep-lat"],
	"desc": "Particle physics, quantum field theory, colliders",
	},
	"astro": {
	"name": "Astrophysics",
	"icon": "🔭",
	"arxiv": ["astro-ph.GA", "astro-ph.CO", "astro-ph.SR", "astro-ph.HE"],
	"desc": "Galaxies, cosmology, stellar physics",
	},
	"quant_ph": {
	"name": "Quantum Computing",
	"icon": "💠",
	"arxiv": ["quant-ph"],
	"desc": "Quantum algorithms, error correction, quantum info",
	},
	"math": {
	"name": "Mathematics",
	"icon": "📐",
	"arxiv": ["math.CO", "math.AG", "math.NT", "math.PR", "math.AP"],
	"desc": "Pure and applied mathematics",
	},
	"bio": {
	"name": "Computational Biology",
	"icon": "🧬",
	"arxiv": ["q-bio.BM", "q-bio.GN", "q-bio.QM"],
	"desc": "Bioinformatics, genomics, protein structure",
	},
	"neuro": {
	"name": "Neuroscience",
	"icon": "🧪",
	"arxiv": ["q-bio.NC"],
	"desc": "Computational neuroscience, brain modeling",
	},
	"econ": {
	"name": "Economics & Game Theory",
	"icon": "📊",
	"arxiv": ["econ.TH", "cs.GT"],
	"desc": "Mechanism design, auctions, market models",
	},
	"crypto": {
	"name": "Cryptography & Security",
	"icon": "🔐",
	"arxiv": ["cs.CR"],
	"desc": "Encryption, protocols, privacy",
	},
	"systems": {
	"name": "Systems & Networking",
	"icon": "🌐",
	"arxiv": ["cs.DC", "cs.NI"],
	"desc": "Distributed systems, networks, cloud",
	},
	"hci": {
	"name": "Human-Computer Interaction",
	"icon": "🖱️",
	"arxiv": ["cs.HC"],
	"desc": "Interface design, accessibility, user studies",
	},
	"audio": {
	"name": "Speech & Audio",
	"icon": "🎵",
	"arxiv": ["cs.SD", "eess.AS"],
	"desc": "Speech recognition, audio generation, music AI",
	},
	"pde": {
	"name": "Physics — General",
	"icon": "🌊",
	"arxiv": ["physics.flu-dyn", "physics.comp-ph", "physics.optics"],
	"desc": "Fluid dynamics, computational physics, optics",
	},
	"cond_mat": {
	"name": "Condensed Matter",
	"icon": "🧊",
	"arxiv": ["cond-mat.mes-hall", "cond-mat.mtrl-sci", "cond-mat.str-el"],
	"desc": "Materials science, superconductivity",
	},
	"se": {
	"name": "Software Engineering",
	"icon": "💻",
	"arxiv": ["cs.SE", "cs.PL"],
	"desc": "Testing, verification, compilers",
	},
	"acl_acl": {
	"name": "Signal Processing",
	"icon": "📡",
	"arxiv": ["eess.SP", "eess.IV"],
	"desc": "Image & signal processing, medical imaging",
	},
	}


	def expand_category_groups(group_keys: list[str]) -> set[str]:
	"""Convert a list of group keys (e.g. ['nlp', 'cv']) into a flat set of arXiv categories."""
	cats: set[str] = set()
	for key in group_keys:
	grp = CATEGORY_GROUPS.get(key)
	if grp:
	cats.update(grp["arxiv"])
	return cats