Spaces:
Running
Running
| """ | |
| Centralised settings for the arxiv recommender app. | |
| All credentials live in .env locally; override with env vars in production. | |
| """ | |
| import os | |
| from dotenv import load_dotenv | |
| load_dotenv() # reads .env file if present (won't override existing env vars) | |
| # ββ Qdrant (BGE-M3 dense, 1 024-dim) βββββββββββββββββββββββββββββββββββββββββ | |
| QDRANT_URL = os.getenv("QDRANT_URL", "").strip() | |
| QDRANT_API_KEY = os.getenv("QDRANT_API_KEY", "").strip() | |
| QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "arxiv_bgem3_dense") | |
| # ββ SQLite ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| DB_PATH = os.getenv("DB_PATH", "interactions.db") | |
| # ββ arXiv API βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ARXIV_API_URL = "https://export.arxiv.org/api/query" | |
| ARXIV_MAX_RESULTS = 10 # results per search page | |
| METADATA_CACHE_TTL_DAYS = 30 # re-fetch metadata after this many days | |
| # ββ Turso (libSQL) β arXiv metadata DB β Phase 3.5 βββββββββββββββββββββββββββ | |
| TURSO_URL = os.getenv("TURSO_URL", "").strip() | |
| TURSO_DB_TOKEN = os.getenv("TURSO_DB_TOKEN", "").strip() | |
| # ββ Recommendation settings βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| REC_LIMIT = 10 # how many recommendations to show | |
| REC_POSITIVE_LIMIT = 20 # max positive examples sent to Qdrant | |
| REC_MIN_POSITIVES = 1 # minimum saves needed before showing recs | |
| # ββ Zilliz Cloud (BGE-M3 sparse vectors) β Phase 3 ββββββββββββββββββββββββββββ | |
| ZILLIZ_URI = os.getenv("ZILLIZ_URI", "").strip() | |
| ZILLIZ_TOKEN = os.getenv("ZILLIZ_TOKEN", "").strip() | |
| ZILLIZ_COLLECTION = os.getenv("ZILLIZ_COLLECTION", "arxiv_bgem3_sparse") | |
| # Zilliz schema (confirmed from notebooks/01-bme-upload.ipynb): | |
| # id INT64 (auto_id, primary key) | |
| # arxiv_id VARCHAR | |
| # sparse_vector SPARSE_FLOAT_VECTOR (BGE-M3 lexical weights, int token IDs) | |
| # Index: SPARSE_INVERTED_INDEX, metric_type="IP" | |
| # ββ Groq (LLM query rewriter) β Phase 3 ββββββββββββββββββββββββββββββββββββββ | |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip() | |
| # ββ BGE-M3 (embedding model) β Phase 3 βββββββββββββββββββββββββββββββββββββββ | |
| BGE_M3_MODEL = os.getenv("BGE_M3_MODEL", "BAAI/bge-m3") | |
| BGE_M3_DEVICE = os.getenv("BGE_M3_DEVICE", "cpu") | |
| ENCODE_CACHE_SIZE = 128 # LRU cache for encoded queries | |
| # ββ Cross-Encoder Reranker (search reranking) βββββββββββββββββββββββββββββββββ | |
| RERANKER_MODEL = os.getenv("RERANKER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2") | |
| SEARCH_RERANK_TOP_N = int(os.getenv("SEARCH_RERANK_TOP_N", "10")) # cap to preserve CPU latency | |
| # ββ Hybrid search tuning β Phase 3 βββββββββββββββββββββββββββββββββββββββββββ | |
| SEARCH_RRF_K = 60 # RRF denominator | |
| SEARCH_FETCH_K_MULTIPLIER = 6 # candidates = top_k Γ 6 before rerank | |
| SEARCH_SEMANTIC_WEIGHT = 0.80 # RRF contribution to final score | |
| SEARCH_RECENCY_WEIGHT = 0.20 # recency contribution to final score | |
| # ββ App βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| APP_TITLE = "ResearchIT" | |
| COOKIE_NAME = "arxiv_user_id" | |
| COOKIE_MAX_AGE = 60 * 60 * 24 * 365 # 1 year | |
| APP_PORT = int(os.getenv("PORT", "7860")) # HF Spaces requires 7860 | |
| # ββ Phase 5: Onboarding category taxonomy βββββββββββββββββββββββββββββββββββββ | |
| # Each group maps a user-friendly label to real arXiv primary_topic codes. | |
| # Used by the onboarding wizard AND as pool filters / LightGBM features later. | |
| CATEGORY_GROUPS: dict[str, dict] = { | |
| "nlp": { | |
| "name": "Natural Language Processing", | |
| "icon": "π¬", | |
| "arxiv": ["cs.CL", "cs.IR"], | |
| "desc": "Language models, text generation, information retrieval", | |
| }, | |
| "cv": { | |
| "name": "Computer Vision", | |
| "icon": "ποΈ", | |
| "arxiv": ["cs.CV"], | |
| "desc": "Image recognition, object detection, video understanding", | |
| }, | |
| "ml": { | |
| "name": "Machine Learning", | |
| "icon": "π§ ", | |
| "arxiv": ["cs.LG", "stat.ML"], | |
| "desc": "Learning theory, optimization, generalization", | |
| }, | |
| "ai": { | |
| "name": "Artificial Intelligence", | |
| "icon": "π€", | |
| "arxiv": ["cs.AI"], | |
| "desc": "Reasoning, planning, knowledge representation", | |
| }, | |
| "robotics": { | |
| "name": "Robotics", | |
| "icon": "π¦Ύ", | |
| "arxiv": ["cs.RO"], | |
| "desc": "Control, manipulation, autonomous systems", | |
| }, | |
| "hep": { | |
| "name": "High Energy Physics", | |
| "icon": "βοΈ", | |
| "arxiv": ["hep-ph", "hep-th", "hep-ex", "hep-lat"], | |
| "desc": "Particle physics, quantum field theory, colliders", | |
| }, | |
| "astro": { | |
| "name": "Astrophysics", | |
| "icon": "π", | |
| "arxiv": ["astro-ph.GA", "astro-ph.CO", "astro-ph.SR", "astro-ph.HE"], | |
| "desc": "Galaxies, cosmology, stellar physics", | |
| }, | |
| "quant_ph": { | |
| "name": "Quantum Computing", | |
| "icon": "π ", | |
| "arxiv": ["quant-ph"], | |
| "desc": "Quantum algorithms, error correction, quantum info", | |
| }, | |
| "math": { | |
| "name": "Mathematics", | |
| "icon": "π", | |
| "arxiv": ["math.CO", "math.AG", "math.NT", "math.PR", "math.AP"], | |
| "desc": "Pure and applied mathematics", | |
| }, | |
| "bio": { | |
| "name": "Computational Biology", | |
| "icon": "π§¬", | |
| "arxiv": ["q-bio.BM", "q-bio.GN", "q-bio.QM"], | |
| "desc": "Bioinformatics, genomics, protein structure", | |
| }, | |
| "neuro": { | |
| "name": "Neuroscience", | |
| "icon": "π§ͺ", | |
| "arxiv": ["q-bio.NC"], | |
| "desc": "Computational neuroscience, brain modeling", | |
| }, | |
| "econ": { | |
| "name": "Economics & Game Theory", | |
| "icon": "π", | |
| "arxiv": ["econ.TH", "cs.GT"], | |
| "desc": "Mechanism design, auctions, market models", | |
| }, | |
| "crypto": { | |
| "name": "Cryptography & Security", | |
| "icon": "π", | |
| "arxiv": ["cs.CR"], | |
| "desc": "Encryption, protocols, privacy", | |
| }, | |
| "systems": { | |
| "name": "Systems & Networking", | |
| "icon": "π", | |
| "arxiv": ["cs.DC", "cs.NI"], | |
| "desc": "Distributed systems, networks, cloud", | |
| }, | |
| "hci": { | |
| "name": "Human-Computer Interaction", | |
| "icon": "π±οΈ", | |
| "arxiv": ["cs.HC"], | |
| "desc": "Interface design, accessibility, user studies", | |
| }, | |
| "audio": { | |
| "name": "Speech & Audio", | |
| "icon": "π΅", | |
| "arxiv": ["cs.SD", "eess.AS"], | |
| "desc": "Speech recognition, audio generation, music AI", | |
| }, | |
| "pde": { | |
| "name": "Physics β General", | |
| "icon": "π", | |
| "arxiv": ["physics.flu-dyn", "physics.comp-ph", "physics.optics"], | |
| "desc": "Fluid dynamics, computational physics, optics", | |
| }, | |
| "cond_mat": { | |
| "name": "Condensed Matter", | |
| "icon": "π§", | |
| "arxiv": ["cond-mat.mes-hall", "cond-mat.mtrl-sci", "cond-mat.str-el"], | |
| "desc": "Materials science, superconductivity", | |
| }, | |
| "se": { | |
| "name": "Software Engineering", | |
| "icon": "π»", | |
| "arxiv": ["cs.SE", "cs.PL"], | |
| "desc": "Testing, verification, compilers", | |
| }, | |
| "acl_acl": { | |
| "name": "Signal Processing", | |
| "icon": "π‘", | |
| "arxiv": ["eess.SP", "eess.IV"], | |
| "desc": "Image & signal processing, medical imaging", | |
| }, | |
| } | |
| def expand_category_groups(group_keys: list[str]) -> set[str]: | |
| """Convert a list of group keys (e.g. ['nlp', 'cv']) into a flat set of arXiv categories.""" | |
| cats: set[str] = set() | |
| for key in group_keys: | |
| grp = CATEGORY_GROUPS.get(key) | |
| if grp: | |
| cats.update(grp["arxiv"]) | |
| return cats | |