Spaces:
Sleeping
Sleeping
| import os | |
| import torch | |
| from pathlib import Path | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| # Authenticate with HuggingFace Hub so gated models (e.g. EmbeddingGemma-300M) can be downloaded. | |
| # On HF Spaces, set HF_TOKEN in Settings β Variables and secrets. | |
| _HF_TOKEN = os.getenv("HF_TOKEN", "") | |
| if _HF_TOKEN: | |
| from huggingface_hub import login as _hf_login | |
| _hf_login(token=_HF_TOKEN, add_to_git_credential=False) | |
| PROJECT_ROOT = Path(__file__).parent | |
| DATA_DIR = PROJECT_ROOT / "data" / "texts" | |
| VECTORSTORE_DIR = PROJECT_ROOT / "vectorstore" | |
| GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "") | |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY", "") | |
| OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "") | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| # --------------------------------------------------------------------------- | |
| # LLM options β (provider, model_id) | |
| # Providers: "google" | "groq" | "openrouter" | |
| # --------------------------------------------------------------------------- | |
| LLM_OPTIONS: dict[str, tuple[str, str]] = { | |
| # ββ Google AI Studio (free tier) ββββββββββββββββββββββββββββββββββββββ | |
| # Limits verified from aistudio.google.com/rate-limit (2026-05) | |
| "Gemma 4 MoE 26B [Google]": ("google", "gemma-4-26b-a4b-it"), # 15 RPM | β TPM | 1500 RPD | |
| "Gemma 4 Dense 31B [Google]": ("google", "gemma-4-31b-it"), # 15 RPM | β TPM | 1500 RPD | |
| "Gemini 3.1 Flash Lite [Google]": ("google", "gemini-3.1-flash-lite"), # 15 RPM | 250K TPM | 500 RPD | |
| "Gemini 3.5 Flash [Google]": ("google", "gemini-3.5-flash"), # 5 RPM | 250K TPM | 20 RPD | |
| "Gemini 2.5 Flash [Google]": ("google", "gemini-2.5-flash"), # 5 RPM | 250K TPM | 20 RPD | |
| "Gemini 2.5 Flash Lite [Google]": ("google", "gemini-2.5-flash-lite"), # 10 RPM | 250K TPM | 20 RPD | |
| # ββ Groq (free tier, very fast LPU inference) βββββββββββββββββββββββββ | |
| "Llama 3.3 70B [Groq]": ("groq", "llama-3.3-70b-versatile"), | |
| "Llama 4 Scout 17B [Groq]": ("groq", "meta-llama/llama-4-scout-17b-16e-instruct"), | |
| "Qwen3 32B [Groq]": ("groq", "qwen/qwen3-32b"), | |
| "Llama 3.1 8B [Groq]": ("groq", "llama-3.1-8b-instant"), | |
| # ββ OpenRouter free models (:free = no cost, rate-limited) ββββββββββββ | |
| "Nvidia Nemotron 120B [OpenRouter]":("openrouter", "nvidia/nemotron-3-super-120b-a12b:free"), | |
| "OpenAI OSS 120B [OpenRouter]": ("openrouter", "openai/gpt-oss-120b:free"), | |
| "DeepSeek V4 Flash [OpenRouter]": ("openrouter", "deepseek/deepseek-v4-flash:free"), | |
| "Llama 3.3 70B [OpenRouter]": ("openrouter", "meta-llama/llama-3.3-70b-instruct:free"), | |
| "Qwen3 Next 80B [OpenRouter]": ("openrouter", "qwen/qwen3-next-80b-a3b-instruct:free"), | |
| "Gemma 4 MoE 26B [OpenRouter]": ("openrouter", "google/gemma-4-26b-a4b-it:free"), | |
| } | |
| DEFAULT_LLM = "Gemma 4 MoE 26B [Google]" | |
| PROVIDER_KEYS = { | |
| "google": ("GOOGLE_API_KEY", "ai.google.dev"), | |
| "groq": ("GROQ_API_KEY", "console.groq.com"), | |
| "openrouter": ("OPENROUTER_API_KEY", "openrouter.ai"), | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Embedding | |
| # --------------------------------------------------------------------------- | |
| EMBEDDING_OPTIONS = { | |
| "EmbeddingGemma 300M (active)": "google/embeddinggemma-300m", | |
| "BGE Large EN v1.5": "BAAI/bge-large-en-v1.5", | |
| "Multilingual E5 Large": "intfloat/multilingual-e5-large", | |
| } | |
| DEFAULT_EMBEDDING = "EmbeddingGemma 300M (active)" | |
| EMBEDDING_MODEL = EMBEDDING_OPTIONS[DEFAULT_EMBEDDING] | |
| # --------------------------------------------------------------------------- | |
| # RAG | |
| # --------------------------------------------------------------------------- | |
| CHUNK_SIZE = 1000 | |
| CHUNK_OVERLAP = 150 | |
| RETRIEVAL_K = 6 # final number of chunks passed to the LLM | |
| USE_HYBRID_SEARCH = True # BM25 + semantic ensemble (fused with RRF) | |
| # --------------------------------------------------------------------------- | |
| # Reranking (2-stage retrieval) | |
| # Stage 1: hybrid (semantic + BM25) β fetch RETRIEVAL_FETCH_K candidates, | |
| # merged with Reciprocal Rank Fusion (RRF). | |
| # Stage 2: cross-encoder reranker scores each (query, chunk) pair jointly | |
| # and keeps the top RETRIEVAL_K. Highest-ROI precision boost. | |
| # --------------------------------------------------------------------------- | |
| USE_RERANKER = True | |
| RERANKER_MODEL = "BAAI/bge-reranker-v2-m3" # multilingual (handles ID queries) | |
| RETRIEVAL_FETCH_K = 20 # candidates retrieved before reranking | |
| RRF_K = 60 # RRF damping constant (standard default) | |
| # --------------------------------------------------------------------------- | |
| # Query rewriting (multi-query expansion) | |
| # The user's phrasing is rarely the best retrieval query. An LLM generates | |
| # alternative phrasings; each is retrieved and the results are fused with RRF | |
| # before reranking β lifts recall on differently-worded questions. | |
| # --------------------------------------------------------------------------- | |
| # Disabled by default: the RAGAS ablation (see notebooks/rag_evaluation.ipynb) | |
| # showed multi-query rewriting slightly *hurt* recall on this small, focused | |
| # corpus. Kept implemented + measured; flip to True for larger/noisier corpora. | |
| USE_QUERY_REWRITE = False | |
| QUERY_REWRITE_MODEL = "gemini-3.1-flash-lite" # fast/cheap, via OpenAI-compat endpoint | |
| N_QUERY_VARIANTS = 3 # total queries incl. the original | |
| # --------------------------------------------------------------------------- | |
| # Corrective RAG + abstention | |
| # If the reranker's top relevance score is below the threshold, the retrieved | |
| # context is too weak β the system abstains instead of answering from poor | |
| # evidence (calibrated hallucination guard for out-of-corpus questions). | |
| # --------------------------------------------------------------------------- | |
| USE_CORRECTIVE_RAG = True | |
| # Gate on the top semantic cosine score: off-corpus questions score ~0.0 while | |
| # in-corpus ones score >=~0.2, so cosine separates them cleanly (the reranker's | |
| # sigmoid sits near 0.5 for both and is unsuitable as an abstain signal). | |
| CRAG_ABSTAIN_THRESHOLD = 0.12 | |
| # Max number of *turns* (1 turn = 1 user + 1 assistant message) to keep in | |
| # LLM history. Each RAG turn adds ~7 000 tokens (6 chunks + Q + A), so 4 turns | |
| # β 28 K tokens β safely under the 32 K limit of Gemma/Qwen3 while leaving | |
| # room for the system prompt and the new question+context. | |
| MAX_HISTORY_TURNS = 4 | |
| # --------------------------------------------------------------------------- | |
| # Knowledge base sources (Project Gutenberg) | |
| # --------------------------------------------------------------------------- | |
| SOURCES = [ | |
| {"philosopher": "Nietzsche", "title": "Thus Spoke Zarathustra", "gutenberg_id": 1998}, | |
| {"philosopher": "Nietzsche", "title": "Beyond Good and Evil", "gutenberg_id": 4363}, | |
| {"philosopher": "Nietzsche", "title": "On the Genealogy of Morality", "gutenberg_id": 52319}, | |
| {"philosopher": "Nietzsche", "title": "The Birth of Tragedy", "gutenberg_id": 51356}, | |
| {"philosopher": "Schopenhauer", "title": "Essays of Arthur Schopenhauer", "gutenberg_id": 11945}, | |
| {"philosopher": "Hume", "title": "An Enquiry Concerning Human Understanding", "gutenberg_id": 9662}, | |
| {"philosopher": "Russell", "title": "The Problems of Philosophy", "gutenberg_id": 5827}, | |
| {"philosopher": "Marcus Aurelius", "title": "Meditations", "gutenberg_id": 2680}, | |
| {"philosopher": "Plato", "title": "The Republic", "gutenberg_id": 1497}, | |
| {"philosopher": "Mill", "title": "Utilitarianism", "gutenberg_id": 11224}, | |
| {"philosopher": "Epictetus", "title": "The Enchiridion", "gutenberg_id": 45109}, | |
| {"philosopher": "Kant", "title": "Fundamental Principles of the Metaphysic of Morals", "gutenberg_id": 5682}, | |
| ] | |