philosopher-chat / config.py
fikri0o0's picture
Honest eval narrative + query rewrite off by default
a589ee3 verified
import os
import torch
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()
# Authenticate with HuggingFace Hub so gated models (e.g. EmbeddingGemma-300M) can be downloaded.
# On HF Spaces, set HF_TOKEN in Settings β†’ Variables and secrets.
_HF_TOKEN = os.getenv("HF_TOKEN", "")
if _HF_TOKEN:
from huggingface_hub import login as _hf_login
_hf_login(token=_HF_TOKEN, add_to_git_credential=False)
PROJECT_ROOT = Path(__file__).parent
DATA_DIR = PROJECT_ROOT / "data" / "texts"
VECTORSTORE_DIR = PROJECT_ROOT / "vectorstore"
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
GROQ_API_KEY = os.getenv("GROQ_API_KEY", "")
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# ---------------------------------------------------------------------------
# LLM options β€” (provider, model_id)
# Providers: "google" | "groq" | "openrouter"
# ---------------------------------------------------------------------------
LLM_OPTIONS: dict[str, tuple[str, str]] = {
# ── Google AI Studio (free tier) ──────────────────────────────────────
# Limits verified from aistudio.google.com/rate-limit (2026-05)
"Gemma 4 MoE 26B [Google]": ("google", "gemma-4-26b-a4b-it"), # 15 RPM | ∞ TPM | 1500 RPD
"Gemma 4 Dense 31B [Google]": ("google", "gemma-4-31b-it"), # 15 RPM | ∞ TPM | 1500 RPD
"Gemini 3.1 Flash Lite [Google]": ("google", "gemini-3.1-flash-lite"), # 15 RPM | 250K TPM | 500 RPD
"Gemini 3.5 Flash [Google]": ("google", "gemini-3.5-flash"), # 5 RPM | 250K TPM | 20 RPD
"Gemini 2.5 Flash [Google]": ("google", "gemini-2.5-flash"), # 5 RPM | 250K TPM | 20 RPD
"Gemini 2.5 Flash Lite [Google]": ("google", "gemini-2.5-flash-lite"), # 10 RPM | 250K TPM | 20 RPD
# ── Groq (free tier, very fast LPU inference) ─────────────────────────
"Llama 3.3 70B [Groq]": ("groq", "llama-3.3-70b-versatile"),
"Llama 4 Scout 17B [Groq]": ("groq", "meta-llama/llama-4-scout-17b-16e-instruct"),
"Qwen3 32B [Groq]": ("groq", "qwen/qwen3-32b"),
"Llama 3.1 8B [Groq]": ("groq", "llama-3.1-8b-instant"),
# ── OpenRouter free models (:free = no cost, rate-limited) ────────────
"Nvidia Nemotron 120B [OpenRouter]":("openrouter", "nvidia/nemotron-3-super-120b-a12b:free"),
"OpenAI OSS 120B [OpenRouter]": ("openrouter", "openai/gpt-oss-120b:free"),
"DeepSeek V4 Flash [OpenRouter]": ("openrouter", "deepseek/deepseek-v4-flash:free"),
"Llama 3.3 70B [OpenRouter]": ("openrouter", "meta-llama/llama-3.3-70b-instruct:free"),
"Qwen3 Next 80B [OpenRouter]": ("openrouter", "qwen/qwen3-next-80b-a3b-instruct:free"),
"Gemma 4 MoE 26B [OpenRouter]": ("openrouter", "google/gemma-4-26b-a4b-it:free"),
}
DEFAULT_LLM = "Gemma 4 MoE 26B [Google]"
PROVIDER_KEYS = {
"google": ("GOOGLE_API_KEY", "ai.google.dev"),
"groq": ("GROQ_API_KEY", "console.groq.com"),
"openrouter": ("OPENROUTER_API_KEY", "openrouter.ai"),
}
# ---------------------------------------------------------------------------
# Embedding
# ---------------------------------------------------------------------------
EMBEDDING_OPTIONS = {
"EmbeddingGemma 300M (active)": "google/embeddinggemma-300m",
"BGE Large EN v1.5": "BAAI/bge-large-en-v1.5",
"Multilingual E5 Large": "intfloat/multilingual-e5-large",
}
DEFAULT_EMBEDDING = "EmbeddingGemma 300M (active)"
EMBEDDING_MODEL = EMBEDDING_OPTIONS[DEFAULT_EMBEDDING]
# ---------------------------------------------------------------------------
# RAG
# ---------------------------------------------------------------------------
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 150
RETRIEVAL_K = 6 # final number of chunks passed to the LLM
USE_HYBRID_SEARCH = True # BM25 + semantic ensemble (fused with RRF)
# ---------------------------------------------------------------------------
# Reranking (2-stage retrieval)
# Stage 1: hybrid (semantic + BM25) β†’ fetch RETRIEVAL_FETCH_K candidates,
# merged with Reciprocal Rank Fusion (RRF).
# Stage 2: cross-encoder reranker scores each (query, chunk) pair jointly
# and keeps the top RETRIEVAL_K. Highest-ROI precision boost.
# ---------------------------------------------------------------------------
USE_RERANKER = True
RERANKER_MODEL = "BAAI/bge-reranker-v2-m3" # multilingual (handles ID queries)
RETRIEVAL_FETCH_K = 20 # candidates retrieved before reranking
RRF_K = 60 # RRF damping constant (standard default)
# ---------------------------------------------------------------------------
# Query rewriting (multi-query expansion)
# The user's phrasing is rarely the best retrieval query. An LLM generates
# alternative phrasings; each is retrieved and the results are fused with RRF
# before reranking β€” lifts recall on differently-worded questions.
# ---------------------------------------------------------------------------
# Disabled by default: the RAGAS ablation (see notebooks/rag_evaluation.ipynb)
# showed multi-query rewriting slightly *hurt* recall on this small, focused
# corpus. Kept implemented + measured; flip to True for larger/noisier corpora.
USE_QUERY_REWRITE = False
QUERY_REWRITE_MODEL = "gemini-3.1-flash-lite" # fast/cheap, via OpenAI-compat endpoint
N_QUERY_VARIANTS = 3 # total queries incl. the original
# ---------------------------------------------------------------------------
# Corrective RAG + abstention
# If the reranker's top relevance score is below the threshold, the retrieved
# context is too weak β€” the system abstains instead of answering from poor
# evidence (calibrated hallucination guard for out-of-corpus questions).
# ---------------------------------------------------------------------------
USE_CORRECTIVE_RAG = True
# Gate on the top semantic cosine score: off-corpus questions score ~0.0 while
# in-corpus ones score >=~0.2, so cosine separates them cleanly (the reranker's
# sigmoid sits near 0.5 for both and is unsuitable as an abstain signal).
CRAG_ABSTAIN_THRESHOLD = 0.12
# Max number of *turns* (1 turn = 1 user + 1 assistant message) to keep in
# LLM history. Each RAG turn adds ~7 000 tokens (6 chunks + Q + A), so 4 turns
# β‰ˆ 28 K tokens β€” safely under the 32 K limit of Gemma/Qwen3 while leaving
# room for the system prompt and the new question+context.
MAX_HISTORY_TURNS = 4
# ---------------------------------------------------------------------------
# Knowledge base sources (Project Gutenberg)
# ---------------------------------------------------------------------------
SOURCES = [
{"philosopher": "Nietzsche", "title": "Thus Spoke Zarathustra", "gutenberg_id": 1998},
{"philosopher": "Nietzsche", "title": "Beyond Good and Evil", "gutenberg_id": 4363},
{"philosopher": "Nietzsche", "title": "On the Genealogy of Morality", "gutenberg_id": 52319},
{"philosopher": "Nietzsche", "title": "The Birth of Tragedy", "gutenberg_id": 51356},
{"philosopher": "Schopenhauer", "title": "Essays of Arthur Schopenhauer", "gutenberg_id": 11945},
{"philosopher": "Hume", "title": "An Enquiry Concerning Human Understanding", "gutenberg_id": 9662},
{"philosopher": "Russell", "title": "The Problems of Philosophy", "gutenberg_id": 5827},
{"philosopher": "Marcus Aurelius", "title": "Meditations", "gutenberg_id": 2680},
{"philosopher": "Plato", "title": "The Republic", "gutenberg_id": 1497},
{"philosopher": "Mill", "title": "Utilitarianism", "gutenberg_id": 11224},
{"philosopher": "Epictetus", "title": "The Enchiridion", "gutenberg_id": 45109},
{"philosopher": "Kant", "title": "Fundamental Principles of the Metaphysic of Morals", "gutenberg_id": 5682},
]