Spaces:

fikri0o0
/

philosopher-chat

Sleeping

App Files Files Community

philosopher-chat / config.py

fikri0o0

Honest eval narrative + query rewrite off by default

a589ee3 verified 8 days ago

raw

history blame contribute delete

8.54 kB

	import os
	import torch
	from pathlib import Path
	from dotenv import load_dotenv

	load_dotenv()

	# Authenticate with HuggingFace Hub so gated models (e.g. EmbeddingGemma-300M) can be downloaded.
	# On HF Spaces, set HF_TOKEN in Settings → Variables and secrets.
	_HF_TOKEN = os.getenv("HF_TOKEN", "")
	if _HF_TOKEN:
	from huggingface_hub import login as _hf_login
	_hf_login(token=_HF_TOKEN, add_to_git_credential=False)

	PROJECT_ROOT = Path(__file__).parent
	DATA_DIR = PROJECT_ROOT / "data" / "texts"
	VECTORSTORE_DIR = PROJECT_ROOT / "vectorstore"

	GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
	GROQ_API_KEY = os.getenv("GROQ_API_KEY", "")
	OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "")

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	# ---------------------------------------------------------------------------
	# LLM options — (provider, model_id)
	# Providers: "google" \| "groq" \| "openrouter"
	# ---------------------------------------------------------------------------
	LLM_OPTIONS: dict[str, tuple[str, str]] = {
	# ── Google AI Studio (free tier) ──────────────────────────────────────
	# Limits verified from aistudio.google.com/rate-limit (2026-05)
	"Gemma 4 MoE 26B [Google]": ("google", "gemma-4-26b-a4b-it"), # 15 RPM \| ∞ TPM \| 1500 RPD
	"Gemma 4 Dense 31B [Google]": ("google", "gemma-4-31b-it"), # 15 RPM \| ∞ TPM \| 1500 RPD
	"Gemini 3.1 Flash Lite [Google]": ("google", "gemini-3.1-flash-lite"), # 15 RPM \| 250K TPM \| 500 RPD
	"Gemini 3.5 Flash [Google]": ("google", "gemini-3.5-flash"), # 5 RPM \| 250K TPM \| 20 RPD
	"Gemini 2.5 Flash [Google]": ("google", "gemini-2.5-flash"), # 5 RPM \| 250K TPM \| 20 RPD
	"Gemini 2.5 Flash Lite [Google]": ("google", "gemini-2.5-flash-lite"), # 10 RPM \| 250K TPM \| 20 RPD
	# ── Groq (free tier, very fast LPU inference) ─────────────────────────
	"Llama 3.3 70B [Groq]": ("groq", "llama-3.3-70b-versatile"),
	"Llama 4 Scout 17B [Groq]": ("groq", "meta-llama/llama-4-scout-17b-16e-instruct"),
	"Qwen3 32B [Groq]": ("groq", "qwen/qwen3-32b"),
	"Llama 3.1 8B [Groq]": ("groq", "llama-3.1-8b-instant"),
	# ── OpenRouter free models (:free = no cost, rate-limited) ────────────
	"Nvidia Nemotron 120B [OpenRouter]":("openrouter", "nvidia/nemotron-3-super-120b-a12b:free"),
	"OpenAI OSS 120B [OpenRouter]": ("openrouter", "openai/gpt-oss-120b:free"),
	"DeepSeek V4 Flash [OpenRouter]": ("openrouter", "deepseek/deepseek-v4-flash:free"),
	"Llama 3.3 70B [OpenRouter]": ("openrouter", "meta-llama/llama-3.3-70b-instruct:free"),
	"Qwen3 Next 80B [OpenRouter]": ("openrouter", "qwen/qwen3-next-80b-a3b-instruct:free"),
	"Gemma 4 MoE 26B [OpenRouter]": ("openrouter", "google/gemma-4-26b-a4b-it:free"),
	}

	DEFAULT_LLM = "Gemma 4 MoE 26B [Google]"

	PROVIDER_KEYS = {
	"google": ("GOOGLE_API_KEY", "ai.google.dev"),
	"groq": ("GROQ_API_KEY", "console.groq.com"),
	"openrouter": ("OPENROUTER_API_KEY", "openrouter.ai"),
	}

	# ---------------------------------------------------------------------------
	# Embedding
	# ---------------------------------------------------------------------------
	EMBEDDING_OPTIONS = {
	"EmbeddingGemma 300M (active)": "google/embeddinggemma-300m",
	"BGE Large EN v1.5": "BAAI/bge-large-en-v1.5",
	"Multilingual E5 Large": "intfloat/multilingual-e5-large",
	}
	DEFAULT_EMBEDDING = "EmbeddingGemma 300M (active)"
	EMBEDDING_MODEL = EMBEDDING_OPTIONS[DEFAULT_EMBEDDING]

	# ---------------------------------------------------------------------------
	# RAG
	# ---------------------------------------------------------------------------
	CHUNK_SIZE = 1000
	CHUNK_OVERLAP = 150
	RETRIEVAL_K = 6 # final number of chunks passed to the LLM
	USE_HYBRID_SEARCH = True # BM25 + semantic ensemble (fused with RRF)

	# ---------------------------------------------------------------------------
	# Reranking (2-stage retrieval)
	# Stage 1: hybrid (semantic + BM25) → fetch RETRIEVAL_FETCH_K candidates,
	# merged with Reciprocal Rank Fusion (RRF).
	# Stage 2: cross-encoder reranker scores each (query, chunk) pair jointly
	# and keeps the top RETRIEVAL_K. Highest-ROI precision boost.
	# ---------------------------------------------------------------------------
	USE_RERANKER = True
	RERANKER_MODEL = "BAAI/bge-reranker-v2-m3" # multilingual (handles ID queries)
	RETRIEVAL_FETCH_K = 20 # candidates retrieved before reranking
	RRF_K = 60 # RRF damping constant (standard default)

	# ---------------------------------------------------------------------------
	# Query rewriting (multi-query expansion)
	# The user's phrasing is rarely the best retrieval query. An LLM generates
	# alternative phrasings; each is retrieved and the results are fused with RRF
	# before reranking — lifts recall on differently-worded questions.
	# ---------------------------------------------------------------------------
	# Disabled by default: the RAGAS ablation (see notebooks/rag_evaluation.ipynb)
	# showed multi-query rewriting slightly hurt recall on this small, focused
	# corpus. Kept implemented + measured; flip to True for larger/noisier corpora.
	USE_QUERY_REWRITE = False
	QUERY_REWRITE_MODEL = "gemini-3.1-flash-lite" # fast/cheap, via OpenAI-compat endpoint
	N_QUERY_VARIANTS = 3 # total queries incl. the original

	# ---------------------------------------------------------------------------
	# Corrective RAG + abstention
	# If the reranker's top relevance score is below the threshold, the retrieved
	# context is too weak — the system abstains instead of answering from poor
	# evidence (calibrated hallucination guard for out-of-corpus questions).
	# ---------------------------------------------------------------------------
	USE_CORRECTIVE_RAG = True
	# Gate on the top semantic cosine score: off-corpus questions score ~0.0 while
	# in-corpus ones score >=~0.2, so cosine separates them cleanly (the reranker's
	# sigmoid sits near 0.5 for both and is unsuitable as an abstain signal).
	CRAG_ABSTAIN_THRESHOLD = 0.12

	# Max number of turns (1 turn = 1 user + 1 assistant message) to keep in
	# LLM history. Each RAG turn adds ~7 000 tokens (6 chunks + Q + A), so 4 turns
	# ≈ 28 K tokens — safely under the 32 K limit of Gemma/Qwen3 while leaving
	# room for the system prompt and the new question+context.
	MAX_HISTORY_TURNS = 4

	# ---------------------------------------------------------------------------
	# Knowledge base sources (Project Gutenberg)
	# ---------------------------------------------------------------------------
	SOURCES = [
	{"philosopher": "Nietzsche", "title": "Thus Spoke Zarathustra", "gutenberg_id": 1998},
	{"philosopher": "Nietzsche", "title": "Beyond Good and Evil", "gutenberg_id": 4363},
	{"philosopher": "Nietzsche", "title": "On the Genealogy of Morality", "gutenberg_id": 52319},
	{"philosopher": "Nietzsche", "title": "The Birth of Tragedy", "gutenberg_id": 51356},
	{"philosopher": "Schopenhauer", "title": "Essays of Arthur Schopenhauer", "gutenberg_id": 11945},
	{"philosopher": "Hume", "title": "An Enquiry Concerning Human Understanding", "gutenberg_id": 9662},
	{"philosopher": "Russell", "title": "The Problems of Philosophy", "gutenberg_id": 5827},
	{"philosopher": "Marcus Aurelius", "title": "Meditations", "gutenberg_id": 2680},
	{"philosopher": "Plato", "title": "The Republic", "gutenberg_id": 1497},
	{"philosopher": "Mill", "title": "Utilitarianism", "gutenberg_id": 11224},
	{"philosopher": "Epictetus", "title": "The Enchiridion", "gutenberg_id": 45109},
	{"philosopher": "Kant", "title": "Fundamental Principles of the Metaphysic of Morals", "gutenberg_id": 5682},
	]