Spaces:

spark-ds549
/

BPL-RAG-Spring-2026

Sleeping

App Files Files Community

BPL-RAG-Spring-2026 / config.py

han-na

human in the loop and other changes

6910834 about 1 month ago

raw

history blame contribute delete

4.1 kB

	"""
	Central configuration for the BPL RAG pipeline.
	All tuneable constants live here — change here, affects everywhere.
	"""

	import os
	from dotenv import load_dotenv

	load_dotenv()

	# ── OpenAI ────────────────────────────────────────────────────────────────────
	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
	OPENAI_CHAT_MODEL = "gpt-4o"
	# OPENAI_EMBED_MODEL = "text-embedding-3-small" # fallback if not using BGE

	# ── BGE Embedding ─────────────────────────────────────────────────────────────
	BGE_MODEL_NAME = "BAAI/bge-m3"
	BGE_DEVICE = "cpu" # V100 CC 7.0 incompatible with installed PyTorch (CC >=7.5)
	BGE_BATCH_SIZE = 32 # lower if OOM

	# ── Neo4j / GraphRAG ──────────────────────────────────────────────────────────
	NEO4J_URI = os.getenv("NEO4J_URI", "")
	NEO4J_USER = os.getenv("NEO4J_USER", "neo4j")
	NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "")

	# GraphRAG is triggered only for content_driven queries
	GRAPH_RAG_ENABLED = True
	GRAPH_TOP_K = 5000 # max additional docs from graph
	GRAPH_MIN_ENTITY_MATCHES = 1 # min query entities a doc must match

	# ── PostgreSQL / pgVector ─────────────────────────────────────────────────────
	PG_HOST = os.getenv("PG_HOST", "localhost")
	PG_PORT = int(os.getenv("PG_PORT", 5432))
	PG_DB = os.getenv("PG_DB", "bpl_rag")
	PG_USER = os.getenv("PG_USER", "postgres")
	PG_PASSWORD = os.getenv("PG_PASSWORD", "")
	PG_DSN = (
	f"postgresql://{PG_USER}:{PG_PASSWORD}@{PG_HOST}:{PG_PORT}/{PG_DB}"
	)

	# ── Chunking ──────────────────────────────────────────────────────────────────
	CHUNK_SIZE = 1024 # was 512 — BGE-M3 handles longer context well
	CHUNK_OVERLAP = 150 # was 100 — proportionally larger overlap
	CHUNK_TOKENIZER = "cl100k_base" # tiktoken encoding

	# ── Retrieval ─────────────────────────────────────────────────────────────────
	TOP_K_DENSE = 5000 # candidates from vector search before rerank
	TOP_K_BM25 = 5000 # candidates from BM25
	TOP_K_FINAL = 50 # results returned to the user
	RRF_K = 60 # RRF constant (standard is 60)

	# ── Metadata score blend weight ───────────────────────────────────────────────
	# final_score = CONTENT_WEIGHT * content_rrf + METADATA_WEIGHT * metadata_sim
	CONTENT_WEIGHT = 0.80
	METADATA_WEIGHT = 0.20

	# ── Ingestion ─────────────────────────────────────────────────────────────────
	MIN_CHAR_COUNT = 100 # skip records with fewer chars of raw_text
	JSON_DUMP_DIR = "data/raw" # folder containing local JSON dumps

	# ── Generation ───────────────────────────────────────────────────────────────
	MAX_CONTEXT_CHUNKS = 5 # how many chunks to pass to GPT-4o
	GENERATION_MAX_TOKENS = 600

	MIN_RELEVANCE_SCORE = 0.1 # documents below this are considered irrelevant