Spaces:

aditya-joshi-05
/

Cortex

Running

App Files Files Community

aditya-joshi-05 commited on Apr 7

Commit

f0d100b

1 Parent(s): f6803e9

Add phase 3 & 4

Browse files

Files changed (15) hide show

.env.example +35 -6
api/main.py +95 -3
api/schemas.py +6 -0
config.py +37 -1
evaluation/__init__.py +0 -0
evaluation/ragas_eval.py +228 -0
evaluation/store.py +276 -0
generation/crag.py +402 -0
ingestion/pipeline.py +14 -1
retrieval/cache.py +241 -0
retrieval/graph_builder.py +278 -0
retrieval/graph_retriever.py +271 -0
retrieval/orchestrator.py +19 -1
retrieval/relation_extractors.py +602 -0
ui/app.py +166 -4

.env.example CHANGED Viewed

@@ -1,13 +1,16 @@
 # ── Copy this file to .env and fill in your values ─────────────
 # cp .env.example .env
-# ── Groq API ───────────────────────────────────────────────────
-GROQ_API_KEY=gsk_your_key_here
 # ── Milvus (defaults work with docker-compose) ─────────────────
-MILVUS_HOST=localhost
-MILVUS_PORT=19530
-MILVUS_COLLECTION=cortex_chunks
 # ── Embedding model ────────────────────────────────────────────
 EMBED_MODEL_NAME=BAAI/bge-small-en-v1.5
@@ -23,9 +26,35 @@ RETRIEVAL_TOP_K=15
 FINAL_TOP_K=5
 # ── LLM ────────────────────────────────────────────────────────
-GROQ_MODEL=llama-3.3-70b-versatile
 GROQ_TEMPERATURE=0.1
 GROQ_MAX_TOKENS=1024
 # ── Logging ────────────────────────────────────────────────────
 LOG_LEVEL=INFO

 # ── Copy this file to .env and fill in your values ─────────────
 # cp .env.example .env
+# ── API KEYS ───────────────────────────────────────────────────
+GROQ_API_KEY=
+NVIDIA_API_KEY=
+MISTRAL_API_KEY=
+TAVILY_API_KEY=
 # ── Milvus (defaults work with docker-compose) ─────────────────
+MILVUS_HOST=
+MILVUS_PORT=
+MILVUS_COLLECTION=
 # ── Embedding model ────────────────────────────────────────────
 EMBED_MODEL_NAME=BAAI/bge-small-en-v1.5
 FINAL_TOP_K=5
 # ── LLM ────────────────────────────────────────────────────────
+GROQ_MODEL=openai/gpt-oss-120b
 GROQ_TEMPERATURE=0.1
 GROQ_MAX_TOKENS=1024
+# ── Knowledge graph ────────────────────────────────────────────
+# rebel          → local REBEL model, no API calls (~1.6GB download on first run)
+# llm            → Groq LLM, free-form predicates (rate-limited)
+# rebel-filtered → REBEL + entity density pre-filter: skips ~70% of chunks
+# llm-filtered   → LLM   + entity density pre-filter: drastically fewer API calls
+GRAPH_EXTRACTOR=llm-filtered
+REBEL_BATCH_SIZE=4        # lower to 4 if you hit OOM on CPU
+# Density filter settings (only used when GRAPH_EXTRACTOR ends with -filtered)
+DENSITY_TOP_FRACTION=0.30  # process top 30% most entity-rich chunks
+DENSITY_MIN_ENTITIES=2     # hard floor: always skip chunks with fewer than N entities
+# ── RE LLM (LLM accessible via Mistral or Ollama) ────────────────────────────────────────────────────────
+LLM_SERVER=mistral   # options: mistral, ollama
+MISTRAL_MODEL=devstral-latest
+OLLAMA_MODEL=llama3.2:3b
+OLLAMA_HOST=
+# ── Redis cache ────────────────────────────────────────────────
+REDIS_URL=
+CACHE_TTL_SECONDS=3600
+# ── Evaluation ─────────────────────────────────────────────────
+EVAL_DB_PATH=
+EVAL_ENABLED=true              # set false to skip RAGAS LLM calls
 # ── Logging ────────────────────────────────────────────────────
 LOG_LEVEL=INFO

api/main.py CHANGED Viewed

@@ -39,6 +39,10 @@ from api.schemas import (
 )
 from config import get_settings
 from generation.generator import Generator, GenerationRequest
 from ingestion.pipeline import IngestionPipeline
 from retrieval.dense import MilvusStore
 from retrieval.embedder import Embedder
@@ -46,6 +50,7 @@ from retrieval.bm25 import BM25Retriever
 from retrieval.orchestrator import MultiStrategyRetriever
 logger = logging.getLogger(__name__)
 # ── Shared singletons ──────────────────────────────────────────
 # Created once on startup, shared across requests
@@ -54,6 +59,9 @@ _embedder: Embedder = None
 _store: MilvusStore = None
 _bm25: BM25Retriever = None
 _retriever: MultiStrategyRetriever = None
 _generator: Generator = None
 _pipeline: IngestionPipeline = None
@@ -61,14 +69,19 @@ _pipeline: IngestionPipeline = None
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Initialise shared resources on startup, clean up on shutdown."""
-    global _embedder, _store, _bm25, _retriever, _generator, _pipeline
     logger.info("Cortex starting up...")
     _embedder  = Embedder()
     _store     = MilvusStore(embedder=_embedder)
     _bm25      = BM25Retriever()
     _retriever = MultiStrategyRetriever(embedder=_embedder, store=_store, bm25=_bm25)
-    _generator = Generator()
     _pipeline  = IngestionPipeline(embedder=_embedder, store=_store, bm25=_bm25)
     # Warm up: trigger model load immediately so first request is fast
@@ -126,11 +139,18 @@ async def health() -> HealthResponse:
     embedder_status = "loaded" if _embedder and _embedder._model else "not_loaded"
     return HealthResponse(
         status="ok" if milvus_status == "ok" else "degraded",
         milvus=milvus_status,
         embedder=embedder_status,
         collection_stats=collection_stats,
     )
@@ -159,6 +179,25 @@ async def ingest(req: IngestRequest) -> IngestResponse:
     return IngestResponse(**stats)
 @app.post("/query", response_model=QueryResponse, tags=["retrieval"])
 async def query(req: QueryRequest) -> QueryResponse:
@@ -169,6 +208,9 @@ async def query(req: QueryRequest) -> QueryResponse:
     cfg = get_settings()
     k = req.top_k or cfg.retrieval_top_k
     try:
         retrieval = _retriever.retrieve(req.query, top_k_candidates=k, final_top_k=cfg.final_top_k)
     except Exception as exc:
@@ -187,6 +229,14 @@ async def query(req: QueryRequest) -> QueryResponse:
     final_chunks = retrieval.chunks
     try:
         result = _generator.generate(
             GenerationRequest(query=req.query, chunks=final_chunks)
@@ -195,6 +245,30 @@ async def query(req: QueryRequest) -> QueryResponse:
         logger.exception("Generation error")
         raise HTTPException(status_code=500, detail=f"Generation failed: {exc}")
     return QueryResponse(
         query=req.query,
         answer=result.answer,
@@ -277,7 +351,25 @@ async def query_stream(req: QueryRequest):
                 yield _sse_event({"type": "done"})
                 return
-            # 3. Stream answer tokens
             gen_request = GenerationRequest(
                 query=req.query,
                 chunks=final_chunks,

 )
 from config import get_settings
 from generation.generator import Generator, GenerationRequest
+from generation.crag import CRAGGate
+from evaluation.store import EvalStore, QueryLogEntry
+from evaluation.ragas_eval import RAGASEvaluator, EvalInput
+from retrieval.cache import CachedRetriever
 from ingestion.pipeline import IngestionPipeline
 from retrieval.dense import MilvusStore
 from retrieval.embedder import Embedder
 from retrieval.orchestrator import MultiStrategyRetriever
 logger = logging.getLogger(__name__)
+cfg = get_settings()
 # ── Shared singletons ──────────────────────────────────────────
 # Created once on startup, shared across requests
 _store: MilvusStore = None
 _bm25: BM25Retriever = None
 _retriever: MultiStrategyRetriever = None
+_crag: CRAGGate = None
+_eval_store: EvalStore = None
+_evaluator: RAGASEvaluator = None
 _generator: Generator = None
 _pipeline: IngestionPipeline = None
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Initialise shared resources on startup, clean up on shutdown."""
+    global _embedder, _store, _bm25, _retriever, _crag, _generator, _pipeline, _eval_store, _evaluator
     logger.info("Cortex starting up...")
     _embedder  = Embedder()
     _store     = MilvusStore(embedder=_embedder)
     _bm25      = BM25Retriever()
     _retriever = MultiStrategyRetriever(embedder=_embedder, store=_store, bm25=_bm25)
+    _crag       = CRAGGate()
+    _eval_store = EvalStore(db_path=cfg.eval_db_path)
+    _evaluator  = RAGASEvaluator(store=_eval_store)
+    _generator  = Generator()
+    # Wrap retriever with Redis cache (degrades gracefully if Redis is absent)
+    _retriever  = CachedRetriever(_retriever)
     _pipeline  = IngestionPipeline(embedder=_embedder, store=_store, bm25=_bm25)
     # Warm up: trigger model load immediately so first request is fast
     embedder_status = "loaded" if _embedder and _embedder._model else "not_loaded"
+    graph_stats = {}
+    try:
+        graph_stats = _retriever.graph_builder.stats()
+    except Exception:
+        pass
     return HealthResponse(
         status="ok" if milvus_status == "ok" else "degraded",
         milvus=milvus_status,
         embedder=embedder_status,
         collection_stats=collection_stats,
+        graph_stats=graph_stats,
     )
     return IngestResponse(**stats)
+@app.get("/metrics", tags=["evaluation"])
+async def get_metrics(limit: int = 100, days: int = 7):
+    """
+    Query performance metrics and RAGAS scores for the dashboard.
+    Returns summary stats, recent query logs, and hourly timeseries.
+    """
+    return {
+        "summary":    _eval_store.get_summary_stats(),
+        "recent":     _eval_store.get_recent_queries(limit=limit),
+        "timeseries": _eval_store.get_metric_timeseries(days=days),
+        "cache":      _retriever.cache_stats(),
+    }
+@app.post("/cache/flush", tags=["system"])
+async def flush_cache():
+    """Flush all Redis retrieval cache entries."""
+    deleted = _retriever.flush_all()
+    return {"deleted": deleted}
 @app.post("/query", response_model=QueryResponse, tags=["retrieval"])
 async def query(req: QueryRequest) -> QueryResponse:
     cfg = get_settings()
     k = req.top_k or cfg.retrieval_top_k
+    import time as _time
+    _t0 = _time.perf_counter()
     try:
         retrieval = _retriever.retrieve(req.query, top_k_candidates=k, final_top_k=cfg.final_top_k)
     except Exception as exc:
     final_chunks = retrieval.chunks
+    # CRAG gate: grade, rewrite if POOR, web-search fallback if ABSENT
+    crag_result = _crag.evaluate(
+        query=req.query,
+        chunks=final_chunks,
+        retriever_fn=lambda q: _retriever.retrieve(q).chunks,
+    )
+    final_chunks = crag_result.final_chunks
     try:
         result = _generator.generate(
             GenerationRequest(query=req.query, chunks=final_chunks)
         logger.exception("Generation error")
         raise HTTPException(status_code=500, detail=f"Generation failed: {exc}")
+    latency_ms = (_time.perf_counter() - _t0) * 1000
+    log_id = _eval_store.log_query(QueryLogEntry(
+        query=req.query,
+        intent=retrieval.decision.intent.value,
+        strategies=retrieval.decision.strategies,
+        retriever_hits=retrieval.retriever_hits,
+        crag_grade=crag_result.grade.value,
+        crag_rewritten=bool(crag_result.rewritten_query),
+        web_search_used=crag_result.web_search_used,
+        num_chunks=len(final_chunks),
+        top_chunk_score=final_chunks[0].score if final_chunks else 0.0,
+        latency_ms=latency_ms,
+        model=result.model,
+    ))
+    if cfg.eval_enabled:
+        _evaluator.evaluate_async(EvalInput(
+            query_log_id=log_id,
+            query=req.query,
+            answer=result.answer,
+            chunks=final_chunks,
+        ))
     return QueryResponse(
         query=req.query,
         answer=result.answer,
                 yield _sse_event({"type": "done"})
                 return
+            # 3. CRAG gate — grade, optionally rewrite + re-retrieve
+            crag_result = _crag.evaluate(
+                query=req.query,
+                chunks=final_chunks,
+                retriever_fn=lambda q: _retriever.retrieve(q).chunks,
+            )
+            final_chunks = crag_result.final_chunks
+            # Emit CRAG event if something interesting happened
+            if crag_result.grade.value != "GOOD" or crag_result.web_search_used:
+                yield _sse_event({
+                    "type": "crag_update",
+                    "grade": crag_result.grade.value,
+                    "rewritten_query": crag_result.rewritten_query,
+                    "web_search_used": crag_result.web_search_used,
+                    "reasoning": crag_result.reasoning,
+                })
+            # 4. Stream answer tokens
             gen_request = GenerationRequest(
                 query=req.query,
                 chunks=final_chunks,

api/schemas.py CHANGED Viewed

@@ -45,6 +45,9 @@ class QueryResponse(BaseModel):
     citations: list[CitationResponse]
     retrieved_chunks: list[ChunkResponse]
     routing: Optional[RoutingResponse] = None
     model: str
     usage: dict
@@ -60,6 +63,8 @@ class IngestResponse(BaseModel):
     chunks_created: int
     chunks_stored: int
     bm25_indexed: int = 0
     errors: list[dict] = []
@@ -68,3 +73,4 @@ class HealthResponse(BaseModel):
     milvus: str
     embedder: str
     collection_stats: dict

     citations: list[CitationResponse]
     retrieved_chunks: list[ChunkResponse]
     routing: Optional[RoutingResponse] = None
+    crag_grade: Optional[str] = None
+    crag_rewritten_query: Optional[str] = None
+    web_search_used: bool = False
     model: str
     usage: dict
     chunks_created: int
     chunks_stored: int
     bm25_indexed: int = 0
+    graph_entities: int = 0
+    graph_triples: int = 0
     errors: list[dict] = []
     milvus: str
     embedder: str
     collection_stats: dict
+    graph_stats: dict = {}

config.py CHANGED Viewed

@@ -40,12 +40,15 @@ class Settings(BaseSettings):
     retrieval_top_k: int = 15            # candidates before reranking
     final_top_k: int = 5                 # chunks sent to LLM
-    # ── LLM / Groq ───────────────────────────────────────────
     groq_api_key: str = os.getenv("GROQ_API_KEY", "")  # must be set in .env for LLM classification to work
     groq_model: str = os.getenv("GROQ_MODEL", "llama-3.3-70b-versatile")
     groq_temperature: float = float(os.getenv("GROQ_TEMPERATURE", 0.1))
     groq_max_tokens: int = int(os.getenv("GROQ_MAX_TOKENS", 1024))
     groq_timeout: int = int(os.getenv("GROQ_TIMEOUT", 30))  # seconds before Groq client timeout
     # ── FastAPI ──────────────────────────────────────────────
     api_host: str = "0.0.0.0"
@@ -56,6 +59,39 @@ class Settings(BaseSettings):
     data_dir: str = "data/documents"
     log_level: str = "INFO"
 @lru_cache(maxsize=1)
 def get_settings() -> Settings:

     retrieval_top_k: int = 15            # candidates before reranking
     final_top_k: int = 5                 # chunks sent to LLM
+    # ── LLM / TAVILY ───────────────────────────────────────────
     groq_api_key: str = os.getenv("GROQ_API_KEY", "")  # must be set in .env for LLM classification to work
     groq_model: str = os.getenv("GROQ_MODEL", "llama-3.3-70b-versatile")
     groq_temperature: float = float(os.getenv("GROQ_TEMPERATURE", 0.1))
     groq_max_tokens: int = int(os.getenv("GROQ_MAX_TOKENS", 1024))
     groq_timeout: int = int(os.getenv("GROQ_TIMEOUT", 30))  # seconds before Groq client timeout
+    tavily_api_key: str = os.getenv("TAVILY_API_KEY", "")
+    mistral_api_key: str = os.getenv("MISTRAL_API_KEY", "")
+    mistral_model: str = os.getenv("MISTRAL_MODEL", "devstral-latest")
     # ── FastAPI ──────────────────────────────────────────────
     api_host: str = "0.0.0.0"
     data_dir: str = "data/documents"
     log_level: str = "INFO"
+     # ── CRAG ─────────────────────────────────────────────────
+    crag_enabled: bool = True
+    crag_relevance_threshold: float = 0.5   # below this → POOR grade
+    # ── Graph ─────────────────────────────────────────────────
+    graph_enabled: bool = True
+    graph_path: str = "data/knowledge_graph.json"
+    graph_max_hops: int = 2
+    # "rebel"          → local REBEL model, no API calls (default)
+    # "llm"            → Groq LLM, free-form predicates
+    # "rebel-filtered" → REBEL + entity density pre-filter (option 4)
+    # "llm-filtered"   → LLM   + entity density pre-filter (option 4)
+    graph_extractor: str = "llm-filtered"
+    rebel_batch_size: int = 4     # chunks per REBEL forward pass; lower if OOM
+     # ── Density filter (used when graph_extractor ends with "-filtered") ──
+    density_top_fraction: float = 0.30   # process top 30% most entity-dense chunks
+    density_min_entities: int   = 2      # hard floor: skip chunks with fewer entities
+    # ── Relation Ext LLM (LLM accessible via Mistral or Ollama) ────────────────────────────────────────────────────────
+    llm_server: str = os.getenv("LLM_SERVER", "mistral")  # "mistral" or "ollama"
+    ollama_model: str = os.getenv("OLLAMA_MODEL", "llama3.2:3b")
+    ollama_host: str = os.getenv("OLLAMA_HOST", "")  # Ollama server URL
+    mistral_model: str = os.getenv("MISTRAL_MODEL", "devstral-latest")
+    # ── Redis cache ───────────────────────────────────────────
+    redis_url: str = "redis://localhost:6379"
+    cache_ttl_seconds: int = 3600    # 1 hour
+    # ── Evaluation ────────────────────────────────────────────
+    eval_db_path: str = "data/cortex_eval.db"
+    eval_enabled: bool = True        # set False to skip RAGAS calls entirely
 @lru_cache(maxsize=1)
 def get_settings() -> Settings:

evaluation/__init__.py ADDED Viewed

File without changes

evaluation/ragas_eval.py ADDED Viewed

	@@ -0,0 +1,228 @@

+"""
+Cortex RAG — RAGAS Evaluation Harness (Phase 4)
+Why reference-free metrics?
+────────────────────────────
+Classic RAG evaluation requires ground-truth answers (golden QA pairs).
+We don't have those at runtime. RAGAS provides three metrics that need
+only (question, answer, retrieved_contexts):
+  faithfulness        — Does the answer make claims supported by the context?
+                        Computed by asking an LLM to identify each claim in
+                        the answer, then checking each claim against the context.
+                        Score = supported_claims / total_claims.
+  answer_relevancy    — Does the answer actually address the question?
+                        Computed by generating N hypothetical questions from the
+                        answer and measuring cosine similarity to the original
+                        question. Low score = answer talks about something else.
+  context_precision   — Are the retrieved chunks actually relevant to the query?
+                        Computed by asking an LLM whether each chunk is useful
+                        for answering the query. Score = relevant_chunks / total.
+We also compute two lightweight custom metrics without any LLM calls:
+  context_utilisation — What fraction of the retrieved chunks are cited in the
+                        answer? (Count [1], [2]... citation markers.) A low score
+                        means the generator ignored most of what was retrieved.
+  mean_chunk_score    — Average retrieval score (post-reranking) of the final
+                        chunks. Tracks retrieval quality independently of answer
+                        quality. Useful for spotting when CRAG rewrites help.
+Running mode
+────────────
+Evaluation is async — it runs in a background thread after the response
+has been streamed to the user, so it never adds latency to the query path.
+Results are written to the EvalStore (SQLite) and appear in the dashboard.
+If RAGAS is not installed or the LLM call fails, only the two custom
+metrics (context_utilisation, mean_chunk_score) are computed and stored.
+This ensures the evaluation pipeline never blocks ingestion or queries.
+"""
+from __future__ import annotations
+import logging
+import re
+import threading
+from dataclasses import dataclass, field
+from typing import Optional
+from evaluation.store import EvalMetricEntry, EvalStore
+from retrieval.dense import RetrievedChunk
+logger = logging.getLogger(__name__)
+@dataclass
+class EvalInput:
+    """Everything needed to evaluate one query-response pair."""
+    query_log_id: int
+    query: str
+    answer: str
+    chunks: list[RetrievedChunk] = field(default_factory=list)
+@dataclass
+class EvalResult:
+    faithfulness:        Optional[float] = None
+    answer_relevancy:    Optional[float] = None
+    context_precision:   Optional[float] = None
+    context_utilisation: Optional[float] = None
+    mean_chunk_score:    Optional[float] = None
+    def as_store_entry(self, query_log_id: int) -> EvalMetricEntry:
+        return EvalMetricEntry(
+            query_log_id=query_log_id,
+            faithfulness=self.faithfulness,
+            answer_relevancy=self.answer_relevancy,
+            context_precision=self.context_precision,
+            context_utilisation=self.context_utilisation,
+            mean_chunk_score=self.mean_chunk_score,
+        )
+class RAGASEvaluator:
+    """
+    Computes RAGAS + custom metrics for a query-response pair.
+    Usage — fire-and-forget (non-blocking):
+        evaluator = RAGASEvaluator(store)
+        evaluator.evaluate_async(EvalInput(
+            query_log_id=log_id,
+            query="What is attention?",
+            answer="Attention is...",
+            chunks=final_chunks,
+        ))
+    Usage — blocking (for testing):
+        result = evaluator.evaluate(eval_input)
+    """
+    def __init__(self, store: Optional[EvalStore] = None) -> None:
+        self._store = store or EvalStore()
+        self._ragas_available = self._check_ragas()
+    # ── Public API ─────────────────────────────────────────────
+    def evaluate_async(self, inp: EvalInput) -> None:
+        """
+        Run evaluation in a daemon thread. Returns immediately.
+        Results are written to EvalStore when complete.
+        """
+        thread = threading.Thread(
+            target=self._run_and_store,
+            args=(inp,),
+            daemon=True,
+            name=f"ragas-eval-{inp.query_log_id}",
+        )
+        thread.start()
+    def evaluate(self, inp: EvalInput) -> EvalResult:
+        """Blocking evaluation. Returns EvalResult."""
+        result = EvalResult()
+        # ── Custom metrics (no LLM, always computed) ──────────
+        result.context_utilisation = self._context_utilisation(inp.answer, inp.chunks)
+        result.mean_chunk_score    = self._mean_chunk_score(inp.chunks)
+        # ── RAGAS metrics (LLM-based, may be skipped) ─────────
+        if self._ragas_available and inp.chunks:
+            ragas_scores = self._run_ragas(inp)
+            result.faithfulness      = ragas_scores.get("faithfulness")
+            result.answer_relevancy  = ragas_scores.get("answer_relevancy")
+            result.context_precision = ragas_scores.get("context_precision")
+        else:
+            if not self._ragas_available:
+                logger.debug("RAGAS not installed — only custom metrics computed.")
+        return result
+    # ── Private ────────────────────────────────────────────────
+    def _run_and_store(self, inp: EvalInput) -> None:
+        try:
+            result = self.evaluate(inp)
+            self._store.log_metrics(result.as_store_entry(inp.query_log_id))
+            logger.debug(
+                "Eval stored for query %d: faith=%.2f rel=%.2f prec=%.2f util=%.2f",
+                inp.query_log_id,
+                result.faithfulness      or 0,
+                result.answer_relevancy  or 0,
+                result.context_precision or 0,
+                result.context_utilisation or 0,
+            )
+        except Exception as exc:
+            logger.warning("Eval failed for query %d: %s", inp.query_log_id, exc)
+    def _run_ragas(self, inp: EvalInput) -> dict:
+        """
+        Call RAGAS library. Returns dict of metric_name → score.
+        Returns empty dict on any failure.
+        """
+        try:
+            from datasets import Dataset  # type: ignore
+            from ragas import evaluate as ragas_evaluate  # type: ignore
+            from ragas.metrics import (  # type: ignore
+                answer_relevancy,
+                context_precision,
+                faithfulness,
+            )
+            from config import get_settings
+            cfg = get_settings()
+            # RAGAS expects a HuggingFace Dataset
+            data = {
+                "question":  [inp.query],
+                "answer":    [inp.answer],
+                "contexts":  [[c.parent_text or c.text for c in inp.chunks]],
+                # reference not available at runtime — omit context_recall
+            }
+            dataset = Dataset.from_dict(data)
+            scores = ragas_evaluate(
+                dataset,
+                metrics=[faithfulness, answer_relevancy, context_precision],
+                raise_exceptions=False,
+            )
+            df = scores.to_pandas()
+            return {
+                "faithfulness":      float(df["faithfulness"].iloc[0])      if "faithfulness"      in df else None,
+                "answer_relevancy":  float(df["answer_relevancy"].iloc[0])  if "answer_relevancy"  in df else None,
+                "context_precision": float(df["context_precision"].iloc[0]) if "context_precision" in df else None,
+            }
+        except Exception as exc:
+            logger.warning("RAGAS evaluation failed: %s", exc)
+            return {}
+    # ── Custom metrics (no LLM required) ──────────────────────
+    @staticmethod
+    def _context_utilisation(answer: str, chunks: list[RetrievedChunk]) -> float:
+        """
+        Fraction of retrieved chunks cited in the answer.
+        Looks for inline [N] citation markers.
+        """
+        if not chunks:
+            return 0.0
+        cited_indices = set(int(n) for n in re.findall(r"\[(\d+)\]", answer))
+        cited = sum(1 for i in range(1, len(chunks) + 1) if i in cited_indices)
+        return round(cited / len(chunks), 3)
+    @staticmethod
+    def _mean_chunk_score(chunks: list[RetrievedChunk]) -> float:
+        """Average retrieval score of the final chunks."""
+        if not chunks:
+            return 0.0
+        return round(sum(c.score for c in chunks) / len(chunks), 3)
+    @staticmethod
+    def _check_ragas() -> bool:
+        try:
+            import ragas  # type: ignore  # noqa: F401
+            import datasets  # type: ignore  # noqa: F401
+            return True
+        except ImportError:
+            return False

evaluation/store.py ADDED Viewed

	@@ -0,0 +1,276 @@

+"""
+Cortex RAG — Evaluation Store (SQLite)
+Two tables:
+  query_logs   — one row per query: routing, CRAG grade, latency, chunk scores
+  eval_metrics — one row per query: RAGAS scores (written async after generation)
+SQLite is the right choice here: zero infrastructure, works on Railway/Render
+out of the box, and a dashboard corpus of ~10k queries fits in <50MB.
+Swap to Postgres trivially later by changing the connection string.
+The store is intentionally append-only. No deletes, no updates.
+This preserves the full history for trend analysis in the dashboard.
+"""
+from __future__ import annotations
+import json
+import logging
+import sqlite3
+import time
+from contextlib import contextmanager
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+logger = logging.getLogger(__name__)
+_DEFAULT_DB_PATH = Path("data/cortex_eval.db")
+# ── Schema ─────────────────────────────────────────────────────
+_DDL = """
+CREATE TABLE IF NOT EXISTS query_logs (
+    id              INTEGER PRIMARY KEY AUTOINCREMENT,
+    timestamp       REAL    NOT NULL,
+    query           TEXT    NOT NULL,
+    intent          TEXT,
+    strategies      TEXT,           -- JSON list
+    retriever_hits  TEXT,           -- JSON dict
+    crag_grade      TEXT,
+    crag_rewritten  INTEGER DEFAULT 0,   -- bool
+    web_search_used INTEGER DEFAULT 0,   -- bool
+    num_chunks      INTEGER DEFAULT 0,
+    top_chunk_score REAL    DEFAULT 0.0,
+    latency_ms      REAL    DEFAULT 0.0,
+    model           TEXT,
+    extractor       TEXT
+);
+CREATE TABLE IF NOT EXISTS eval_metrics (
+    id                  INTEGER PRIMARY KEY AUTOINCREMENT,
+    query_log_id        INTEGER NOT NULL REFERENCES query_logs(id),
+    timestamp           REAL    NOT NULL,
+    faithfulness        REAL,   -- 0-1: does answer contradict context?
+    answer_relevancy    REAL,   -- 0-1: does answer address the question?
+    context_precision   REAL,   -- 0-1: are retrieved chunks relevant?
+    context_utilisation REAL,   -- 0-1: fraction of chunks cited in answer
+    mean_chunk_score    REAL    -- average retrieval score of final chunks
+);
+CREATE INDEX IF NOT EXISTS idx_query_logs_ts   ON query_logs(timestamp);
+CREATE INDEX IF NOT EXISTS idx_eval_metrics_id ON eval_metrics(query_log_id);
+"""
+# ── Dataclasses ────────────────────────────────────────────────
+@dataclass
+class QueryLogEntry:
+    query: str
+    intent: str = ""
+    strategies: list[str] = None
+    retriever_hits: dict = None
+    crag_grade: str = ""
+    crag_rewritten: bool = False
+    web_search_used: bool = False
+    num_chunks: int = 0
+    top_chunk_score: float = 0.0
+    latency_ms: float = 0.0
+    model: str = ""
+    extractor: str = ""
+    def __post_init__(self):
+        if self.strategies is None:
+            self.strategies = []
+        if self.retriever_hits is None:
+            self.retriever_hits = {}
+@dataclass
+class EvalMetricEntry:
+    query_log_id: int
+    faithfulness: Optional[float] = None
+    answer_relevancy: Optional[float] = None
+    context_precision: Optional[float] = None
+    context_utilisation: Optional[float] = None
+    mean_chunk_score: Optional[float] = None
+# ── Store ──────────────────────────────────────────────────────
+class EvalStore:
+    """
+    Thread-safe SQLite-backed store for query logs and eval metrics.
+    Usage:
+        store = EvalStore()
+        log_id = store.log_query(entry)
+        store.log_metrics(EvalMetricEntry(query_log_id=log_id, faithfulness=0.92, ...))
+    """
+    def __init__(self, db_path: str | Path = _DEFAULT_DB_PATH) -> None:
+        self._path = Path(db_path)
+        self._path.parent.mkdir(parents=True, exist_ok=True)
+        self._init_db()
+    # ── Write ──────────────────────────────────────────────────
+    def log_query(self, entry: QueryLogEntry) -> int:
+        """Insert a query log row. Returns the new row id."""
+        with self._conn() as conn:
+            cur = conn.execute(
+                """INSERT INTO query_logs
+                   (timestamp, query, intent, strategies, retriever_hits,
+                    crag_grade, crag_rewritten, web_search_used,
+                    num_chunks, top_chunk_score, latency_ms, model, extractor)
+                   VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)""",
+                (
+                    time.time(),
+                    entry.query,
+                    entry.intent,
+                    json.dumps(entry.strategies),
+                    json.dumps(entry.retriever_hits),
+                    entry.crag_grade,
+                    int(entry.crag_rewritten),
+                    int(entry.web_search_used),
+                    entry.num_chunks,
+                    entry.top_chunk_score,
+                    entry.latency_ms,
+                    entry.model,
+                    entry.extractor,
+                ),
+            )
+            return cur.lastrowid
+    def log_metrics(self, entry: EvalMetricEntry) -> None:
+        """Insert an eval_metrics row."""
+        with self._conn() as conn:
+            conn.execute(
+                """INSERT INTO eval_metrics
+                   (query_log_id, timestamp, faithfulness, answer_relevancy,
+                    context_precision, context_utilisation, mean_chunk_score)
+                   VALUES (?,?,?,?,?,?,?)""",
+                (
+                    entry.query_log_id,
+                    time.time(),
+                    entry.faithfulness,
+                    entry.answer_relevancy,
+                    entry.context_precision,
+                    entry.context_utilisation,
+                    entry.mean_chunk_score,
+                ),
+            )
+    # ── Read ───────────────────────────────────────────────────
+    def get_recent_queries(self, limit: int = 100) -> list[dict]:
+        """Last N query logs joined with their eval metrics (if available)."""
+        with self._conn() as conn:
+            rows = conn.execute(
+                """SELECT q.id, q.timestamp, q.query, q.intent, q.strategies,
+                          q.crag_grade, q.web_search_used, q.num_chunks,
+                          q.top_chunk_score, q.latency_ms,
+                          e.faithfulness, e.answer_relevancy,
+                          e.context_precision, e.context_utilisation,
+                          e.mean_chunk_score
+                   FROM query_logs q
+                   LEFT JOIN eval_metrics e ON e.query_log_id = q.id
+                   ORDER BY q.timestamp DESC
+                   LIMIT ?""",
+                (limit,),
+            ).fetchall()
+        return [self._row_to_dict(r) for r in rows]
+    def get_metric_timeseries(self, days: int = 7) -> list[dict]:
+        """
+        Hourly-bucketed metric averages over the last N days.
+        Used for the trend line chart in the dashboard.
+        """
+        since = time.time() - days * 86400
+        with self._conn() as conn:
+            rows = conn.execute(
+                """SELECT
+                       CAST((q.timestamp - ?) / 3600 AS INTEGER) AS hour_bucket,
+                       AVG(e.faithfulness)        AS faithfulness,
+                       AVG(e.answer_relevancy)    AS answer_relevancy,
+                       AVG(e.context_precision)   AS context_precision,
+                       AVG(e.mean_chunk_score)    AS mean_chunk_score,
+                       COUNT(*)                   AS query_count
+                   FROM query_logs q
+                   JOIN eval_metrics e ON e.query_log_id = q.id
+                   WHERE q.timestamp > ?
+                   GROUP BY hour_bucket
+                   ORDER BY hour_bucket""",
+                (since, since),
+            ).fetchall()
+        return [dict(zip(
+            ["hour_bucket", "faithfulness", "answer_relevancy",
+             "context_precision", "mean_chunk_score", "query_count"], r
+        )) for r in rows]
+    def get_summary_stats(self) -> dict:
+        """Aggregate stats for the dashboard header metrics."""
+        with self._conn() as conn:
+            total = conn.execute("SELECT COUNT(*) FROM query_logs").fetchone()[0]
+            with_metrics = conn.execute("SELECT COUNT(*) FROM eval_metrics").fetchone()[0]
+            avgs = conn.execute(
+                """SELECT AVG(faithfulness), AVG(answer_relevancy),
+                          AVG(context_precision), AVG(mean_chunk_score)
+                   FROM eval_metrics"""
+            ).fetchone()
+            grade_dist = conn.execute(
+                """SELECT crag_grade, COUNT(*) as cnt
+                   FROM query_logs WHERE crag_grade != ''
+                   GROUP BY crag_grade"""
+            ).fetchall()
+            strategy_dist = conn.execute(
+                """SELECT strategies, COUNT(*) as cnt
+                   FROM query_logs GROUP BY strategies"""
+            ).fetchall()
+            avg_latency = conn.execute(
+                "SELECT AVG(latency_ms) FROM query_logs WHERE latency_ms > 0"
+            ).fetchone()[0]
+        return {
+            "total_queries": total,
+            "evaluated_queries": with_metrics,
+            "avg_faithfulness":      round(avgs[0] or 0, 3),
+            "avg_answer_relevancy":  round(avgs[1] or 0, 3),
+            "avg_context_precision": round(avgs[2] or 0, 3),
+            "avg_chunk_score":       round(avgs[3] or 0, 3),
+            "avg_latency_ms":        round(avg_latency or 0, 1),
+            "crag_grade_dist":  {r[0]: r[1] for r in grade_dist},
+            "strategy_dist":    {r[0]: r[1] for r in strategy_dist},
+        }
+    # ── Init ───────────────────────────────────────────────────
+    def _init_db(self) -> None:
+        with self._conn() as conn:
+            conn.executescript(_DDL)
+        logger.info("EvalStore ready at %s", self._path)
+    @contextmanager
+    def _conn(self):
+        conn = sqlite3.connect(self._path, timeout=10, check_same_thread=False)
+        conn.row_factory = sqlite3.Row
+        try:
+            yield conn
+            conn.commit()
+        except Exception:
+            conn.rollback()
+            raise
+        finally:
+            conn.close()
+    @staticmethod
+    def _row_to_dict(row) -> dict:
+        d = dict(row)
+        for key in ("strategies",):
+            if d.get(key):
+                try:
+                    d[key] = json.loads(d[key])
+                except Exception:
+                    pass
+        return d

generation/crag.py ADDED Viewed

	@@ -0,0 +1,402 @@

+"""
+Cortex RAG — Corrective RAG (CRAG) Gate (Phase 3)
+The problem CRAG solves
+────────────────────────
+Standard RAG always passes retrieved chunks to the LLM, even when:
+  - The query is ambiguous and the retrieved chunks are off-topic
+  - The knowledge base simply doesn't contain the answer
+  - The retrieved chunks contradict each other
+In all three cases, the LLM will either hallucinate or produce a
+confused answer. CRAG adds a grading step BEFORE generation:
+                 ┌─── GOOD ────► Generator (proceed normally)
+Query → Retrieve ┤
+                 ├─── POOR ────► Rewrite query → Re-retrieve → Generator
+                 └─── ABSENT ──► Web search fallback → Generator
+Grading
+────────
+An LLM-as-judge evaluates (query, retrieved_chunks) and returns:
+  {
+    "grade": "GOOD" | "POOR" | "ABSENT",
+    "relevance_score": 0.0–1.0,
+    "has_sufficient_context": true | false,
+    "reasoning": "..."
+  }
+Grade definitions:
+  GOOD    — chunks are relevant and sufficient for the query
+  POOR    — chunks are partially relevant; try rewriting the query
+  ABSENT  — knowledge base clearly doesn't contain the answer;
+             fall back to web search
+Query rewriting
+────────────────
+When grade == POOR, we expand the query using chain-of-thought:
+the grader's `reasoning` field (why did retrieval fail?) is fed
+back as context for a rewrite prompt. This makes the rewrite
+semantically targeted, not just rephrased.
+Web search fallback
+────────────────────
+When grade == ABSENT, we call Tavily (preferred) or DuckDuckGo
+(no API key needed) and package the top-3 web results as synthetic
+RetrievedChunk objects with source="web_search". These flow into
+the same generator unchanged.
+"""
+from __future__ import annotations
+import json
+import logging
+import re
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional
+from config import get_settings
+from retrieval.dense import RetrievedChunk
+logger = logging.getLogger(__name__)
+# ── Grade enum ─────────────────────────────────────────────────
+class RetrievalGrade(str, Enum):
+    GOOD   = "GOOD"     # proceed to generation
+    POOR   = "POOR"     # rewrite query and re-retrieve
+    ABSENT = "ABSENT"   # fall back to web search
+# ── Result dataclass ───────────────────────────────────────────
+@dataclass
+class CRAGResult:
+    grade: RetrievalGrade
+    relevance_score: float
+    has_sufficient_context: bool
+    reasoning: str
+    final_chunks: list[RetrievedChunk]    # chunks to pass to generator
+    rewritten_query: Optional[str] = None # set if grade was POOR
+    web_search_used: bool = False
+# ── Prompt templates ───────────────────────────────────────────
+_GRADER_PROMPT = """\
+You are a retrieval quality judge. Given a user query and retrieved passages,
+assess whether the passages contain sufficient information to answer the query.
+Return ONLY a JSON object in this exact format (no markdown, no preamble):
+{{
+  "grade": "<GOOD|POOR|ABSENT>",
+  "relevance_score": <float 0.0-1.0>,
+  "has_sufficient_context": <true|false>,
+  "reasoning": "<one sentence explaining your assessment>"
+}}
+Grades:
+  GOOD   — passages are clearly relevant and contain enough information to answer
+  POOR   — passages are partially relevant but incomplete or off-topic; retrieval should be retried
+  ABSENT — the knowledge base clearly does not contain information about this query
+User query: {query}
+Retrieved passages:
+{passages}
+"""
+_REWRITE_PROMPT = """\
+A retrieval system failed to find good results for the following query.
+The grader's feedback explains why the results were poor.
+Original query: {query}
+Grader feedback: {reasoning}
+Rewrite the query to be more specific and likely to retrieve better results.
+Apply these strategies: expand acronyms, add domain context, use alternative terms.
+Return ONLY the rewritten query string, no explanation.
+"""
+# ── CRAG Gate ──────────────────────────────────────────────────
+class CRAGGate:
+    """
+    Corrective RAG gate that sits between retrieval and generation.
+    Usage (in orchestrator):
+        crag = CRAGGate()
+        result = crag.evaluate(
+            query=user_query,
+            chunks=retrieved_chunks,
+            retriever_fn=retriever.retrieve,   # callable for re-retrieval
+        )
+        # result.final_chunks → pass to generator
+        # result.grade → log for evaluation dashboard
+    """
+    def __init__(self) -> None:
+        self._llm = None
+    # ── Public API ─────────────────────────────────────────────
+    def evaluate(
+        self,
+        query: str,
+        chunks: list[RetrievedChunk],
+        retriever_fn: Optional[callable] = None,
+        max_retries: int = 1,
+    ) -> CRAGResult:
+        """
+        Grade retrieved chunks and apply corrective action if needed.
+        Args:
+            query:        the user's original query
+            chunks:       chunks returned by the retrieval pipeline
+            retriever_fn: callable(query: str) → list[RetrievedChunk]
+                          used for re-retrieval on POOR grade
+            max_retries:  max number of rewrite+re-retrieve cycles
+        """
+        # Grade the initial retrieval
+        grade_result = self._grade(query, chunks)
+        logger.info(
+            "CRAG grade: %s (score=%.2f, sufficient=%s) — %s",
+            grade_result["grade"],
+            grade_result["relevance_score"],
+            grade_result["has_sufficient_context"],
+            grade_result["reasoning"][:80],
+        )
+        grade = RetrievalGrade(grade_result["grade"])
+        # ── GOOD: pass through unchanged ──────────────────────
+        if grade == RetrievalGrade.GOOD:
+            return CRAGResult(
+                grade=grade,
+                relevance_score=grade_result["relevance_score"],
+                has_sufficient_context=True,
+                reasoning=grade_result["reasoning"],
+                final_chunks=chunks,
+            )
+        # ── POOR: rewrite query and re-retrieve ───────────────
+        if grade == RetrievalGrade.POOR and retriever_fn and max_retries > 0:
+            rewritten = self._rewrite_query(query, grade_result["reasoning"])
+            logger.info("CRAG rewrite: '%s' → '%s'", query[:50], rewritten[:50])
+            try:
+                new_chunks = retriever_fn(rewritten)
+                # Re-grade the new results (once — no infinite loop)
+                new_grade = self._grade(rewritten, new_chunks)
+                return CRAGResult(
+                    grade=RetrievalGrade(new_grade["grade"]),
+                    relevance_score=new_grade["relevance_score"],
+                    has_sufficient_context=new_grade["has_sufficient_context"],
+                    reasoning=new_grade["reasoning"],
+                    final_chunks=new_chunks or chunks,  # fall back if retry also empty
+                    rewritten_query=rewritten,
+                )
+            except Exception as exc:
+                logger.warning("Re-retrieval after rewrite failed: %s", exc)
+                # Fall through to returning original chunks with POOR grade
+                return CRAGResult(
+                    grade=grade,
+                    relevance_score=grade_result["relevance_score"],
+                    has_sufficient_context=False,
+                    reasoning=grade_result["reasoning"],
+                    final_chunks=chunks,
+                    rewritten_query=rewritten,
+                )
+        # ── ABSENT: web search fallback ────────────────────────
+        if grade == RetrievalGrade.ABSENT:
+            web_chunks = self._web_search_fallback(query)
+            if web_chunks:
+                return CRAGResult(
+                    grade=grade,
+                    relevance_score=0.0,
+                    has_sufficient_context=True,
+                    reasoning=grade_result["reasoning"],
+                    final_chunks=web_chunks,
+                    web_search_used=True,
+                )
+            # Web search also failed — return original chunks with warning
+            return CRAGResult(
+                grade=grade,
+                relevance_score=0.0,
+                has_sufficient_context=False,
+                reasoning=f"Knowledge base: {grade_result['reasoning']}. Web search also returned no results.",
+                final_chunks=chunks,
+            )
+        # Default: return original chunks unchanged
+        return CRAGResult(
+            grade=grade,
+            relevance_score=grade_result["relevance_score"],
+            has_sufficient_context=grade_result["has_sufficient_context"],
+            reasoning=grade_result["reasoning"],
+            final_chunks=chunks,
+        )
+    # ── LLM grader ────────────────────────────────────────────
+    def _grade(self, query: str, chunks: list[RetrievedChunk]) -> dict:
+        """Call LLM to grade retrieval quality. Returns parsed dict."""
+        if not chunks:
+            return {
+                "grade": "ABSENT",
+                "relevance_score": 0.0,
+                "has_sufficient_context": False,
+                "reasoning": "No chunks were retrieved.",
+            }
+        passages = "\n\n".join(
+            f"[{i}] {c.title}: {c.text[:400]}"
+            for i, c in enumerate(chunks[:5], 1)
+        )
+        try:
+            client = self._get_llm()
+            cfg = get_settings()
+            response = client.chat.completions.create(
+                model=cfg.groq_model,
+                messages=[{
+                    "role": "user",
+                    "content": _GRADER_PROMPT.format(query=query, passages=passages),
+                }],
+                temperature=0.0,
+                max_tokens=200,
+            )
+            raw = response.choices[0].message.content or "{}"
+            return self._parse_grade(raw)
+        except Exception as exc:
+            logger.warning("CRAG grader LLM call failed: %s", exc)
+            # Safe default: assume GOOD to avoid blocking the pipeline
+            return {
+                "grade": "GOOD",
+                "relevance_score": 0.5,
+                "has_sufficient_context": True,
+                "reasoning": f"Grader unavailable ({exc}); passing through.",
+            }
+    def _parse_grade(self, raw: str) -> dict:
+        raw = raw.strip()
+        if raw.startswith("```"):
+            raw = re.sub(r"^```[a-z]*\n?", "", raw)
+            raw = re.sub(r"\n?```$", "", raw)
+        try:
+            data = json.loads(raw)
+        except json.JSONDecodeError:
+            return {
+                "grade": "GOOD", "relevance_score": 0.5,
+                "has_sufficient_context": True, "reasoning": "Parse error.",
+            }
+        grade_str = data.get("grade", "GOOD").upper()
+        if grade_str not in {"GOOD", "POOR", "ABSENT"}:
+            grade_str = "GOOD"
+        return {
+            "grade": grade_str,
+            "relevance_score": float(data.get("relevance_score", 0.5)),
+            "has_sufficient_context": bool(data.get("has_sufficient_context", True)),
+            "reasoning": str(data.get("reasoning", "")),
+        }
+    # ── Query rewriter ────────────────────────────────────────
+    def _rewrite_query(self, original_query: str, reasoning: str) -> str:
+        try:
+            client = self._get_llm()
+            cfg = get_settings()
+            response = client.chat.completions.create(
+                model=cfg.groq_model,
+                messages=[{
+                    "role": "user",
+                    "content": _REWRITE_PROMPT.format(
+                        query=original_query, reasoning=reasoning
+                    ),
+                }],
+                temperature=0.3,
+                max_tokens=128,
+            )
+            rewritten = (response.choices[0].message.content or "").strip()
+            return rewritten if rewritten else original_query
+        except Exception as exc:
+            logger.warning("Query rewrite failed: %s", exc)
+            return original_query
+    # ── Web search fallback ───────────────────────────────────
+    def _web_search_fallback(self, query: str) -> list[RetrievedChunk]:
+        """
+        Try Tavily first (better quality), then DuckDuckGo (no API key).
+        Returns synthetic RetrievedChunk objects from web results.
+        """
+        chunks = self._tavily_search(query) or self._duckduckgo_search(query)
+        if chunks:
+            logger.info("CRAG web fallback: %d results for '%s'", len(chunks), query[:50])
+        return chunks
+    def _tavily_search(self, query: str) -> list[RetrievedChunk]:
+        try:
+            from tavily import TavilyClient  # type: ignore
+            cfg = get_settings()
+            api_key = cfg.tavily_api_key
+            if not api_key:
+                return []
+            client = TavilyClient(api_key=api_key)
+            results = client.search(query, max_results=3)
+            return [
+                self._web_result_to_chunk(r.get("content", ""), r.get("url", ""), r.get("title", "Web"))
+                for r in results.get("results", [])
+                if r.get("content")
+            ]
+        except Exception:
+            return []
+    def _duckduckgo_search(self, query: str) -> list[RetrievedChunk]:
+        try:
+            from duckduckgo_search import DDGS  # type: ignore
+            results = []
+            with DDGS() as ddgs:
+                for r in ddgs.text(query, max_results=3):
+                    results.append(
+                        self._web_result_to_chunk(
+                            r.get("body", ""), r.get("href", ""), r.get("title", "Web")
+                        )
+                    )
+            return results
+        except Exception:
+            return []
+    @staticmethod
+    def _web_result_to_chunk(text: str, url: str, title: str) -> RetrievedChunk:
+        import hashlib
+        cid = hashlib.sha256(url.encode()).hexdigest()[:16]
+        return RetrievedChunk(
+            chunk_id=cid,
+            doc_id="web",
+            source=url,
+            title=title,
+            text=text[:1500],
+            parent_text=text[:1500],
+            chunk_index=0,
+            score=0.6,       # neutral score for web results
+            retriever="web_search",
+        )
+    # ── Groq client ───────────────────────────────────────────
+    def _get_llm(self):
+        if self._llm is None:
+            cfg = get_settings()
+            if not cfg.groq_api_key:
+                raise RuntimeError("GROQ_API_KEY not set")
+            from groq import Groq  # type: ignore
+            self._llm = Groq(api_key=cfg.groq_api_key)
+        return self._llm

ingestion/pipeline.py CHANGED Viewed

@@ -22,6 +22,7 @@ from ingestion.document_loader import Document, DocumentLoader
 from retrieval.embedder import Embedder
 from retrieval.dense import MilvusStore
 from retrieval.bm25 import BM25Retriever
 logger = logging.getLogger(__name__)
@@ -43,12 +44,15 @@ class IngestionPipeline:
         embedder: Optional[Embedder] = None,
         store: Optional[MilvusStore] = None,
         bm25: Optional[BM25Retriever] = None,
     ) -> None:
         self._loader = loader or DocumentLoader()
         self._embedder = embedder or Embedder()
         self._chunker = chunker or SemanticChunker(embedder=self._embedder)
         self._store = store or MilvusStore(embedder=self._embedder)
         self._bm25  = bm25  or BM25Retriever()
     # ── Public ─────────────────────────────────────────────────
@@ -133,7 +137,16 @@ class IngestionPipeline:
         except Exception as exc:
             logger.error("BM25 indexing failed: %s", exc)
             stats["errors"].append({"source": "bm25_index", "error": str(exc)})
         elapsed = time.perf_counter() - t0
         logger.info(
             "Ingestion complete in %.1fs — %d docs, %d chunks stored.",

 from retrieval.embedder import Embedder
 from retrieval.dense import MilvusStore
 from retrieval.bm25 import BM25Retriever
+from retrieval.graph_builder import KnowledgeGraphBuilder
 logger = logging.getLogger(__name__)
         embedder: Optional[Embedder] = None,
         store: Optional[MilvusStore] = None,
         bm25: Optional[BM25Retriever] = None,
+        graph: Optional[KnowledgeGraphBuilder] = None,
     ) -> None:
         self._loader = loader or DocumentLoader()
         self._embedder = embedder or Embedder()
         self._chunker = chunker or SemanticChunker(embedder=self._embedder)
         self._store = store or MilvusStore(embedder=self._embedder)
         self._bm25  = bm25  or BM25Retriever()
+        self._graph = graph or KnowledgeGraphBuilder()
     # ── Public ─────────────────────────────────────────────────
         except Exception as exc:
             logger.error("BM25 indexing failed: %s", exc)
             stats["errors"].append({"source": "bm25_index", "error": str(exc)})
+        # ── Build knowledge graph (NER + relations) — Phase 3 ──
+        try:
+            graph_stats = self._graph.process_chunks(all_chunks)
+            stats["graph_entities"] = graph_stats.get("entities", 0)
+            stats["graph_triples"]  = graph_stats.get("triples", 0)
+        except Exception as exc:
+            logger.error("Graph extraction failed: %s", exc)
+            stats["errors"].append({"source": "graph_build", "error": str(exc)})
         elapsed = time.perf_counter() - t0
         logger.info(
             "Ingestion complete in %.1fs — %d docs, %d chunks stored.",

retrieval/cache.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""
+Cortex RAG — Retrieval Cache (Redis, Phase 4)
+What gets cached
+─────────────────
+The output of the full retrieval pipeline — after RRF fusion and
+cross-encoder reranking — is serialised and stored in Redis with a
+configurable TTL (default 1 hour).
+Cache key: SHA-256 of (query.lower().strip() + str(top_k))
+This means the same query with different capitalisation or trailing
+spaces hits the same cache entry, which is almost always correct for RAG.
+What does NOT get cached
+─────────────────────────
+CRAG evaluation and generation are NOT cached. The CRAG grade depends
+on the current state of the knowledge base (which changes after ingestion),
+and generation is fast enough (streaming) that caching it adds complexity
+without meaningful latency savings.
+Graceful degradation
+─────────────────────
+If Redis is unreachable on startup, the cache silently disables itself
+and logs a warning. Every query falls through to the live retrieval
+pipeline unchanged. No exceptions surface to the user.
+This means you can develop without Redis running locally and only enable
+it in production (Railway, Render) where Redis add-ons are available.
+"""
+from __future__ import annotations
+import hashlib
+import json
+import logging
+from typing import Optional
+from retrieval.dense import RetrievedChunk
+from retrieval.orchestrator import MultiStrategyRetriever, RetrievalResult
+from retrieval.router import QueryIntent, RoutingDecision
+logger = logging.getLogger(__name__)
+def _make_cache_key(query: str, top_k: int) -> str:
+    raw = f"{query.lower().strip()}:{top_k}"
+    return "cortex:retrieval:" + hashlib.sha256(raw.encode()).hexdigest()[:24]
+def _serialise_result(result: RetrievalResult) -> str:
+    """JSON-serialise a RetrievalResult for Redis storage."""
+    return json.dumps({
+        "chunks": [
+            {
+                "chunk_id":    c.chunk_id,
+                "doc_id":      c.doc_id,
+                "source":      c.source,
+                "title":       c.title,
+                "text":        c.text,
+                "parent_text": c.parent_text,
+                "chunk_index": c.chunk_index,
+                "score":       c.score,
+                "retriever":   c.retriever,
+            }
+            for c in result.chunks
+        ],
+        "decision": {
+            "intent":        result.decision.intent.value,
+            "strategies":    result.decision.strategies,
+            "confidence":    result.decision.confidence,
+            "reasoning":     result.decision.reasoning,
+        },
+        "retriever_hits": result.retriever_hits,
+    })
+def _deserialise_result(raw: str) -> RetrievalResult:
+    """Reconstruct a RetrievalResult from its JSON representation."""
+    data = json.loads(raw)
+    chunks = [
+        RetrievedChunk(
+            chunk_id=c["chunk_id"],
+            doc_id=c["doc_id"],
+            source=c["source"],
+            title=c["title"],
+            text=c["text"],
+            parent_text=c["parent_text"],
+            chunk_index=c["chunk_index"],
+            score=c["score"],
+            retriever=c["retriever"],
+        )
+        for c in data["chunks"]
+    ]
+    d = data["decision"]
+    decision = RoutingDecision(
+        intent=QueryIntent(d["intent"]),
+        strategies=d["strategies"],
+        confidence=d["confidence"],
+        reasoning=d["reasoning"],
+    )
+    return RetrievalResult(
+        chunks=chunks,
+        decision=decision,
+        retriever_hits=data.get("retriever_hits", {}),
+    )
+class CachedRetriever:
+    """
+    Drop-in wrapper around MultiStrategyRetriever that adds Redis caching.
+    Usage (replaces MultiStrategyRetriever in api/main.py):
+        retriever = CachedRetriever(MultiStrategyRetriever(...))
+        result = retriever.retrieve(query)
+        print(retriever.cache_stats())  # {"hits": 3, "misses": 7, "enabled": True}
+    """
+    def __init__(
+        self,
+        inner: MultiStrategyRetriever,
+        ttl_seconds: Optional[int] = None,
+    ) -> None:
+        self._inner = inner
+        self._redis = self._connect_redis()
+        self._ttl   = ttl_seconds or self._default_ttl()
+        self._hits   = 0
+        self._misses = 0
+    # ── Public API (matches MultiStrategyRetriever interface) ──
+    def retrieve(
+        self,
+        query: str,
+        top_k_candidates: Optional[int] = None,
+        final_top_k: Optional[int] = None,
+    ) -> RetrievalResult:
+        """
+        Retrieve with cache. Falls through to live retrieval on miss or error.
+        """
+        from config import get_settings
+        cfg = get_settings()
+        k = final_top_k or cfg.final_top_k
+        key = _make_cache_key(query, k)
+        # ── Cache lookup ───────────────────────────────────────
+        if self._redis:
+            try:
+                cached = self._redis.get(key)
+                if cached:
+                    self._hits += 1
+                    logger.debug("Cache HIT for query: %s…", query[:40])
+                    result = _deserialise_result(cached)
+                    result.from_cache = True
+                    return result
+            except Exception as exc:
+                logger.warning("Redis GET failed: %s — falling through.", exc)
+        # ── Cache miss: live retrieval ─────────────────────────
+        self._misses += 1
+        logger.debug("Cache MISS for query: %s…", query[:40])
+        result = self._inner.retrieve(query, top_k_candidates, final_top_k)
+        result.from_cache = False
+        # ── Write to cache ─────────────────────────────────────
+        if self._redis and not result.empty:
+            try:
+                self._redis.setex(key, self._ttl, _serialise_result(result))
+            except Exception as exc:
+                logger.warning("Redis SET failed: %s", exc)
+        return result
+    def invalidate(self, query: str, top_k: int) -> bool:
+        """Manually invalidate a cache entry (e.g. after re-ingestion)."""
+        if not self._redis:
+            return False
+        try:
+            return bool(self._redis.delete(_make_cache_key(query, top_k)))
+        except Exception:
+            return False
+    def flush_all(self) -> int:
+        """Delete all Cortex cache keys. Returns count deleted."""
+        if not self._redis:
+            return 0
+        try:
+            keys = self._redis.keys("cortex:retrieval:*")
+            if keys:
+                return self._redis.delete(*keys)
+            return 0
+        except Exception:
+            return 0
+    def cache_stats(self) -> dict:
+        total = self._hits + self._misses
+        return {
+            "enabled":   self._redis is not None,
+            "hits":      self._hits,
+            "misses":    self._misses,
+            "hit_rate":  round(self._hits / total, 3) if total else 0.0,
+            "ttl_s":     self._ttl,
+        }
+    # ── Pass-through for orchestrator methods ──────────────────
+    def index_chunks(self, chunks: list) -> int:
+        return self._inner.index_chunks(chunks)
+    def build_graph(self, chunks: list) -> dict:
+        return self._inner.build_graph(chunks)
+    @property
+    def graph_builder(self):
+        return self._inner.graph_builder
+    # ── Redis connection ───────────────────────────────────────
+    @staticmethod
+    def _connect_redis():
+        from config import get_settings
+        cfg = get_settings()
+        url = getattr(cfg, "redis_url", "redis://localhost:6379")
+        try:
+            import redis  # type: ignore
+            client = redis.from_url(url, socket_connect_timeout=2, decode_responses=True)
+            client.ping()
+            logger.info("Redis cache connected at %s", url)
+            return client
+        except ImportError:
+            logger.info("redis-py not installed — cache disabled. pip install redis")
+            return None
+        except Exception as exc:
+            logger.warning("Redis unavailable (%s) — cache disabled.", exc)
+            return None
+    @staticmethod
+    def _default_ttl() -> int:
+        from config import get_settings
+        return getattr(get_settings(), "cache_ttl_seconds", 3600)

retrieval/graph_builder.py ADDED Viewed

	@@ -0,0 +1,278 @@

+"""
+Cortex RAG — Knowledge Graph Builder (Phase 3)
+What this does
+──────────────
+During ingestion, every chunk is processed to extract:
+  1. Named entities  (spaCy NER: PERSON, ORG, WORK_OF_ART, PRODUCT, …)
+  2. Relations       (few-shot LLM: subject → predicate → object triples)
+These are assembled into a NetworkX undirected graph where:
+  - Nodes  = entities (label + type + first-seen source)
+  - Edges  = relations (predicate label + list of source chunk_ids)
+Each node also carries a list of chunk_ids it appeared in, so the
+graph retriever can map entity → chunks without an extra lookup.
+The graph is persisted as a JSON file (graphs are small — a 100-doc
+corpus typically has <10k nodes). On reload the full graph is
+reconstructed in seconds from the JSON.
+──────────────
+(Phase 3, refactored)
+The builder is now responsible ONLY for:
+  - spaCy NER (entities are always extracted the same way)
+  - Assembling triples into a NetworkX graph
+  - Persisting / loading the graph
+Relation extraction is delegated to a RelationExtractor strategy:
+  - REBELExtractor  (default) — local model, no API calls
+  - LLMExtractor              — Groq, free-form predicates
+Switch via .env:
+  GRAPH_EXTRACTOR=rebel    # default, recommended
+  GRAPH_EXTRACTOR=llm      # original method
+Or pass explicitly:
+  builder = KnowledgeGraphBuilder(extractor=LLMExtractor())
+"""
+from __future__ import annotations
+import json
+import logging
+from pathlib import Path
+from typing import Optional
+import networkx as nx
+from ingestion.chunker import Chunk
+from retrieval.relation_extractors import (
+    RelationExtractor,
+    Triple,
+    build_extractor,
+)
+logger = logging.getLogger(__name__)
+_DEFAULT_GRAPH_PATH = Path("data/knowledge_graph.json")
+# spaCy entity types we care about for RAG
+_ENTITY_TYPES = {
+    "PERSON", "ORG", "GPE", "PRODUCT", "WORK_OF_ART",
+    "EVENT", "LAW", "NORP", "FAC", "LOC",
+}
+class KnowledgeGraphBuilder:
+    """
+    Builds and maintains the knowledge graph.
+    Usage (at ingestion time):
+        # REBEL (default — no API calls)
+        builder = KnowledgeGraphBuilder()
+        builder.process_chunks(chunks)
+        # LLM method (original)
+        from retrieval.relation_extractors import LLMExtractor
+        builder = KnowledgeGraphBuilder(extractor=LLMExtractor())
+        builder.process_chunks(chunks)
+    Usage (at query time):
+        builder = KnowledgeGraphBuilder()
+        G = builder.graph    # loaded from disk automatically
+    """
+    def __init__(
+        self,
+        graph_path: str | Path = _DEFAULT_GRAPH_PATH,
+        extractor: Optional[RelationExtractor] = None,
+    ) -> None:
+        self._path = Path(graph_path)
+        self._graph: nx.Graph = nx.Graph()
+        # If no extractor is injected, build_extractor() reads GRAPH_EXTRACTOR from .env
+        self._extractor: RelationExtractor = extractor or build_extractor()
+        self._nlp = None
+        self._load_if_exists()
+        logger.info(
+            "KnowledgeGraphBuilder ready (extractor=%s)", self._extractor.name
+        )
+    # ── Public API ─────────────────────────────────────────────
+    @property
+    def graph(self) -> nx.Graph:
+        return self._graph
+    @property
+    def extractor_name(self) -> str:
+        return self._extractor.name
+    def process_chunks(self, chunks: list[Chunk]) -> dict:
+        """
+        Extract entities and relations from chunks; update and save graph.
+        Uses the configured extractor's extract_batch() for efficiency.
+        Returns stats dict.
+        """
+        if not chunks:
+            return {"chunks": 0, "entities": 0, "triples": 0, "errors": 0}
+        stats = {"chunks": len(chunks), "entities": 0, "triples": 0, "errors": 0}
+        # ── Batch relation extraction ──────────────────────────
+        # REBEL processes all chunks in one forward pass.
+        # LLM falls back to sequential (one API call per chunk).
+        try:
+            triple_map = self._extractor.extract_batch(chunks)
+        except Exception as exc:
+            logger.error("Batch extraction failed, falling back to sequential: %s", exc)
+            triple_map = {}
+            for chunk in chunks:
+                try:
+                    triple_map[chunk.chunk_id] = self._extractor.extract(chunk)
+                except Exception as e:
+                    logger.warning("Extraction failed for %s: %s", chunk.chunk_id, e)
+                    triple_map[chunk.chunk_id] = []
+                    stats["errors"] += 1
+        # ── Entity extraction + graph update ───────────────────
+        for chunk in chunks:
+            try:
+                entities = self._extract_entities(chunk.text)
+                triples  = triple_map.get(chunk.chunk_id, [])
+                self._add_entities_to_graph(entities, chunk)
+                self._add_triples_to_graph(triples)
+                stats["entities"] += len(entities)
+                stats["triples"]  += len(triples)
+            except Exception as exc:
+                logger.warning("Graph update failed for chunk %s: %s", chunk.chunk_id, exc)
+                stats["errors"] += 1
+        self.save()
+        logger.info(
+            "Graph updated via %s: +%d entities, +%d triples (nodes=%d, edges=%d)",
+            self._extractor.name,
+            stats["entities"], stats["triples"],
+            self._graph.number_of_nodes(), self._graph.number_of_edges(),
+        )
+        return stats
+    def save(self) -> None:
+        self._path.parent.mkdir(parents=True, exist_ok=True)
+        data = nx.node_link_data(self._graph)
+        with open(self._path, "w") as fh:
+            json.dump(data, fh, indent=2)
+        logger.debug("Graph saved to %s", self._path)
+    def stats(self) -> dict:
+        return {
+            "nodes":      self._graph.number_of_nodes(),
+            "edges":      self._graph.number_of_edges(),
+            "extractor":  self._extractor.name,
+            "graph_path": str(self._path),
+        }
+    # ── Entity extraction (always spaCy — same for both methods) ─
+    def _extract_entities(self, text: str) -> list[tuple[str, str]]:
+        nlp = self._get_nlp()
+        doc = nlp(text[:10_000])
+        seen: set[str] = set()
+        entities: list[tuple[str, str]] = []
+        for ent in doc.ents:
+            if ent.label_ not in _ENTITY_TYPES:
+                continue
+            normalised = ent.text.strip().title()
+            if normalised in seen or len(normalised) < 2:
+                continue
+            seen.add(normalised)
+            entities.append((normalised, ent.label_))
+        return entities
+    # ── Graph construction (shared by both methods) ────────────
+    def _add_entities_to_graph(
+        self, entities: list[tuple[str, str]], chunk: Chunk
+    ) -> None:
+        for label, etype in entities:
+            if self._graph.has_node(label):
+                existing = self._graph.nodes[label].get("chunk_ids", [])
+                if chunk.chunk_id not in existing:
+                    existing.append(chunk.chunk_id)
+                self._graph.nodes[label]["chunk_ids"] = existing
+            else:
+                self._graph.add_node(
+                    label,
+                    entity_type=etype,
+                    chunk_ids=[chunk.chunk_id],
+                    source=chunk.source,
+                )
+    def _add_triples_to_graph(self, triples: list[Triple]) -> None:
+        for triple in triples:
+            for node in (triple.subject, triple.object):
+                if not self._graph.has_node(node):
+                    self._graph.add_node(
+                        node,
+                        entity_type="UNKNOWN",
+                        chunk_ids=[],
+                        source=triple.source,
+                        extractor=triple.extractor,
+                    )
+            if self._graph.has_edge(triple.subject, triple.object):
+                edge = self._graph[triple.subject][triple.object]
+                predicates = edge.get("predicates", [])
+                chunk_ids  = edge.get("chunk_ids", [])
+                if triple.predicate not in predicates:
+                    predicates.append(triple.predicate)
+                if triple.chunk_id not in chunk_ids:
+                    chunk_ids.append(triple.chunk_id)
+                edge["predicates"] = predicates
+                edge["chunk_ids"]  = chunk_ids
+            else:
+                self._graph.add_edge(
+                    triple.subject, triple.object,
+                    predicates=[triple.predicate],
+                    chunk_ids=[triple.chunk_id],
+                    source=triple.source,
+                    extractor=triple.extractor,
+                )
+    # ── Persistence ───────────────────────────────────────────
+    def _load_if_exists(self) -> None:
+        if not self._path.exists():
+            return
+        try:
+            with open(self._path) as fh:
+                data = json.load(fh)
+            self._graph = nx.node_link_graph(data)
+            logger.info(
+                "Knowledge graph loaded: %d nodes, %d edges",
+                self._graph.number_of_nodes(),
+                self._graph.number_of_edges(),
+            )
+        except Exception as exc:
+            logger.warning("Failed to load graph (%s) — starting fresh.", exc)
+    # ── spaCy ─────────────────────────────────────────────────
+    def _get_nlp(self):
+        if self._nlp is None:
+            try:
+                import spacy  # type: ignore
+            except ImportError as exc:
+                raise RuntimeError("Install spacy: pip install spacy") from exc
+            try:
+                self._nlp = spacy.load("en_core_web_sm")
+            except OSError:
+                raise RuntimeError(
+                    "Run: python -m spacy download en_core_web_sm"
+                )
+        return self._nlp

retrieval/graph_retriever.py ADDED Viewed

	@@ -0,0 +1,271 @@

+"""
+Cortex RAG — Graph Retriever (Phase 3)
+How multi-hop retrieval works
+──────────────────────────────
+Standard dense retrieval can answer: "What is attention?"
+It cannot answer: "Who wrote the attention paper, and what did they later
+build that addresses memory bottlenecks in inference?"
+That question requires:
+  Step 1: Find entity "Attention Is All You Need" in the graph
+  Step 2: Follow "authored_by" edges → Vaswani, Shazeer, Parmar, …
+  Step 3: Follow those author nodes' other edges →
+          Shazeer: "introduced" → "Multi-Query Attention"
+          Leviathan: "developed" → "Speculative Decoding"
+  Step 4: Collect all chunk_ids linked to visited nodes
+  Step 5: Fetch those chunks from Milvus → return to RRF pool
+The BFS depth (default: 2 hops) is the key parameter. 1 hop = only
+direct neighbours; 2 hops = neighbours of neighbours. 3+ hops tends to
+explode in scope and include irrelevant context.
+Entity matching
+───────────────
+The query "Who developed PagedAttention?" must match graph nodes like
+"Paged Attention" or "PagedAttention". We do:
+  1. Exact match (case-insensitive)
+  2. Partial match (query entity substring of node label)
+  3. spaCy NER on the query to extract candidate entity strings first
+"""
+from __future__ import annotations
+import logging
+from typing import Optional
+from retrieval.dense import MilvusStore, RetrievedChunk
+from retrieval.graph_builder import KnowledgeGraphBuilder
+logger = logging.getLogger(__name__)
+class GraphRetriever:
+    """
+    Retrieves chunks via knowledge graph traversal.
+    Returns RetrievedChunk objects fetched from Milvus, so they carry
+    the same structure as dense/BM25 results and can flow into RRF.
+    """
+    def __init__(
+        self,
+        graph_builder: Optional[KnowledgeGraphBuilder] = None,
+        store: Optional[MilvusStore] = None,
+        max_hops: int = 2,
+    ) -> None:
+        self._builder = graph_builder or KnowledgeGraphBuilder()
+        self._store   = store or MilvusStore()
+        self._max_hops = max_hops
+        self._nlp = None
+    # ── Public API ─────────────────────────────────────────────
+    def search(self, query: str, top_k: int = 15) -> list[RetrievedChunk]:
+        """
+        Graph traversal retrieval for a given query.
+        Pipeline:
+          1. Extract named entities from query (spaCy)
+          2. Anchor each entity to matching graph nodes (fuzzy match)
+          3. BFS up to max_hops from anchors
+          4. Collect chunk_ids from all visited nodes + traversed edges
+          5. Fetch chunks from Milvus by chunk_id
+          6. Score by graph centrality (number of graph links to query entities)
+        """
+        G = self._builder.graph
+        if G.number_of_nodes() == 0:
+            logger.debug("Graph is empty — skipping graph retrieval.")
+            return []
+        # 1. Extract query entities
+        query_entities = self._extract_query_entities(query)
+        if not query_entities:
+            logger.debug("No named entities in query — skipping graph retrieval.")
+            return []
+        logger.debug("Graph query entities: %s", query_entities)
+        # 2. Find anchor nodes
+        anchor_nodes = self._find_anchor_nodes(query_entities, G)
+        if not anchor_nodes:
+            logger.debug("No anchor nodes found in graph.")
+            return []
+        logger.debug("Anchor nodes: %s", anchor_nodes)
+        # 3 + 4. BFS traversal → collect chunk_ids
+        chunk_id_scores: dict[str, float] = {}
+        visited_nodes: set[str] = set()
+        for anchor in anchor_nodes:
+            self._bfs_collect(
+                G, anchor, self._max_hops,
+                chunk_id_scores, visited_nodes
+            )
+        if not chunk_id_scores:
+            return []
+        # 5. Sort chunk_ids by score and fetch from Milvus
+        sorted_ids = sorted(
+            chunk_id_scores, key=lambda cid: chunk_id_scores[cid], reverse=True
+        )[:top_k]
+        chunks = self._fetch_chunks_from_milvus(sorted_ids, chunk_id_scores)
+        logger.info(
+            "Graph retriever: %d anchors, %d nodes visited, %d chunks returned",
+            len(anchor_nodes), len(visited_nodes), len(chunks)
+        )
+        return chunks
+    # ── BFS traversal ─────────────────────────────────────────
+    def _bfs_collect(
+        self,
+        G,
+        start_node: str,
+        max_hops: int,
+        chunk_scores: dict[str, float],
+        visited: set[str],
+    ) -> None:
+        """
+        BFS from start_node up to max_hops.
+        Scores chunks by hop distance: 1.0 at hop 0, 0.5 at hop 1, 0.25 at hop 2.
+        """
+        queue: list[tuple[str, int]] = [(start_node, 0)]
+        local_visited: set[str] = set()
+        while queue:
+            node, depth = queue.pop(0)
+            if node in local_visited or depth > max_hops:
+                continue
+            local_visited.add(node)
+            visited.add(node)
+            # Score = 1 / 2^depth (1.0 at anchor, 0.5 one hop away, etc.)
+            hop_score = 1.0 / (2 ** depth)
+            # Collect chunk_ids from this node
+            node_data = G.nodes[node]
+            for cid in node_data.get("chunk_ids", []):
+                chunk_scores[cid] = max(chunk_scores.get(cid, 0.0), hop_score)
+            # Collect chunk_ids from edges (relations)
+            for neighbour in G.neighbors(node):
+                edge_data = G[node][neighbour]
+                for cid in edge_data.get("chunk_ids", []):
+                    chunk_scores[cid] = max(chunk_scores.get(cid, 0.0), hop_score * 0.8)
+                if depth < max_hops:
+                    queue.append((neighbour, depth + 1))
+    # ── Entity extraction ──────────────────────────────────────
+    def _extract_query_entities(self, query: str) -> list[str]:
+        """
+        Extract named entities from the query using spaCy NER.
+        Falls back to noun chunks if NER finds nothing.
+        """
+        try:
+            nlp = self._get_nlp()
+            doc = nlp(query)
+            entities = [ent.text.strip().title() for ent in doc.ents if len(ent.text.strip()) > 1]
+            if not entities:
+                # Fallback: try noun chunks (catches "attention mechanism", etc.)
+                entities = [
+                    chunk.text.strip().title()
+                    for chunk in doc.noun_chunks
+                    if len(chunk.text.strip()) > 3
+                ]
+            return entities
+        except Exception as exc:
+            logger.debug("Entity extraction failed: %s", exc)
+            return []
+    # ── Node matching ─────────────────────────────────────────
+    @staticmethod
+    def _find_anchor_nodes(query_entities: list[str], G) -> list[str]:
+        """
+        Find graph nodes that match query entities.
+        Priority: exact match → partial match.
+        """
+        all_nodes = list(G.nodes())
+        lower_nodes = {n.lower(): n for n in all_nodes}
+        anchors: list[str] = []
+        for qe in query_entities:
+            qe_lower = qe.lower()
+            # Exact match (case-insensitive)
+            if qe_lower in lower_nodes:
+                anchors.append(lower_nodes[qe_lower])
+                continue
+            # Partial match: query entity is substring of a node label
+            for node_lower, node in lower_nodes.items():
+                if qe_lower in node_lower or node_lower in qe_lower:
+                    if node not in anchors:
+                        anchors.append(node)
+        return anchors[:10]   # cap to avoid explosion on generic queries
+    # ── Milvus fetch ──────────────────────────────────────────
+    def _fetch_chunks_from_milvus(
+        self,
+        chunk_ids: list[str],
+        scores: dict[str, float],
+    ) -> list[RetrievedChunk]:
+        """
+        Fetch specific chunks from Milvus by chunk_id.
+        Tags each chunk with retriever="graph".
+        """
+        if not chunk_ids:
+            return []
+        try:
+            # Milvus IN query
+            id_list = '", "'.join(chunk_ids)
+            expr = f'chunk_id in ["{id_list}"]'
+            coll = self._store._ensure_collection()
+            results = coll.query(
+                expr=expr,
+                output_fields=["chunk_id", "doc_id", "source", "title",
+                                "text", "parent_text", "chunk_index"],
+                limit=len(chunk_ids),
+            )
+            chunks: list[RetrievedChunk] = []
+            for row in results:
+                cid = row["chunk_id"]
+                chunks.append(RetrievedChunk(
+                    chunk_id=cid,
+                    doc_id=row["doc_id"],
+                    source=row["source"],
+                    title=row["title"],
+                    text=row["text"],
+                    parent_text=row["parent_text"],
+                    chunk_index=row["chunk_index"],
+                    score=scores.get(cid, 0.1),
+                    retriever="graph",
+                ))
+            return sorted(chunks, key=lambda c: c.score, reverse=True)
+        except Exception as exc:
+            logger.warning("Milvus fetch for graph chunks failed: %s", exc)
+            return []
+    # ── spaCy ─────────────────────────────────────────────────
+    def _get_nlp(self):
+        if self._nlp is None:
+            import spacy  # type: ignore
+            try:
+                self._nlp = spacy.load("en_core_web_sm")
+            except OSError:
+                raise RuntimeError(
+                    "Download spaCy model: python -m spacy download en_core_web_sm"
+                )
+        return self._nlp

retrieval/orchestrator.py CHANGED Viewed

@@ -30,6 +30,8 @@ from retrieval.bm25 import BM25Retriever
 from retrieval.dense import MilvusStore, RetrievedChunk
 from retrieval.embedder import Embedder
 from retrieval.fusion import CrossEncoderReranker, RRFFusion
 from retrieval.router import QueryRouter, RoutingDecision
 logger = logging.getLogger(__name__)
@@ -51,6 +53,7 @@ class MultiStrategyRetriever:
         embedder: Optional[Embedder] = None,
         store: Optional[MilvusStore] = None,
         bm25: Optional[BM25Retriever] = None,
         router: Optional[QueryRouter] = None,
         fuser: Optional[RRFFusion] = None,
         reranker: Optional[CrossEncoderReranker] = None,
@@ -58,6 +61,10 @@ class MultiStrategyRetriever:
         self._embedder = embedder or Embedder()
         self._dense    = store    or MilvusStore(embedder=self._embedder)
         self._bm25     = bm25     or BM25Retriever()
         self._router   = router   or QueryRouter()
         self._fuser    = fuser    or RRFFusion()
         self._reranker = reranker or CrossEncoderReranker()
@@ -112,6 +119,17 @@ class MultiStrategyRetriever:
         Dense indexing is handled separately by MilvusStore.
         """
         return self._bm25.add_chunks(chunks)
     # ── Private: parallel retrieval ───────────────────────────
@@ -128,7 +146,7 @@ class MultiStrategyRetriever:
         retriever_map = {
             "dense": lambda q, k: self._dense.search(q, top_k=k),
             "bm25":  lambda q, k: self._bm25.search(q, top_k=k),
-            # "graph" will be registered here in Phase 3
         }
         results: dict[str, list[RetrievedChunk]] = {}

 from retrieval.dense import MilvusStore, RetrievedChunk
 from retrieval.embedder import Embedder
 from retrieval.fusion import CrossEncoderReranker, RRFFusion
+from retrieval.graph_builder import KnowledgeGraphBuilder
+from retrieval.graph_retriever import GraphRetriever
 from retrieval.router import QueryRouter, RoutingDecision
 logger = logging.getLogger(__name__)
         embedder: Optional[Embedder] = None,
         store: Optional[MilvusStore] = None,
         bm25: Optional[BM25Retriever] = None,
+        graph_builder: Optional[KnowledgeGraphBuilder] = None,
         router: Optional[QueryRouter] = None,
         fuser: Optional[RRFFusion] = None,
         reranker: Optional[CrossEncoderReranker] = None,
         self._embedder = embedder or Embedder()
         self._dense    = store    or MilvusStore(embedder=self._embedder)
         self._bm25     = bm25     or BM25Retriever()
+        self._graph_builder = graph_builder or KnowledgeGraphBuilder()
+        self._graph    = GraphRetriever(
+            graph_builder=self._graph_builder, store=self._dense
+        )
         self._router   = router   or QueryRouter()
         self._fuser    = fuser    or RRFFusion()
         self._reranker = reranker or CrossEncoderReranker()
         Dense indexing is handled separately by MilvusStore.
         """
         return self._bm25.add_chunks(chunks)
+    def build_graph(self, chunks: list) -> dict:
+        """
+        Extract entities + relations from chunks and update the knowledge graph.
+        Call from ingestion pipeline after dense + BM25 indexing.
+        """
+        return self._graph_builder.process_chunks(chunks)
+    @property
+    def graph_builder(self) -> KnowledgeGraphBuilder:
+        return self._graph_builder
     # ── Private: parallel retrieval ───────────────────────────
         retriever_map = {
             "dense": lambda q, k: self._dense.search(q, top_k=k),
             "bm25":  lambda q, k: self._bm25.search(q, top_k=k),
+            "graph": lambda q, k: self._graph.search(q, top_k=k),
         }
         results: dict[str, list[RetrievedChunk]] = {}

retrieval/relation_extractors.py ADDED Viewed

	@@ -0,0 +1,602 @@

+"""
+Cortex RAG — Relation Extractors
+Strategy pattern: both extractors share the same interface.
+Switch between them with GRAPH_EXTRACTOR=rebel|llm in .env.
+  RelationExtractor (abstract)
+    ├── REBELExtractor   — local model, no API calls, Wikidata predicates
+    └── LLMExtractor     — Mistral/LLM, free-form predicates, rate-limited
+KnowledgeGraphBuilder accepts either via dependency injection, or
+auto-selects based on config.get_settings().graph_extractor.
+Adding a new extractor in the future:
+  1. Subclass RelationExtractor
+  2. Implement extract(chunk) → list[Triple]
+  3. Register the name in _EXTRACTOR_REGISTRY at the bottom of this file
+"""
+from __future__ import annotations
+import json
+import logging
+import re
+import time
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional
+from config import get_settings
+from ingestion.chunker import Chunk
+logger = logging.getLogger(__name__)
+# ── Shared dataclass ───────────────────────────────────────────
+@dataclass
+class Triple:
+    subject: str
+    predicate: str
+    object: str
+    chunk_id: str
+    source: str
+    extractor: str = "unknown"   # tracks which extractor produced this triple
+# ── Abstract base ──────────────────────────────────────────────
+class RelationExtractor(ABC):
+    """
+    Common interface for all relation extraction strategies.
+    Subclasses must implement extract() only.
+    """
+    @abstractmethod
+    def extract(self, chunk: Chunk) -> list[Triple]:
+        """
+        Extract (subject, predicate, object) triples from a single chunk.
+        Must never raise — return [] on any failure.
+        """
+        ...
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Short identifier used in logging and triple.extractor field."""
+        ...
+    def extract_batch(self, chunks: list[Chunk]) -> dict[str, list[Triple]]:
+        """
+        Extract triples from a list of chunks.
+        Default: calls extract() sequentially.
+        Subclasses can override for true batching (e.g. REBEL).
+        Returns: {chunk_id: [Triple, ...]}
+        """
+        return {chunk.chunk_id: self.extract(chunk) for chunk in chunks}
+# ── REBEL extractor ────────────────────────────────────────────
+# REBEL relation types that map cleanly to RAG-useful edges.
+# The full Wikidata set has 220 types; we keep the ~40 most useful.
+_REBEL_KEEP_RELATIONS = {
+    "author", "developer", "creator", "founded by", "owned by",
+    "instance of", "subclass of", "part of", "has part",
+    "country", "country of origin", "located in", "headquarters location",
+    "employer", "member of", "affiliation", "educated at",
+    "award received", "occupation", "field of work", "notable work",
+    "based on", "followed by", "follows", "influenced by", "has edition",
+    "product or material produced", "used by", "manufacturer",
+    "publication date", "academic degree", "applies to jurisdiction",
+    "published in", "platform", "programming language", "license",
+}
+class REBELExtractor(RelationExtractor):
+    """
+    Local relation extraction using REBEL (Babelscape/rebel-large).
+    Model facts:
+      - 406M params (BART-large fine-tuned on Wikipedia + Wikidata)
+      - Input: raw text sentence(s)
+      - Output: decoded triplet string → parsed into (head, type, tail)
+      - CPU inference: ~80–150ms per chunk on modern hardware
+      - No API calls, no rate limits, fully offline after first download
+    Batching:
+      REBEL's tokeniser handles variable-length batches natively.
+      extract_batch() sends all chunks in one forward pass, which is
+      significantly faster than calling extract() in a loop.
+      Max batch size is controlled by REBEL_BATCH_SIZE in config
+      (default 8 — safe for 8GB RAM; raise to 16–32 with more RAM).
+    Predicate normalisation:
+      REBEL outputs Wikidata relation labels (e.g. "country of origin").
+      We keep only relations in _REBEL_KEEP_RELATIONS (40 types) and
+      discard the rest — this prevents graph noise from obscure predicates
+      like "Wikimedia disambiguation page" or "image" polluting the graph.
+    Download:
+      First run downloads ~1.6GB to ~/.cache/huggingface/hub/.
+      Subsequent runs load from cache in ~3s.
+    """
+    _REBEL_MODEL = "Babelscape/rebel-large"
+    _MAX_INPUT_TOKENS = 256     # REBEL was trained on short passages
+    _MAX_OUTPUT_TOKENS = 512
+    def __init__(self) -> None:
+        self._tokenizer = None
+        self._model = None
+    @property
+    def name(self) -> str:
+        return "rebel"
+    # ── Public ──────────────────────────────────────────────────
+    def extract(self, chunk: Chunk) -> list[Triple]:
+        """Single-chunk extraction (sequential). Prefer extract_batch for speed."""
+        results = self.extract_batch([chunk])
+        return results.get(chunk.chunk_id, [])
+    def extract_batch(self, chunks: list[Chunk]) -> dict[str, list[Triple]]:
+        """
+        True batched extraction. All chunks processed in a single model call.
+        Falls back to sequential on memory errors.
+        """
+        if not chunks:
+            return {}
+        tok, model = self._load()
+        cfg = get_settings()
+        batch_size = getattr(cfg, "rebel_batch_size", 8)
+        # Chunk text is truncated to avoid exceeding REBEL's context window
+        texts = [c.text[:1200] for c in chunks]
+        all_triples: dict[str, list[Triple]] = {c.chunk_id: [] for c in chunks}
+        for batch_start in range(0, len(chunks), batch_size):
+            batch_chunks = chunks[batch_start : batch_start + batch_size]
+            batch_texts  = texts[batch_start : batch_start + batch_size]
+            try:
+                inputs = tok(
+                    batch_texts,
+                    max_length=self._MAX_INPUT_TOKENS,
+                    padding=True,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                generated = model.generate(
+                    **inputs,
+                    max_length=self._MAX_OUTPUT_TOKENS,
+                    num_beams=3,
+                    early_stopping=True,
+                )
+                decoded = tok.batch_decode(generated, skip_special_tokens=False)
+                for chunk, raw_output in zip(batch_chunks, decoded):
+                    triples = self._parse_rebel_output(raw_output, chunk)
+                    all_triples[chunk.chunk_id] = triples
+            except Exception as exc:
+                logger.warning("REBEL batch %d failed: %s", batch_start, exc)
+                # Mark as empty rather than crashing the whole ingestion
+                for chunk in batch_chunks:
+                    all_triples[chunk.chunk_id] = []
+        return all_triples
+    # ── REBEL output parser ────────────────────────────────────
+    def _parse_rebel_output(self, decoded: str, chunk: Chunk) -> list[Triple]:
+        """
+        Parse REBEL's special-token output format.
+        REBEL outputs a string like:
+          <triplet> Vaswani <subj> Attention Is All You Need <obj> author
+          <triplet> Transformer <subj> NLP <obj> field of work
+        We extract each triplet, filter to keep relations, normalise,
+        and return Triple dataclasses.
+        """
+        triples: list[Triple] = []
+        # Split on <triplet> delimiter
+        raw_triplets = decoded.split("<triplet>")
+        for raw in raw_triplets:
+            raw = raw.strip()
+            if not raw or "<subj>" not in raw or "<obj>" not in raw:
+                continue
+            try:
+                # Format: "SUBJECT <subj> OBJECT <obj> RELATION"
+                subj_split = raw.split("<subj>")
+                subject = subj_split[0].strip()
+                obj_rel = subj_split[1].split("<obj>")
+                obj    = obj_rel[0].strip()
+                relation = obj_rel[1].strip()
+                # Clean up any residual special tokens
+                for tok_str in ["</s>", "<s>", "<pad>"]:
+                    relation = relation.replace(tok_str, "").strip()
+                    subject  = subject.replace(tok_str, "").strip()
+                    obj      = obj.replace(tok_str, "").strip()
+                if not subject or not obj or not relation:
+                    continue
+                # Filter to useful relation types only
+                if relation.lower() not in _REBEL_KEEP_RELATIONS:
+                    continue
+                triples.append(Triple(
+                    subject=subject.title(),
+                    predicate=relation.lower(),
+                    object=obj.title(),
+                    chunk_id=chunk.chunk_id,
+                    source=chunk.source,
+                    extractor=self.name,
+                ))
+            except (IndexError, AttributeError):
+                continue
+        return triples[:8]   # cap per chunk
+    # ── Model loading ──────────────────────────────────────────
+    def _load(self):
+        if self._tokenizer is None or self._model is None:
+            try:
+                from transformers import AutoModelForSeq2SeqLM, AutoTokenizer  # type: ignore
+            except ImportError as exc:
+                raise RuntimeError(
+                    "Install transformers: pip install transformers"
+                ) from exc
+            logger.info("Loading REBEL model '%s' (first run downloads ~1.6GB)…", self._REBEL_MODEL)
+            t0 = time.perf_counter()
+            self._tokenizer = AutoTokenizer.from_pretrained(self._REBEL_MODEL)
+            self._model = AutoModelForSeq2SeqLM.from_pretrained(self._REBEL_MODEL)
+            self._model.eval()   # inference mode — disables dropout
+            logger.info("REBEL loaded in %.1fs", time.perf_counter() - t0)
+        return self._tokenizer, self._model
+# ── LLM extractor (original method, preserved) ─────────────────
+_LLM_PROMPT = """\
+Extract factual relationships from the passage below.
+Return ONLY a JSON array of triples. Each triple is:
+  {{"subject": "...", "predicate": "...", "object": "..."}}
+Rules:
+- subject and object must be named entities (people, orgs, systems, concepts)
+- predicate is a short verb phrase ("developed", "is based on", "introduced", "authored")
+- Extract 0–5 triples maximum. If there are none, return []
+- Return ONLY the JSON array, no explanation, no markdown
+Passage:
+{text}
+"""
+class LLMExtractor(RelationExtractor):
+    """
+    Relation extraction via Mistral LLM (the original Phase 3 method).
+    Produces free-form, human-readable predicates ("introduced the concept of",
+    "co-authored with") rather than the fixed Wikidata vocabulary that REBEL uses.
+    Use this when:
+      - You want rich, domain-specific predicate labels
+      - Your corpus is small enough that rate limits aren't a problem
+      - You want to fine-tune the extraction prompt for your specific domain
+    Rate limiting:
+      Set MISTRAL_RELATION_RPM in .env to cap requests-per-minute.
+      Default is 0 (no cap). Mistral free tier allows ~30 RPM.
+    """
+    def __init__(self) -> None:
+        self._llm = None
+    @property
+    def name(self) -> str:
+        return "llm"
+    def extract(self, chunk: Chunk) -> list[Triple]:
+        try:
+            client = self._get_llm()
+            cfg = get_settings()
+            if cfg.llm_server == "ollama":
+                response = client.chat.complete(
+                    model=cfg.ollama_model,
+                    messages=[{
+                    "role": "user",
+                    "content": _LLM_PROMPT.format(text=chunk.text[:2000]),
+                }],
+                )
+            else:
+                response = client.chat.complete(
+                    model=cfg.mistral_model,
+                    messages=[{
+                        "role": "user",
+                        "content": _LLM_PROMPT.format(text=chunk.text[:2000]),
+                    }],
+                    temperature=0.0,
+                    max_tokens=512,
+                )
+            raw = response.choices[0].message.content or "[]"
+            return self._parse(raw, chunk)
+        except Exception as exc:
+            logger.debug("LLM extractor failed for chunk %s: %s", chunk.chunk_id, exc)
+            return []
+    def _parse(self, raw: str, chunk: Chunk) -> list[Triple]:
+        raw = raw.strip()
+        if raw.startswith("```"):
+            raw = re.sub(r"^```[a-z]*\n?", "", raw)
+            raw = re.sub(r"\n?```$", "", raw)
+        try:
+            items = json.loads(raw)
+        except json.JSONDecodeError:
+            return []
+        triples: list[Triple] = []
+        for item in items[:5]:
+            if not isinstance(item, dict):
+                continue
+            s = str(item.get("subject", "")).strip()
+            p = str(item.get("predicate", "")).strip()
+            o = str(item.get("object", "")).strip()
+            if s and p and o:
+                triples.append(Triple(
+                    subject=s.title(),
+                    predicate=p.lower(),
+                    object=o.title(),
+                    chunk_id=chunk.chunk_id,
+                    source=chunk.source,
+                    extractor=self.name,
+                ))
+        return triples
+    def _get_llm(self):
+        if self._llm is None:
+            cfg = get_settings()
+            llm_server = cfg.llm_server
+            if llm_server == "ollama":
+                try:
+                    from ollama import Client as ollama_client  # type: ignore
+                except ImportError as exc:
+                    raise RuntimeError(
+                        "Install ollama client: pip install ollama"
+                    ) from exc
+                self._llm = ollama_client(host=cfg.ollama_host)
+            else:
+                if not cfg.mistral_api_key:
+                    raise RuntimeError("MISTRAL_API_KEY not set")
+                from mistralai.client import Mistral  # type: ignore
+                self._llm = Mistral(api_key=cfg.mistral_api_key)
+        return self._llm
+# ── Entity density filter (Option 4) ──────────────────────────
+class EntityDensityFilter(RelationExtractor):
+    """
+    Decorator that wraps any extractor and skips low-entity-density chunks.
+    Rationale
+    ─────────
+    Chunks with 0–1 named entities rarely yield useful triples — a
+    paragraph of methodology boilerplate has no entities to link.
+    Scoring by entity density (entities per 100 tokens) and processing
+    only the top N% of chunks cuts extraction time by ~70% with
+    negligible graph quality loss.
+    How density is computed
+    ───────────────────────
+    density = (spaCy NER entity count) / (token count / 100)
+    This normalises for chunk length — a 50-token chunk with 3 entities
+    scores higher than a 500-token chunk with the same 3 entities.
+    Usage
+    ─────
+    # Wrap REBEL, keep top 30% of chunks (default):
+    extractor = EntityDensityFilter(REBELExtractor())
+    # Wrap LLM, keep top 20%, only chunks with ≥2 entities:
+    extractor = EntityDensityFilter(
+        LLMExtractor(),
+        top_fraction=0.20,
+        min_entity_count=2,
+    )
+    # Via config (wraps whatever GRAPH_EXTRACTOR is set to):
+    GRAPH_EXTRACTOR=rebel-filtered   # rebel + density filter
+    GRAPH_EXTRACTOR=llm-filtered     # llm   + density filter
+    """
+    def __init__(
+        self,
+        inner: RelationExtractor,
+        top_fraction: Optional[float] = None,
+        min_entity_count: Optional[int] = None,
+    ) -> None:
+        cfg = get_settings()
+        self._inner = inner
+        # top_fraction: process only the top X% most entity-dense chunks
+        self._top_fraction   = top_fraction   or getattr(cfg, "density_top_fraction", 0.30)
+        # min_entity_count: hard floor — never extract from chunks below this
+        self._min_entity_count = min_entity_count or getattr(cfg, "density_min_entities", 2)
+        self._nlp = None
+    @property
+    def name(self) -> str:
+        return f"{self._inner.name}-filtered"
+    # ── Public ──────────────────────────────────────────────────
+    def extract(self, chunk: Chunk) -> list[Triple]:
+        """Single-chunk extraction with density pre-check."""
+        if not self._passes_density_check([chunk]):
+            logger.debug("Chunk %s skipped (low entity density)", chunk.chunk_id)
+            return []
+        return self._inner.extract(chunk)
+    def extract_batch(self, chunks: list[Chunk]) -> dict[str, list[Triple]]:
+        """
+        Filter chunks by density score, then delegate only the qualifying
+        subset to the inner extractor's batch method.
+        Steps:
+          1. Score every chunk by entity density (fast — pure spaCy)
+          2. Apply min_entity_count hard floor
+          3. Keep top_fraction of remaining chunks by density score
+          4. Pass filtered set to inner.extract_batch()
+          5. Return merged result (skipped chunks → empty list)
+        """
+        if not chunks:
+            return {}
+        # Score all chunks
+        scored = self._score_chunks(chunks)   # list of (chunk, density, entity_count)
+        # Hard floor: drop chunks below minimum entity count
+        above_floor = [(c, d, n) for c, d, n in scored if n >= self._min_entity_count]
+        # Top-fraction cut: sort by density desc, keep top N%
+        above_floor.sort(key=lambda x: x[1], reverse=True)
+        cutoff = max(1, int(len(above_floor) * self._top_fraction))
+        selected = [c for c, _, _ in above_floor[:cutoff]]
+        skipped  = len(chunks) - len(selected)
+        if skipped:
+            logger.info(
+                "Density filter: %d/%d chunks selected (top %.0f%%, min_entities=%d)",
+                len(selected), len(chunks),
+                self._top_fraction * 100, self._min_entity_count,
+            )
+        # Delegate to inner extractor
+        if not selected:
+            return {c.chunk_id: [] for c in chunks}
+        inner_results = self._inner.extract_batch(selected)
+        # Merge: unselected chunks get empty lists
+        selected_ids = {c.chunk_id for c in selected}
+        return {
+            c.chunk_id: inner_results.get(c.chunk_id, []) if c.chunk_id in selected_ids else []
+            for c in chunks
+        }
+    # ── Density scoring ────────────────────────────────────────
+    def _score_chunks(
+        self, chunks: list[Chunk]
+    ) -> list[tuple[Chunk, float, int]]:
+        """
+        Returns list of (chunk, density_score, entity_count).
+        density_score = entities per 100 tokens (approx).
+        """
+        nlp = self._get_nlp()
+        results = []
+        for chunk in chunks:
+            doc = nlp(chunk.text[:5000])
+            entity_count = len([e for e in doc.ents if len(e.text.strip()) > 1])
+            token_count  = max(len(doc), 1)
+            density      = (entity_count / token_count) * 100
+            results.append((chunk, density, entity_count))
+        return results
+    def _passes_density_check(self, chunks: list[Chunk]) -> bool:
+        """Quick single-chunk density check for extract()."""
+        if not chunks:
+            return False
+        _, _, entity_count = self._score_chunks(chunks)[0]
+        return entity_count >= self._min_entity_count
+    # ── spaCy ──────────────────────────────────────────────────
+    def _get_nlp(self):
+        if self._nlp is None:
+            import spacy  # type: ignore
+            try:
+                self._nlp = spacy.load("en_core_web_sm")
+            except OSError:
+                raise RuntimeError("Run: python -m spacy download en_core_web_sm")
+        return self._nlp
+# ── Registry + factory ─────────────────────────────────────────
+_EXTRACTOR_REGISTRY: dict[str, type[RelationExtractor]] = {
+    "rebel":         REBELExtractor,
+    "llm":           LLMExtractor,
+    # Density-filtered variants are constructed specially — see build_extractor()
+}
+# Names that trigger density-filter wrapping
+_FILTERED_VARIANTS = {
+    "rebel-filtered": "rebel",
+    "llm-filtered":   "llm",
+}
+def build_extractor(name: Optional[str] = None) -> RelationExtractor:
+    """
+    Available values for GRAPH_EXTRACTOR:
+    "rebel"          — REBEL local model, no API calls (default)
+    "llm"            — Groq LLM, free-form predicates
+    "rebel-filtered" — REBEL + entity density pre-filter (option 4)
+    "llm-filtered"   — LLM   + entity density pre-filter (option 4)
+    Explicit usage in code:
+    extractor = build_extractor("rebel-filtered")
+    # Or compose manually for full control:
+    extractor = EntityDensityFilter(
+        REBELExtractor(),
+        top_fraction=0.25,
+        min_entity_count=3,
+    )
+    """
+    cfg = get_settings()
+    extractor_name = (name or getattr(cfg, "graph_extractor", "rebel")).lower()
+    # Density-filtered variant: build inner extractor then wrap it
+    if extractor_name in _FILTERED_VARIANTS:
+        inner_name = _FILTERED_VARIANTS[extractor_name]
+        inner_cls  = _EXTRACTOR_REGISTRY[inner_name]
+        inner      = inner_cls()
+        logger.info(
+            "Using relation extractor: %s (inner=%s, top_fraction=%.0f%%, min_entities=%d)",
+            extractor_name, inner_name,
+            getattr(cfg, "density_top_fraction", 0.30) * 100,
+            getattr(cfg, "density_min_entities", 2),
+        )
+        return EntityDensityFilter(inner)
+    # Plain extractor
+    cls = _EXTRACTOR_REGISTRY.get(extractor_name)
+    if cls is None:
+        available = list(_EXTRACTOR_REGISTRY.keys()) + list(_FILTERED_VARIANTS.keys())
+        raise ValueError(
+            f"Unknown extractor '{extractor_name}'. "
+            f"Available: {available}. "
+            f"Set GRAPH_EXTRACTOR in .env to one of these."
+        )
+    logger.info("Using relation extractor: %s", extractor_name)
+    return cls()

ui/app.py CHANGED Viewed

@@ -12,12 +12,18 @@ import json
 import time
 from pathlib import Path
 from typing import Optional
 import requests
 import streamlit as st
 # ── Config ────────────────────────────────────────────────────
-API_BASE = "http://localhost:8000"
 st.set_page_config(
     page_title="Cortex RAG",
@@ -66,11 +72,24 @@ def _render_source_cards_raw(chunks: list[dict]):
             title = chunk.get("title", "Unknown")
             source = Path(chunk.get("source", "")).name
             snippet = chunk.get("text_snippet", "")[:160]
             st.markdown(f"""
 <div class="source-card">
   <strong>[{i+1}] {title}</strong>
   <span class="score-badge" style="float:right">{score_pct}%</span><br/>
-  <small style="color:#6b7280">{source}</small>
   <div class="chunk-snippet">{snippet}…</div>
 </div>""", unsafe_allow_html=True)
@@ -101,7 +120,7 @@ st.markdown(
 )
 st.divider()
-tab_ask, tab_ingest, tab_system = st.tabs(["🔍 Ask", "📥 Ingest", "🩺 System"])
 # ─────────────────────────────────────────────────────────────
@@ -173,6 +192,19 @@ with tab_ask:
                             sources_placeholder.markdown(payload.get("text", ""))
                             status_placeholder.empty()
                         elif event_type == "done":
                             answer_placeholder.markdown(answer_text)
                             status_placeholder.empty()
@@ -262,7 +294,129 @@ with tab_ingest:
 # ─────────────────────────────────────────────────────────────
-# TAB 3 — SYSTEM HEALTH
 # ─────────────────────────────────────────────────────────────
 with tab_system:
     st.subheader("System health")
@@ -299,5 +453,13 @@ with tab_system:
                 st.metric("Chunks indexed", stats.get("entity_count", "—"))
             st.divider()
             st.markdown("**Raw health response**")
             st.json(health)

 import time
 from pathlib import Path
 from typing import Optional
+import sys
+sys.path.append(str(Path(__file__).resolve().parent.parent))
+from config import get_settings
 import requests
 import streamlit as st
 # ── Config ────────────────────────────────────────────────────
+cfg = get_settings()
+API_BASE = f"http://{cfg.api_host}:{cfg.api_port}"
+REDIS_URL = cfg.redis_url
 st.set_page_config(
     page_title="Cortex RAG",
             title = chunk.get("title", "Unknown")
             source = Path(chunk.get("source", "")).name
             snippet = chunk.get("text_snippet", "")[:160]
+            retriever = chunk.get("retriever", "dense")
+            retriever_colors = {
+                "dense": "#dbeafe:#1e40af",
+                "bm25": "#dcfce7:#166534",
+                "dense+bm25": "#f3e8ff:#6b21a8",
+                "bm25+dense": "#f3e8ff:#6b21a8",
+                "graph": "#fef9c3:#854d0e",
+                "web_search": "#fee2e2:#991b1b",
+            }
+            ret_style = retriever_colors.get(retriever, "#f3f4f6:#374151")
+            ret_bg, ret_fg = ret_style.split(":")
             st.markdown(f"""
 <div class="source-card">
   <strong>[{i+1}] {title}</strong>
   <span class="score-badge" style="float:right">{score_pct}%</span><br/>
+  <small style="color:#6b7280">{source}</small> &nbsp;
+  <span style="background:{ret_bg};color:{ret_fg};border-radius:4px;padding:1px 6px;font-size:0.72rem;font-weight:600">{retriever}</span>
   <div class="chunk-snippet">{snippet}…</div>
 </div>""", unsafe_allow_html=True)
 )
 st.divider()
+tab_ask, tab_ingest, tab_eval, tab_system = st.tabs(["🔍 Ask", "📥 Ingest", "📊 Evaluation", "🩺 System"])
 # ─────────────────────────────────────────────────────────────
                             sources_placeholder.markdown(payload.get("text", ""))
                             status_placeholder.empty()
+                        elif event_type == "crag_update":
+                            grade = payload.get("grade", "")
+                            rewritten = payload.get("rewritten_query")
+                            web_used = payload.get("web_search_used", False)
+                            reasoning = payload.get("reasoning", "")
+                            icon = {"POOR": "🔄", "ABSENT": "🌐"}.get(grade, "ℹ️")
+                            msg = f"{icon} **CRAG {grade}**: {reasoning[:100]}"
+                            if rewritten:
+                                msg += "  \n\u21a9 Rewritten: *" + rewritten + "*"
+                            if web_used:
+                                msg += "  \n\U0001f310 Web search fallback used"
+                            status_placeholder.info(msg)
                         elif event_type == "done":
                             answer_placeholder.markdown(answer_text)
                             status_placeholder.empty()
 # ─────────────────────────────────────────────────────────────
+# TAB 3 — EVALUATION DASHBOARD
+# ─────────────────────────────────────────────────────────────
+with tab_eval:
+    st.subheader("RAG evaluation dashboard")
+    st.caption("Metrics update automatically after each query. RAGAS scores compute in the background (~5s after response).")
+    if st.button("🔄 Refresh metrics"):
+        st.session_state.pop("metrics_data", None)
+    if "metrics_data" not in st.session_state:
+        try:
+            resp = requests.get(f"{API_BASE}/metrics?limit=200&days=14", timeout=5)
+            resp.raise_for_status()
+            st.session_state.metrics_data = resp.json()
+        except Exception as exc:
+            st.session_state.metrics_data = {"error": str(exc)}
+    mdata = st.session_state.get("metrics_data", {})
+    if "error" in mdata:
+        st.error(f"Cannot reach API: {mdata['error']}")
+    else:
+        summary = mdata.get("summary", {})
+        cache   = mdata.get("cache", {})
+        # ── Header KPI row ─────────────────────────────────────
+        k1, k2, k3, k4, k5, k6 = st.columns(6)
+        k1.metric("Total queries",    summary.get("total_queries", 0))
+        k2.metric("Faithfulness",     f"{summary.get('avg_faithfulness', 0):.2f}")
+        k3.metric("Answer relevancy", f"{summary.get('avg_answer_relevancy', 0):.2f}")
+        k4.metric("Context precision",f"{summary.get('avg_context_precision', 0):.2f}")
+        k5.metric("Avg latency",      f"{summary.get('avg_latency_ms', 0):.0f} ms")
+        k6.metric("Cache hit rate",   f"{cache.get('hit_rate', 0):.0%}" if cache.get('enabled') else "off")
+        st.divider()
+        # ── Metric timeseries ──────────────────────────────────
+        ts = mdata.get("timeseries", [])
+        if ts:
+            import pandas as pd
+            df_ts = pd.DataFrame(ts)
+            df_ts["hour"] = df_ts["hour_bucket"]
+            st.markdown("#### RAGAS metrics over time")
+            st.line_chart(
+                df_ts.set_index("hour")[["faithfulness", "answer_relevancy", "context_precision"]],
+                height=220,
+            )
+        else:
+            st.info("No evaluation data yet. Run some queries to populate the dashboard.")
+        st.divider()
+        col_left, col_right = st.columns(2, gap="large")
+        with col_left:
+            # ── CRAG grade distribution ────────────────────────
+            grade_dist = summary.get("crag_grade_dist", {})
+            if grade_dist:
+                import pandas as pd
+                st.markdown("#### CRAG grade distribution")
+                df_grades = pd.DataFrame(
+                    list(grade_dist.items()), columns=["Grade", "Count"]
+                )
+                st.bar_chart(df_grades.set_index("Grade"), height=180)
+            # ── Strategy distribution ──────────────────────────
+            strat_dist = summary.get("strategy_dist", {})
+            if strat_dist:
+                import pandas as pd
+                st.markdown("#### Retrieval strategy mix")
+                rows = []
+                for strat_json, cnt in strat_dist.items():
+                    try:
+                        import json as _json
+                        label = "+".join(_json.loads(strat_json)).upper()
+                    except Exception:
+                        label = strat_json
+                    rows.append({"Strategy": label, "Count": cnt})
+                df_strat = pd.DataFrame(rows)
+                st.bar_chart(df_strat.set_index("Strategy"), height=180)
+        with col_right:
+            # ── Cache stats ────────────────────────────────────
+            st.markdown("#### Cache")
+            if cache.get("enabled"):
+                c1, c2 = st.columns(2)
+                c1.metric("Hits",   cache.get("hits", 0))
+                c2.metric("Misses", cache.get("misses", 0))
+                st.caption(f"TTL: {cache.get('ttl_s', 0)//60} min")
+                if st.button("🗑️ Flush cache"):
+                    try:
+                        r = requests.post(f"{REDIS_URL}/cache/flush", timeout=5)
+                        st.success(f"Flushed {r.json().get('deleted', 0)} entries.")
+                        st.session_state.pop("metrics_data", None)
+                    except Exception as e:
+                        st.error(str(e))
+            else:
+                st.caption("Redis not connected. Start Redis to enable caching.")
+                st.code("docker run -d -p 6379:6379 redis:7-alpine", language="bash")
+        st.divider()
+        # ── Recent query log table ─────────────────────────────
+        recent = mdata.get("recent", [])
+        if recent:
+            import pandas as pd
+            st.markdown("#### Recent queries")
+            rows = []
+            for r in recent[:50]:
+                rows.append({
+                    "Query":       r.get("query", "")[:60],
+                    "Intent":      r.get("intent", ""),
+                    "CRAG":        r.get("crag_grade", ""),
+                    "Faithful":    f"{r['faithfulness']:.2f}"      if r.get("faithfulness")      else "—",
+                    "Relevancy":   f"{r['answer_relevancy']:.2f}"  if r.get("answer_relevancy")  else "—",
+                    "Precision":   f"{r['context_precision']:.2f}" if r.get("context_precision") else "—",
+                    "Latency ms":  f"{r.get('latency_ms', 0):.0f}",
+                })
+            st.dataframe(pd.DataFrame(rows), use_container_width=True, hide_index=True)
+# ─────────────────────────────────────────────────────────────
+# TAB 4 — SYSTEM HEALTH
 # ─────────────────────────────────────────────────────────────
 with tab_system:
     st.subheader("System health")
                 st.metric("Chunks indexed", stats.get("entity_count", "—"))
             st.divider()
+            graph_stats = health.get("graph_stats", {})
+            if graph_stats:
+                col_d, col_e = st.columns(2)
+                with col_d:
+                    st.metric("Graph nodes", graph_stats.get("nodes", "—"))
+                with col_e:
+                    st.metric("Graph edges", graph_stats.get("edges", "—"))
+            st.divider()
             st.markdown("**Raw health response**")
             st.json(health)