Spaces:

joytheslothh
/

MediRAG-API

Running

App Files Files Community

joytheslothh commited on 1 day ago

Commit

b6f9fa8

0 Parent(s):

deploy: clean build

Browse files

Files changed (48) hide show

.dockerignore +9 -0
.gitattributes +9 -0
.gitignore +39 -0
Dockerfile +40 -0
README.md +56 -0
app.py +98 -0
app_demo.py +172 -0
config.yaml +71 -0
conftest.py +12 -0
demo/.gitkeep +1 -0
pytest.ini +6 -0
requirements.txt +35 -0
requirements_hf.txt +46 -0
requirements_minimal.txt +43 -0
scripts/build_rxnorm_cache.py +347 -0
scripts/debug_pmc.py +54 -0
scripts/download_dailymed.py +259 -0
scripts/download_guidelines.py +399 -0
scripts/fix_fda_chunk_text.py +120 -0
scripts/ingest_incremental.py +192 -0
scripts/warmup.py +58 -0
setup.py +15 -0
src/__init__.py +44 -0
src/api/__init__.py +1 -0
src/api/main.py +933 -0
src/api/schemas.py +276 -0
src/cli.py +70 -0
src/dashboard/__init__.py +1 -0
src/evaluate.py +289 -0
src/evaluation/__init__.py +1 -0
src/evaluation/aggregator.py +173 -0
src/evaluation/ragas_eval.py +177 -0
src/modules/__init__.py +127 -0
src/modules/base.py +4 -0
src/modules/contradiction.py +259 -0
src/modules/entity_verifier.py +334 -0
src/modules/faithfulness.py +302 -0
src/modules/source_credibility.py +204 -0
src/pipeline/__init__.py +1 -0
src/pipeline/chunker.py +82 -0
src/pipeline/consensus.py +111 -0
src/pipeline/embedder.py +163 -0
src/pipeline/generator.py +584 -0
src/pipeline/ingest.py +250 -0
src/pipeline/privacy.py +65 -0
src/pipeline/retriever.py +463 -0
tests/test_api.py +51 -0
tests/test_modules.py +66 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,9 @@

+# Override .gitignore for Docker builds
+# Include necessary data files
+!data/index/
+!data/index/*
+# Exclude everything else from data
+data/raw/*
+data/processed/*
+logs/*

.gitattributes ADDED Viewed

	@@ -0,0 +1,9 @@

+# Hugging Face Spaces - Git Attributes
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.index filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,39 @@

+# Ignore all large database files and directories
+data/
+# Python
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*.egg-info/
+dist/
+build/
+.eggs/
+# Environments
+venv/
+.venv/
+env/
+.env
+# Logs (generated at runtime)
+logs/
+# IDE
+.vscode/
+.idea/
+*.suo
+*.user
+# OS
+.DS_Store
+Thumbs.db
+# Notebooks checkpoints
+.ipynb_checkpoints/
+# Temporary files
+*.tmp
+*.bak
+.env

Dockerfile ADDED Viewed

	@@ -0,0 +1,40 @@

+# MediRAG Backend - Hugging Face Spaces Docker Deployment
+# Optimized for faster builds
+FROM python:3.10-slim
+WORKDIR /app
+# Install system dependencies (libmupdf deps bundled in pymupdf wheel, no extra needed)
+RUN apt-get update && apt-get install -y \
+    git \
+    curl \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV TRANSFORMERS_CACHE=/tmp/transformers_cache
+ENV HF_HOME=/tmp/hf_home
+ENV TORCH_HOME=/tmp/torch_cache
+ENV PIP_NO_CACHE_DIR=1
+ENV PIP_DISABLE_PIP_VERSION_CHECK=1
+# Copy requirements first for better caching
+COPY requirements_minimal.txt .
+# Force pip re-run by busting the cache (update this date to force full reinstall)
+ARG CACHE_BUST=2026-04-12-v3
+RUN pip install --no-cache-dir -r requirements_minimal.txt
+# Copy the rest of the application
+COPY . .
+# Create necessary directories
+RUN mkdir -p data/processed data/raw logs
+# Expose port (Hugging Face Spaces uses 7860)
+EXPOSE 7860
+# Run FastAPI backend directly
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,56 @@

+---
+title: MediRAG API
+emoji: 🏥
+colorFrom: blue
+colorTo: green
+sdk: docker
+pinned: false
+---
+# MediRAG Backend - Hugging Face Spaces (Docker)
+🏥 **Medical RAG System with Hallucination Detection**
+This is the **backend API** for MediRAG 2.0, designed to work with a **React frontend**.
+## 🐳 Docker Deployment
+This Space provides the backend API. The React frontend connects to this backend.
+### Backend Features
+- 🔍 **Hybrid Retrieval**: FAISS (BioBERT) + BM25 keyword search
+- 🧠 **LLM Generation**: Mistral/Gemini for medical answer generation
+- 🛡️ **4-Layer Audit**: Faithfulness, Entity Verification, Source Credibility, Contradiction Detection
+- ⚠️ **Safety Interventions**: Auto-blocks high-risk responses
+- 📊 **Health Risk Score (HRS)**: 0-100 composite safety metric
+- 🔌 **REST API**: Full FastAPI endpoints for React frontend
+## 🚀 Usage
+### For React Frontend
+Connect your React app to this backend:
+```javascript
+const API_URL = "https://joytheslothh-medirag-api.hf.space";
+```
+### API Endpoints
+- `GET /health` - Health check
+- `POST /query` - Full RAG pipeline
+- `POST /evaluate` - Evaluate answer
+- `GET /docs` - Swagger API documentation
+### Environment Variables
+Set in Hugging Face Space settings:
+- `MISTRAL_API_KEY` - For Mistral LLM
+- `GOOGLE_API_KEY` - For Gemini LLM
+## 🏗️ Architecture
+```
+React Frontend → FastAPI Backend → RAG Pipeline → Response
+```
+## ⚠️ Disclaimer
+**This system is for research purposes only. Always consult qualified medical professionals for health decisions.**
+## 📄 License
+MIT License - See repository for details.

app.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""
+MediRAG Backend - FastAPI only (No Gradio)
+React frontend on Vercel, this is just the API backend
+"""
+import os
+import sys
+import subprocess
+import logging
+import requests
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Set cache directories for Hugging Face
+os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"
+os.environ["HF_HOME"] = "/tmp/hf_home"
+os.environ["TORCH_HOME"] = "/tmp/torch_cache"
+# Add src to path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src"))
+# Install spaCy model if not present (optional — server starts without it)
+try:
+    import spacy
+    try:
+        spacy.load("en_core_sci_lg")
+        logger.info("spaCy model en_core_sci_lg loaded.")
+    except OSError:
+        # Try installing the model at runtime
+        try:
+            logger.info("Attempting to install scispacy model en_core_sci_lg...")
+            subprocess.run([
+                sys.executable, "-m", "pip", "install", "--quiet",
+                "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz"
+            ], check=True, timeout=300)
+            spacy.load("en_core_sci_lg")
+            logger.info("spaCy model installed and loaded.")
+        except Exception as model_err:
+            logger.warning(f"Could not install spaCy model: {model_err}. NER features will be limited.")
+except ImportError:
+    logger.warning("spacy/scispacy not installed. NER features will be limited but server will still start.")
+# Download datasets using huggingface_hub
+from huggingface_hub import hf_hub_download
+# Check and download index and data files
+data_dir = os.path.join(os.path.dirname(__file__), "data")
+index_dir = os.path.join(data_dir, "index")
+os.makedirs(index_dir, exist_ok=True)
+faiss_path = os.path.join(index_dir, "faiss.index")
+metadata_path = os.path.join(index_dir, "metadata_store.pkl")
+bm25_path = os.path.join(index_dir, "bm25_cache.pkl")
+vocab_path = os.path.join(data_dir, "drugbank vocabulary.csv")
+rxnorm_path = os.path.join(data_dir, "rxnorm_cache.csv")
+def download_dataset_files():
+    """Download FAISS index and other core data from Hugging Face Dataset"""
+    repo_id = "joytheslothh/MediRAG-Index-Data"
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        logger.warning("HF_TOKEN environment variable is not set. Dataset download might fail if repo is private.")
+    try:
+        if not os.path.exists(faiss_path):
+            logger.info("Downloading faiss.index from HF dataset...")
+            hf_hub_download(repo_id=repo_id, filename="index/faiss.index", local_dir=data_dir, repo_type="dataset", token=token)
+        if not os.path.exists(metadata_path):
+            logger.info("Downloading metadata_store.pkl from HF dataset...")
+            hf_hub_download(repo_id=repo_id, filename="index/metadata_store.pkl", local_dir=data_dir, repo_type="dataset", token=token)
+        if not os.path.exists(bm25_path):
+            logger.info("Downloading bm25_cache.pkl from HF dataset...")
+            hf_hub_download(repo_id=repo_id, filename="index/bm25_cache.pkl", local_dir=data_dir, repo_type="dataset", token=token)
+        if not os.path.exists(vocab_path):
+            logger.info("Downloading drugbank vocabulary.csv from HF dataset...")
+            hf_hub_download(repo_id=repo_id, filename="drugbank vocabulary.csv", local_dir=data_dir, repo_type="dataset", token=token)
+        if not os.path.exists(rxnorm_path):
+            logger.info("Downloading rxnorm_cache.csv from HF dataset...")
+            hf_hub_download(repo_id=repo_id, filename="rxnorm_cache.csv", local_dir=data_dir, repo_type="dataset", token=token)
+    except Exception as e:
+        logger.error(f"Failed to download dataset files: {e}")
+        logger.warning("Backend may not start correctly or queries may fail.")
+# Trigger download at startup
+download_dataset_files()
+# Import FastAPI app - this is the main backend for React frontend
+from src.api.main import app
+if __name__ == "__main__":
+    import uvicorn
+    # Get port from environment (Hugging Face uses 7860)
+    port = int(os.environ.get("PORT", 7860))
+    logger.info("Starting FastAPI backend on port {}".format(port))
+    uvicorn.run(app, host="0.0.0.0", port=port)

app_demo.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+MediRAG Backend - Local Demo Version
+Simplified version for local testing without heavy models
+"""
+import os
+import gradio as gr
+# Mock functions for demo
+def health_check():
+    return {"status": "ok", "demo_mode": True}
+def query_medical(question: str, top_k: int = 5, mistral_api_key: str = "", google_api_key: str = ""):
+    """Demo version - returns mock response"""
+    # Simulate processing
+    demo_answer = f"""
+This is a DEMO response for: "{question}"
+In the full version, this would:
+1. Retrieve relevant medical documents from FAISS index
+2. Generate answer using Mistral/Gemini LLM
+3. Evaluate with 4-layer audit system
+4. Return Health Risk Score (HRS)
+**To run full version:**
+- Deploy to Hugging Face Spaces (Docker)
+- Or install all dependencies locally
+"""
+    demo_output = f"""
+🏥 **MEDICAL ANSWER (DEMO MODE)**
+{demo_answer}
+---
+📊 **RISK ASSESSMENT**
+• Health Risk Score (HRS): 25/100 (DEMO)
+• Risk Band: LOW
+• Confidence: MEDIUM
+---
+🧪 **MODULE SCORES (DEMO)**
+✓ Faithfulness: 0.85
+✓ Entity Accuracy: 0.90
+✓ Source Credibility: 0.88
+✓ Contradiction Risk: 0.95
+---
+📚 **TOP SOURCES (DEMO)**
+📄 Source 1: PubMed - Clinical Study (Score: 0.923)
+This is a placeholder for retrieved medical literature...
+📄 Source 2: PMC - Systematic Review (Score: 0.891)
+Another placeholder for medical evidence...
+---
+⏱️ Total Time: 1250ms (DEMO)
+---
+⚠️ **NOTE**: This is running in DEMO mode without the full ML models.
+For full functionality, deploy to Hugging Face Spaces or install all dependencies.
+    """.strip()
+    return demo_output
+# Create Gradio interface
+with gr.Blocks(title="MediRAG - Medical AI Demo") as demo:
+    gr.Markdown("""
+    # 🏥 MediRAG 2.0 - DEMO MODE
+    ## Medical Question Answering with Hallucination Detection
+    **⚠️ This is a DEMO version for local testing.**
+    The full version includes:
+    - 107,425+ medical documents in FAISS index
+    - BioBERT embeddings for retrieval
+    - Mistral/Gemini LLM for generation
+    - 4-layer audit system (DeBERTa-v3, SciSpaCy)
+    - Health Risk Score calculation
+    **Deploy to Hugging Face Spaces for full functionality:**
+    https://huggingface.co/spaces/joytheslothh/MediRAG-API
+    """)
+    with gr.Accordion("⚙️ API Configuration (Optional)", open=False):
+        gr.Markdown("""
+        In the full version, provide your API keys for LLM generation:
+        - **Mistral API Key**: https://console.mistral.ai/
+        - **Google API Key**: https://makersuite.google.com/app/apikey
+        """)
+        with gr.Row():
+            mistral_key_input = gr.Textbox(
+                label="Mistral API Key",
+                placeholder="Enter your Mistral API key (full version only)",
+                type="password",
+                value=""
+            )
+            google_key_input = gr.Textbox(
+                label="Google API Key (Gemini)",
+                placeholder="Enter your Google API key (full version only)",
+                type="password",
+                value=""
+            )
+    with gr.Row():
+        with gr.Column():
+            question_input = gr.Textbox(
+                label="Your Medical Question",
+                placeholder="e.g., What are the side effects of metformin?",
+                lines=3
+            )
+            top_k_slider = gr.Slider(
+                minimum=1,
+                maximum=10,
+                value=5,
+                step=1,
+                label="Number of Sources to Retrieve"
+            )
+            submit_btn = gr.Button("🔍 Ask MediRAG (Demo)", variant="primary")
+        with gr.Column():
+            output_text = gr.Markdown(label="Response")
+    submit_btn.click(
+        fn=query_medical,
+        inputs=[question_input, top_k_slider, mistral_key_input, google_key_input],
+        outputs=output_text
+    )
+    gr.Markdown("""
+    ---
+    ### 🚀 How to Run Full Version
+    **Option 1: Hugging Face Spaces (Recommended)**
+    ```
+    1. Visit: https://huggingface.co/spaces/joytheslothh/MediRAG-API
+    2. The full app is already deployed there!
+    ```
+    **Option 2: Local with Docker**
+    ```bash
+    cd Backend
+    docker build -t medirag .
+    docker run -p 7860:7860 medirag
+    ```
+    **Option 3: Local with Virtual Environment**
+    ```bash
+    cd Backend
+    python -m venv venv
+    venv\Scripts\activate
+    pip install -r requirements_hf.txt
+    python -m spacy download en_core_sci_lg
+    python app.py
+    ```
+    ### 🔬 Full System Features
+    - **Faithfulness**: DeBERTa-v3 NLI model checks claim support
+    - **Entity Verification**: SciSpaCy + DrugBank for drug/dosage validation
+    - **Source Credibility**: Ranks evidence by publication tier
+    - **Contradiction Detection**: Internal NLI cross-check for self-contradictions
+    """)
+if __name__ == "__main__":
+    port = int(os.environ.get("PORT", 7860))
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=port,
+        share=False,
+        show_error=True
+    )

config.yaml ADDED Viewed

	@@ -0,0 +1,71 @@

+retrieval:
+  top_k: 5
+  chunk_size: 512
+  chunk_overlap: 50
+  embedding_model: dmis-lab/biobert-v1.1
+  index_path: data/index/faiss.index
+  metadata_path: data/index/metadata_store.pkl
+modules:
+  faithfulness:
+    nli_model: cnut1648/biolinkbert-mednli
+    entailment_threshold: 0.75
+    max_nli_tokens: 510
+    truncate_side: left          # keep END of context (clinical values appear last)
+    deberta_batch_size: 4        # Colab T4: safe at 8 | CPU 16GB: use 2 | OOM: system retries at 1
+  entity_verifier:
+    spacy_model: en_ner_bc5cdr_md
+    critical_entity_types: [DRUG, DOSAGE]
+    dosage_tolerance_pct: 10     # >10% numerical difference → CRITICAL
+    rxnorm_api_url: https://rxnav.nlm.nih.gov/REST/approximateTerm.json
+    rxnorm_api_timeout_s: 3
+    rxnorm_cache_path: data/rxnorm_cache.csv
+  source_credibility:
+    method: keyword              # "keyword" = demo (FR-11a) | "metadata" = May (FR-11b)
+    # tier weights are defined by name in src/modules/source_credibility.py TIER_WEIGHTS dict
+    # clinical_guideline=1.0, drug_label=0.90, systematic_review=0.85,
+    # research_abstract=0.70, review_article=0.60, clinical_case=0.50, unknown=0.30
+  contradiction:
+    nli_model: cnut1648/biolinkbert-mednli   # same model as faithfulness — load once
+    confidence_threshold: 0.75
+    max_sentence_pairs: 45       # skip if N > 10 sentences, check adjacent + (first,last)
+    deberta_batch_size: 4
+aggregator:
+  weights:
+    faithfulness: 0.35
+    entity_accuracy: 0.20
+    source_credibility: 0.20
+    contradiction_risk: 0.15
+    ragas_composite: 0.10
+  risk_bands:
+    low: [0, 30]
+    moderate: [31, 60]
+    high: [61, 85]
+    critical: [86, 100]
+llm:
+  provider: mistral
+  gemini_api_key: ${GEMINI_API_KEY}
+  mistral_api_key: ${MISTRAL_API_KEY}
+  groq_api_key: ${GROQ_API_KEY}
+  model: mistral-large-latest
+  gemini_model: gemini-2.0-flash
+  groq_model: llama-3.3-70b-versatile
+  base_url: http://localhost:11434
+  timeout_seconds: 120
+  judge_temperature: 0.0
+  generation_temperature: 0.7
+api:
+  host: 0.0.0.0
+  port: 8000
+  max_query_length: 500
+  max_answer_length: 2000
+  max_chunks: 10
+  max_chunk_length: 2000
+logging:
+  level: INFO                    # set to WARNING on demo day
+  file: logs/medirag.log
+  format: "%(asctime)s [%(levelname)s] %(name)s: %(message)s"

conftest.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""
+conftest.py — project root
+Ensures src/ is on the Python path so all test files can import from src.*
+without needing PYTHONPATH to be set manually. (SRS Section 17)
+"""
+import sys
+import os
+# Add the src/ directory to path so `from modules.faithfulness import ...` works
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src"))
+# Also add project root so `import src` works
+sys.path.insert(0, os.path.dirname(__file__))

demo/.gitkeep ADDED Viewed

	@@ -0,0 +1 @@


1	+ # placeholder — demo_fallback.json generated by scripts/warmup.py

pytest.ini ADDED Viewed

	@@ -0,0 +1,6 @@

+[pytest]
+testpaths = tests
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+addopts = -v --tb=short

requirements.txt ADDED Viewed

	@@ -0,0 +1,35 @@

+langchain==0.1.20
+langchain-community==0.0.38
+# FIX 1: faiss-cpu 1.7.4 doesn't exist on PyPI — 1.9.0+ has a compatible API
+faiss-cpu>=1.9.0
+# FIX 2: torch 2.2.0 has no Python 3.13 wheels — 2.5.0+ supports Python 3.13
+torch>=2.5.0
+# FIX 3: transformers 4.40.0 may have issues on Python 3.13 — use 4.44+
+transformers>=4.44.0
+sentence-transformers>=2.7.0
+# scispacy + en_core_sci_lg: installed via conda, NOT here (see setup commands below)
+# scispacy 0.5.4 pins scipy<1.11 which has no Python 3.12 pip wheels.
+# Conda has pre-built scipy binaries — use: conda install -c conda-forge scispacy
+ragas==0.1.9
+fastapi==0.110.0
+uvicorn==0.27.0
+# streamlit>=1.35.0  # Removed - using React frontend instead
+pyyaml==6.0.1
+pydantic>=2.9.0        # 2.6.0 has broken pydantic.v1 on Python 3.12 (ForwardRef bug); fixed in 2.9+
+datasets==2.18.0
+pytest==8.1.0
+httpx>=0.27.0,<0.28.0              # starlette 0.36.3 TestClient breaks with httpx 0.28+ (removed app= kwarg)
+pandas>=2.2.0          # 2.2.0 has Python 3.12 wheels (no longer need 2.2.3+)
+numpy>=1.26.4,<2       # langchain 0.1.20 requires numpy<2; use conda env for Python 3.12 (conda pre-builds numpy 1.x)
+requests==2.31.0
+google-genai>=1.0.0                # New Google GenAI SDK (replaces deprecated google-generativeai)
+pysbd>=0.3.4                       # sentence boundary detection (faithfulness module)
+pymupdf>=1.24.0                    # fitz: extracted text from PDF
+python-docx>=1.1.0                 # extracted text from DOCX
+rank-bm25>=0.2.2                   # keyword search for retriever
+python-multipart>=0.0.12           # handle form data in FastAPI

requirements_hf.txt ADDED Viewed

	@@ -0,0 +1,46 @@

+# MediRAG Backend - Hugging Face Spaces Requirements
+# Optimized for faster builds - relaxed version constraints
+# Core dependencies
+langchain>=0.1.0
+langchain-community>=0.0.30
+# Vector search
+faiss-cpu>=1.9.0
+# ML/DL frameworks
+torch>=2.0.0
+transformers>=4.40.0
+sentence-transformers>=2.5.0
+# Medical NLP - installed in Dockerfile instead
+# scispacy>=0.5.4
+# https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz
+# https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz
+# Evaluation
+ragas>=0.1.0
+# API framework
+fastapi>=0.110.0
+uvicorn>=0.27.0
+# Hugging Face Spaces - Gradio for API wrapper
+gradio>=4.0.0,<5.0.0
+# Utilities
+pyyaml>=6.0.0
+pydantic>=2.0.0
+datasets>=2.18.0
+pandas>=2.0.0
+numpy>=1.26.0,<2
+requests>=2.30.0
+google-genai>=0.5.0
+pysbd>=0.3.0
+pymupdf>=1.24.0
+python-docx>=1.1.0
+rank-bm25>=0.2.0
+python-multipart>=0.0.12
+# Additional for Hugging Face
+huggingface-hub>=0.20.0

requirements_minimal.txt ADDED Viewed

	@@ -0,0 +1,43 @@

+# MediRAG Backend - FastAPI only (no Gradio)
+# React frontend on Vercel, this is just the API backend
+# Core API
+fastapi>=0.110.0
+uvicorn>=0.27.0
+python-multipart>=0.0.12
+# Data handling
+pydantic>=2.0.0
+pyyaml>=6.0.0
+numpy>=1.26.0,<2
+pandas>=2.0.0
+requests>=2.30.0
+# Essential ML only
+torch --index-url https://download.pytorch.org/whl/cpu
+transformers>=4.40.0
+sentence-transformers>=2.5.0
+faiss-cpu>=1.9.0
+# LLM integrations
+langchain>=0.1.0
+langchain-community>=0.0.30
+google-genai>=0.5.0
+ragas>=0.1.0
+# Hugging Face Hub (for fetching FAISS index at runtime)
+huggingface-hub>=0.20.0
+datasets>=2.18.0
+# File parsing (PDF, DOCX)
+pymupdf>=1.24.0
+python-docx>=1.1.0
+# Medical NLP
+spacy>=3.7.0
+scispacy>=0.5.4
+https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz
+https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz
+pysbd>=0.3.0
+rank-bm25>=0.2.0

scripts/build_rxnorm_cache.py ADDED Viewed

	@@ -0,0 +1,347 @@

+"""
+FR-20: build_rxnorm_cache.py — Offline Drug Name Normalisation Cache Builder
+=============================================================================
+Accepts EITHER:
+  A) DrugBank vocabulary CSV  (--drugbank-csv)  ← recommended, immediate
+  B) DrugBank Open Data XML   (--drugbank-xml)  ← requires registration at drugbank.com
+DrugBank vocabulary CSV is freely downloadable (no account needed) from:
+  https://go.drugbank.com/releases/latest#open-data  →  "DrugBank Vocabulary"
+Queries RxNorm REST API (single approximateTerm call per drug) and saves
+results to data/rxnorm_cache.csv.
+Runtime:
+    ~14,000 names × 0.1s delay × 1 API call ≈ 24 minutes
+Usage:
+    python scripts/build_rxnorm_cache.py --drugbank-csv "data/drugbank vocabulary.csv"
+    python scripts/build_rxnorm_cache.py --drugbank-csv "data/drugbank vocabulary.csv" --dry-run 50
+    python scripts/build_rxnorm_cache.py --drugbank-xml data/raw/drugbank_open_data.xml
+"""
+from __future__ import annotations
+import argparse
+import csv
+import logging
+import sys
+import time
+import xml.etree.ElementTree as ET
+from pathlib import Path
+import requests
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+)
+logger = logging.getLogger("build_rxnorm_cache")
+# RxNorm approximateTerm endpoint — returns rxcui + name in ONE call (v1.4 fix)
+RXNORM_APPROX_URL = "https://rxnav.nlm.nih.gov/REST/approximateTerm.json"
+# DrugBank Open Data XML namespace (XML path only)
+NS = {"db": "http://www.drugbank.ca"}
+# ---------------------------------------------------------------------------
+# Step 1A: Extract drug names from DrugBank Vocabulary CSV  ← preferred
+# ---------------------------------------------------------------------------
+def extract_drug_names_from_csv(csv_path: str) -> list[str]:
+    """
+    Parse the DrugBank vocabulary CSV and return all drug name strings.
+    CSV columns: DrugBank ID | Accession Numbers | Common name | CAS | UNII
+                 | Synonyms | Standard InChI Key
+    Synonyms column is pipe-separated (e.g. "Drug A | Alias B | Trade Name C").
+    Args:
+        csv_path : path to the DrugBank vocabulary CSV file
+    Returns:
+        Sorted deduplicated list of drug name strings.
+    """
+    path = Path(csv_path)
+    if not path.exists():
+        logger.error(
+            "DrugBank vocabulary CSV not found at '%s'. "
+            "Download it from https://go.drugbank.com/releases/latest#open-data "
+            "(look for 'DrugBank Vocabulary' — no account needed).",
+            csv_path,
+        )
+        sys.exit(1)
+    logger.info("Parsing DrugBank vocabulary CSV: %s", path)
+    names: set[str] = set()
+    with open(path, "r", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            # Common name
+            common = row.get("Common name", "").strip()
+            if common:
+                names.add(common)
+            # Pipe-separated synonyms
+            synonyms_raw = row.get("Synonyms", "")
+            if synonyms_raw:
+                for syn in synonyms_raw.split("|"):
+                    syn = syn.strip()
+                    if syn:
+                        names.add(syn)
+    result = sorted(names)
+    logger.info("Extracted %d unique drug names/synonyms from CSV", len(result))
+    return result
+# ---------------------------------------------------------------------------
+# Step 1B: Extract drug names from DrugBank Open Data XML  ← needs account
+# ---------------------------------------------------------------------------
+def extract_drug_names_from_xml(xml_path: str) -> list[str]:
+    """
+    Parse DrugBank Open Data XML and extract all drug names + synonyms.
+    Args:
+        xml_path : Path to drugbank_open_data.xml
+    Returns:
+        Sorted deduplicated list of drug name strings.
+    """
+    logger.info("Parsing DrugBank XML: %s", xml_path)
+    try:
+        tree = ET.parse(xml_path)
+    except FileNotFoundError:
+        logger.error(
+            "DrugBank XML not found at '%s'. "
+            "Download it from https://go.drugbank.com/releases/latest#open-data "
+            "(free academic registration required), or use --drugbank-csv instead.",
+            xml_path,
+        )
+        sys.exit(1)
+    except ET.ParseError as exc:
+        logger.error("Failed to parse DrugBank XML: %s", exc)
+        sys.exit(1)
+    root = tree.getroot()
+    names: set[str] = set()
+    for drug in root.findall("db:drug", NS):
+        name_el = drug.find("db:name", NS)
+        if name_el is not None and name_el.text:
+            names.add(name_el.text.strip())
+        for syn in drug.findall("db:synonyms/db:synonym", NS):
+            if syn.text:
+                names.add(syn.text.strip())
+        for brand in drug.findall(
+            "db:international-brands/db:international-brand/db:name", NS
+        ):
+            if brand.text:
+                names.add(brand.text.strip())
+    result = sorted(names)
+    logger.info("Extracted %d unique drug names/synonyms from XML", len(result))
+    return result
+# ---------------------------------------------------------------------------
+# Step 2: Query RxNorm (single API call per drug — v1.4)
+# ---------------------------------------------------------------------------
+def query_rxnorm(drug_name: str, timeout: int = 5) -> tuple[str, str]:
+    """
+    Look up a drug name in RxNorm using approximateTerm endpoint.
+    Returns (rxcui, canonical_name).  Returns ("", "") on any failure.
+    Uses /approximateTerm — single HTTP call returning both rxcui and name.
+    (Previous 2-call approach was replaced in v1.4, cutting runtime by ~50%.)
+    """
+    try:
+        resp = requests.get(
+            RXNORM_APPROX_URL,
+            params={"term": drug_name, "maxEntries": "1", "option": "1"},
+            timeout=timeout,
+        )
+        if resp.status_code != 200:
+            return "", ""
+        candidates: list[dict] = (
+            resp.json()
+            .get("approximateGroup", {})
+            .get("candidate", [])
+        )
+        if not candidates:
+            return "", ""
+        rxcui = candidates[0].get("rxcui", "")
+        name  = candidates[0].get("name", drug_name)   # fallback to input
+        return rxcui, name
+    except Exception:
+        return "", ""
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Build offline RxNorm cache from DrugBank data (FR-20)"
+    )
+    source = parser.add_mutually_exclusive_group()
+    source.add_argument(
+        "--drugbank-csv",
+        metavar="PATH",
+        default=None,
+        help=(
+            "Path to DrugBank vocabulary CSV  [RECOMMENDED — no account needed]. "
+            "Download from https://go.drugbank.com/releases/latest#open-data"
+        ),
+    )
+    source.add_argument(
+        "--drugbank-xml",
+        metavar="PATH",
+        default=None,
+        help="Path to DrugBank Open Data XML (requires free academic registration).",
+    )
+    parser.add_argument(
+        "--output-csv",
+        default="data/rxnorm_cache.csv",
+        help="Path for output CSV",
+    )
+    parser.add_argument(
+        "--delay",
+        type=float,
+        default=0.1,
+        help="Seconds to wait between API calls (default 0.1 — ~24 min total)",
+    )
+    parser.add_argument(
+        "--dry-run",
+        type=int,
+        default=0,
+        metavar="N",
+        help="Only process first N drug names (for testing)",
+    )
+    parser.add_argument(
+        "--resume",
+        action="store_true",
+        help=(
+            "Resume a previously interrupted run. Reads already-completed entries "
+            "from --output-csv and skips them, appending only the missing ones."
+        ),
+    )
+    args = parser.parse_args()
+    # ------------------------------------------------------------------
+    # Auto-detect source if neither flag was given
+    # ------------------------------------------------------------------
+    csv_default = "data/drugbank vocabulary.csv"
+    xml_default = "data/raw/drugbank_open_data.xml"
+    if args.drugbank_csv:
+        drug_names = extract_drug_names_from_csv(args.drugbank_csv)
+    elif args.drugbank_xml:
+        drug_names = extract_drug_names_from_xml(args.drugbank_xml)
+    elif Path(csv_default).exists():
+        logger.info("Auto-detected DrugBank vocabulary CSV at '%s'", csv_default)
+        drug_names = extract_drug_names_from_csv(csv_default)
+    elif Path(xml_default).exists():
+        logger.info("Auto-detected DrugBank XML at '%s'", xml_default)
+        drug_names = extract_drug_names_from_xml(xml_default)
+    else:
+        logger.error(
+            "No DrugBank source found. Pass --drugbank-csv or --drugbank-xml. "
+            "See script docstring for download links."
+        )
+        sys.exit(1)
+    if args.dry_run > 0:
+        drug_names = drug_names[: args.dry_run]
+        logger.info("Dry-run mode: processing %d names only", len(drug_names))
+    # ------------------------------------------------------------------
+    # Resume: skip names already in the output CSV
+    # ------------------------------------------------------------------
+    out_path = Path(args.output_csv)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    already_done: set[str] = set()
+    if args.resume and out_path.exists():
+        try:
+            with open(out_path, "r", encoding="utf-8") as f:
+                reader = csv.DictReader(f)
+                for row in reader:
+                    name = row.get("drug_name", "").strip()
+                    if name:
+                        already_done.add(name)
+            logger.info(
+                "Resume mode: %d entries already in cache — skipping these.",
+                len(already_done),
+            )
+        except Exception as exc:
+            logger.warning("Could not read existing cache for resume: %s", exc)
+            already_done = set()
+    remaining = [n for n in drug_names if n not in already_done]
+    skipped = len(drug_names) - len(remaining)
+    if skipped:
+        logger.info("Skipping %d already-resolved names. %d remaining.", skipped, len(remaining))
+    total = len(remaining)
+    if total == 0:
+        logger.info("Nothing to do — cache is already complete.")
+        sys.exit(0)
+    est_minutes = total * (args.delay + 0.05) / 60
+    logger.info(
+        "Starting cache build: %d names to process, delay=%.2fs, estimated %.0f minutes",
+        total, args.delay, est_minutes,
+    )
+    # ------------------------------------------------------------------
+    # Write CSV — append if resuming, overwrite otherwise
+    # ------------------------------------------------------------------
+    file_mode = "a" if args.resume and out_path.exists() and already_done else "w"
+    write_header = file_mode == "w"
+    found = len(already_done)  # count previously resolved entries too
+    new_found = 0
+    with open(out_path, file_mode, newline="", encoding="utf-8") as f:
+        writer = csv.writer(f)
+        if write_header:
+            writer.writerow(["drug_name", "rxcui", "canonical_name"])
+        for i, name in enumerate(remaining):
+            rxcui, canonical = query_rxnorm(name)
+            writer.writerow([name, rxcui, canonical])
+            if rxcui:
+                new_found += 1
+                found += 1
+            if i % 25 == 0 or i == total - 1:
+                pct = 100 * (i + 1) / total
+                logger.info(
+                    "Progress: %d/%d (%.1f%%) — %d resolved this run (%d total)",
+                    i + 1, total, pct, new_found, found,
+                )
+            time.sleep(args.delay)
+    logger.info(
+        "Cache saved to %s — %d/%d names resolved to RxNorm IDs (this run: +%d)",
+        out_path, found, len(drug_names), new_found,
+    )
+    logger.info(
+        "Commit this file to the repo: git add %s && git commit -m 'Add RxNorm cache'",
+        out_path,
+    )
+if __name__ == "__main__":
+    main()

scripts/debug_pmc.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import requests, re
+from lxml import html
+r = requests.get(
+    'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10725812/',
+    headers={'User-Agent': 'Mozilla/5.0'},
+    timeout=15
+)
+tree = html.fromstring(r.content)
+# Find main article body — skip nav/header
+article = tree.xpath('//article') or tree.xpath('//*[@role="main"]') or tree.xpath('//div[@class="article"]')
+root = article[0] if article else tree
+print('Using root:', root.tag, root.get('class','')[:40])
+# Find all sections with their h2/h3 and paragraphs
+sections = root.xpath('.//section')
+print(f'\nTotal sections: {len(sections)}')
+# Show first Recommendations section content
+for sec in sections:
+    h3 = sec.xpath('.//h3')
+    if h3 and 'Recommendation' in h3[0].text_content():
+        print('\n--- RECOMMENDATIONS SECTION ---')
+        print('H3:', h3[0].text_content().strip())
+        # Get all list items and paragraphs in this section
+        items = sec.xpath('.//li | .//p')
+        for item in items[:8]:
+            t = item.text_content().strip()
+            if t and len(t) > 20:
+                print(' TEXT:', t[:200])
+        break
+# Check how rec numbers look — find paragraphs starting with N.N pattern
+all_p = root.xpath('.//p')
+print('\n--- PARAGRAPHS WITH REC NUMBERS ---')
+rec_re = re.compile(r'^\s*\d+\.\d+[a-z]?\s+\w')
+count = 0
+for p in all_p:
+    t = p.text_content().strip()
+    if rec_re.match(t):
+        print(' REC:', t[:200])
+        count += 1
+        if count >= 5:
+            break
+# Show structure of first H2 section
+print('\n--- FIRST H2 SECTION STRUCTURE ---')
+h2_secs = root.xpath('.//section[.//h2]')
+if h2_secs:
+    sec = h2_secs[0]
+    print('H2:', sec.xpath('.//h2')[0].text_content().strip()[:60])
+    children = list(sec)
+    print('Direct children tags:', [c.tag for c in children[:10]])

scripts/download_dailymed.py ADDED Viewed

	@@ -0,0 +1,259 @@

+"""
+scripts/download_dailymed.py
+============================
+Downloads FDA DailyMed drug labels for common clinical drugs via the
+DailyMed API and saves them as chunks.jsonl ready for ingestion into
+the MediRAG FAISS index.
+Sections extracted per drug:
+  - DOSAGE AND ADMINISTRATION
+  - CONTRAINDICATIONS
+  - WARNINGS AND PRECAUTIONS
+  - INDICATIONS AND USAGE
+  - DRUG INTERACTIONS
+Usage:
+    python scripts/download_dailymed.py
+    python scripts/download_dailymed.py --drugs metformin aspirin warfarin
+    python scripts/download_dailymed.py --output data/dailymed_chunks.jsonl
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import time
+import xml.etree.ElementTree as ET
+from pathlib import Path
+import requests
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Top 200 common clinical drugs (priority list)
+# ---------------------------------------------------------------------------
+TOP_DRUGS = [
+    "metformin", "atorvastatin", "lisinopril", "levothyroxine", "amlodipine",
+    "omeprazole", "metoprolol", "albuterol", "losartan", "gabapentin",
+    "sertraline", "simvastatin", "montelukast", "pantoprazole", "alprazolam",
+    "furosemide", "escitalopram", "rosuvastatin", "acetaminophen", "ibuprofen",
+    "amoxicillin", "azithromycin", "doxycycline", "prednisone", "warfarin",
+    "clopidogrel", "aspirin", "tamsulosin", "insulin glargine", "glipizide",
+    "hydrochlorothiazide", "amlodipine", "venlafaxine", "bupropion", "duloxetine",
+    "clonazepam", "lorazepam", "zolpidem", "quetiapine", "aripiprazole",
+    "olanzapine", "risperidone", "fluoxetine", "paroxetine", "citalopram",
+    "tramadol", "oxycodone", "morphine", "fentanyl", "naloxone",
+    "ciprofloxacin", "levofloxacin", "clindamycin", "metronidazole", "trimethoprim",
+    "enalapril", "ramipril", "carvedilol", "bisoprolol", "digoxin",
+    "spironolactone", "diltiazem", "verapamil", "nifedipine", "hydralazine",
+    "nitroglycerin", "isosorbide", "clopidogrel", "apixaban", "rivaroxaban",
+    "dabigatran", "heparin", "enoxaparin", "atorvastatin", "pravastatin",
+    "ezetimibe", "fenofibrate", "niacin", "gemfibrozil", "cholestyramine",
+    "allopurinol", "colchicine", "indomethacin", "naproxen", "celecoxib",
+    "hydroxychloroquine", "methotrexate", "leflunomide", "sulfasalazine",
+    "prednisolone", "dexamethasone", "budesonide", "fluticasone", "beclomethasone",
+    "ipratropium", "tiotropium", "salmeterol", "formoterol", "theophylline",
+    "insulin aspart", "insulin lispro", "sitagliptin", "saxagliptin", "empagliflozin",
+    "canagliflozin", "dapagliflozin", "liraglutide", "exenatide", "pioglitazone",
+    "acarbose", "repaglinide", "nateglinide", "glimepiride", "glyburide",
+    "levothyroxine", "methimazole", "propylthiouracil", "calcitonin", "alendronate",
+    "risedronate", "ibandronate", "denosumab", "teriparatide", "raloxifene",
+    "tamoxifen", "letrozole", "anastrozole", "exemestane", "fulvestrant",
+    "rituximab", "trastuzumab", "bevacizumab", "imatinib", "erlotinib",
+    "ondansetron", "metoclopramide", "promethazine", "prochlorperazine",
+    "loperamide", "bismuth subsalicylate", "lactulose", "polyethylene glycol",
+    "docusate", "senna", "mesalamine", "sulfasalazine", "infliximab",
+    "adalimumab", "etanercept", "ustekinumab", "secukinumab",
+    "acyclovir", "valacyclovir", "oseltamivir", "ribavirin", "sofosbuvir",
+    "fluconazole", "itraconazole", "voriconazole", "amphotericin b",
+    "vancomycin", "linezolid", "daptomycin", "meropenem", "piperacillin",
+    "phenytoin", "valproic acid", "carbamazepine", "levetiracetam", "lamotrigine",
+    "topiramate", "oxcarbazepine", "lacosamide", "brivaracetam",
+    "donepezil", "memantine", "rivastigmine", "galantamine",
+    "carbidopa levodopa", "pramipexole", "ropinirole", "rasagiline", "selegiline",
+    "baclofen", "tizanidine", "cyclobenzaprine", "methocarbamol",
+    "sildenafil", "tadalafil", "vardenafil", "finasteride", "dutasteride",
+    "testosterone", "estradiol", "progesterone", "medroxyprogesterone",
+    "methylphenidate", "amphetamine", "atomoxetine", "guanfacine", "clonidine",
+]
+# DailyMed sections we care about (LOINC codes)
+SECTION_CODES = {
+    "34068-7": "DOSAGE AND ADMINISTRATION",
+    "34070-3": "CONTRAINDICATIONS",
+    "43685-7": "WARNINGS AND PRECAUTIONS",
+    "34067-9": "INDICATIONS AND USAGE",
+    "34073-7": "DRUG INTERACTIONS",
+    "34071-1": "WARNINGS",
+    "34084-4": "ADVERSE REACTIONS",
+    "34088-5": "OVERDOSAGE",
+    "34080-2": "USE IN SPECIFIC POPULATIONS",
+}
+DAILYMED_API = "https://dailymed.nlm.nih.gov/dailymed/services/v2"
+def search_drug(drug_name: str) -> str | None:
+    """Return the SPL set_id for the first matching drug label."""
+    try:
+        r = requests.get(
+            f"{DAILYMED_API}/spls.json",
+            params={"drug_name": drug_name, "pagesize": 1},
+            timeout=10,
+        )
+        r.raise_for_status()
+        data = r.json()
+        results = data.get("data", [])
+        if results:
+            return results[0].get("setid")
+    except Exception as e:
+        logger.warning("Search failed for '%s': %s", drug_name, e)
+    return None
+def fetch_label_xml(set_id: str) -> str | None:
+    """Download the full SPL XML for a given set_id."""
+    try:
+        r = requests.get(
+            f"{DAILYMED_API}/spls/{set_id}.xml",
+            timeout=15,
+        )
+        r.raise_for_status()
+        return r.text
+    except Exception as e:
+        logger.warning("XML fetch failed for set_id '%s': %s", set_id, e)
+    return None
+def extract_sections(xml_text: str, drug_name: str, set_id: str = "unknown") -> list[dict]:
+    """Parse SPL XML and extract clinical sections as chunk dicts."""
+    chunks = []
+    try:
+        root = ET.fromstring(xml_text)
+        ns = {"hl7": "urn:hl7-org:v3"}
+        # Get brand/generic name from XML
+        title_el = root.find(".//hl7:title", ns)
+        label_title = title_el.text.strip() if title_el is not None and title_el.text else drug_name.title()
+        for section in root.findall(".//hl7:section", ns):
+            code_el = section.find("hl7:code", ns)
+            if code_el is None:
+                continue
+            code = code_el.get("code", "")
+            section_name = SECTION_CODES.get(code)
+            if not section_name:
+                continue
+            # Extract text — handle tables specially so row data isn't lost
+            texts = []
+            for el in section.iter("{urn:hl7-org:v3}text"):
+                # Extract tables as readable rows before falling back to itertext
+                for table in el.findall(".//{urn:hl7-org:v3}table"):
+                    rows = []
+                    for tr in table.iter("{urn:hl7-org:v3}tr"):
+                        cells = [" ".join(td.itertext()).strip()
+                                 for td in tr.iter("{urn:hl7-org:v3}td")]
+                        if not cells:
+                            cells = [" ".join(th.itertext()).strip()
+                                     for th in tr.iter("{urn:hl7-org:v3}th")]
+                        row = " | ".join(c for c in cells if c)
+                        if row:
+                            rows.append(row)
+                    if rows:
+                        texts.append(" ; ".join(rows))
+                    # Remove table from tree to avoid double-counting via itertext
+                    el.remove(table) if table in list(el) else None
+                # Non-table text
+                text = " ".join(el.itertext()).strip()
+                if text:
+                    texts.append(text)
+            full_text = " ".join(texts).strip()
+            if len(full_text) < 50:
+                continue
+            # Truncate to 1500 chars per chunk (BioBERT max ~512 tokens)
+            for i in range(0, min(len(full_text), 6000), 1500):
+                segment = full_text[i:i+1500].strip()
+                if len(segment) < 50:
+                    continue
+                chunk_id = f"fda_{drug_name.replace(' ', '_')}_{set_id}_{code}_{i}"
+                chunks.append({
+                    "chunk_id":     chunk_id,
+                    "doc_id":       f"fda_{drug_name.replace(' ', '_')}_{set_id}",
+                    "chunk_text":   f"[FDA DailyMed | {drug_name.title()} | {section_name}] {drug_name.title()} {section_name}: {segment}",
+                    "chunk_index":  i // 1500,
+                    "total_chunks": max(1, min(4, len(full_text) // 1500 + 1)),
+                    "pub_type":     "drug_label",
+                    "source":       "FDA DailyMed",
+                    "title":        f"{label_title} — {section_name}",
+                    "pub_year":     2024,
+                    "journal":      "FDA DailyMed",
+                    "drug_name":    drug_name,
+                    "section":      section_name,
+                })
+    except ET.ParseError as e:
+        logger.warning("XML parse error for '%s': %s", drug_name, e)
+    return chunks
+def download_dailymed(drug_list: list[str], output_path: str) -> None:
+    out = Path(output_path)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    total_chunks = 0
+    failed = []
+    with open(out, "w", encoding="utf-8") as f:
+        for i, drug in enumerate(drug_list):
+            logger.info("[%d/%d] Processing: %s", i + 1, len(drug_list), drug)
+            set_id = search_drug(drug)
+            if not set_id:
+                logger.warning("  No DailyMed entry found for '%s'", drug)
+                failed.append(drug)
+                time.sleep(0.3)
+                continue
+            xml_text = fetch_label_xml(set_id)
+            if not xml_text:
+                failed.append(drug)
+                time.sleep(0.3)
+                continue
+            chunks = extract_sections(xml_text, drug, set_id=set_id)
+            for chunk in chunks:
+                f.write(json.dumps(chunk) + "\n")
+            total_chunks += len(chunks)
+            logger.info("  → %d chunks extracted (set_id: %s)", len(chunks), set_id)
+            time.sleep(0.4)  # Be polite to the API
+    logger.info("Done. %d total chunks written to %s", total_chunks, out)
+    if failed:
+        logger.warning("Failed drugs (%d): %s", len(failed), ", ".join(failed))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--drugs", nargs="*", default=None,
+                        help="Specific drug names (default: full TOP_DRUGS list)")
+    parser.add_argument("--output", default="data/dailymed_chunks.jsonl",
+                        help="Output JSONL path")
+    parser.add_argument("--limit", type=int, default=None,
+                        help="Limit number of drugs to download")
+    args = parser.parse_args()
+    drug_list = args.drugs or TOP_DRUGS
+    # Deduplicate while preserving order
+    seen: set[str] = set()
+    drug_list = [d for d in drug_list if not (d in seen or seen.add(d))]
+    if args.limit:
+        drug_list = drug_list[:args.limit]
+    logger.info("Downloading DailyMed labels for %d drugs...", len(drug_list))
+    download_dailymed(drug_list, args.output)

scripts/download_guidelines.py ADDED Viewed

	@@ -0,0 +1,399 @@

+"""
+scripts/download_guidelines.py
+================================
+Downloads clinical guidelines from PubMed Central (PMC) open-access API
+and chunks them for ingestion into the MediRAG FAISS index.
+Sources:
+  - ADA Standards of Medical Care in Diabetes 2024 (16 sections via PMC)
+  - More guidelines can be added to GUIDELINE_SOURCES below
+Chunking strategy (based on structural analysis):
+  - Primary boundary: H2 clinical topic + Recommendations block + evidence narrative
+  - Never split a Recommendations block
+  - Store evidence grades (A/B/C/E) and recommendation numbers as metadata
+Usage:
+    python scripts/download_guidelines.py
+    python scripts/download_guidelines.py --source ada_diabetes
+    python scripts/download_guidelines.py --dry-run
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import re
+import time
+import uuid
+from pathlib import Path
+import requests
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Guideline sources — PMC IDs for ADA 2024 Standards of Care
+# ---------------------------------------------------------------------------
+GUIDELINE_SOURCES = {
+    "ada_diabetes": {
+        "name": "ADA Standards of Medical Care in Diabetes 2024",
+        "key": "ada",
+        "pub_type": "clinical_guideline",
+        "source": "American Diabetes Association",
+        "pub_year": 2024,
+        "journal": "Diabetes Care",
+        "sections": [
+            {"pmcid": "PMC10725812", "section": "2", "title": "Diagnosis and Classification of Diabetes"},
+            {"pmcid": "PMC10725809", "section": "4", "title": "Comprehensive Medical Evaluation and Assessment of Comorbidities"},
+            {"pmcid": "PMC10725816", "section": "5", "title": "Facilitating Positive Health Behaviors and Well-being"},
+            {"pmcid": "PMC10725808", "section": "6", "title": "Glycemic Goals and Hypoglycemia"},
+            {"pmcid": "PMC10725813", "section": "7", "title": "Diabetes Technology"},
+            {"pmcid": "PMC10725806", "section": "8", "title": "Obesity and Weight Management for the Prevention and Treatment of Type 2 Diabetes"},
+            {"pmcid": "PMC10725810", "section": "9", "title": "Pharmacologic Approaches to Glycemic Treatment"},
+            {"pmcid": "PMC10725804", "section": "13", "title": "Older Adults"},
+            {"pmcid": "PMC10725814", "section": "14", "title": "Children and Adolescents"},
+            {"pmcid": "PMC10725801", "section": "15", "title": "Management of Diabetes in Pregnancy"},
+            {"pmcid": "PMC10725815", "section": "16", "title": "Diabetes Care in the Hospital"},
+            {"pmcid": "PMC10725798", "section": "1", "title": "Improving Care and Promoting Health in Populations"},
+        ],
+    },
+    "acc_aha_cholesterol": {
+        "name": "2018 ACC/AHA Guideline on Management of Blood Cholesterol",
+        "key": "acc_aha_chol",
+        "pub_type": "clinical_guideline",
+        "source": "American College of Cardiology/American Heart Association",
+        "pub_year": 2018,
+        "journal": "Circulation",
+        "sections": [
+            # PMC7403606: Grundy et al. 2018 executive summary, freely accessible full text
+            {"pmcid": "PMC7403606", "section": "1", "title": "Management of Blood Cholesterol — Statin Therapy and LDL Targets"},
+        ],
+    },
+    "acc_aha_prevention": {
+        "name": "2019 ACC/AHA Guideline on Primary Prevention of Cardiovascular Disease",
+        "key": "acc_aha_prev",
+        "pub_type": "clinical_guideline",
+        "source": "American College of Cardiology/American Heart Association",
+        "pub_year": 2019,
+        "journal": "Journal of the American College of Cardiology",
+        "sections": [
+            # PMC7685565: Arnett et al. 2019, full guideline open access
+            {"pmcid": "PMC7685565", "section": "1", "title": "Primary Prevention — Blood Pressure, Cholesterol, Aspirin, Lifestyle"},
+        ],
+    },
+}
+PMC_API = "https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{pmcid}/unicode"
+PMC_EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+# Evidence grade pattern: single letter A/B/C/E at end of recommendation
+_GRADE_RE = re.compile(r'\b([ABCE])\s*$')
+# Recommendation number pattern: e.g. "9.18", "2.1a", "6.5b"
+_REC_NUM_RE = re.compile(r'^(\d+\.\d+[a-z]?)\s+')
+PMC_HTML_URL = "https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/"
+def fetch_pmc_xml(pmcid: str) -> str | None:
+    """Fetch PMC article HTML page and extract clean structured text."""
+    try:
+        from lxml import html as lxml_html
+        url = PMC_HTML_URL.format(pmcid=pmcid)
+        r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=30)
+        r.raise_for_status()
+        return _extract_pmc_html_text(lxml_html.fromstring(r.content))
+    except Exception as e:
+        logger.warning("PMC HTML fetch failed for %s: %s", pmcid, e)
+        return None
+def _extract_pmc_html_text(tree) -> str:
+    """
+    Extract clean structured text from PMC article HTML.
+    Uses lxml XPath to navigate the <article> element.
+    Deduplicates recommendation paragraphs (PMC renders them twice).
+    """
+    # Get main article element
+    articles = tree.xpath('//article')
+    root = articles[0] if articles else tree
+    lines = []
+    seen_texts: set[str] = set()  # Deduplication for repeated elements
+    def clean(el) -> str:
+        return " ".join(el.text_content().split()).strip()
+    def add_line(text: str) -> None:
+        if text and len(text) > 10 and text not in seen_texts:
+            seen_texts.add(text)
+            lines.append(text)
+    def extract_table(table_el):
+        """Extract a table element as readable pipe-separated rows."""
+        caption = table_el.xpath('.//caption')
+        if caption:
+            add_line(f"[Table: {clean(caption[0])}]")
+        for tr in table_el.xpath('.//tr'):
+            cells = [" ".join(td.text_content().split()).strip()
+                     for td in tr.xpath('.//td | .//th')]
+            row = " | ".join(c for c in cells if c)
+            if row:
+                add_line(row)
+    def process_section(sec, depth=0):
+        # Deep-search for tables first (they may be nested inside divs/figures)
+        for table in sec.xpath('.//table'):
+            # Only process tables whose nearest section ancestor is this sec
+            ancestors = table.xpath('ancestor::section')
+            if not ancestors or ancestors[-1] == sec:
+                extract_table(table)
+        for child in sec:
+            tag = child.tag.lower() if isinstance(child.tag, str) else ""
+            if tag in ("h1", "h2", "h3", "h4"):
+                text = clean(child)
+                if text and text not in ("Abstract", "References", "Footnotes"):
+                    lines.append(f"\n{'#' * (depth + 2)} {text}")
+            elif tag == "p":
+                text = clean(child)
+                add_line(text)
+            elif tag in ("ul", "ol"):
+                for li in child.xpath('.//li'):
+                    text = clean(li)
+                    add_line(f"• {text}")
+            elif tag == "section":
+                process_section(child, depth + 1)
+            elif tag == "table":
+                pass  # Already handled above via deep-search
+            elif tag == "div":
+                # Recurse into divs that might contain content
+                cls = child.get("class", "")
+                if any(k in cls for k in ("content", "body", "text", "article")):
+                    process_section(child, depth)
+    for sec in root.xpath('.//section'):
+        # Only process top-level sections (not deeply nested)
+        parent = sec.getparent()
+        if parent is not None and parent.tag.lower() not in ("section",):
+            process_section(sec)
+    # If no sections found, fall back to all paragraphs
+    if len(lines) < 5:
+        for p in root.xpath('.//article//p | .//p[@class]'):
+            add_line(clean(p))
+    return "\n\n".join(l for l in lines if l.strip())
+def extract_recommendations(text: str) -> list[dict]:
+    """Extract individual recommendations with their numbers and grades."""
+    recs = []
+    for line in text.split('\n'):
+        line = line.strip()
+        m = _REC_NUM_RE.match(line)
+        if m:
+            rec_num = m.group(1)
+            rec_text = line[m.end():].strip()
+            grade_m = _GRADE_RE.search(rec_text)
+            grade = grade_m.group(1) if grade_m else "E"
+            recs.append({"number": rec_num, "text": rec_text, "grade": grade})
+    return recs
+def chunk_guideline_text(
+    text: str,
+    section_meta: dict,
+    guideline_meta: dict,
+    max_chunk_chars: int = 2000,
+) -> list[dict]:
+    """
+    Chunk guideline text at ## heading boundaries produced by _extract_pmc_html_text.
+    Each chunk = H2/H3 topic + its paragraphs/recommendations.
+    """
+    chunks = []
+    section_num = section_meta["section"]
+    section_title = section_meta["title"]
+    guideline_name = guideline_meta["name"]
+    source = guideline_meta["source"]
+    pub_year = guideline_meta["pub_year"]
+    pub_type = guideline_meta["pub_type"]
+    source_key = guideline_meta.get("key", "ada")
+    journal = guideline_meta.get("journal", "Diabetes Care")
+    # Split text into blocks at any ## heading
+    # Each block starts with a heading line and contains the following paragraphs
+    _HEADING_RE = re.compile(r'^(#{1,4})\s+(.+)$', re.MULTILINE)
+    # Find all heading positions
+    heading_matches = list(_HEADING_RE.finditer(text))
+    if not heading_matches:
+        # No headings found — chunk by size
+        blocks = [(section_title, text)]
+    else:
+        blocks = []
+        for i, m in enumerate(heading_matches):
+            heading_text = m.group(2).strip()
+            # Skip metadata headings
+            if heading_text in ("Abstract", "References", "Footnotes", "Author notes",
+                                "Conflicts of interest", "Acknowledgments"):
+                continue
+            start = m.end()
+            end = heading_matches[i + 1].start() if i + 1 < len(heading_matches) else len(text)
+            content = text[start:end].strip()
+            if content:
+                blocks.append((heading_text, content))
+    def make_chunk(heading: str, content: str, part_idx: int = 0) -> dict:
+        recs = extract_recommendations(content)
+        rec_nums = [r["number"] for r in recs]
+        grades = {r["number"]: r["grade"] for r in recs}
+        grade_summary = "/".join(sorted(set(r["grade"] for r in recs))) if recs else ""
+        prefix = f"[{guideline_name} | Section {section_num}: {section_title} | {heading}]"
+        if grade_summary:
+            prefix += f" [Evidence: {grade_summary}]"
+        return {
+            "chunk_id":           f"guideline_{source_key}_{section_num}_{uuid.uuid4().hex[:8]}_{part_idx}",
+            "doc_id":             f"guideline_{source_key}_section_{section_num}",
+            "chunk_text":         f"{prefix}\n{content}",
+            "chunk_index":        len(chunks),
+            "total_chunks":       0,
+            "pub_type":           pub_type,
+            "source":             source,
+            "title":              f"{guideline_name} — Section {section_num}: {heading}",
+            "pub_year":           pub_year,
+            "journal":            journal,
+            "section_number":     section_num,
+            "section_title":      section_title,
+            "h2_heading":         heading,
+            "recommendation_numbers": rec_nums,
+            "evidence_grades":    grades,
+        }
+    for heading, content in blocks:
+        if len(content) <= max_chunk_chars:
+            chunks.append(make_chunk(heading, content))
+        else:
+            # Split long blocks at paragraph boundaries
+            paras = [p.strip() for p in re.split(r'\n{2,}', content) if p.strip()]
+            current: list[str] = []
+            part = 0
+            for para in paras:
+                current.append(para)
+                if len("\n\n".join(current)) >= max_chunk_chars:
+                    chunks.append(make_chunk(heading, "\n\n".join(current[:-1]), part))
+                    current = [para]
+                    part += 1
+            if current:
+                chunks.append(make_chunk(heading, "\n\n".join(current), part))
+    for chunk in chunks:
+        chunk["total_chunks"] = len(chunks)
+    return chunks
+def download_guidelines(source_key: str, output_path: str, dry_run: bool = False) -> None:
+    source = GUIDELINE_SOURCES[source_key]
+    out = Path(output_path)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    total_chunks = 0
+    failed_sections = []
+    with open(out, "w", encoding="utf-8") as f:
+        for section in source["sections"]:
+            pmcid = section["pmcid"]
+            logger.info("Fetching %s — Section %s: %s", pmcid, section["section"], section["title"])
+            text = fetch_pmc_xml(pmcid)
+            if not text or len(text) < 200:
+                logger.warning("No text retrieved for %s — skipping", pmcid)
+                failed_sections.append(section["title"])
+                time.sleep(0.5)
+                continue
+            logger.info("  Retrieved %d chars", len(text))
+            chunks = chunk_guideline_text(text, section, source)
+            logger.info("  → %d chunks extracted", len(chunks))
+            if dry_run:
+                if chunks:
+                    logger.info("  Sample chunk:\n%s\n...", chunks[0]["chunk_text"][:300])
+                continue
+            for chunk in chunks:
+                f.write(json.dumps(chunk) + "\n")
+            total_chunks += len(chunks)
+            time.sleep(0.5)  # Be polite to NCBI API
+    if not dry_run:
+        logger.info("Done. %d total chunks written to %s", total_chunks, out)
+    if failed_sections:
+        logger.warning("Failed sections: %s", failed_sections)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--source", default=None,
+                        choices=list(GUIDELINE_SOURCES.keys()),
+                        help="Guideline source to download (default: all sources)")
+    parser.add_argument("--all", action="store_true",
+                        help="Download all guideline sources")
+    parser.add_argument("--output", default="data/guidelines_chunks.jsonl")
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Fetch and parse but don't write output")
+    args = parser.parse_args()
+    sources_to_run = list(GUIDELINE_SOURCES.keys()) if (args.all or args.source is None) else [args.source]
+    for source_key in sources_to_run:
+        logger.info("Downloading: %s", GUIDELINE_SOURCES[source_key]["name"])
+        # For multi-source runs, append non-ada sources to the same output file
+        if source_key == sources_to_run[0]:
+            download_guidelines(source_key, args.output, dry_run=args.dry_run)
+        else:
+            # Append to existing file by re-opening in append mode
+            out = Path(args.output)
+            source = GUIDELINE_SOURCES[source_key]
+            total_chunks = 0
+            failed_sections = []
+            with open(out, "a", encoding="utf-8") as f:
+                for section in source["sections"]:
+                    pmcid = section["pmcid"]
+                    logger.info("Fetching %s — Section %s: %s", pmcid, section["section"], section["title"])
+                    text = fetch_pmc_xml(pmcid)
+                    if not text or len(text) < 200:
+                        logger.warning("No text retrieved for %s — skipping", pmcid)
+                        failed_sections.append(section["title"])
+                        time.sleep(0.5)
+                        continue
+                    logger.info("  Retrieved %d chars", len(text))
+                    chunks = chunk_guideline_text(text, section, source)
+                    logger.info("  → %d chunks extracted", len(chunks))
+                    if args.dry_run:
+                        if chunks:
+                            logger.info("  Sample chunk:\n%s\n...", chunks[0]["chunk_text"][:300])
+                        continue
+                    for chunk in chunks:
+                        f.write(json.dumps(chunk) + "\n")
+                    total_chunks += len(chunks)
+                    time.sleep(0.5)
+            if not args.dry_run:
+                logger.info("Done. %d total chunks written for %s", total_chunks, source_key)
+            if failed_sections:
+                logger.warning("Failed sections: %s", failed_sections)

scripts/fix_fda_chunk_text.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""
+scripts/fix_fda_chunk_text.py
+==============================
+One-time fix: replaces the verbose FDA boilerplate prefix in all FDA DailyMed
+chunk_text entries in the metadata store with a clean, BM25-friendly prefix.
+Before: [FDA DRUG LABEL — These highlights do not include all the information
+         needed to use WARFARIN SODIUM TABLETS safely and effectively...]
+         CONTRAINDICATIONS: actual content...
+After:  [FDA DailyMed | Warfarin | CONTRAINDICATIONS] actual content...
+Usage:
+    python scripts/fix_fda_chunk_text.py
+    python scripts/fix_fda_chunk_text.py --dry-run
+"""
+from __future__ import annotations
+import argparse
+import logging
+import pickle
+import re
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+import yaml
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logger = logging.getLogger(__name__)
+SECTION_CODES = {
+    "34068-7": "DOSAGE AND ADMINISTRATION",
+    "34070-3": "CONTRAINDICATIONS",
+    "43685-7": "WARNINGS AND PRECAUTIONS",
+    "34067-9": "INDICATIONS AND USAGE",
+    "34073-7": "DRUG INTERACTIONS",
+    "34071-1": "WARNINGS",
+}
+# Matches both old boilerplate and previously-fixed format
+_BOILERPLATE_RE = re.compile(r"^\[FDA[^\]]*\]\s*(?:[A-Za-z][^:]*:\s*)?", re.DOTALL)
+def fix_chunk_text(chunk_id: str, old_text: str) -> str:
+    """Return cleaned chunk_text with a compact, keyword-rich prefix."""
+    # Extract drug name from chunk_id: fda_{drug_name}_{set_id}_{code}_{offset}
+    parts = chunk_id.split("_")
+    # parts[0] = "fda", parts[1] = drug_name (may be multi-word), then UUID parts, then code, then offset
+    # Find the section code in parts
+    section_name = None
+    drug_name_parts = []
+    for i, part in enumerate(parts[1:], 1):
+        if part in SECTION_CODES:
+            section_name = SECTION_CODES[part]
+            drug_name_parts = parts[1:i]
+            break
+    # Filter out UUID parts (set_id format: 8hex-4hex-...) from drug name
+    _UUID_RE = re.compile(r'^[0-9a-f]{8}-', re.I)
+    drug_name_parts = [p for p in drug_name_parts if not _UUID_RE.match(p)]
+    drug_name = " ".join(drug_name_parts).replace("_", " ").title() if drug_name_parts else "Unknown"
+    if not section_name:
+        m = _BOILERPLATE_RE.match(old_text)
+        section_name = m.group(1).strip() if m else "DRUG INFORMATION"
+    # Strip the old boilerplate prefix and get just the content
+    m = _BOILERPLATE_RE.match(old_text)
+    content = old_text[m.end():].strip() if m else old_text.strip()
+    # Prepend drug name into content so BM25 finds it even in continuation chunks
+    # e.g. chunk starting "Bleeding tendencies..." now reads "Warfarin CONTRAINDICATIONS: Bleeding..."
+    return f"[FDA DailyMed | {drug_name} | {section_name}] {drug_name} {section_name}: {content}"
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dry-run", action="store_true")
+    args = parser.parse_args()
+    with open("config.yaml") as f:
+        cfg = yaml.safe_load(f)
+    meta_path = cfg["retrieval"]["metadata_path"]
+    logger.info("Loading metadata store from %s ...", meta_path)
+    with open(meta_path, "rb") as f:
+        store: dict = pickle.load(f)
+    fda_keys = [k for k, v in store.items() if v.get("source") == "FDA DailyMed"]
+    logger.info("Found %d FDA DailyMed entries to fix", len(fda_keys))
+    fixed = 0
+    for key in fda_keys:
+        entry = store[key]
+        old_text = entry.get("chunk_text", "")
+        # Re-run on both old boilerplate AND previously-fixed entries (to fix UUID + add drug name to content)
+        if not (old_text.startswith("[FDA DRUG LABEL") or old_text.startswith("[FDA DailyMed |")):
+            continue
+        new_text = fix_chunk_text(entry.get("chunk_id", ""), old_text)
+        if args.dry_run:
+            if fixed < 3:
+                logger.info("BEFORE: %s", old_text[:120])
+                logger.info("AFTER:  %s", new_text[:120])
+                logger.info("---")
+        else:
+            store[key]["chunk_text"] = new_text
+        fixed += 1
+    logger.info("%d entries %s", fixed,
+                "would be fixed (dry run)" if args.dry_run else "fixed")
+    if not args.dry_run:
+        with open(meta_path, "wb") as f:
+            pickle.dump(store, f, protocol=pickle.HIGHEST_PROTOCOL)
+        logger.info("Metadata store saved. Restart backend to rebuild BM25 index.")
+if __name__ == "__main__":
+    main()

scripts/ingest_incremental.py ADDED Viewed

	@@ -0,0 +1,192 @@

+"""
+scripts/ingest_incremental.py
+==============================
+Adds new chunks to an EXISTING FAISS index without rebuilding from scratch.
+Only the new chunks are embedded — existing vectors are untouched.
+Usage:
+    python scripts/ingest_incremental.py --input data/dailymed_chunks.jsonl
+    python scripts/ingest_incremental.py --input data/dailymed_chunks.jsonl --dry-run
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import pickle
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+import faiss
+import numpy as np
+import yaml
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logger = logging.getLogger(__name__)
+def load_config() -> dict:
+    with open("config.yaml", "r", encoding="utf-8") as f:
+        return yaml.safe_load(f)
+def load_new_chunks(path: str) -> list[dict]:
+    chunks = []
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                chunks.append(json.loads(line))
+    logger.info("Loaded %d new chunks from %s", len(chunks), path)
+    return chunks
+def embed_chunks(chunks: list[dict], model_name: str) -> np.ndarray:
+    from sentence_transformers import SentenceTransformer
+    model = SentenceTransformer(model_name)
+    texts = [c["chunk_text"] for c in chunks]
+    logger.info("Embedding %d new chunks with %s...", len(texts), model_name)
+    embeddings = model.encode(
+        texts,
+        batch_size=32,
+        show_progress_bar=True,
+        normalize_embeddings=True,
+        convert_to_numpy=True,
+    )
+    return embeddings.astype(np.float32)
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", required=True, help="JSONL file of new chunks")
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Show what would be added without writing to disk")
+    parser.add_argument("--force-update-section", default=None,
+                        help="Force-update chunk_text for existing chunks matching this section keyword (e.g. 'ADVERSE REACTIONS')")
+    args = parser.parse_args()
+    cfg = load_config()
+    idx_path   = cfg["retrieval"]["index_path"]
+    meta_path  = cfg["retrieval"]["metadata_path"]
+    model_name = cfg["retrieval"]["embedding_model"]
+    if not Path(idx_path).exists():
+        logger.error("FAISS index not found at %s. Run embedder.py first.", idx_path)
+        sys.exit(1)
+    # Load existing index + metadata
+    logger.info("Loading existing FAISS index from %s ...", idx_path)
+    index = faiss.read_index(idx_path)
+    existing_count = index.ntotal
+    logger.info("Existing index: %d vectors", existing_count)
+    with open(meta_path, "rb") as f:
+        metadata_store: dict[int, dict] = pickle.load(f)
+    # Force-update existing chunk_text for a specific section (no new FAISS vectors needed)
+    all_input_chunks = load_new_chunks(args.input)
+    if args.force_update_section:
+        section_kw = args.force_update_section.upper()
+        # Primary lookup: chunk_id → FAISS key (works for FDA with deterministic IDs)
+        id_to_meta = {v.get("chunk_id"): k for k, v in metadata_store.items()}
+        # Secondary lookup: (doc_id, chunk_index) → FAISS key (works for guidelines with random UUID IDs)
+        docidx_to_meta = {(v.get("doc_id", ""), v.get("chunk_index", 0)): k
+                          for k, v in metadata_store.items()}
+        updated = 0
+        for chunk in all_input_chunks:
+            if section_kw in chunk.get("chunk_text", "").upper():
+                # Try primary match first
+                faiss_key = id_to_meta.get(chunk.get("chunk_id"))
+                # Fallback to (doc_id, chunk_index) match
+                if faiss_key is None:
+                    faiss_key = docidx_to_meta.get((chunk.get("doc_id", ""), chunk.get("chunk_index", 0)))
+                if faiss_key is not None:
+                    metadata_store[faiss_key]["chunk_text"] = chunk["chunk_text"]
+                    updated += 1
+        logger.info("Force-updated chunk_text for %d '%s' entries", updated, section_kw)
+        if not args.dry_run:
+            with open(meta_path, "wb") as f:
+                pickle.dump(metadata_store, f, protocol=pickle.HIGHEST_PROTOCOL)
+            logger.info("Metadata store saved.")
+            # Invalidate BM25 cache
+            bm25_cache = Path(meta_path).parent / "bm25_cache.pkl"
+            if bm25_cache.exists():
+                bm25_cache.unlink()
+                logger.info("BM25 cache invalidated — will rebuild on next startup.")
+        return
+    # Deduplicate — skip chunks already in the index.
+    # Primary key: chunk_id. Secondary key: (doc_id, chunk_index) handles
+    # re-ingestion of the same document with new UUIDs (e.g. FDA label updates).
+    existing_ids = {v.get("chunk_id", "") for v in metadata_store.values()}
+    existing_docidx = {
+        (v.get("doc_id", ""), v.get("chunk_index", -1))
+        for v in metadata_store.values()
+        if v.get("doc_id") and v.get("chunk_index", -1) >= 0
+    }
+    def _is_duplicate(c: dict) -> bool:
+        if c.get("chunk_id") in existing_ids:
+            return True
+        key = (c.get("doc_id", ""), c.get("chunk_index", -1))
+        return key[0] != "" and key[1] >= 0 and key in existing_docidx
+    new_chunks = [c for c in all_input_chunks if not _is_duplicate(c)]
+    if not new_chunks:
+        logger.info("All chunks already in index. Nothing to add.")
+        return
+    logger.info("%d new chunks to add (%d duplicates skipped)",
+                len(new_chunks), len(all_input_chunks) - len(new_chunks))
+    if args.dry_run:
+        logger.info("DRY RUN — no changes written.")
+        for c in new_chunks[:5]:
+            logger.info("  Would add: %s | %s", c.get("chunk_id"), c.get("title", "")[:60])
+        return
+    # Embed new chunks only
+    embeddings = embed_chunks(new_chunks, model_name)
+    # Add to existing FAISS index
+    index.add(embeddings)
+    logger.info("Index now has %d vectors (+%d)", index.ntotal, len(new_chunks))
+    # Extend metadata store (new keys start from existing_count)
+    for i, chunk in enumerate(new_chunks):
+        metadata_store[existing_count + i] = {
+            "chunk_id":     chunk.get("chunk_id", f"chunk_{existing_count + i}"),
+            "doc_id":       chunk.get("doc_id", ""),
+            "source":       chunk.get("source", ""),
+            "title":        chunk.get("title", ""),
+            "pub_type":     chunk.get("pub_type", "unknown"),
+            "pub_year":     chunk.get("pub_year"),
+            "journal":      chunk.get("journal", ""),
+            "chunk_index":  chunk.get("chunk_index", 0),
+            "total_chunks": chunk.get("total_chunks", 1),
+            "chunk_text":   chunk.get("chunk_text", ""),
+        }
+    # Save updated artifacts
+    faiss.write_index(index, idx_path)
+    logger.info("FAISS index saved to %s", idx_path)
+    with open(meta_path, "wb") as f:
+        pickle.dump(metadata_store, f, protocol=pickle.HIGHEST_PROTOCOL)
+    logger.info("Metadata store saved (%d total entries)", len(metadata_store))
+    # Also append to chunks.jsonl for future full rebuilds
+    chunks_jsonl = Path("data/processed/chunks.jsonl")
+    with open(chunks_jsonl, "a", encoding="utf-8") as f:
+        for chunk in new_chunks:
+            f.write(json.dumps(chunk) + "\n")
+    logger.info("Appended %d chunks to %s", len(new_chunks), chunks_jsonl)
+    logger.info("Done. Restart the backend to reload the updated index.")
+if __name__ == "__main__":
+    main()

scripts/warmup.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""
+src/scripts/warmup.py
+=====================
+Pre-loads heavy ML models (FAISS, DeBERTa, SciSpaCy) into memory
+and guarantees instantaneous responses for the first API request during the live demo.
+Usage:
+    python scripts/warmup.py
+"""
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+import logging
+import time
+import requests
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
+logger = logging.getLogger("warmup")
+def main():
+    api_url = "http://localhost:8000"
+    logger.info("Verifying API is running...")
+    try:
+        health = requests.get(f"{api_url}/health", timeout=5)
+        health.raise_for_status()
+        logger.info(f"API Health: {health.json()}")
+    except requests.exceptions.RequestException as e:
+        logger.error(f"API is not running at {api_url}. Please start it with 'uvicorn src.api.main:app' first.")
+        sys.exit(1)
+    logger.info("Sending WARMUP query to load DeBERTa, SciSpaCy, and FAISS into RAM... (This may take 15-25s)")
+    t0 = time.time()
+    # We send a basic query to force all models to initialize
+    payload = {
+        "question": "What is the recommended dosage of Metformin for elderly Type 2 Diabetes patients?",
+        "top_k": 1,
+        "run_ragas": False
+    }
+    try:
+        resp = requests.post(f"{api_url}/query", json=payload, timeout=60)
+        resp.raise_for_status()
+        elapsed = time.time() - t0
+        logger.info(f"Warmup successful in {elapsed:.1f}s!")
+        logger.info("All machine learning models are now cached in RAM.")
+        logger.info("The next API requests will be completely instantaneous.")
+    except Exception as e:
+        logger.error(f"Warmup failed: {e}")
+        if hasattr(e, "response") and e.response is not None:
+            logger.error(f"Response: {e.response.text}")
+if __name__ == "__main__":
+    main()

setup.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from setuptools import setup, find_packages
+setup(
+    name="medirag-cli",
+    version="0.1.0",
+    packages=find_packages(),
+    install_requires=[
+        "typer>=0.9.0",
+    ],
+    entry_points={
+        "console_scripts": [
+            "medirag=src.cli:app",
+        ],
+    },
+)

src/__init__.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""
+src/__init__.py — Package initializer and logging setup.
+Runs once on first `import src`. Sets up logging from config.yaml.
+(SRS Section 13)
+"""
+import logging
+import os
+def _setup_logging() -> None:
+    """Configure root logger. No-op if handlers already exist."""
+    os.makedirs("logs", exist_ok=True)
+    log_level = logging.INFO
+    log_format = "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
+    log_file = "logs/medirag.log"
+    # Try to load level from config.yaml
+    try:
+        import yaml
+        with open("config.yaml", "r") as f:
+            cfg = yaml.safe_load(f)
+        level_str = cfg.get("logging", {}).get("level", "INFO")
+        log_level = getattr(logging, level_str.upper(), logging.INFO)
+        log_file = cfg.get("logging", {}).get("file", log_file)
+        log_format = cfg.get("logging", {}).get("format", log_format)
+    except Exception:
+        pass  # Use defaults if config not found (e.g., during tests)
+    root = logging.getLogger()
+    if root.handlers:
+        return  # Already configured — don't add duplicate handlers
+    handlers: list[logging.Handler] = [logging.StreamHandler()]
+    try:
+        os.makedirs(os.path.dirname(log_file), exist_ok=True)
+        handlers.append(logging.FileHandler(log_file, encoding="utf-8"))
+    except Exception:
+        pass  # File logging optional — don't fail on permission errors
+    logging.basicConfig(level=log_level, format=log_format, handlers=handlers)
+_setup_logging()

src/api/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # src/api/__init__.py

src/api/main.py ADDED Viewed

	@@ -0,0 +1,933 @@

+"""
+src/api/main.py — MediRAG FastAPI Application
+=============================================
+FR-18: Two endpoints:
+  GET  /health   → liveness check + Ollama status
+  POST /evaluate → calls run_evaluation(), returns FR-17 JSON
+Design decisions:
+  - DeBERTa model is loaded once at app startup (not per-request)
+  - If any module raises an exception, partial results are returned (no HTTP 500)
+  - HTTP 422 Pydantic validation errors are automatic
+  - RAGAS is disabled by default (run_ragas=False) — set to True only if
+    Ollama/OpenAI is available; the RAGAS module already fails gracefully.
+To run:
+    uvicorn src.api.main:app --reload --host 0.0.0.0 --port 8000
+"""
+from __future__ import annotations
+import os
+import logging
+import time
+from contextlib import asynccontextmanager
+from pathlib import Path
+from typing import Optional
+import requests
+import json
+import sqlite3
+import yaml
+from datetime import datetime
+from fastapi import FastAPI, HTTPException, File, UploadFile
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import RedirectResponse
+import threading
+from src.api.schemas import (
+    HealthResponse,
+    EvaluateRequest,
+    EvaluateResponse,
+    QueryRequest,
+    QueryResponse,
+    RetrievedChunk,
+    IngestRequest,
+    ChatRequest,
+    ModuleScore,
+    ModuleResults,
+)
+from src.evaluate import run_evaluation
+from src.pipeline.generator import generate_answer
+from src.pipeline.retriever import Retriever
+# ---------------------------------------------------------------------------
+# Logging
+# ---------------------------------------------------------------------------
+try:
+    _cfg = yaml.safe_load(Path("config.yaml").read_text())
+    _log_level = _cfg.get("logging", {}).get("level", "INFO")
+    _ollama_base = _cfg.get("llm", {}).get("base_url", "http://localhost:11434")
+    _api_cfg = _cfg.get("api", {})
+except Exception:
+    _log_level = "INFO"
+    _ollama_base = "http://localhost:11434"
+    _api_cfg = {}
+logging.basicConfig(
+    level=getattr(logging, _log_level, logging.INFO),
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+)
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Database settings
+# ---------------------------------------------------------------------------
+def init_db():
+    Path("data").mkdir(exist_ok=True)
+    conn = sqlite3.connect("data/logs.db")
+    c = conn.cursor()
+    c.execute("""
+        CREATE TABLE IF NOT EXISTS audit_logs (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            timestamp TEXT,
+            endpoint TEXT,
+            question TEXT,
+            answer TEXT,
+            hrs INTEGER,
+            risk_band TEXT,
+            composite_score REAL,
+            latency_ms INTEGER,
+            intervention_applied BOOLEAN,
+            details TEXT
+        )
+    """)
+    conn.commit()
+    conn.close()
+def log_audit(endpoint: str, question: str, answer: str, hrs: int, risk_band: str, composite: float, latency: int, intervention: bool, details: dict):
+    try:
+        conn = sqlite3.connect("data/logs.db")
+        c = conn.cursor()
+        c.execute("""
+            INSERT INTO audit_logs (timestamp, endpoint, question, answer, hrs, risk_band, composite_score, latency_ms, intervention_applied, details)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        """, (
+            datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
+            endpoint,
+            question,
+            answer,
+            hrs,
+            risk_band,
+            composite,
+            latency,
+            intervention,
+            json.dumps(details)
+        ))
+        conn.commit()
+        conn.close()
+    except Exception as e:
+        logger.error(f"Failed to save audit log to DB: {e}")
+# ---------------------------------------------------------------------------
+# Lifespan: warm DeBERTa once at startup so the first request isn't slow
+# ---------------------------------------------------------------------------
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Pre-warm DeBERTa and Retriever at startup."""
+    init_db()
+    logger.info("MediRAG API starting — pre-warming models...")
+    try:
+        from src.modules.faithfulness import _get_model
+        _get_model()
+        logger.info("DeBERTa pre-warm complete.")
+    except Exception as exc:
+        logger.warning("DeBERTa pre-warm skipped: %s", exc)
+    # Pre-load the retriever (BioBERT + FAISS index) into app state
+    try:
+        app.state.retriever = Retriever(_cfg)
+        # Trigger lazy load now so first /query request isn't slow
+        app.state.retriever._load_model()
+        app.state.retriever._load_index()
+        logger.info("Retriever pre-warm complete.")
+    except Exception as exc:
+        logger.warning("Retriever pre-warm skipped: %s", exc)
+        app.state.retriever = None
+    yield
+    logger.info("MediRAG API shutting down.")
+# ---------------------------------------------------------------------------
+# App
+# ---------------------------------------------------------------------------
+app = FastAPI(
+    title="MediRAG Evaluation API",
+    description=(
+        "Evaluate LLM-generated medical answers against retrieved evidence. "
+        "Returns faithfulness, entity accuracy, source credibility, "
+        "contradiction risk, and a composite Health Risk Score (HRS)."
+    ),
+    version="0.1.0",
+    lifespan=lifespan,
+)
+# Allow all origins for local dev / React frontend on same machine
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["GET", "POST"],
+    allow_headers=["*"],
+)
+# ---------------------------------------------------------------------------
+# Helper: check Ollama
+# ---------------------------------------------------------------------------
+def _check_ollama() -> bool:
+    """Return True if Ollama API is reachable."""
+    try:
+        resp = requests.get(f"{_ollama_base}/api/tags", timeout=2)
+        return resp.status_code == 200
+    except Exception:
+        return False
+# ---------------------------------------------------------------------------
+# Helper: convert EvalResult details → ModuleScore
+# ---------------------------------------------------------------------------
+def _module_score(module_results: dict, key: str) -> Optional[ModuleScore]:
+    data = module_results.get(key)
+    if data is None:
+        return None
+    return ModuleScore(
+        score=data.get("score", 0.0),
+        details=data.get("details", {}),
+        error=data.get("error"),
+        latency_ms=data.get("latency_ms"),
+    )
+# ---------------------------------------------------------------------------
+# GET / → redirect to /docs
+# ---------------------------------------------------------------------------
+@app.post("/project-guide")
+def project_guide(req: ChatRequest):
+    """
+    Proxy endpoint for the Project Guide chatbot.
+    Routes requests to Groq API using the local GROQ_API_KEY.
+    """
+    groq_url = "https://api.groq.com/openai/v1/chat/completions"
+    api_key = os.getenv("GROQ_API_KEY")
+    if not api_key:
+        raise HTTPException(status_code=500, detail="GROQ_API_KEY not found in server environment")
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json"
+    }
+    # Format messages for Groq
+    messages = []
+    if req.system_prompt:
+        messages.append({"role": "system", "content": req.system_prompt})
+    for m in req.messages:
+        messages.append({"role": m.role, "content": m.content})
+    payload = {
+        "model": "mixtral-8x7b-32768",
+        "messages": messages,
+        "temperature": 0.5,
+        "max_tokens": 1024
+    }
+    try:
+        resp = requests.post(groq_url, headers=headers, json=payload, timeout=30)
+        resp.raise_for_status()
+        return resp.json()
+    except Exception as e:
+        logger.error(f"Groq Proxy Error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/", include_in_schema=False)
+def root():
+    return RedirectResponse(url="/docs")
+# ---------------------------------------------------------------------------
+# GET /health
+# ---------------------------------------------------------------------------
+@app.get("/health", response_model=HealthResponse, tags=["system"])
+def health() -> HealthResponse:
+    """
+    Liveness check.
+    Returns {"status": "ok", "ollama_available": true/false}.
+    Always returns 200 — the caller decides what to do with `ollama_available`.
+    """
+    return HealthResponse(
+        status="ok",
+        ollama_available=_check_ollama(),
+    )
+# ---------------------------------------------------------------------------
+# POST /evaluate
+# ---------------------------------------------------------------------------
+@app.post("/evaluate", response_model=EvaluateResponse, tags=["evaluation"])
+def evaluate(req: EvaluateRequest) -> EvaluateResponse:
+    """
+    Run the full MediRAG evaluation pipeline on a question + answer + context.
+    - Validates inputs (FR-18: length limits, chunk count)
+    - Runs Faithfulness, Entity Verification, Source Credibility, Contradiction
+    - Optionally runs RAGAS (set `run_ragas=true` if Ollama/OpenAI is available)
+    - Returns composite Health Risk Score (HRS) + per-module breakdown
+    **Note on `run_ragas`**: RAGAS requires a running LLM backend (Ollama or
+    OpenAI). If unavailable, RAGAS will gracefully return score=0.5 as a
+    neutral fallback — it will NOT crash the request.
+    """
+    logger.info(
+        "POST /evaluate — question=%r, chunks=%d, run_ragas=%s",
+        req.question[:80],
+        len(req.context_chunks),
+        req.run_ragas,
+    )
+    # Convert Pydantic ContextChunk → plain dicts for the pipeline
+    context_dicts: list[dict] = [chunk.model_dump(exclude_none=True) for chunk in req.context_chunks]
+    t0 = time.perf_counter()
+    try:
+        result = run_evaluation(
+            question=req.question,
+            answer=req.answer,
+            context_chunks=context_dicts,
+            rxnorm_cache_path=req.rxnorm_cache_path,
+            run_ragas=req.run_ragas,
+            config=_cfg,
+        )
+    except Exception as exc:
+        logger.exception("run_evaluation raised an unhandled exception: %s", exc)
+        raise HTTPException(
+            status_code=500,
+            detail=f"Evaluation pipeline error: {type(exc).__name__}: {exc}",
+        ) from exc
+    total_ms = int((time.perf_counter() - t0) * 1000)
+    # Extract composite score + details
+    composite = float(result.score)
+    details = result.details or {}
+    hrs = details.get("hrs", int(round(100 * (1.0 - composite))))
+    hrs = max(0, min(100, hrs))
+    confidence_level = details.get("confidence_level", "UNKNOWN")
+    risk_band = details.get("risk_band", "UNKNOWN")
+    pipeline_ms = details.get("total_pipeline_ms", total_ms)
+    # Build per-module scores
+    mod_results: dict = details.get("module_results", {})
+    module_scores = ModuleResults(
+        faithfulness=_module_score(mod_results, "faithfulness"),
+        entity_verifier=_module_score(mod_results, "entity_verifier"),
+        source_credibility=_module_score(mod_results, "source_credibility"),
+        contradiction=_module_score(mod_results, "contradiction"),
+        ragas=_module_score(mod_results, "ragas"),
+    )
+    logger.info(
+        "POST /evaluate → HRS=%d (%s) in %d ms",
+        hrs, risk_band, pipeline_ms,
+    )
+    log_audit("evaluate", req.question, req.answer, hrs, risk_band, composite, pipeline_ms, False, {
+        "module_results": mod_results,
+        "confidence_level": confidence_level
+    })
+    return EvaluateResponse(
+        composite_score=composite,
+        hrs=hrs,
+        confidence_level=confidence_level,
+        risk_band=risk_band,
+        module_results=module_scores,
+        total_pipeline_ms=pipeline_ms,
+    )
+# ---------------------------------------------------------------------------
+# POST /query  — end-to-end: question → retrieve → generate → evaluate
+# ---------------------------------------------------------------------------
+@app.post("/query", response_model=QueryResponse, tags=["query"])
+def query(req: QueryRequest) -> QueryResponse:
+    """
+    Full end-to-end MediRAG pipeline.
+    1. Retrieves top-k context chunks from FAISS (BioBERT)
+    2. Generates a grounded answer using Mistral (Ollama)
+    3. Evaluates the answer with all 4 modules + aggregator
+    4. Returns the answer, retrieved chunks, HRS score, and full breakdown
+    **Requires Ollama running locally with Mistral pulled.**
+    No fallback — returns 503 if Ollama is unavailable.
+    """
+    import time as _time
+    t_total = _time.perf_counter()
+    logger.info("POST /query — question=%r, top_k=%d", req.question[:80], req.top_k)
+    # Step 1: Retrieve
+    retriever: Optional[Retriever] = getattr(app.state, "retriever", None)
+    if retriever is None:
+        # Fallback: instantiate now (slower first call)
+        try:
+            retriever = Retriever(_cfg)
+        except Exception as exc:
+            raise HTTPException(status_code=503,
+                detail=f"Retriever unavailable: {exc}") from exc
+    try:
+        raw_results = retriever.search(req.question, top_k=req.top_k)
+    except FileNotFoundError as exc:
+        raise HTTPException(status_code=503,
+            detail=f"FAISS index not found: {exc}") from exc
+    except Exception as exc:
+        raise HTTPException(status_code=500,
+            detail=f"Retrieval error: {exc}") from exc
+    if not raw_results:
+        raise HTTPException(status_code=404,
+            detail="No relevant documents found for this question.")
+    # Convert retriever output → chunk dicts for generator + evaluate
+    context_chunks: list[dict] = []
+    retrieved_chunks_out: list[RetrievedChunk] = []
+    for chunk_text, meta, score in raw_results:
+        d = {
+            "text":       chunk_text,
+            "chunk_id":   meta.get("chunk_id"),
+            "source":     meta.get("source", ""),
+            "pub_type":   meta.get("pub_type", ""),
+            "pub_year":   meta.get("pub_year"),
+            "title":      meta.get("title", ""),
+        }
+        context_chunks.append(d)
+        retrieved_chunks_out.append(RetrievedChunk(
+            chunk_id=meta.get("chunk_id"),
+            text=chunk_text[:500],   # truncate for response readability
+            source=meta.get("source", ""),
+            pub_type=meta.get("pub_type", ""),
+            pub_year=meta.get("pub_year"),
+            title=meta.get("title", ""),
+            similarity_score=round(score, 4),
+        ))
+    logger.info("Retrieved %d chunks (top score=%.4f)", len(context_chunks),
+                raw_results[0][2] if raw_results else 0.0)
+    # Raw FAISS cosine similarity for coverage gap gate.
+    # IndexFlatIP + L2-norm = cosine in [-1, 1]. < 0.60 means no close semantic match in DB.
+    top_faiss_cosine = (
+        raw_results[0][1].get("_top_faiss_cosine", 0.0) if raw_results else 0.0
+    )
+    # Convert request overrides into a dict for generator
+    llm_overrides = {}
+    if req.llm_provider:
+        llm_overrides["provider"] = req.llm_provider
+    if req.llm_api_key:
+        llm_overrides["api_key"] = req.llm_api_key
+    if req.llm_model:
+        llm_overrides["model"] = req.llm_model
+    if req.ollama_url:
+        llm_overrides["ollama_url"] = req.ollama_url
+    if req.system_prompt:
+        llm_overrides["system_prompt"] = req.system_prompt
+    if req.persona:
+        llm_overrides["persona"] = req.persona
+    # =========================================================================
+    # Step 2a: PRIVACY SHIELD — MediRAG redacts PHI (Option 1)
+    # =========================================================================
+    p_mapping = {}
+    privacy_applied = False
+    question_to_gen = req.question
+    if req.use_privacy_shield:
+        from src.pipeline.privacy import shield
+        question_to_gen, p_mapping = shield.redact(req.question)
+        if p_mapping:
+            privacy_applied = True
+            logger.info("PRIVACY INTERVENTION: Redacted %d items from question.", len(p_mapping))
+    # Step 2: Generate answer via LLM (Gemini or Ollama)
+    try:
+        # Use the potentially redacted question for generation
+        answer = generate_answer(question_to_gen, context_chunks, _cfg, overrides=llm_overrides)
+    except RuntimeError as exc:
+        raise HTTPException(status_code=503,
+            detail=f"LLM generation failed: {exc}") from exc
+    # Restore the PHI for the final display so the user sees the actual names
+    if privacy_applied:
+        from src.pipeline.privacy import shield
+        answer = shield.restore(answer, p_mapping)
+    # =========================================================================
+    # =========================================================================
+    # Step 2b: CONSENSUS CHECK — MediRAG compares multiple models (Option 2)
+    # =========================================================================
+    consensus_results = None
+    if req.use_consensus:
+        from src.pipeline.consensus import run_consensus_check
+        # Determine which providers to use based on available config/overrides
+        providers = ["gemini"]
+        if os.environ.get("GROQ_API_KEY"):
+            providers.append("groq")
+        elif os.environ.get("MISTRAL_API_KEY"):
+            providers.append("mistral")
+        else:
+            providers.append("ollama") # fallback to local if no second key
+        logger.info("Running Consensus Layer with %s", providers)
+        consensus_results = run_consensus_check(req.question, context_chunks, _cfg, providers=providers)
+        # If consensus finds a safer merged answer, we promote it
+        # and update the primary answer for the evaluation loop
+        answer = consensus_results.get("consensus_answer", answer)
+    # =========================================================================
+    # [DEMO MODE] Inject a false claim to demonstrate the intervention system
+    if req.inject_hallucination:
+        logger.warning("DEMO MODE: Injecting hallucinated claim into answer: '%s'",
+                       req.inject_hallucination)
+        answer = answer + " " + req.inject_hallucination.strip()
+    # Step 3: Evaluate
+    try:
+        eval_result = run_evaluation(
+            question=req.question,
+            answer=answer,
+            context_chunks=context_chunks,
+            run_ragas=req.run_ragas,
+            config=_cfg,
+        )
+    except Exception as exc:
+        logger.exception("Evaluation failed: %s", exc)
+        try:
+            log_audit("query", req.question, answer, 100, "EVAL_ERROR", 0.0,
+                      int((_time.perf_counter() - t_total) * 1000),
+                      False, {"error": str(exc), "error_type": "evaluation_failure"})
+        except Exception:
+            pass
+        raise HTTPException(status_code=500,
+            detail=f"Evaluation error: {exc}") from exc
+    # =========================================================================
+    # Step 3b: INTERVENTION LOOP — MediRAG acts on evaluation results
+    # =========================================================================
+    from src.pipeline.generator import generate_strict_answer
+    details = eval_result.details or {}
+    composite = float(eval_result.score)
+    hrs = int(round(100 * (1.0 - composite)))
+    hrs = max(0, min(100, hrs))
+    mod_results: dict = details.get("module_results", {})
+    intervention_applied = False
+    intervention_reason = None
+    original_answer = None
+    intervention_details = None
+    faith_score = (mod_results.get("faithfulness") or {}).get("score", 1.0)
+    # Source-credibility-aware faith threshold: high-credibility sources get more tolerance
+    source_cred = float(details.get("component_scores", {}).get("source_credibility", 0.5))
+    faith_threshold = max(0.3, 0.7 - (source_cred * 0.4))  # 0.30 for cred=1.0, 0.66 for cred=0.3
+    # ── Coverage Gap Gate ────────────────────────────────────────────────────
+    # Two signals combined:
+    #   1. Refusal answer — LLM says "not in context / insufficient evidence"
+    #      → LLM itself confirms the DB doesn't cover this topic.
+    #   2. FAISS cosine — genuinely poor semantic match vs. the query.
+    #      BioBERT clusters medical dosing texts, so threshold must be high (0.75).
+    _REFUSAL_PATTERNS = (
+        "not mentioned in the provided context",
+        "not provided in the retrieved context",
+        "insufficient evidence in retrieved context",
+        "no information about",
+        "not in the provided context",
+        "cannot find information",
+        "the retrieved context does not contain",
+        "the context does not contain",
+        "not mentioned in the context",
+        "is not provided in the context",
+    )
+    _answer_lower = answer.lower()
+    is_refusal_answer = any(p in _answer_lower for p in _REFUSAL_PATTERNS)
+    is_low_faiss = top_faiss_cosine < 0.75
+    # If a verified drug with rxcui appears in the question, the intervention's
+    # FDA direct lookup can still retrieve the right data even when initial FAISS
+    # retrieval missed it. Don't label those as coverage gaps — let intervention run.
+    _ev_entities = (mod_results.get("entity_verifier") or {}).get("details", {}).get("entities", [])
+    _q_lower_cg = req.question.lower()
+    _drug_in_question = any(
+        e.get("rxcui") and e.get("entity", "").lower() in _q_lower_cg
+        for e in _ev_entities
+    )
+    # Refusal is a standalone COVERAGE_GAP signal — faith_score is unreliable here
+    # because NLI scores refusal sentences as NEUTRAL (0.5), not low.
+    # Exception: if a drug is named in the question, FDA lookup can still help.
+    # HALLUCINATION: specific claims made but not grounded in available context.
+    if is_refusal_answer and not _drug_in_question:
+        gap_type = "COVERAGE_GAP"
+    elif faith_score < faith_threshold and is_low_faiss and not _drug_in_question:
+        gap_type = "COVERAGE_GAP"  # poor retrieval + low faith = DB lacks this topic
+    elif faith_score < faith_threshold:
+        gap_type = "HALLUCINATION"  # relevant context exists but answer ignores it
+    else:
+        gap_type = None
+    coverage_gap = gap_type == "COVERAGE_GAP"
+    coverage_gap_details: dict | None = {
+        "gap_type": gap_type,
+        "top_faiss_cosine": round(top_faiss_cosine, 4),
+        "is_refusal_answer": is_refusal_answer,
+        "note": (
+            "Database coverage may be insufficient for this topic. "
+            "The answer could not be verified against retrieved evidence. "
+            "Consult primary medical literature or a specialist."
+        ) if coverage_gap else None,
+    } if gap_type else None
+    if coverage_gap:
+        logger.warning(
+            "COVERAGE_GAP detected — refusal=%s, faiss=%.4f, faith=%.2f",
+            is_refusal_answer, top_faiss_cosine, faith_score,
+        )
+    # Tier 1: CRITICAL BLOCK (HRS ≥ 86) — response is too dangerous to show
+    # Coverage gap: skip both tiers — regenerating from an empty DB won't help
+    if coverage_gap:
+        logger.info("COVERAGE_GAP — skipping intervention (regeneration cannot add missing data).")
+    elif hrs >= 86:
+        original_answer = answer
+        answer = (
+            "⛔ UNSAFE RESPONSE BLOCKED by MediRAG Safety Gate.\n\n"
+            "The generated answer was flagged as CRITICAL risk "
+            f"(Health Risk Score: {hrs}/100). "
+            "It showed signs of hallucination or contradiction with the retrieved evidence. "
+            "Please consult a qualified medical professional or rephrase your question."
+        )
+        intervention_applied = True
+        intervention_reason = "CRITICAL_BLOCKED"
+        intervention_details = {
+            "hrs_original": hrs,
+            "faithfulness": faith_score,
+            "message": "Response blocked: HRS ≥ 86 (CRITICAL risk band).",
+        }
+        logger.warning("INTERVENTION: CRITICAL_BLOCKED — HRS=%d", hrs)
+    # Tier 2: HIGH RISK REGENERATION
+    elif hrs >= 61 or faith_score < faith_threshold:
+        original_answer = answer
+        original_hrs = hrs
+        logger.warning(
+            "INTERVENTION: HIGH_RISK_REGENERATED — HRS=%d, faith=%.2f. Regenerating with strict prompt.",
+            hrs, faith_score
+        )
+        try:
+            # Re-retrieve from shared index — find better chunks than the ones that failed
+            try:
+                # Direct FDA lookup — only for drugs named in the question itself.
+                # Drugs found in the answer but NOT in the question (e.g. metformin
+                # mentioned incidentally in a general "first-line treatment" answer)
+                # should not trigger FDA lookup; that would replace relevant context
+                # with the wrong label sections (contraindications instead of treatment).
+                fda_direct: list[dict] = []
+                try:
+                    ev_details = eval_result.details.get("module_results", {}).get("entity_verifier", {}).get("details", {})
+                    verified_drugs = [
+                        e["entity"] for e in ev_details.get("entities", [])
+                        if e.get("status") == "VERIFIED" and e.get("rxcui")
+                    ]
+                    q_lower = req.question.lower()
+                    for drug in verified_drugs:
+                        if drug.lower() in q_lower:
+                            fda_direct += app.state.retriever.get_fda_chunks(drug)
+                    if fda_direct:
+                        logger.info("Direct FDA lookup found %d chunks for drugs: %s",
+                                    len(fda_direct), [d for d in verified_drugs if d.lower() in q_lower])
+                except Exception as fda_exc:
+                    logger.debug("Direct FDA lookup skipped: %s", fda_exc)
+                # Direct guideline lookup — only when original retrieval was poor.
+                # If FAISS cosine ≥ 0.85 the original chunks were already relevant;
+                # adding guideline sections here can pull in wrong topic areas
+                # (e.g., ADA Section 2 Diagnosis instead of Section 9 Treatment).
+                guideline_direct: list[dict] = []
+                if top_faiss_cosine < 0.85:
+                    try:
+                        guideline_direct = app.state.retriever.get_guideline_chunks(req.question)
+                        if guideline_direct:
+                            logger.info("Direct guideline lookup found %d chunks", len(guideline_direct))
+                    except Exception as gl_exc:
+                        logger.debug("Direct guideline lookup skipped: %s", gl_exc)
+                else:
+                    logger.debug("Skipping guideline direct lookup (FAISS cosine=%.4f ≥ 0.85, original retrieval was high-quality)", top_faiss_cosine)
+                # Merge: guideline chunks + FDA chunks + fresh retrieval
+                fda_direct = guideline_direct + fda_direct
+                # For drug/clinical questions, expand query toward authoritative sources
+                _drug_terms = ("contraindication", "dosage", "dose", "interaction",
+                               "warning", "adverse", "side effect", "mechanism")
+                _q_lower = req.question.lower()
+                retry_query = (
+                    f"FDA drug label clinical guideline {req.question}"
+                    if any(t in _q_lower for t in _drug_terms)
+                    else req.question
+                )
+                fresh_results = app.state.retriever.search(retry_query, top_k=req.top_k)
+                fresh_chunks: list[dict] = []
+                for chunk_text, meta, score in fresh_results:
+                    fresh_chunks.append({
+                        "text": chunk_text, "chunk_id": meta.get("chunk_id"),
+                        "source": meta.get("source", ""), "pub_type": meta.get("pub_type", ""),
+                        "pub_year": meta.get("pub_year"), "title": meta.get("title", ""),
+                    })
+                # Merge: direct lookups first (FDA/guidelines), then fresh retrieval
+                base_chunks = fresh_chunks if fresh_chunks else context_chunks
+                retry_chunks = (fda_direct + base_chunks)[:req.top_k] if fda_direct else base_chunks
+                logger.info("Re-retrieval for intervention: %d fresh chunks (top source: %s)",
+                            len(retry_chunks),
+                            retry_chunks[0].get("pub_type", "?") if retry_chunks else "none")
+            except Exception:
+                retry_chunks = context_chunks
+            answer = generate_strict_answer(req.question, retry_chunks, _cfg, overrides=llm_overrides)
+            # Re-evaluate the corrected answer
+            eval_result = run_evaluation(
+                question=req.question,
+                answer=answer,
+                context_chunks=retry_chunks,
+                run_ragas=False,  # skip RAGAS on retry to reduce latency
+                config=_cfg,
+            )
+            details = eval_result.details or {}
+            composite = float(eval_result.score)
+            hrs = int(round(100 * (1.0 - composite)))
+            hrs = max(0, min(100, hrs))
+            mod_results = details.get("module_results", {})
+        except Exception as exc:
+            logger.error("Strict regeneration failed: %s — keeping original answer", exc)
+            answer = original_answer  # fall back gracefully
+            original_answer = None
+        intervention_applied = True
+        intervention_reason = "HIGH_RISK_REGENERATED"
+        intervention_details = {
+            "hrs_original": original_hrs,
+            "hrs_corrected": hrs,
+            "faithfulness_original": faith_score,
+            "faithfulness_corrected": (mod_results.get("faithfulness") or {}).get("score", 0),
+            "message": "Response regenerated with strict context-only prompt due to high risk score.",
+        }
+    # =========================================================================
+    # Step 4: Build response
+    total_ms = int((_time.perf_counter() - t_total) * 1000)
+    logger.info("POST /query → HRS=%d (%s) intervention=%s in %d ms total",
+                hrs, details.get("risk_band", "?"), intervention_reason or "none", total_ms)
+    log_audit("query", req.question, answer, hrs, details.get("risk_band", "UNKNOWN"), composite, total_ms, intervention_applied, {
+        "module_results": mod_results,
+        "confidence_level": details.get("confidence_level", "UNKNOWN"),
+        "intervention_reason": intervention_reason,
+        "original_answer": original_answer,
+    })
+    return QueryResponse(
+        question=req.question,
+        generated_answer=answer,
+        retrieved_chunks=retrieved_chunks_out,
+        composite_score=composite,
+        hrs=hrs,
+        confidence_level=details.get("confidence_level", "UNKNOWN"),
+        risk_band=details.get("risk_band", "UNKNOWN"),
+        module_results=ModuleResults(
+            faithfulness=_module_score(mod_results, "faithfulness"),
+            entity_verifier=_module_score(mod_results, "entity_verifier"),
+            source_credibility=_module_score(mod_results, "source_credibility"),
+            contradiction=_module_score(mod_results, "contradiction"),
+            ragas=_module_score(mod_results, "ragas"),
+        ),
+        total_pipeline_ms=total_ms,
+        intervention_applied=intervention_applied,
+        intervention_reason=intervention_reason,
+        original_answer=original_answer,
+        intervention_details=intervention_details,
+        consensus_results=consensus_results,
+        privacy_applied=privacy_applied,
+        privacy_details={"redacted_count": len(p_mapping)} if privacy_applied else None,
+        coverage_gap=coverage_gap,
+        coverage_gap_details=coverage_gap_details,
+    )
+# ---------------------------------------------------------------------------
+# POST /ingest — dynamically append new documents to the FAISS index
+# ---------------------------------------------------------------------------
+_faiss_lock = threading.Lock()
+@app.post("/ingest", tags=["ingestion"])
+def ingest_document(req: IngestRequest):
+    """
+    Dynamically ingest a new document into the running FAISS index.
+    Thread-safe implementation uses a lock to prevent concurrent write corruption.
+    """
+    import pickle
+    import faiss
+    from src.pipeline.chunker import chunk_documents
+    retriever = getattr(app.state, "retriever", None)
+    if retriever is None or retriever._index is None:
+        raise HTTPException(status_code=503, detail="Retriever not pre-warmed. Cannot ingest.")
+    logger.info("POST /ingest — title='%s', size=%d chars", req.title, len(req.text))
+    # 1. Chunk the document
+    doc = {
+        "text": req.text,
+        "doc_id": "custom_" + req.title[:10],
+        "title": req.title,
+        "source": req.source,
+        "pub_type": req.pub_type,
+        "pub_year": 2026,
+    }
+    chunks = chunk_documents([doc], _cfg)
+    if not chunks:
+        raise HTTPException(status_code=400, detail="Document produced 0 chunks.")
+    # 2. Embed the chunks using the same BioBERT model as the retriever
+    from src.pipeline.embedder import encode_texts
+    import numpy as np
+    # Reuse already-loaded SentenceTransformer from the retriever to avoid double RAM load
+    if retriever._model is None:
+        retriever._load_model()
+    st_model = retriever._model
+    texts = [c["chunk_text"] for c in chunks]
+    embeddings = np.array(st_model.encode(texts, show_progress_bar=False), dtype=np.float32)
+    faiss.normalize_L2(embeddings)  # Required: index is IndexFlatIP = cosine sim
+    # 3. Thread-safe Index Update with atomic disk writes
+    with _faiss_lock:
+        import os
+        idx_path = Path(_cfg["retrieval"]["index_path"])
+        meta_path = Path(_cfg["retrieval"]["metadata_path"])
+        index = retriever._index
+        metadata_store = retriever._metadata
+        start_id = len(metadata_store)
+        # Add to in-memory structures
+        for i, chunk in enumerate(chunks):
+            metadata_store[start_id + i] = chunk
+        # Add to FAISS in memory
+        index.add(embeddings)
+        # Atomic FAISS write: write to temp → rename (never leaves a half-written file)
+        idx_tmp = str(idx_path) + ".tmp"
+        faiss.write_index(index, idx_tmp)
+        os.replace(idx_tmp, str(idx_path))
+        # Atomic metadata write
+        meta_tmp = str(meta_path) + ".tmp"
+        with open(meta_tmp, "wb") as f:
+            pickle.dump(metadata_store, f)
+        os.replace(meta_tmp, str(meta_path))
+        # 4. Rebuild BM25 for the running instance
+        retriever.rebuild_bm25()
+    logger.info("Successfully injected %d chunks for '%s' into FAISS and BM25.", len(chunks), req.title)
+    return {"status": "success", "chunks_added": len(chunks), "title": req.title}
+# ---------------------------------------------------------------------------
+# GET /logs and /stats — fetch history for dashboard
+# ---------------------------------------------------------------------------
+@app.get("/logs", tags=["dashboard"])
+def get_logs(limit: int = 50):
+    try:
+        conn = sqlite3.connect("data/logs.db")
+        conn.row_factory = sqlite3.Row
+        c = conn.cursor()
+        c.execute("SELECT * FROM audit_logs ORDER BY id DESC LIMIT ?", (limit,))
+        rows = c.fetchall()
+        conn.close()
+        return [dict(ix) for ix in rows]
+    except Exception as e:
+        return []
+@app.get("/stats", tags=["dashboard"])
+def get_stats():
+    try:
+        conn = sqlite3.connect("data/logs.db")
+        c = conn.cursor()
+        c.execute("SELECT COUNT(*), AVG(hrs), SUM(CASE WHEN risk_band='CRITICAL' THEN 1 ELSE 0 END) FROM audit_logs")
+        total_evals, avg_hrs, crit_alerts = c.fetchone()
+        c.execute("SELECT SUM(CASE WHEN intervention_applied=1 THEN 1 ELSE 0 END) FROM audit_logs")
+        interventions = c.fetchone()[0]
+        # Monthly data example
+        monthly_query = "SELECT SUBSTR(timestamp, 1, 7) as month, AVG(hrs) FROM audit_logs GROUP BY month ORDER BY month LIMIT 12"
+        c.execute(monthly_query)
+        monthly_data = [{"month": row[0], "avg_hrs": row[1]} for row in c.fetchall()]
+        conn.close()
+        return {
+            "totalEvals": total_evals or 0,
+            "avgHrs": round(avg_hrs or 0, 1),
+            "critAlerts": crit_alerts or 0,
+            "interventions": interventions or 0,
+            "monthly": monthly_data
+        }
+    except Exception as e:
+        return {
+            "totalEvals": 0, "avgHrs": 0, "critAlerts": 0, "interventions": 0, "monthly": []
+        }
+# ---------------------------------------------------------------------------
+# POST /parse_file — helper for frontend to extract PDF/DOCX text
+# ---------------------------------------------------------------------------
+@app.post("/parse_file", tags=["ingestion"])
+async def parse_file(file: UploadFile = File(...)):
+    """Extract text from uploaded txt, md, pdf, or docx files."""
+    content = await file.read()
+    filename = file.filename.lower()
+    text = ""
+    try:
+        if filename.endswith(".pdf"):
+            import fitz
+            doc = fitz.open(stream=content, filetype="pdf")
+            msgs = []
+            for page in doc:
+                msgs.append(page.get_text())
+            text = "\n".join(msgs)
+        elif filename.endswith(".docx"):
+            import docx
+            from io import BytesIO
+            doc = docx.Document(BytesIO(content))
+            text = "\n".join([p.text for p in doc.paragraphs])
+        else:
+            text = content.decode("utf-8", errors="replace")
+        return {"status": "success", "text": text}
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Failed to parse file: {e}")

src/api/schemas.py ADDED Viewed

	@@ -0,0 +1,276 @@

+"""
+src/api/schemas.py — Pydantic request/response models for MediRAG FastAPI
+=========================================================================
+FR-18: Input validation limits from config.yaml → api:
+  - max_query_length:  500
+  - max_answer_length: 2000
+  - max_chunks:        10
+  - max_chunk_length:  2000
+"""
+from __future__ import annotations
+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, Field, field_validator
+class IngestRequest(BaseModel):
+    """POST /ingest — append a custom document to the FAISS index."""
+    title: str = Field(..., description="Document title")
+    text: str = Field(..., min_length=10, description="Raw text of the document to ingest")
+    pub_type: str = Field(default="clinical_guideline", description="Document type")
+    source: str = Field(default="custom_upload", description="Source of the document")
+# ---------------------------------------------------------------------------
+# Request schemas
+# ---------------------------------------------------------------------------
+class ContextChunk(BaseModel):
+    """A single retrieved context chunk passed to the evaluation pipeline."""
+    text: str = Field(..., min_length=1, max_length=2000,
+                      description="Chunk text (max 2000 chars)")
+    # Optional metadata fields — all pass-through to the pipeline modules
+    chunk_id: Optional[str] = None
+    pub_type: Optional[str] = None
+    pub_year: Optional[int] = None
+    source: Optional[str] = None
+    title: Optional[str] = None
+    tier_type: Optional[str] = None       # pre-labelled evidence tier (optional)
+    score: Optional[float] = None         # retrieval similarity score
+class EvaluateRequest(BaseModel):
+    """POST /evaluate — request body."""
+    question: str = Field(
+        ...,
+        min_length=5,
+        max_length=500,
+        description="User question (5–500 chars)",
+        examples=["What is the recommended dosage of Metformin for Type 2 Diabetes in elderly patients?"],
+    )
+    answer: str = Field(
+        ...,
+        min_length=1,
+        max_length=2000,
+        description="LLM-generated answer to evaluate (1–2000 chars)",
+        examples=["Metformin is typically started at 500 mg twice daily with meals..."],
+    )
+    context_chunks: List[ContextChunk] = Field(
+        ...,
+        min_length=1,
+        max_length=10,
+        description="Retrieved context chunks (1–10 items)",
+    )
+    run_ragas: bool = Field(
+        default=False,
+        description="Run RAGAS evaluation (requires Ollama or OpenAI backend; slower)",
+    )
+    llm_provider: Optional[str] = Field(
+        default=None,
+        description="LLM provider override: 'gemini' or 'ollama'"
+    )
+    llm_api_key: Optional[str] = Field(
+        default=None,
+        description="API Key if accessing Gemini"
+    )
+    llm_model: Optional[str] = Field(
+        default=None,
+        description="Specific model string if overriding defaults"
+    )
+    rxnorm_cache_path: str = Field(
+        default="data/rxnorm_cache.csv",
+        description="Path to RxNorm cache CSV",
+    )
+    @field_validator("context_chunks")
+    @classmethod
+    def at_least_one_chunk(cls, v: list) -> list:
+        if len(v) == 0:
+            raise ValueError("At least one context chunk is required")
+        return v
+# ---------------------------------------------------------------------------
+# Response schemas
+# ---------------------------------------------------------------------------
+class ModuleScore(BaseModel):
+    """Score + details dict for a single evaluation module."""
+    score: float = Field(..., ge=0.0, le=1.0, description="Module score in [0, 1]")
+    details: Dict[str, Any] = Field(default_factory=dict)
+    error: Optional[str] = Field(None, description="Error message if module failed")
+    latency_ms: Optional[int] = None
+class ModuleResults(BaseModel):
+    """All per-module scores bundled together."""
+    faithfulness: Optional[ModuleScore] = None
+    entity_verifier: Optional[ModuleScore] = None
+    source_credibility: Optional[ModuleScore] = None
+    contradiction: Optional[ModuleScore] = None
+    ragas: Optional[ModuleScore] = None
+class EvaluateResponse(BaseModel):
+    """POST /evaluate — response body (FR-17 format)."""
+    composite_score: float = Field(
+        ..., ge=0.0, le=1.0,
+        description="Weighted composite score in [0, 1]"
+    )
+    hrs: int = Field(
+        ..., ge=0, le=100,
+        description="Health Risk Score = round(100 × (1 - composite_score))"
+    )
+    confidence_level: str = Field(
+        ...,
+        description="HIGH / MODERATE / LOW",
+    )
+    risk_band: str = Field(
+        ...,
+        description="LOW / MODERATE / HIGH / CRITICAL",
+    )
+    module_results: ModuleResults
+    total_pipeline_ms: int = Field(..., description="Total wall-clock time in ms")
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+class ChatRequest(BaseModel):
+    messages: List[ChatMessage]
+    system_prompt: Optional[str] = None
+    persona: Optional[str] = "physician"
+class HealthResponse(BaseModel):
+    """GET /health — liveness and dependency status."""
+    status: str = Field(default="ok")
+    ollama_available: bool
+    version: str = Field(default="0.1.0")
+# ---------------------------------------------------------------------------
+# End-to-end query schemas (POST /query)
+# ---------------------------------------------------------------------------
+class QueryRequest(BaseModel):
+    """POST /query — only a question needed; retrieval + generation happen server-side."""
+    question: str = Field(
+        ...,
+        min_length=5,
+        max_length=8000,
+        description="Medical question (5–8000 chars; may include doc context)",
+        examples=["What is the recommended dosage of Metformin for elderly Type 2 Diabetes patients?"],
+    )
+    top_k: int = Field(
+        default=5,
+        ge=1,
+        le=10,
+        description="Number of context chunks to retrieve (1–10)",
+    )
+    run_ragas: bool = Field(
+        default=False,
+        description="Run RAGAS evaluation (requires LLM backend)",
+    )
+    # Per-request LLM overrides — if not set, server config.yaml values are used
+    # This makes the eval engine portable: callers bring their own key + model
+    llm_provider: Optional[str] = Field(
+        default=None,
+        description="LLM provider override: 'gemini' or 'ollama'"
+    )
+    llm_api_key: Optional[str] = Field(
+        default=None,
+        description="API key override (e.g. Gemini key). Not logged or stored."
+    )
+    llm_model: Optional[str] = Field(
+        default=None,
+        description="Model name override (e.g. 'gemini-2.5-flash-lite')"
+    )
+    ollama_url: Optional[str] = Field(
+        default=None,
+        description="Ollama base URL override (e.g. 'http://localhost:11434')"
+    )
+    # Demo/test only — injects a false claim into the LLM answer before evaluation
+    # to demonstrate the intervention system catching hallucinations.
+    inject_hallucination: Optional[str] = Field(
+        default=None,
+        description="[DEMO ONLY] Appends a false medical claim to the answer before evaluation."
+    )
+    # Consensus Engine (Option 2)
+    use_consensus: bool = Field(
+        default=False,
+        description="Run multiple models and compare for clinical agreement."
+    )
+    # Privacy Shield (Option 1)
+    use_privacy_shield: bool = Field(
+        default=False,
+        description="Automatically redact PHI/PII (names, IDs) before external API calls.",
+    )
+    system_prompt: Optional[str] = Field(
+        default=None,
+        description="Custom system prompt to override the default clinical persona."
+    )
+    persona: Optional[str] = Field(
+        default="physician",
+        description="The target audience for the response: 'physician' or 'patient'."
+    )
+class RetrievedChunk(BaseModel):
+    """A single chunk returned alongside the query response for transparency."""
+    chunk_id: Optional[str] = None
+    text: str
+    source: Optional[str] = None
+    pub_type: Optional[str] = None
+    pub_year: Optional[int] = None
+    title: Optional[str] = None
+    similarity_score: Optional[float] = None
+class QueryResponse(BaseModel):
+    """POST /query — full end-to-end response."""
+    question: str
+    generated_answer: str
+    retrieved_chunks: List[RetrievedChunk]
+    # Evaluation fields (same as EvaluateResponse)
+    composite_score: float = Field(..., ge=0.0, le=1.0)
+    hrs: int = Field(..., ge=0, le=100)
+    confidence_level: str
+    risk_band: str
+    module_results: ModuleResults
+    total_pipeline_ms: int
+    # Intervention fields (active safety gate)
+    intervention_applied: bool = Field(
+        default=False,
+        description="True if the system modified or blocked the response for safety.",
+    )
+    intervention_reason: Optional[str] = Field(
+        default=None,
+        description="CRITICAL_BLOCKED | HIGH_RISK_REGENERATED | null",
+    )
+    original_answer: Optional[str] = Field(
+        default=None,
+        description="The original (unsafe) LLM answer before intervention, for transparency.",
+    )
+    intervention_details: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="Which modules triggered the intervention and their scores.",
+    )
+    # Consensus fields
+    consensus_results: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="Results from the multi-model agreement check."
+    )
+    # Privacy Shield fields
+    privacy_applied: bool = Field(default=False)
+    privacy_details: Optional[Dict[str, Any]] = Field(default=None)
+    # Coverage gap gate — distinguishes missing DB coverage from hallucination
+    coverage_gap: bool = Field(
+        default=False,
+        description="True when retrieval quality is low — the database may lack coverage for this topic.",
+    )
+    coverage_gap_details: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="gap_type (COVERAGE_GAP | HALLUCINATION), retrieval_confidence, threshold.",
+    )

src/cli.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import typer
+import subprocess
+import webbrowser
+import time
+import socket
+import os
+import sys
+app = typer.Typer(help="MediRAG Command Line Interface")
+def is_port_in_use(port: int) -> bool:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        return s.connect_ex(('localhost', port)) == 0
+@app.command()
+def start():
+    """Start the full MediRAG experience (Backend + Full Frontend)"""
+    typer.echo("Starting full MediRAG experience...")
+    run_servers(practical_mode=False)
+@app.command()
+def api():
+    """Start the streamlined 'practical' UI"""
+    typer.echo("Starting streamlined MediRAG practical UI...")
+    run_servers(practical_mode=True)
+def run_servers(practical_mode: bool):
+    # Check ports
+    if is_port_in_use(8000):
+        typer.echo("Warning: Port 8000 (Backend) might already be in use.")
+    if is_port_in_use(5173):
+        typer.echo("Warning: Port 5173 (Frontend) might already be in use.")
+    backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    frontend_dir = os.path.join(os.path.dirname(backend_dir), "Frontend")
+    # Start Backend
+    typer.echo("Starting Backend server...")
+    backend_process = subprocess.Popen(
+        [sys.executable, "-m", "uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000"],
+        cwd=backend_dir
+    )
+    # Start Frontend
+    typer.echo("Starting Frontend server...")
+    # On Windows, npm run dev needs shell=True or using cmd /c
+    frontend_process = subprocess.Popen(
+        ["cmd", "/c", "npm", "run", "dev"] if os.name == 'nt' else ["npm", "run", "dev"],
+        cwd=frontend_dir
+    )
+    typer.echo("Waiting for servers to start...")
+    time.sleep(5)  # Basic wait for frontend to spin up
+    url = "http://localhost:5173/cli-view" if practical_mode else "http://localhost:5173/"
+    typer.echo(f"Opening browser at {url}...")
+    webbrowser.open(url)
+    try:
+        # Keep process alive
+        backend_process.wait()
+        frontend_process.wait()
+    except KeyboardInterrupt:
+        typer.echo("\nShutting down servers...")
+        backend_process.terminate()
+        frontend_process.terminate()
+        typer.echo("Servers stopped.")
+if __name__ == "__main__":
+    app()

src/dashboard/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # src/dashboard/__init__.py

src/evaluate.py ADDED Viewed

	@@ -0,0 +1,289 @@

+"""
+FR-22: src/evaluate.py — MediRAG Evaluation Orchestrator
+=========================================================
+Top-level entry point for the evaluation pipeline.
+Runs all 4 evaluation modules + RAGAS + aggregator for a given
+(question, answer, context_docs) triple, returning a fully structured
+composite EvalResult.
+Usage as a module:
+    from src.evaluate import run_evaluation
+    result = run_evaluation(question, answer, context_docs)
+    print(f"Score: {result.score:.3f} ({result.details['confidence_level']})")
+Usage from CLI:
+    python -m src.evaluate \\
+        --question "What is the recommended dosage of Metformin for Type 2 Diabetes?" \\
+        --answer "Metformin is typically started at 500mg twice daily..." \\
+        --context-file data/processed/chunks.jsonl \\
+        --top-k 5
+SRS reference: FR-22, Section 7 (Evaluation Pipeline Overview)
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import sys
+import time
+from pathlib import Path
+from typing import Optional
+from src.modules.base import EvalResult
+from src.modules.faithfulness import score_faithfulness
+from src.modules.entity_verifier import verify_entities
+from src.modules.source_credibility import score_source_credibility
+from src.modules.contradiction import score_contradiction
+from src.evaluation.ragas_eval import score_ragas
+from src.evaluation.aggregator import aggregate
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Main evaluation function
+# ---------------------------------------------------------------------------
+def run_evaluation(
+    question: str,
+    answer: str,
+    context_chunks: list[dict],
+    rxnorm_cache_path: str = "data/rxnorm_cache.csv",
+    run_ragas: bool = True,
+    weights: Optional[dict[str, float]] = None,
+    config: Optional[dict] = None,
+) -> EvalResult:
+    """
+    Run the full MediRAG evaluation pipeline for a single QA pair.
+    Args:
+        question          : Original user question.
+        answer            : LLM-generated answer to evaluate.
+        context_chunks    : List of retrieved chunk dicts (from retriever.retrieve()).
+                           Each chunk must have at minimum {'text': str}.
+        rxnorm_cache_path : Path to rxnorm_cache.csv for entity verification.
+        run_ragas         : Whether to run the RAGAS module (requires LLM backend).
+        weights           : Override default aggregation weights (optional).
+    Returns:
+        EvalResult for the "aggregator" module containing:
+            .score          → composite score in [0, 1]
+            .details        → full breakdown per module
+            .latency_ms     → total wall-clock time
+    """
+    t_start = time.perf_counter()
+    logger.info("=== MediRAG Evaluation START ===")
+    logger.info("Question: %s", question[:120])
+    logger.info("Answer  : %s", answer[:120])
+    logger.info("Chunks  : %d context documents", len(context_chunks))
+    # Extract text and metadata for modules that need it
+    context_texts: list[str] = [c.get("text", "") for c in context_chunks]
+    chunk_ids: list[str] = [
+        c.get("chunk_id") or c.get("metadata", {}).get("chunk_id") or f"chunk_{i}"
+        for i, c in enumerate(context_chunks)
+    ]
+    # -------------------------------------------------------------------------
+    # Retrieval Quality Gate
+    # If the retriever's absolute RRF score is too low, the chunks are likely
+    # unrelated to the question — evaluation against them produces false HRS spikes.
+    # Threshold: max raw RRF for top-1 in both sources = 2/(60+1) ≈ 0.0328
+    # We flag as insufficient if max_rrf < 0.008 (only very weak BM25 or FAISS match)
+    # -------------------------------------------------------------------------
+    RETRIEVAL_CONFIDENCE_THRESHOLD = 0.008
+    retrieval_confidence = context_chunks[0].get("_retrieval_confidence", 1.0) if context_chunks else 0.0
+    if context_chunks and retrieval_confidence < RETRIEVAL_CONFIDENCE_THRESHOLD:
+        logger.warning(
+            "Retrieval confidence %.6f below threshold %.3f — context likely irrelevant to question.",
+            retrieval_confidence, RETRIEVAL_CONFIDENCE_THRESHOLD,
+        )
+        total_ms = int((time.perf_counter() - t_start) * 1000)
+        return EvalResult(
+            module_name="aggregator",
+            score=0.5,
+            details={
+                "retrieval_insufficient": True,
+                "retrieval_confidence": retrieval_confidence,
+                "hrs": 50,
+                "risk_band": "MODERATE",
+                "confidence_level": "LOW",
+                "total_pipeline_ms": total_ms,
+                "module_results": {},
+                "warning": (
+                    "Retrieved context has very low relevance to the question "
+                    f"(retrieval_confidence={retrieval_confidence:.4f}). "
+                    "Evaluation scores would be meaningless. "
+                    "Consider rephrasing the question or expanding the index."
+                ),
+            },
+            latency_ms=total_ms,
+        )
+    # -------------------------------------------------------------------------
+    # Module 1: Faithfulness (DeBERTa NLI)
+    # -------------------------------------------------------------------------
+    logger.info("--- Module 1: Faithfulness ---")
+    faith_result = score_faithfulness(
+        answer=answer,
+        context_docs=context_texts,
+        chunk_ids=chunk_ids,
+        config=config,
+    )
+    # -------------------------------------------------------------------------
+    # Module 2: Entity Verification (SciSpaCy + RxNorm)
+    # -------------------------------------------------------------------------
+    logger.info("--- Module 2: Entity Verification ---")
+    entity_result = verify_entities(
+        answer=answer,
+        question=question,
+        context_docs=context_texts,
+        rxnorm_cache_path=rxnorm_cache_path,
+    )
+    # -------------------------------------------------------------------------
+    # Module 3: Source Credibility (Evidence Tier)
+    # -------------------------------------------------------------------------
+    logger.info("--- Module 3: Source Credibility ---")
+    source_result = score_source_credibility(retrieved_chunks=context_chunks)
+    # -------------------------------------------------------------------------
+    # Module 4: Contradiction Detection (DeBERTa NLI cross-check)
+    # -------------------------------------------------------------------------
+    logger.info("--- Module 4: Contradiction Detection ---")
+    contra_result = score_contradiction(
+        answer=answer,
+        context_docs=context_texts,
+    )
+    # -------------------------------------------------------------------------
+    # RAGAS (optional — requires LLM backend)
+    # -------------------------------------------------------------------------
+    ragas_result: Optional[EvalResult] = None
+    if run_ragas:
+        logger.info("--- RAGAS Evaluation ---")
+        ragas_result = score_ragas(
+            question=question,
+            answer=answer,
+            context_docs=context_texts,
+        )
+    # -------------------------------------------------------------------------
+    # Aggregator: weighted composite
+    # -------------------------------------------------------------------------
+    logger.info("--- Aggregator ---")
+    agg_result = aggregate(
+        faithfulness_result=faith_result,
+        entity_result=entity_result,
+        source_result=source_result,
+        contradiction_result=contra_result,
+        ragas_result=ragas_result,
+        weights=weights,
+    )
+    total_ms = int((time.perf_counter() - t_start) * 1000)
+    agg_result.details["total_pipeline_ms"] = total_ms
+    # Attach per-module results for API/dashboard access
+    agg_result.details["module_results"] = {
+        "faithfulness":       {"score": faith_result.score,  "details": faith_result.details},
+        "entity_verifier":    {"score": entity_result.score, "details": entity_result.details},
+        "source_credibility": {"score": source_result.score, "details": source_result.details},
+        "contradiction":      {"score": contra_result.score, "details": contra_result.details},
+        "ragas":              {"score": ragas_result.score,  "details": ragas_result.details} if ragas_result else None,
+    }
+    logger.info(
+        "=== MediRAG Evaluation DONE: score=%.3f (%s) in %d ms ===",
+        agg_result.score,
+        agg_result.details.get("confidence_level", "?"),
+        total_ms,
+    )
+    return agg_result
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+def _build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        description="MediRAG evaluation pipeline (FR-22)"
+    )
+    p.add_argument("--question",      required=True, help="User question")
+    p.add_argument("--answer",        required=True, help="LLM answer to evaluate")
+    p.add_argument("--context-file",  default="data/processed/chunks.jsonl",
+                   help="JSONL file of chunks (output of ingest.py)")
+    p.add_argument("--top-k",         type=int, default=5,
+                   help="Number of context chunks to use")
+    p.add_argument("--rxnorm-cache",  default="data/rxnorm_cache.csv",
+                   help="Path to rxnorm_cache.csv")
+    p.add_argument("--no-ragas",      action="store_true",
+                   help="Skip RAGAS evaluation (no LLM backend needed)")
+    p.add_argument("--json",          action="store_true",
+                   help="Output result as JSON")
+    return p
+def _load_context_from_file(path: str, top_k: int) -> list[dict]:
+    """Load top-k chunks from a JSONL file as simple dicts."""
+    chunks = []
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    chunks.append(json.loads(line))
+                if len(chunks) >= top_k:
+                    break
+    except FileNotFoundError:
+        logger.error("Context file not found: %s", path)
+        sys.exit(1)
+    return chunks
+if __name__ == "__main__":
+    import yaml
+    # Load config.yaml for logging setup
+    try:
+        cfg = yaml.safe_load(Path("config.yaml").read_text())
+        log_level = cfg.get("logging", {}).get("level", "INFO")
+    except Exception:
+        log_level = "INFO"
+    logging.basicConfig(
+        level=getattr(logging, log_level, logging.INFO),
+        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+    )
+    args = _build_parser().parse_args()
+    chunks = _load_context_from_file(args.context_file, args.top_k)
+    result = run_evaluation(
+        question=args.question,
+        answer=args.answer,
+        context_chunks=chunks,
+        rxnorm_cache_path=args.rxnorm_cache,
+        run_ragas=not args.no_ragas,
+    )
+    if args.json:
+        import dataclasses
+        print(json.dumps(dataclasses.asdict(result), indent=2))
+    else:
+        print(f"\n{'='*60}")
+        print(f"  MediRAG Evaluation Result")
+        print(f"{'='*60}")
+        print(f"  Score          : {result.score:.3f}")
+        print(f"  Confidence     : {result.details.get('confidence_level', 'N/A')}")
+        print(f"  Pipeline time  : {result.details.get('total_pipeline_ms', 0)} ms")
+        print(f"\n  Module Breakdown:")
+        for mod, res in (result.details.get("module_results") or {}).items():
+            if res:
+                print(f"    {mod:22s}: {res['score']:.3f}")
+        print(f"{'='*60}\n")

src/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # src/evaluation/__init__.py

src/evaluation/aggregator.py ADDED Viewed

	@@ -0,0 +1,173 @@

+"""
+FR-19: src/evaluation/aggregator.py — Weighted Score Aggregation
+================================================================
+Combines scores from all evaluation modules into a single composite score
+using the fixed weights defined in SRS Section 8.2.
+Weights (must sum to 1.0):
+    faithfulness       : 0.35  (primary signal — DeBERTa NLI)
+    entity_accuracy    : 0.20  (SciSpaCy NER + RxNorm)
+    source_credibility : 0.20  (evidence tier)
+    contradiction_risk : 0.15  (1.0 - contradiction_score)
+    ragas_composite    : 0.10  (optional — 0.5 neutral if unavailable)
+Output:
+    EvalResult with:
+        module_name = "aggregator"
+        score       = weighted composite in [0, 1]
+        details     = {weights_used, weighted_composite, component_contributions}
+Usage:
+    from src.evaluation.aggregator import aggregate
+    agg_result = aggregate(faith_res, entity_res, source_res, contra_res, ragas_res)
+"""
+from __future__ import annotations
+import logging
+import time
+from typing import Optional
+from src.modules.base import EvalResult
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Default weights (SRS Section 8.2)
+# ---------------------------------------------------------------------------
+DEFAULT_WEIGHTS: dict[str, float] = {
+    "faithfulness":        0.35,
+    "entity_accuracy":     0.20,
+    "source_credibility":  0.20,
+    "contradiction_risk":  0.15,
+    "ragas_composite":     0.10,
+}
+def aggregate(
+    faithfulness_result: EvalResult,
+    entity_result: EvalResult,
+    source_result: EvalResult,
+    contradiction_result: EvalResult,
+    ragas_result: Optional[EvalResult] = None,
+    weights: Optional[dict[str, float]] = None,
+) -> EvalResult:
+    """
+    Aggregate all module scores into a single composite evaluation result.
+    Args:
+        faithfulness_result    : Output from faithfulness.score_faithfulness()
+        entity_result          : Output from entity_verifier.verify_entities()
+        source_result          : Output from source_credibility.score_source_credibility()
+        contradiction_result   : Output from contradiction.score_contradiction()
+        ragas_result           : Output from ragas_eval.score_ragas() (optional)
+        weights                : Override default weights (must sum to 1.0)
+    Returns:
+        EvalResult with module_name="aggregator" and composite score.
+    """
+    t0 = time.perf_counter()
+    w = weights or DEFAULT_WEIGHTS
+    # Validate weights sum to 1.0 (tolerance 0.01)
+    weight_sum = sum(w.values())
+    if abs(weight_sum - 1.0) > 0.01:
+        logger.warning(
+            "Weights sum to %.4f (expected 1.0) — normalising.", weight_sum
+        )
+        w = {k: v / weight_sum for k, v in w.items()}
+    # Extract scores — use 0.5 neutral for any unavailable module
+    faith_score = faithfulness_result.score if not faithfulness_result.error else 0.5
+    entity_score = entity_result.score if not entity_result.error else 0.5
+    source_score = source_result.score if not source_result.error else 0.5
+    contra_score = contradiction_result.score if not contradiction_result.error else 1.0
+    ragas_score = (ragas_result.score if ragas_result and not ragas_result.error else 0.5)
+    # Compute base weighted contributions
+    contributions = {
+        "faithfulness_contribution":   round(faith_score  * w["faithfulness"], 4),
+        "entity_contribution":         round(entity_score * w["entity_accuracy"], 4),
+        "source_contribution":         round(source_score * w["source_credibility"], 4),
+        "contradiction_contribution":  round(contra_score * w["contradiction_risk"], 4),
+        "ragas_contribution":          round(ragas_score  * w["ragas_composite"], 4),
+    }
+    base_composite = sum(contributions.values())
+    # --- Non-linear Safety Penalties ---
+    # Faithfulness penalty: applies when answer is not grounded in context.
+    # Contradiction penalty: only applies when actual contradictions are detected
+    #   (score < 0.3). Score = 0.5 means "neutral/cannot verify" (refusal answers,
+    #   no keyword overlap) — these should NOT be double-penalized.
+    penalty_multiplier = 1.0
+    if faith_score <= 0.6:
+        penalty_multiplier *= 0.6  # 40% penalty for ungrounded claims
+    if contra_score < 0.3:
+        penalty_multiplier *= 0.6  # 40% penalty only for confirmed contradictions
+    composite = base_composite * penalty_multiplier
+    # HRS = round(100 × (1 - composite)), then map to risk band
+    # Thresholds must match config.yaml aggregator.risk_bands
+    _HRS_LOW      = 30
+    _HRS_MODERATE = 60
+    _HRS_HIGH     = 85
+    hrs = int(round(100 * (1.0 - composite)))
+    hrs = max(0, min(100, hrs))
+    if hrs <= _HRS_LOW:
+        risk_band = "LOW"
+    elif hrs <= _HRS_MODERATE:
+        risk_band = "MODERATE"
+    elif hrs <= _HRS_HIGH:
+        risk_band = "HIGH"
+    else:
+        risk_band = "CRITICAL"
+    # Confidence level (based on composite, not HRS)
+    if composite >= 0.80:
+        confidence = "HIGH"
+    elif composite >= 0.55:
+        confidence = "MODERATE"
+    else:
+        confidence = "LOW"
+    details = {
+        "weights_used": {k: round(v, 4) for k, v in w.items()},
+        "component_scores": {
+            "faithfulness":       round(faith_score, 4),
+            "entity_accuracy":    round(entity_score, 4),
+            "source_credibility": round(source_score, 4),
+            "contradiction_risk": round(contra_score, 4),
+            "ragas_composite":    round(ragas_score, 4),
+        },
+        "weighted_composite": round(composite, 4),
+        "hrs": hrs,
+        "risk_band": risk_band,
+        "component_contributions": contributions,
+        "confidence_level": confidence,
+        "module_latencies_ms": {
+            "faithfulness":       faithfulness_result.latency_ms,
+            "entity_verifier":    entity_result.latency_ms,
+            "source_credibility": source_result.latency_ms,
+            "contradiction":      contradiction_result.latency_ms,
+            "ragas":              ragas_result.latency_ms if ragas_result else 0,
+        },
+    }
+    latency_ms = int((time.perf_counter() - t0) * 1000)
+    logger.info(
+        "Aggregated score: %.3f (%s confidence) — "
+        "faith=%.2f entity=%.2f source=%.2f contra=%.2f ragas=%.2f",
+        composite, confidence,
+        faith_score, entity_score, source_score, contra_score, ragas_score,
+    )
+    return EvalResult(
+        module_name="aggregator",
+        score=composite,
+        details=details,
+        latency_ms=latency_ms,
+    )

src/evaluation/ragas_eval.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""
+FR-06: src/evaluation/ragas_eval.py — RAGAS Faithfulness + Answer Relevancy
+=============================================================================
+Wraps the ragas library to compute:
+    - faithfulness      : context-grounded claim verification
+    - answer_relevancy  : semantic similarity of answer to question
+Requires an LLM backend. Supported backends (in priority order):
+    1. Ollama (local, free)  — set OLLAMA_HOST env var or use default localhost:11434
+    2. OpenAI API            — set OPENAI_API_KEY env var
+    3. Graceful degradation  — returns score=None with explanation if no LLM available
+Usage:
+    from src.evaluation.ragas_eval import score_ragas
+    result = score_ragas(question, answer, context_docs)
+SRS reference: FR-06, Section 7 (Evaluation Pipeline)
+"""
+from __future__ import annotations
+import logging
+import os
+import time
+from typing import Optional
+from src.modules.base import EvalResult
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Backend detection
+# ---------------------------------------------------------------------------
+def _detect_llm_backend() -> Optional[str]:
+    """Return 'ollama', 'openai', or None."""
+    if os.getenv("OPENAI_API_KEY"):
+        return "openai"
+    # Check if Ollama is running locally
+    try:
+        import requests
+        host = os.getenv("OLLAMA_HOST", "http://localhost:11434")
+        resp = requests.get(f"{host}/api/tags", timeout=2)
+        if resp.status_code == 200:
+            return "ollama"
+    except Exception:
+        pass
+    return None
+def _build_ragas_llm(backend: str):
+    """Build a ragas-compatible LLM wrapper."""
+    if backend == "openai":
+        from langchain_openai import ChatOpenAI
+        return ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
+    elif backend == "ollama":
+        from langchain_community.chat_models import ChatOllama
+        host = os.getenv("OLLAMA_HOST", "http://localhost:11434")
+        model = os.getenv("OLLAMA_MODEL", "mistral")
+        return ChatOllama(base_url=host, model=model)
+    raise ValueError(f"Unknown backend: {backend}")
+def _build_ragas_embeddings(backend: str):
+    """Build a ragas-compatible embeddings wrapper."""
+    if backend == "openai":
+        from langchain_openai import OpenAIEmbeddings
+        return OpenAIEmbeddings()
+    elif backend == "ollama":
+        from langchain_community.embeddings import OllamaEmbeddings
+        host = os.getenv("OLLAMA_HOST", "http://localhost:11434")
+        model = os.getenv("OLLAMA_EMBED_MODEL", "nomic-embed-text")
+        return OllamaEmbeddings(base_url=host, model=model)
+    raise ValueError(f"Unknown backend: {backend}")
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def score_ragas(
+    question: str,
+    answer: str,
+    context_docs: list[str],
+    max_contexts: int = 3,
+) -> EvalResult:
+    """
+    Compute RAGAS faithfulness and answer_relevancy scores.
+    Args:
+        question     : Original user question.
+        answer       : LLM-generated answer.
+        context_docs : Retrieved context passages.
+        max_contexts : Max context chunks to pass to RAGAS (to limit token cost).
+    Returns:
+        EvalResult with module_name="ragas", score in [0,1].
+        score = mean(faithfulness, answer_relevancy).
+        Returns score=0.5 (neutral) if no LLM backend is available.
+    """
+    t0 = time.perf_counter()
+    backend = _detect_llm_backend()
+    if backend is None:
+        logger.warning(
+            "No LLM backend available for RAGAS. "
+            "Set OPENAI_API_KEY or start Ollama (ollama serve). "
+            "Returning neutral score (0.5)."
+        )
+        return EvalResult(
+            module_name="ragas",
+            score=0.5,
+            details={
+                "backend": None,
+                "faithfulness": None,
+                "answer_relevancy": None,
+                "note": "No LLM backend — set OPENAI_API_KEY or start Ollama",
+            },
+            latency_ms=int((time.perf_counter() - t0) * 1000),
+        )
+    try:
+        from datasets import Dataset
+        from ragas import evaluate
+        from ragas.metrics import faithfulness, answer_relevancy
+        llm = _build_ragas_llm(backend)
+        embeddings = _build_ragas_embeddings(backend)
+        # Configure metrics to use our chosen backend
+        faithfulness.llm = llm
+        faithfulness.embeddings = embeddings
+        answer_relevancy.llm = llm
+        answer_relevancy.embeddings = embeddings
+        contexts = context_docs[:max_contexts]
+        dataset = Dataset.from_dict(
+            {
+                "question": [question],
+                "answer": [answer],
+                "contexts": [contexts],
+            }
+        )
+        result = evaluate(dataset, metrics=[faithfulness, answer_relevancy])
+        faith_score = float(result["faithfulness"])
+        relevancy_score = float(result["answer_relevancy"])
+        composite = (faith_score + relevancy_score) / 2.0
+        details = {
+            "backend": backend,
+            "faithfulness": round(faith_score, 4),
+            "answer_relevancy": round(relevancy_score, 4),
+        }
+        latency_ms = int((time.perf_counter() - t0) * 1000)
+        logger.info(
+            "RAGAS: faith=%.3f, relevancy=%.3f → composite=%.3f in %d ms",
+            faith_score, relevancy_score, composite, latency_ms,
+        )
+        return EvalResult(
+            module_name="ragas",
+            score=composite,
+            details=details,
+            latency_ms=latency_ms,
+        )
+    except Exception as exc:
+        logger.error("RAGAS evaluation failed: %s", exc)
+        return EvalResult(
+            module_name="ragas",
+            score=0.5,
+            details={"backend": backend, "error": str(exc)},
+            error=str(exc),
+            latency_ms=int((time.perf_counter() - t0) * 1000),
+        )

src/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+src/modules/base.py — Shared EvalResult dataclass.
+Used as the standard output schema by all 4 evaluation modules.
+Details shape per module is fully specified here (SRS Section 5).
+"""
+from __future__ import annotations
+import logging
+from dataclasses import dataclass, field
+from typing import Any, Optional
+logger = logging.getLogger(__name__)
+@dataclass
+class EvalResult:
+    """
+    Shared output schema for all evaluation modules.
+    Attributes:
+        module_name : Identifier string, e.g. "faithfulness"
+        score       : Module score in [0.0, 1.0] — clipped automatically
+        details     : Module-specific dict (see DETAILS SHAPES below)
+        error       : None if successful; error message string if module failed
+        latency_ms  : Wall-clock milliseconds for this module's execution
+    """
+    module_name: str
+    score: float
+    details: dict[str, Any] = field(default_factory=dict)
+    error: Optional[str] = None
+    latency_ms: int = 0
+    def __post_init__(self) -> None:
+        """Clip score to [0.0, 1.0] as required by SRS 4.2."""
+        if not (0.0 <= self.score <= 1.0):
+            logger.warning(
+                "%s: score %.4f out of [0,1], clipping.",
+                self.module_name,
+                self.score,
+            )
+            self.score = max(0.0, min(1.0, self.score))
+    # -------------------------------------------------------------------------
+    # DETAILS SHAPE REFERENCE (SRS Section 5)
+    # -------------------------------------------------------------------------
+    #
+    # faithfulness.details:
+    # {
+    #   "total_claims": int,
+    #   "entailed_count": int,
+    #   "neutral_count": int,
+    #   "contradicted_count": int,
+    #   "claims": [
+    #     {
+    #       "claim": str,
+    #       "status": "ENTAILED" | "NEUTRAL" | "CONTRADICTED",
+    #       "best_chunk_id": str,      # chunk with highest NLI score
+    #       "nli_score": float
+    #     }
+    #   ]
+    # }
+    #
+    # entity_verifier.details:
+    # {
+    #   "total_entities": int,
+    #   "verified_count": int,
+    #   "flagged_count": int,
+    #   "entities": [
+    #     {
+    #       "entity": str,
+    #       "type": "DRUG" | "DOSAGE" | "CONDITION" | "PROCEDURE",
+    #       "status": "VERIFIED" | "FLAGGED" | "NOT_FOUND",
+    #       "severity": "CRITICAL" | "MODERATE" | "MINOR" | null,
+    #       "answer_value": str,
+    #       "context_value": str | null,
+    #       "rxcui": str | null
+    #     }
+    #   ]
+    # }
+    #
+    # source_credibility.details:
+    # {
+    #   "method_used": "keyword" | "metadata",
+    #   "chunks": [
+    #     {
+    #       "chunk_id": str,
+    #       "tier": int,             # 1–5
+    #       "tier_weight": float,
+    #       "pub_type": str,
+    #       "title": str,
+    #       "matched_keyword": str | null
+    #     }
+    #   ]
+    # }
+    #
+    # contradiction.details:
+    # {
+    #   "total_sentences": int,
+    #   "checked_pairs": int,
+    #   "contradicted_pairs": int,
+    #   "pairs": [
+    #     {
+    #       "sentence_a": str,
+    #       "sentence_b": str,
+    #       "contradiction_score": float,
+    #       "flagged": bool
+    #     }
+    #   ]
+    # }
+    #
+    # aggregator.details:
+    # {
+    #   "weights_used": {
+    #     "faithfulness": float,
+    #     "entity_accuracy": float,
+    #     "source_credibility": float,
+    #     "contradiction_risk": float
+    #   },
+    #   "weighted_composite": float,
+    #   "component_contributions": {
+    #     "faithfulness_contribution": float,
+    #     "entity_contribution": float,
+    #     "source_contribution": float,
+    #     "contradiction_contribution": float
+    #   }
+    # }

src/modules/base.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""src/modules/base.py — see __init__.py of this package for EvalResult."""
+from src.modules import EvalResult  # re-export for convenience
+__all__ = ["EvalResult"]

src/modules/contradiction.py ADDED Viewed

	@@ -0,0 +1,259 @@

+"""
+FR-17: src/modules/contradiction.py — Module 4: Cross-Document Contradiction Detection
+========================================================================================
+Uses the same DeBERTa NLI cross-encoder (cross-encoder/nli-deberta-v3-small) to
+detect contradictions between the LLM answer and retrieved context passages.
+Algorithm (SRS Section 6.4):
+    1. Split answer into sentences  (claims)
+    2. Split each context chunk into sentences
+    3. For each (answer_sentence, context_sentence) pair:
+        - Run NLI → get contradiction score
+        - If contradiction_score ≥ CONTRADICTION_THRESHOLD → flag pair
+    4. score = 1.0 - (flagged_pairs / total_pairs)
+This module shares the NLI model instance with faithfulness.py when both
+run in the same process (the model is cached at the faithfulness module level).
+Design note:
+    To keep latency manageable, context sentences are limited to
+    MAX_CONTEXT_SENTS per chunk and total pairs are capped at MAX_PAIRS.
+"""
+from __future__ import annotations
+import logging
+import time
+import pysbd
+from src.modules.base import EvalResult
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+CONTRADICTION_THRESHOLD = 0.50   # Balanced: catches real contradictions without over-flagging
+MIN_KEYWORD_OVERLAP = 3          # At least 3 meaningful words in common before running NLI
+MAX_CONTEXT_SENTS = 4   # top N sentences per context chunk
+MAX_PAIRS = 200          # hard cap to keep latency bounded (~2-3s)
+_segmenter = None
+# Common stopwords to ignore in overlap check
+_STOPWORDS = {
+    "the", "a", "an", "is", "in", "of", "to", "for", "and", "or", "are",
+    "be", "at", "by", "if", "it", "as", "on", "with", "this", "that",
+    "was", "were", "not", "no", "have", "has", "had", "but", "so", "from",
+    "should", "may", "can", "will", "than", "more", "when", "which", "who",
+    "what", "all", "each", "after", "before", "been", "do", "does", "1",
+    "2", "3", "mg", "iv", "od", "per", "day", "based", "using", "include",
+}
+def _get_segmenter():
+    """Lazily load and return the pysbd segmenter."""
+    global _segmenter
+    if _segmenter is None:
+        try:
+            import pysbd
+            _segmenter = pysbd.Segmenter(language="en", clean=False)
+        except ImportError:
+            logger.warning("pysbd not installed, falling back to naive sentence splitting.")
+            _segmenter = "stub"  # Use a string to indicate stub mode
+        except Exception as e:
+            logger.error("Failed to initialize pysbd segmenter: %s", e)
+            _segmenter = "stub"
+    return _segmenter
+def _keyword_overlap(sent_a: str, sent_b: str) -> int:
+    """Count shared content words between two sentences."""
+    tokens_a = {w.lower() for w in sent_a.split() if w.lower() not in _STOPWORDS and len(w) > 2}
+    tokens_b = {w.lower() for w in sent_b.split() if w.lower() not in _STOPWORDS and len(w) > 2}
+    return len(tokens_a & tokens_b)
+def _segment(text: str) -> list[str]:
+    """Segment text into sentences using pysbd or a fallback."""
+    seg = _get_segmenter()
+    try:
+        if seg == "stub":
+            return [s.strip() for s in text.split(".") if s.strip()]
+        else:
+            return [s.strip() for s in seg.segment(text) if s.strip()]
+    except Exception:
+        return [s.strip() for s in text.split(".") if s.strip()]
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def score_contradiction(
+    answer: str,
+    context_docs: list[str],
+    max_chunks: int = 5,
+) -> EvalResult:
+    """
+    Detect contradictions between the LLM answer and retrieved context.
+    Args:
+        answer       : LLM-generated answer text.
+        context_docs : List of retrieved context passage strings.
+        max_chunks   : Max number of context chunks to evaluate.
+    Returns:
+        EvalResult with module_name="contradiction", score in [0,1] where
+        1.0 = no contradictions detected, 0.0 = all pairs contradicted.
+    """
+    t0 = time.perf_counter()
+    if not answer or not context_docs:
+        return EvalResult(
+            module_name="contradiction",
+            score=0.5,  # neutral — cannot verify with missing input
+            details={"total_sentences": 0, "checked_pairs": 0, "contradicted_pairs": 0, "pairs": []},
+            latency_ms=0,
+        )
+    # Import model via faithfulness module (shared cache)
+    try:
+        from src.modules.faithfulness import _get_model, LABEL_CONTRADICTION
+    except ImportError:
+        # (Lazy imports to prevent startup crashes when libraries aren't installed yet)
+        try:
+            from sentence_transformers import CrossEncoder
+            _model = CrossEncoder("cross-encoder/nli-deberta-v3-small")
+            _get_model = lambda: _model  # noqa: E731
+            LABEL_CONTRADICTION = 0
+        except ImportError:
+            logger.error("sentence-transformers not installed. Cannot run NLI model.")
+            return EvalResult(
+                module_name="contradiction",
+                score=1.0,
+                details={},
+                error="NLI model (sentence-transformers) not installed.",
+                latency_ms=int((time.perf_counter() - t0) * 1000),
+            )
+        except Exception as exc:
+            logger.error("Failed to load NLI model: %s", exc)
+            return EvalResult(
+                module_name="contradiction",
+                score=1.0,
+                details={},
+                error=f"Failed to load NLI model: {exc}",
+                latency_ms=int((time.perf_counter() - t0) * 1000),
+            )
+    model = _get_model()
+    # Strip markdown/citations from answer before NLI (same reason as faithfulness.py)
+    import re as _re
+    _MD = _re.compile(
+        r'\[Source:[^\]]*\]|\[[^\]]{0,120}\]'  # citations
+        r'|\*\*([^*]+)\*\*|\*([^*]+)\*'        # bold/italic → keep text
+        r'|`[^`]+`'                             # code
+    )
+    answer = _MD.sub(lambda m: (m.group(1) or m.group(2) or ''), answer).strip()
+    # Segment answer into claims
+    answer_sents = _segment(answer)
+    if not answer_sents:
+        return EvalResult(
+            module_name="contradiction",
+            score=0.5,  # neutral — cannot verify with no sentences
+            details={"total_sentences": 0, "checked_pairs": 0, "contradicted_pairs": 0, "pairs": []},
+            latency_ms=0,
+        )
+    # Segment context chunks
+    docs = context_docs[:max_chunks]
+    context_sents: list[str] = []
+    for doc in docs:
+        sents = _segment(doc)[:MAX_CONTEXT_SENTS]
+        context_sents.extend(sents)
+    if not context_sents:
+        return EvalResult(
+            module_name="contradiction",
+            score=0.5,  # neutral — cannot verify with no context sentences
+            details={"total_sentences": len(answer_sents), "checked_pairs": 0, "contradicted_pairs": 0, "pairs": []},
+            latency_ms=0,
+        )
+    # Build pairs WITH topical pre-filter (skip unrelated sentence pairs entirely)
+    all_pairs: list[tuple[str, str]] = []
+    for a_sent in answer_sents:
+        for c_sent in context_sents:
+            if _keyword_overlap(a_sent, c_sent) >= MIN_KEYWORD_OVERLAP:
+                all_pairs.append((a_sent, c_sent))
+            if len(all_pairs) >= MAX_PAIRS:
+                break
+        if len(all_pairs) >= MAX_PAIRS:
+            break
+    if not all_pairs:
+        # Topically unrelated — cannot check for contradictions
+        return EvalResult(
+            module_name="contradiction",
+            score=0.5,  # neutral — no overlapping pairs to evaluate
+            details={"total_sentences": len(answer_sents), "checked_pairs": 0, "contradicted_pairs": 0, "pairs": []},
+            latency_ms=int((time.perf_counter() - t0) * 1000),
+        )
+    # Batch NLI inference
+    try:
+        scores_raw = model.predict(all_pairs, apply_softmax=True)
+    except Exception as exc:
+        logger.error("Contradiction NLI inference failed: %s", exc)
+        return EvalResult(
+            module_name="contradiction",
+            score=1.0,
+            details={},
+            error=f"Model inference error: {exc}",
+            latency_ms=int((time.perf_counter() - t0) * 1000),
+        )
+    # Collect flagged pairs
+    pair_details: list[dict] = []
+    contradicted = 0
+    total = len(all_pairs)
+    for i, (a_sent, c_sent) in enumerate(all_pairs):
+        con_score = float(scores_raw[i][LABEL_CONTRADICTION])
+        flagged = con_score >= CONTRADICTION_THRESHOLD
+        if flagged:
+            contradicted += 1
+            # Only log the most severe contradictions to keep details manageable
+            pair_details.append(
+                {
+                    "sentence_a": a_sent[:120],
+                    "sentence_b": c_sent[:120],
+                    "contradiction_score": round(con_score, 4),
+                    "flagged": True,
+                }
+            )
+    # Score: 1.0 = clean, lower = more contradictions found
+    score = 1.0 - (contradicted / total) if total > 0 else 1.0
+    details = {
+        "total_sentences": len(answer_sents),
+        "checked_pairs": total,
+        "contradicted_pairs": contradicted,
+        "pairs": pair_details[:20],  # cap output to top 20 flagged pairs
+    }
+    latency_ms = int((time.perf_counter() - t0) * 1000)
+    logger.info(
+        "Contradiction: %.3f (%d/%d pairs flagged) in %d ms",
+        score, contradicted, total, latency_ms,
+    )
+    return EvalResult(
+        module_name="contradiction",
+        score=score,
+        details=details,
+        latency_ms=latency_ms,
+    )

src/modules/entity_verifier.py ADDED Viewed

	@@ -0,0 +1,334 @@

+"""
+FR-09: src/modules/entity_verifier.py — Module 2: Medical Entity Verification
+==============================================================================
+Uses SciSpaCy NER (en_core_sci_lg) to extract medical entities from the answer,
+then verifies drug entities against the RxNorm cache and/or REST API.
+Verification pipeline (SRS Section 6.2):
+    1. NER: extract DRUG, DOSAGE, CONDITION, PROCEDURE entities from answer
+    2. For each DRUG entity:
+        a. Look up in local rxnorm_cache.csv (fast, offline)
+        b. If not found, query RxNorm REST API /approximateTerm (live fallback)
+        c. If still not found, mark as NOT_FOUND
+    3. Cross-check entity presence in context docs (optional validation)
+    4. Score = verified_drug_count / total_drug_count  (non-drug entities have no score impact)
+Entity status values:
+    VERIFIED  — drug found in RxNorm cache or API with rxcui
+    FLAGGED   — entity found but has a known dangerous synonym conflict
+    NOT_FOUND — drug name not resolvable via any layer
+Severity mapping (for FLAGGED):
+    brand ↔ generic mismatch → CRITICAL
+    dosage discrepancy       → MODERATE
+    minor synonym variant    → MINOR
+"""
+from __future__ import annotations
+import logging
+import re
+import time
+from functools import lru_cache
+from pathlib import Path
+from typing import Optional
+import pandas as pd
+import requests
+from src.modules.base import EvalResult
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+RXNORM_APPROX_URL = "https://rxnav.nlm.nih.gov/REST/approximateTerm.json"
+DEFAULT_CACHE_PATH = "data/rxnorm_cache.csv"
+NER_MODEL = "en_ner_bc5cdr_md"
+DOSAGE_TOLERANCE_PCT = 10  # flag if answer dose differs from context dose by > 10%
+# Matches clinical dose values: "500 mg", "2.5 mcg/kg", "10 IU", etc.
+_DOSE_RE = re.compile(
+    r'(\d+(?:\.\d+)?)\s*(?:mg|mcg|g\b|ml|iu|units?|mg/kg|mg/dl)',
+    re.IGNORECASE,
+)
+# Map spacy entity labels to our schema types
+_ENTITY_TYPE_MAP = {
+    # en_core_sci_lg (CRAFT corpus) labels
+    "CHEBI":       "DRUG",       # Chemical Entities of Biological Interest — covers drugs
+    "GGP":         "CONDITION",  # Gene or Gene Product
+    "SO":          "CONDITION",  # Sequence Ontology
+    "TAXON":       "CONDITION",
+    "GO":          "CONDITION",  # Gene Ontology
+    "CL":          "CONDITION",  # Cell Line
+    "DNA":         "CONDITION",
+    "RNA":         "CONDITION",
+    "CELL_TYPE":   "CONDITION",
+    "CELL_LINE":   "CONDITION",
+    "PROTEIN":     "CONDITION",
+    # BC5CDR labels (used by some scispacy models)
+    "Chemical":    "DRUG",
+    "Disease":     "CONDITION",
+    # Generic / fallback labels
+    "CHEMICAL":    "DRUG",
+    "DRUG":        "DRUG",
+    "COMPOUND":    "DRUG",
+    "DISEASE":     "CONDITION",
+    "SYMPTOM":     "CONDITION",
+    "PROCEDURE":   "PROCEDURE",
+    "DOSAGE":      "DOSAGE",
+}
+DRUG_TYPES = {"DRUG"}  # only these get verified against RxNorm
+# ---------------------------------------------------------------------------
+# Module-level caches
+# ---------------------------------------------------------------------------
+_spacy_model = None
+_rxnorm_cache: dict[str, str] | None = None  # drug_name -> rxcui
+_rxnorm_cache_path: str = DEFAULT_CACHE_PATH
+def _get_spacy_model():
+    global _spacy_model
+    if _spacy_model is None:
+        import spacy
+        logger.info("Loading SciSpaCy NER model: %s (first call only)", NER_MODEL)
+        try:
+            _spacy_model = spacy.load(NER_MODEL)
+            logger.info("SciSpaCy model loaded.")
+        except OSError as exc:
+            logger.error(
+                "Failed to load '%s'. Install with: "
+                "pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/"
+                "releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz\nError: %s",
+                NER_MODEL, exc,
+            )
+            raise
+    return _spacy_model
+def _load_rxnorm_cache(cache_path: str) -> dict[str, str]:
+    """Load the RxNorm cache CSV into a lowercase drug_name → rxcui dict."""
+    path = Path(cache_path)
+    if not path.exists():
+        logger.warning(
+            "RxNorm cache not found at '%s'. Live API only will be used.", cache_path
+        )
+        return {}
+    try:
+        df = pd.read_csv(path, dtype=str)
+        cache = {
+            str(row["drug_name"]).strip().lower(): str(row["rxcui"]).strip()
+            for _, row in df.iterrows()
+            if pd.notna(row.get("drug_name")) and pd.notna(row.get("rxcui"))
+            and str(row.get("rxcui", "")).strip()
+        }
+        logger.info("RxNorm cache loaded: %d entries from %s", len(cache), cache_path)
+        return cache
+    except Exception as exc:
+        logger.warning("Failed to load RxNorm cache: %s", exc)
+        return {}
+def _get_rxnorm_cache(cache_path: str) -> dict[str, str]:
+    global _rxnorm_cache, _rxnorm_cache_path
+    if _rxnorm_cache is None or cache_path != _rxnorm_cache_path:
+        _rxnorm_cache_path = cache_path
+        _rxnorm_cache = _load_rxnorm_cache(cache_path)
+    return _rxnorm_cache
+def _extract_doses_near(text: str, drug_name: str, window: int = 180) -> list[float]:
+    """Return numeric dose values found within `window` chars of `drug_name` in `text`."""
+    idx = text.lower().find(drug_name.lower())
+    if idx == -1:
+        return []
+    vicinity = text[max(0, idx - window // 2): idx + len(drug_name) + window]
+    return [float(m.group(1)) for m in _DOSE_RE.finditer(vicinity)]
+def _lookup_rxnorm_api(drug_name: str, timeout: int = 4) -> Optional[str]:
+    """Query RxNorm REST API. Returns rxcui string or None."""
+    try:
+        resp = requests.get(
+            RXNORM_APPROX_URL,
+            params={"term": drug_name, "maxEntries": "1", "option": "1"},
+            timeout=timeout,
+        )
+        if resp.status_code != 200:
+            return None
+        candidates = (
+            resp.json()
+            .get("approximateGroup", {})
+            .get("candidate", [])
+        )
+        if candidates:
+            return str(candidates[0].get("rxcui", "")).strip() or None
+    except Exception:
+        pass
+    return None
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def verify_entities(
+    answer: str,
+    question: str = "",
+    context_docs: list[str] | None = None,
+    rxnorm_cache_path: str = DEFAULT_CACHE_PATH,
+    use_api_fallback: bool = True,
+) -> EvalResult:
+    """
+    Extract and verify medical entities from the LLM answer.
+    Args:
+        answer            : LLM-generated answer text.
+        question          : Original question (NER'd alongside answer for richer entity set).
+        context_docs      : Retrieved context passages (used for cross-checking).
+        rxnorm_cache_path : Path to rxnorm_cache.csv.
+        use_api_fallback  : Whether to call RxNorm REST API for cache misses.
+    Returns:
+        EvalResult with module_name="entity_verifier", score in [0,1], and
+        details matching the shape from src/modules/__init__.py.
+    """
+    t0 = time.perf_counter()
+    # --- NER -----------------------------------------------------------------
+    try:
+        nlp = _get_spacy_model()
+    except Exception as exc:
+        return EvalResult(
+            module_name="entity_verifier",
+            score=0.5,  # neutral fallback — don't penalise if model not available
+            details={"error": str(exc), "entities": []},
+            error=f"NER model unavailable: {exc}",
+            latency_ms=int((time.perf_counter() - t0) * 1000),
+        )
+    # Combine question + answer for richer entity extraction
+    combined_text = f"{question} {answer}" if question else answer
+    doc = nlp(combined_text)
+    # Collect entities with deduplication
+    seen: set[str] = set()
+    raw_entities: list[tuple[str, str]] = []  # (text, type)
+    for ent in doc.ents:
+        key = ent.text.strip().lower()
+        if not key or key in seen:
+            continue
+        seen.add(key)
+        entity_type = _ENTITY_TYPE_MAP.get(ent.label_, "CONDITION")
+        raw_entities.append((ent.text.strip(), entity_type))
+    if not raw_entities:
+        return EvalResult(
+            module_name="entity_verifier",
+            score=0.5,  # neutral — cannot verify what isn't there
+            details={"total_entities": 0, "verified_count": 0, "flagged_count": 0, "entities": []},
+            latency_ms=int((time.perf_counter() - t0) * 1000),
+        )
+    # --- RxNorm verification for DRUG entities -------------------------------
+    cache = _get_rxnorm_cache(rxnorm_cache_path)
+    context_text = " ".join(context_docs or []).lower()
+    entity_results: list[dict] = []
+    drug_total = 0
+    drug_verified = 0
+    drug_flagged = 0
+    for entity_text, entity_type in raw_entities:
+        result = {
+            "entity": entity_text,
+            "type": entity_type,
+            "status": "NOT_FOUND",
+            "severity": None,
+            "answer_value": entity_text,
+            "context_value": None,
+            "rxcui": None,
+        }
+        if entity_type in DRUG_TYPES:
+            drug_total += 1
+            key = entity_text.lower()
+            # Layer 1: Local cache lookup
+            rxcui = cache.get(key)
+            # Layer 2: API fallback
+            if not rxcui and use_api_fallback:
+                rxcui = _lookup_rxnorm_api(entity_text)
+            if rxcui:
+                result["rxcui"] = rxcui
+                # Check for dosage discrepancy before marking VERIFIED
+                answer_doses = _extract_doses_near(answer, entity_text)
+                context_doses = _extract_doses_near(context_text, entity_text)
+                flagged_dose = False
+                if answer_doses and context_doses:
+                    a_dose = answer_doses[0]
+                    c_dose = min(context_doses, key=lambda d: abs(d - a_dose))
+                    pct_diff = abs(a_dose - c_dose) / max(c_dose, 1e-9) * 100
+                    if pct_diff > DOSAGE_TOLERANCE_PCT:
+                        result["status"] = "FLAGGED"
+                        result["severity"] = "MODERATE"
+                        result["answer_value"] = f"{a_dose} (answer)"
+                        result["context_value"] = f"{c_dose} (context, Δ{pct_diff:.0f}%)"
+                        drug_flagged += 1
+                        flagged_dose = True
+                        logger.warning(
+                            "Dosage discrepancy for '%s': answer=%.1f context=%.1f (%.0f%%)",
+                            entity_text, a_dose, c_dose, pct_diff,
+                        )
+                if not flagged_dose:
+                    result["status"] = "VERIFIED"
+                    drug_verified += 1
+                    if key in context_text:
+                        result["context_value"] = entity_text
+            else:
+                result["status"] = "NOT_FOUND"
+        elif entity_type in ("CONDITION", "PROCEDURE"):
+            # Non-drug entities: check presence in context only
+            if entity_text.lower() in context_text:
+                result["status"] = "VERIFIED"
+                result["context_value"] = entity_text
+            else:
+                result["status"] = "NOT_FOUND"
+        entity_results.append(result)
+    # --- Score ---------------------------------------------------------------
+    # Score is based on drug entities only (per SRS Section 6.2)
+    if drug_total == 0:
+        score = 0.5  # neutral — no drug entities to verify
+    else:
+        score = drug_verified / drug_total
+    details = {
+        "total_entities": len(raw_entities),
+        "drug_total": drug_total,
+        "verified_count": drug_verified,
+        "flagged_count": drug_flagged,
+        "entities": entity_results,
+    }
+    latency_ms = int((time.perf_counter() - t0) * 1000)
+    logger.info(
+        "Entity verification: %.3f (%d/%d drugs verified) in %d ms",
+        score, drug_verified, drug_total, latency_ms,
+    )
+    return EvalResult(
+        module_name="entity_verifier",
+        score=score,
+        details=details,
+        latency_ms=latency_ms,
+    )

src/modules/faithfulness.py ADDED Viewed

	@@ -0,0 +1,302 @@

+"""
+FR-05: src/modules/faithfulness.py — Module 1: Faithfulness Scoring
+=====================================================================
+Uses cross-encoder/nli-deberta-v3-small to score how well the LLM answer
+is entailed by the retrieved context chunks.
+Architecture:
+    1. Split answer into individual claims (sentences via pysbd)
+    2. For each claim: compute NLI score against every context chunk
+    3. Assign claim status: ENTAILED / NEUTRAL / CONTRADICTED
+    4. score = entailed_count / total_claims
+Thresholds (SRS Section 6.1):
+    entailment  ≥ 0.50  → ENTAILED
+    contradiction ≥ 0.30 → CONTRADICTED
+    otherwise           → NEUTRAL
+Model loaded lazily and cached at module level (avoids double-loading
+when called multiple times in same process).
+"""
+from __future__ import annotations
+import logging
+import time
+from functools import lru_cache
+from typing import TYPE_CHECKING
+from src.modules.base import EvalResult
+if TYPE_CHECKING:
+    pass
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+# BioLinkBERT fine-tuned on MedNLI (clinical notes, MIMIC-III)
+# Paper 15 (Chen et al. SemEval-2023): best single model for biomedical NLI (F1=0.765)
+# Faster on CPU than DeBERTa-large (BERT-base architecture)
+MODEL_NAME = "cnut1648/biolinkbert-mednli"
+# MedNLI label order (verified): {0: entailment, 1: neutral, 2: contradiction}
+LABEL_ENTAILMENT = 0
+LABEL_NEUTRAL = 1
+LABEL_CONTRADICTION = 2
+ENTAILMENT_THRESHOLD = 0.50
+CONTRADICTION_THRESHOLD = 0.30
+# ---------------------------------------------------------------------------
+# Lazy model loader
+# ---------------------------------------------------------------------------
+_model = None
+_segmenter = None
+def _get_model():
+    global _model
+    if _model is None:
+        try:
+            from sentence_transformers import CrossEncoder
+            logger.info("Loading NLI model: %s (first call only)", MODEL_NAME)
+            _model = CrossEncoder(MODEL_NAME)
+            logger.info("NLI model loaded.")
+        except ImportError:
+            logger.error("sentence_transformers not installed. Faithfulness will be stubbed.")
+            _model = "stub"
+    return _model
+def _get_segmenter():
+    global _segmenter
+    if _segmenter is None:
+        try:
+            import pysbd
+            _segmenter = pysbd.Segmenter(language="en", clean=False)
+        except ImportError:
+            _segmenter = "stub"
+    return _segmenter
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def score_faithfulness(
+    answer: str,
+    context_docs: list[str],
+    chunk_ids: list[str] | None = None,
+    max_chunks: int = 3,
+    config: dict | None = None,
+) -> EvalResult:
+    """
+    Score the faithfulness of an answer against retrieved context documents.
+    Args:
+        answer       : The LLM-generated answer text.
+        context_docs : List of context passage strings (top-k retrieved chunks).
+        chunk_ids    : Optional IDs matching context_docs for traceability.
+        max_chunks   : Maximum context chunks to consider (to limit API calls).
+    Returns:
+        EvalResult with module_name="faithfulness", score in [0,1], and details
+        dict matching the shape defined in src/modules/__init__.py.
+    """
+    t0 = time.perf_counter()
+    _faith_cfg = (config or {}).get("modules", {}).get("faithfulness", {})
+    entailment_threshold = _faith_cfg.get("entailment_threshold", ENTAILMENT_THRESHOLD)
+    contradiction_threshold = CONTRADICTION_THRESHOLD
+    if not answer or not context_docs:
+        return EvalResult(
+            module_name="faithfulness",
+            score=0.0,
+            details={"error": "Empty answer or no context provided"},
+            error="Empty answer or no context",
+            latency_ms=0,
+        )
+    # Limit context size
+    docs = context_docs[:max_chunks]
+    ids = (chunk_ids or [f"chunk_{i}" for i in range(len(docs))])[:max_chunks]
+    # Strip markdown formatting from guideline/structured chunks before NLI
+    # DeBERTa NLI was trained on clean prose — markdown confuses it
+    import re as _re
+    _MD_CLEAN = _re.compile(r'\[([^\]]+)\]\n|#{1,6}\s+|•\s+|\*\*([^*]+)\*\*|\*([^*]+)\*|`[^`]+`')
+    docs = [_MD_CLEAN.sub(lambda m: m.group(2) or m.group(3) or '', d) for d in docs]
+    # Strip inline citations and markdown from the answer before claim splitting.
+    # LLM answers often include [Source: *title*] citations and **bold** text that
+    # confuse BioLinkBERT NLI — the model was trained on clean prose.
+    _CITE_RE = _re.compile(
+        r'\[Source:[^\]]*\]'           # [Source: title] or [Source: *italic title*]
+        r'|\[[^\]]{0,120}\]'           # other short bracket constructs
+        r'|\*\*([^*]+)\*\*'            # **bold** → keep inner text
+        r'|\*([^*]+)\*'                # *italic* → keep inner text
+        r'|`[^`]+`'                    # `code`
+        r'|^\s*[*•]\s+'               # bullet points at line start
+    )
+    answer_clean = _CITE_RE.sub(lambda m: (m.group(1) or m.group(2) or ''), answer).strip()
+    # Split answer into claims
+    seg = _get_segmenter()
+    try:
+        if seg == "stub":
+            claims = [s.strip() for s in answer_clean.split(".") if s.strip()]
+        else:
+            claims = [s.strip() for s in seg.segment(answer_clean) if s.strip()]
+    except Exception:
+        claims = [s.strip() for s in answer_clean.split(".") if s.strip()]
+    if not claims:
+        return EvalResult(
+            module_name="faithfulness",
+            score=0.5,
+            details={"error": "Could not extract claims from answer"},
+            error="No claims extracted",
+            latency_ms=0,
+        )
+    model = _get_model()
+    # Limit claims to avoid O(claims×chunks) explosion with the large model
+    claims = claims[:12]
+    # ---------------------------------------------------------------------------
+    # Numerical Bypass (Paper 14: non-optional for clinical NLI)
+    # NLI models structurally cannot verify numerical comparisons (≥6.5%, 126 mg/dL).
+    # Use direct string/lexical matching for claims containing clinical measurements.
+    # ---------------------------------------------------------------------------
+    import re as _re2
+    _NUM_PATTERN = _re2.compile(
+        r'[\d]+[\s]*(mg|mcg|%|mL|mmol|IU|units?|g|kg|≥|≤|>|<|±|mg/dL|mmol/L|mg/kg)',
+        _re2.IGNORECASE,
+    )
+    def _numerical_match(claim: str, context_chunks: list[str]) -> str:
+        """
+        For claims with numerical clinical values, check if the key numbers
+        appear in any context chunk. Returns ENTAILED or NEUTRAL.
+        """
+        nums = _re2.findall(r'[\d]+\.?[\d]*', claim)
+        if not nums:
+            return "NEUTRAL"
+        combined = " ".join(context_chunks).lower()
+        matched = sum(1 for n in nums if n in combined)
+        return "ENTAILED" if matched >= len(nums) * 0.6 else "NEUTRAL"
+    # Separate numerical claims (bypass NLI) from textual claims (use NLI)
+    numerical_results: dict[int, str] = {}  # claim_idx → status
+    nli_claim_indices: list[int] = []
+    for ci, claim in enumerate(claims):
+        if _NUM_PATTERN.search(claim):
+            numerical_results[ci] = _numerical_match(claim, docs)
+        else:
+            nli_claim_indices.append(ci)
+    # Build NLI pairs only for non-numerical claims
+    nli_claims = [claims[ci] for ci in nli_claim_indices]
+    all_pairs = []
+    pair_map: list[tuple[int, int]] = []  # (nli_claim_idx, doc_idx)
+    for nci, claim in enumerate(nli_claims):
+        for di, doc in enumerate(docs):
+            all_pairs.append((doc, claim))
+            pair_map.append((nci, di))
+    # Batch NLI inference
+    try:
+        if model == "stub":
+            # Provide dummy scores if model is unavailable
+            scores_raw = [[0.1, 0.1, 0.8] for _ in all_pairs]
+        else:
+            scores_raw = model.predict(all_pairs, apply_softmax=True)
+    except Exception as exc:
+        logger.error("NLI model inference failed: %s", exc)
+        return EvalResult(
+            module_name="faithfulness",
+            score=0.0,
+            details={},
+            error=f"Model inference error: {exc}",
+            latency_ms=int((time.perf_counter() - t0) * 1000),
+        )
+    # Aggregate: for each claim find the context with the highest entailment
+    claim_results: list[dict] = []
+    entailed = 0
+    neutral = 0
+    contradicted = 0
+    # Build per-NLI-claim best scores from batch results
+    nli_best: dict[int, tuple[float, float, int]] = {}  # nci → (best_ent, best_con, best_doc)
+    for idx, (nci, d_i) in enumerate(pair_map):
+        score_vec = scores_raw[idx]
+        ent_score = float(score_vec[LABEL_ENTAILMENT])
+        con_score = float(score_vec[LABEL_CONTRADICTION])
+        if nci not in nli_best or ent_score > nli_best[nci][0]:
+            nli_best[nci] = (ent_score, con_score, d_i)
+    for ci, claim in enumerate(claims):
+        if ci in numerical_results:
+            # Numerical bypass — lexical match result
+            status = numerical_results[ci]
+            nli_score = 1.0 if status == "ENTAILED" else 0.0
+            best_doc_idx = 0
+            method = "numerical_bypass"
+        else:
+            # NLI result
+            nci = nli_claim_indices.index(ci) if ci in nli_claim_indices else -1
+            best_entailment, best_contradiction, best_doc_idx = nli_best.get(nci, (0.0, 0.0, 0))
+            if best_entailment >= entailment_threshold:
+                status = "ENTAILED"
+                nli_score = best_entailment
+            elif best_contradiction >= contradiction_threshold:
+                status = "CONTRADICTED"
+                nli_score = best_contradiction
+            else:
+                status = "NEUTRAL"
+                nli_score = best_entailment
+            method = "nli"
+        if status == "ENTAILED":
+            entailed += 1
+        elif status == "CONTRADICTED":
+            contradicted += 1
+        else:
+            neutral += 1
+        claim_results.append({
+            "claim": claim,
+            "status": status,
+            "best_chunk_id": ids[best_doc_idx],
+            "nli_score": round(nli_score, 4),
+            "method": method,
+        })
+    total = len(claims)
+    score = max(0.0, (entailed - contradicted) / total) if total > 0 else 0.0
+    details = {
+        "total_claims": total,
+        "entailed_count": entailed,
+        "neutral_count": neutral,
+        "contradicted_count": contradicted,
+        "claims": claim_results,
+    }
+    latency_ms = int((time.perf_counter() - t0) * 1000)
+    logger.info(
+        "Faithfulness: %.3f (%d/%d entailed) in %d ms",
+        score, entailed, total, latency_ms,
+    )
+    return EvalResult(
+        module_name="faithfulness",
+        score=score,
+        details=details,
+        latency_ms=latency_ms,
+    )

src/modules/source_credibility.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""
+FR-14: src/modules/source_credibility.py — Module 3: Source Credibility Scoring
+=================================================================================
+Scores the credibility of retrieved source documents based on their publication
+type / evidence tier.
+Tier weights (SRS Section 6.3):
+    clinical_guideline   → 1.00  (Tier 1 — highest authority)
+    systematic_review    → 0.85  (Tier 2)
+    research_abstract    → 0.70  (Tier 3 — PubMedQA default)
+    review_article       → 0.60  (Tier 4)
+    clinical_case        → 0.50  (Tier 5)
+    unknown / other      → 0.30  (fallback)
+Detection:
+    1. Use 'tier_type' metadata field if present (set by embedder.py)
+    2. Fall back to keyword matching in pub_type / title text
+Score = weighted mean of tier weights across all retrieved chunks.
+Each chunk must be a dict with at minimum:
+    {"text": str, "metadata": {"tier_type": str, "pub_type": str, "title": str}}
+or the simpler form accepted by the retriever:
+    {"text": str, "source": str, "tier_type": str, "title": str}
+"""
+from __future__ import annotations
+import logging
+import re
+import time
+from src.modules.base import EvalResult
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Evidence tier weights
+# ---------------------------------------------------------------------------
+TIER_WEIGHTS: dict[str, float] = {
+    "clinical_guideline":  1.00,
+    "systematic_review":   0.85,
+    "drug_label":          0.90,  # FDA-approved drug labels — authoritative regulatory source
+    "research_abstract":   0.70,
+    "review_article":      0.60,
+    "clinical_case":       0.50,
+    "unknown":             0.30,
+}
+# Keyword → tier_type mapping for fallback text matching
+_KEYWORD_MAP: list[tuple[re.Pattern, str]] = [
+    (re.compile(r"\b(guideline|clinical practice|recommendation|consensus)\b", re.I), "clinical_guideline"),
+    (re.compile(r"\b(systematic review|meta.?analysis)\b", re.I),                    "systematic_review"),
+    # RCT / controlled trial → highest single-study evidence tier
+    (re.compile(r"\b(randomized|randomised|controlled trial|rct|clinical trial)\b", re.I), "clinical_guideline"),
+    # FDA drug labels
+    (re.compile(r"\b(fda|drug label|prescribing information|package insert|dailymed)\b", re.I), "drug_label"),
+    (re.compile(r"\b(review|overview)\b", re.I),                                     "review_article"),
+    (re.compile(r"\b(case report|case study|clinical case)\b", re.I),                "clinical_case"),
+    (re.compile(r"\b(abstract|research article|original article|journal)\b", re.I),  "research_abstract"),
+]
+def _classify_tier(chunk: dict) -> tuple[str, str | None]:
+    """
+    Return (tier_type, matched_keyword) for a single retrieved chunk dict.
+    Priority 1: explicit tier_type field (set by embedder.py)
+    Priority 2: pub_type field directly maps to a known tier name
+    Priority 3: keyword regex on pub_type + title text
+    """
+    # Priority 1: explicit tier_type already set (e.g., by embedder.py)
+    tier = (
+        chunk.get("tier_type")
+        or chunk.get("metadata", {}).get("tier_type")
+    )
+    if tier and tier in TIER_WEIGHTS:
+        return tier, None
+    # Priority 2: direct pub_type value lookup
+    # Handles underscore-separated values like "research_abstract" which
+    # won't match word-boundary regex patterns
+    pub_type_raw = str(
+        chunk.get("pub_type") or chunk.get("metadata", {}).get("pub_type") or ""
+    ).strip().lower()
+    _PUB_TYPE_DIRECT: dict[str, str] = {
+        "research_abstract":  "research_abstract",
+        "abstract":           "research_abstract",
+        "systematic_review":  "systematic_review",
+        "systematic review":  "systematic_review",
+        "meta_analysis":      "systematic_review",
+        "meta-analysis":      "systematic_review",
+        "drug_label":         "drug_label",
+        "drug label":         "drug_label",
+        "clinical_guideline": "clinical_guideline",
+        "clinical guideline": "clinical_guideline",
+        "guideline":          "clinical_guideline",
+        "review_article":     "review_article",
+        "review article":     "review_article",
+        "review":             "review_article",
+        "clinical_case":      "clinical_case",
+        "case_report":        "clinical_case",
+        "case report":        "clinical_case",
+    }
+    if pub_type_raw in _PUB_TYPE_DIRECT:
+        return _PUB_TYPE_DIRECT[pub_type_raw], None
+    # Priority 3: keyword regex on pub_type + title text
+    title = str(chunk.get("title") or chunk.get("metadata", {}).get("title") or "")
+    text_to_search = f"{pub_type_raw} {title}"
+    for pattern, matched_tier in _KEYWORD_MAP:
+        m = pattern.search(text_to_search)
+        if m:
+            return matched_tier, m.group(0)
+    return "unknown", None
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def score_source_credibility(
+    retrieved_chunks: list[dict],
+) -> EvalResult:
+    """
+    Score the credibility of a set of retrieved source documents.
+    Args:
+        retrieved_chunks : List of chunk dicts as returned by retriever.retrieve().
+                          Each must contain at minimum 'text' and ideally
+                          'tier_type', 'pub_type', 'title', 'chunk_id' fields.
+    Returns:
+        EvalResult with module_name="source_credibility", score in [0,1], and
+        details matching the shape from src/modules/__init__.py.
+    """
+    t0 = time.perf_counter()
+    if not retrieved_chunks:
+        return EvalResult(
+            module_name="source_credibility",
+            score=0.0,
+            details={"chunks": [], "method_used": "none"},
+            error="No chunks provided",
+            latency_ms=0,
+        )
+    chunk_details: list[dict] = []
+    weights: list[float] = []
+    method_used = "metadata"  # assume metadata-first; may switch to keyword
+    for i, chunk in enumerate(retrieved_chunks):
+        tier_type, matched_kw = _classify_tier(chunk)
+        weight = TIER_WEIGHTS.get(tier_type, TIER_WEIGHTS["unknown"])
+        weights.append(weight)
+        if matched_kw:
+            method_used = "keyword"
+        # Compute tier number (1-5) for display
+        tier_num = {
+            "clinical_guideline": 1,
+            "systematic_review":  2,
+            "research_abstract":  3,
+            "review_article":     4,
+            "clinical_case":      5,
+        }.get(tier_type, 6)  # 6 = unknown/unclassified
+        chunk_details.append(
+            {
+                "chunk_id": chunk.get("chunk_id") or chunk.get("metadata", {}).get("chunk_id") or f"chunk_{i}",
+                "tier": tier_num,
+                "tier_type": tier_type,
+                "tier_weight": round(weight, 2),
+                "pub_type": chunk.get("pub_type") or chunk.get("metadata", {}).get("pub_type") or "",
+                "title": (chunk.get("title") or chunk.get("metadata", {}).get("title") or "")[:80],
+                "matched_keyword": matched_kw,
+            }
+        )
+    score = sum(weights) / len(weights) if weights else 0.0
+    details = {
+        "method_used": method_used,
+        "chunk_count": len(retrieved_chunks),
+        "avg_tier_weight": round(score, 4),
+        "chunks": chunk_details,
+    }
+    latency_ms = int((time.perf_counter() - t0) * 1000)
+    logger.info(
+        "Source credibility: %.3f (avg tier weight over %d chunks) in %d ms",
+        score, len(retrieved_chunks), latency_ms,
+    )
+    return EvalResult(
+        module_name="source_credibility",
+        score=score,
+        details=details,
+        latency_ms=latency_ms,
+    )

src/pipeline/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # src/pipeline/__init__.py

src/pipeline/chunker.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+FR-02: Document Chunking
+========================
+LangChain RecursiveCharacterTextSplitter
+  chunk_size = 512 chars  (config: retrieval.chunk_size)
+  overlap    = 50  chars  (config: retrieval.chunk_overlap)
+Each chunk carries the full FR-03b metadata schema required by Module 3
+(source credibility) and the FAISS metadata store.
+"""
+from __future__ import annotations
+import logging
+import uuid
+from typing import Any
+logger = logging.getLogger(__name__)
+def chunk_documents(
+    documents: list[dict[str, Any]],
+    config: dict,
+) -> list[dict[str, Any]]:
+    """
+    Split a list of raw documents into overlapping text chunks.
+    Args:
+        documents : List of dicts with keys:
+                    text, doc_id, source, title, pub_type, pub_year, journal
+        config    : Loaded config.yaml dict
+    Returns:
+        List of chunk dicts (FR-03b metadata schema):
+            chunk_id, chunk_text, doc_id, source, title,
+            pub_type, pub_year, journal, chunk_index, total_chunks
+    """
+    from langchain.text_splitter import RecursiveCharacterTextSplitter
+    chunk_size    = config["retrieval"]["chunk_size"]      # 512
+    chunk_overlap = config["retrieval"]["chunk_overlap"]   # 50
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        length_function=len,
+        separators=["\n\n", "\n", ". ", " ", ""],
+    )
+    all_chunks: list[dict] = []
+    for doc in documents:
+        text = doc.get("text", "").strip()
+        if not text:
+            logger.debug("Skipping empty document: doc_id=%s", doc.get("doc_id"))
+            continue
+        raw_chunks = splitter.split_text(text)
+        total = len(raw_chunks)
+        for idx, chunk_text in enumerate(raw_chunks):
+            chunk_text = chunk_text.strip()
+            if not chunk_text:
+                continue
+            all_chunks.append({
+                # FR-03b schema
+                "chunk_id":     str(uuid.uuid4()),
+                "chunk_text":   chunk_text,
+                "doc_id":       doc["doc_id"],
+                "source":       doc["source"],
+                "title":        doc["title"],
+                "pub_type":     doc["pub_type"],
+                "pub_year":     doc.get("pub_year", 0),
+                "journal":      doc.get("journal", ""),
+                "chunk_index":  idx,
+                "total_chunks": total,
+            })
+    logger.info(
+        "Chunked %d documents → %d chunks (size=%d, overlap=%d)",
+        len(documents), len(all_chunks), chunk_size, chunk_overlap,
+    )
+    return all_chunks

src/pipeline/consensus.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""
+src/pipeline/consensus.py — Multi-Model Consensus Engine
+=========================================================
+Implements the "Ensemble Judge" middleware feature.
+Calls multiple LLMs and compares their answers for medical contradictions.
+"""
+from __future__ import annotations
+import logging
+import concurrent.futures
+from typing import List, Dict, Any, Optional
+from src.pipeline.generator import generate_answer
+logger = logging.getLogger(__name__)
+def run_consensus_check(
+    question: str,
+    context_chunks: List[Dict[str, Any]],
+    config: Dict[str, Any],
+    providers: List[str] = ["gemini", "groq"]
+) -> Dict[str, Any]:
+    """
+    Calls multiple providers in parallel and compares outcomes.
+    Returns: {
+        "answers": { provider: answer },
+        "agreement_score": float [0-1],
+        "conflicts": List[str],
+        "consensus_answer": str
+    }
+    """
+    logger.info("Starting Consensus Check with providers: %s", providers)
+    # 1. Generate answers in parallel
+    answers = {}
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        future_to_provider = {
+            executor.submit(generate_answer, question, context_chunks, config, {"provider": p}): p
+            for p in providers
+        }
+        for future in concurrent.futures.as_completed(future_to_provider):
+            provider = future_to_provider[future]
+            try:
+                answers[provider] = future.result()
+            except Exception as exc:
+                logger.error("Provider %s failed during consensus: %s", provider, exc)
+                answers[provider] = f"ERROR: {exc}"
+    if len(answers) < 2:
+        return {
+            "answers": answers,
+            "agreement_score": 1.0,
+            "conflicts": ["Insufficient providers responded for a full consensus check."],
+            "consensus_answer": list(answers.values())[0] if answers else "Safety failure: No providers responded."
+        }
+    # Compile context text to feed the Judge
+    context_text = "\n\n".join([f"Source {i+1}:\n{c.get('text', '')}" for i, c in enumerate(context_chunks)])
+    # 2. Compare answers using a "Judge" Agent
+    # We use Gemini (or the primary provider) as the judge
+    comparison_prompt = f"""
+You are a Medical Consensus Judge. Compare the following two medical answers provided by different AI models to the same question.
+CRITICAL INSTRUCTION: Your primary duty is to ensure the final answer is explicitly grounded in the provided MEDICAL CONTEXT.
+ Identify any CLINICAL CONTRADICTIONS or significant discrepancies in drug names, dosages, or recommendations.
+If one model hallucinates outside the context, you must side with the model that stuck to the context.
+QUESTION: {question}
+MEDICAL CONTEXT FROM DATASET:
+{context_text}
+ANSWER A:
+{list(answers.values())[0]}
+ANSWER B:
+{list(answers.values())[1] if len(answers) > 1 else "N/A"}
+OUTPUT FORMAT (JSON ONLY):
+{{
+  "agreement_score": 0.0 to 1.0 (1.0 means perfect alignment, 0.0 means complete contradiction),
+  "conflicts": ["list of specific medical discrepancies found"],
+  "summary": "brief summary of how they differ and which one aligns better with the Medical Context",
+  "recommended_consensus": "the most conservative and safe unified answer that strictly adheres to the Medical Context"
+}}
+"""
+    try:
+        # Use the generator's default to run the judge
+        judge_raw = generate_answer("Medical Consensus Judge Task", [{"text": comparison_prompt}], config)
+        # Attempt to parse JSON from the judge's response
+        # (A real implementation would use structured output, but we use a robust parse for now)
+        import json
+        import re
+        # Clean potential markdown
+        clean_json = re.sub(r'```json\n?|\n?```', '', judge_raw).strip()
+        judge_data = json.loads(clean_json)
+        return {
+            "answers": answers,
+            "agreement_score": judge_data.get("agreement_score", 0.5),
+            "conflicts": judge_data.get("conflicts", []),
+            "summary": judge_data.get("summary", ""),
+            "consensus_answer": judge_data.get("recommended_consensus", list(answers.values())[0])
+        }
+    except Exception as e:
+        logger.error("Consensus Judge failed: %s", e)
+        return {
+            "answers": answers,
+            "agreement_score": 0.5,
+            "conflicts": [f"Judge failed: {e}"],
+            "consensus_answer": list(answers.values())[0]
+        }

src/pipeline/embedder.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""
+FR-03 + FR-03b: Embedding Generation & FAISS Index Construction
+===============================================================
+Model  : dmis-lab/biobert-v1.1  (768-dim dense vectors, SentenceTransformer)
+Index  : FAISS IndexFlatIP with L2-normalized vectors  (= cosine similarity)
+Metadata: Parallel dict[int → dict] saved as pickle alongside index
+Usage:
+    python src/pipeline/embedder.py
+"""
+from __future__ import annotations
+import sys
+import os
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+import json
+import logging
+import pickle
+import faiss
+import numpy as np
+import yaml
+import src  # noqa: F401 — logging setup
+logger = logging.getLogger(__name__)
+def _load_config() -> dict:
+    with open("config.yaml", "r", encoding="utf-8") as f:
+        return yaml.safe_load(f)
+def load_chunks(chunks_path: str = "data/processed/chunks.jsonl") -> list[dict]:
+    """Load chunks from JSONL produced by ingest.py."""
+    path = Path(chunks_path)
+    if not path.exists():
+        raise FileNotFoundError(
+            f"Chunks file not found: '{chunks_path}'. "
+            "Run python src/pipeline/ingest.py first."
+        )
+    chunks = []
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                chunks.append(json.loads(line))
+    logger.info("Loaded %d chunks from %s", len(chunks), chunks_path)
+    return chunks
+def encode_texts(
+    texts: list[str],
+    model_name: str,
+    batch_size: int = 32,
+) -> np.ndarray:
+    """
+    Encode texts using BioBERT via SentenceTransformer.
+    Returns L2-normalized float32 array of shape (N, 768).
+    """
+    from sentence_transformers import SentenceTransformer
+    logger.info("Loading embedding model: %s", model_name)
+    model = SentenceTransformer(model_name)
+    logger.info("Encoding %d texts (batch_size=%d)...", len(texts), batch_size)
+    embeddings: np.ndarray = model.encode(
+        texts,
+        batch_size=batch_size,
+        show_progress_bar=True,
+        normalize_embeddings=True,   # L2-normalise → cosine via IndexFlatIP
+        convert_to_numpy=True,
+    )
+    logger.info("Encoded shape: %s", embeddings.shape)
+    return embeddings.astype(np.float32)
+def build_faiss_index(embeddings: np.ndarray) -> faiss.IndexFlatIP:
+    """
+    Build FAISS IndexFlatIP.
+    Because vectors are L2-normalised, inner product == cosine similarity.
+    """
+    dim = embeddings.shape[1]  # 768 for BioBERT
+    index = faiss.IndexFlatIP(dim)
+    index.add(embeddings)
+    logger.info(
+        "FAISS IndexFlatIP built: %d vectors, dim=%d", index.ntotal, dim
+    )
+    return index
+def build_metadata_store(chunks: list[dict]) -> dict[int, dict]:
+    """
+    Build parallel metadata dict  →  key = FAISS integer index (0-based).
+    Stores the full FR-03b schema plus chunk_text for retrieval.
+    """
+    store: dict[int, dict] = {}
+    for i, chunk in enumerate(chunks):
+        store[i] = {
+            "chunk_id":     chunk["chunk_id"],
+            "doc_id":       chunk["doc_id"],
+            "source":       chunk["source"],
+            "title":        chunk["title"],
+            "pub_type":     chunk["pub_type"],
+            "pub_year":     chunk["pub_year"],
+            "journal":      chunk["journal"],
+            "chunk_index":  chunk["chunk_index"],
+            "total_chunks": chunk["total_chunks"],
+            "chunk_text":   chunk["chunk_text"],   # kept for retrieval
+        }
+    return store
+def save_artifacts(
+    index: faiss.IndexFlatIP,
+    metadata_store: dict,
+    config: dict,
+) -> None:
+    """Persist FAISS index and metadata pickle to disk."""
+    index_path = Path(config["retrieval"]["index_path"])
+    meta_path  = Path(config["retrieval"]["metadata_path"])
+    index_path.parent.mkdir(parents=True, exist_ok=True)
+    meta_path.parent.mkdir(parents=True, exist_ok=True)
+    faiss.write_index(index, str(index_path))
+    logger.info("FAISS index written to %s", index_path)
+    with open(meta_path, "wb") as f:
+        pickle.dump(metadata_store, f, protocol=pickle.HIGHEST_PROTOCOL)
+    logger.info(
+        "Metadata store written to %s (%d entries)", meta_path, len(metadata_store)
+    )
+def main() -> None:
+    config = _load_config()
+    chunks = load_chunks("data/processed/chunks.jsonl")
+    if not chunks:
+        logger.error("No chunks to embed. Run python src/pipeline/ingest.py first.")
+        sys.exit(1)
+    texts          = [c["chunk_text"] for c in chunks]
+    model_name     = config["retrieval"]["embedding_model"]
+    embeddings     = encode_texts(texts, model_name, batch_size=32)
+    index          = build_faiss_index(embeddings)
+    metadata_store = build_metadata_store(chunks)
+    save_artifacts(index, metadata_store, config)
+    logger.info(
+        "Embedding complete. Index has %d vectors. "
+        "Next: python scripts/warmup.py && streamlit run src/dashboard/app.py",
+        index.ntotal,
+    )
+if __name__ == "__main__":
+    main()

src/pipeline/generator.py ADDED Viewed

	@@ -0,0 +1,584 @@

+"""
+src/pipeline/generator.py — LLM Answer Generation
+===================================================
+Supports multiple providers based on config.yaml → llm.provider:
+  - "gemini"  : Google Gemini API (recommended)
+  - "mistral" : Mistral AI API (api.mistral.ai)
+  - "groq"    : Groq Cloud API (fast inference)
+  - "ollama"  : Local Ollama/Mistral (requires Ollama running locally)
+API Key setup:
+  Set env variables in Backend/.env:
+    GEMINI_API_KEY=your_key
+    MISTRAL_API_KEY=your_key
+    GROQ_API_KEY=your_key
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+import time
+from pathlib import Path
+from typing import Optional
+import yaml
+logger = logging.getLogger(__name__)
+# Load .env file at module import time
+def _load_env():
+    env_path = Path(".env")
+    if not env_path.exists():
+        # Try one level up
+        env_path = Path("../Backend/.env")
+    if env_path.exists():
+        for line in env_path.read_text().splitlines():
+            line = line.strip()
+            if line and not line.startswith("#") and "=" in line:
+                key, val = line.split("=", 1)
+                key = key.strip()
+                val = val.strip().strip('"').strip("'")
+                if key and val and key not in os.environ:
+                    os.environ[key] = val
+_load_env()
+# ---------------------------------------------------------------------------
+# Config loader
+# ---------------------------------------------------------------------------
+def _load_config() -> dict:
+    try:
+        return yaml.safe_load(Path("config.yaml").read_text())
+    except Exception:
+        return {}
+# ---------------------------------------------------------------------------
+# Prompt builder (shared by both providers)
+# ---------------------------------------------------------------------------
+_PHYSICIAN_PROMPT = (
+    "You are MediRAG, a medical AI assistant tailored for clinicians and researchers. "
+    "You MUST answer ONLY using information explicitly stated in the CONTEXT provided below. "
+    "Use professional medical terminology, be concise, and cite specific details. "
+    "After each claim, cite it inline as [Source: <document title>]. "
+    "If the context does NOT contain sufficient information to answer safely, you MUST respond EXACTLY with: "
+    "'⚠️ The retrieved context does not contain enough information to answer this safely. "
+    "Please consult authoritative clinical guidelines or a specialist.' "
+    "NEVER use general knowledge, training data, or information outside the provided context."
+)
+_PATIENT_PROMPT = (
+    "You are MediRAG, a medical AI assistant tailored for patients and non-experts. "
+    "You MUST answer ONLY using information explicitly stated in the CONTEXT provided below. "
+    "Explain medical information in a clear, accessible, and empathetic way. "
+    "After each claim, cite it inline as [Source: <document title>]. "
+    "If the context does NOT contain sufficient information to answer safely, you MUST respond EXACTLY with: "
+    "'⚠️ The retrieved context does not contain enough information to answer this safely. "
+    "Please consult your doctor or a medical specialist.' "
+    "NEVER use general knowledge, training data, or information outside the provided context."
+)
+_SYSTEM_PROMPT = _PHYSICIAN_PROMPT # Default fallback
+def _build_prompt(question: str, context_chunks: list[dict], system_prompt: Optional[str] = None, persona: str = "physician") -> str:
+    """Build the RAG prompt from the question + retrieved chunks.
+    Explicitly surfaces title and source for each chunk in the header so the LLM
+    can cite [Source: <title>] inline in its answer.
+    """
+    context_parts = []
+    for i, chunk in enumerate(context_chunks, 1):
+        text = chunk.get("text") or chunk.get("chunk_text", "")
+        title = chunk.get("title", "")
+        source = chunk.get("source", "")
+        pub_type = chunk.get("pub_type", "")
+        # Include title as the primary citation label
+        header_parts = [f"Source {i}"]
+        if title:
+            header_parts.append(f"Title: {title}")
+        if pub_type:
+            header_parts.append(pub_type)
+        if source and source != title:
+            header_parts.append(source)
+        header = "[" + " | ".join(header_parts) + "]"
+        context_parts.append(f"{header}\n{text.strip()}")
+    context_block = "\n\n".join(context_parts)
+    # Determine effective system prompt based on persona if no manual override
+    if system_prompt:
+        effective_system = system_prompt
+    else:
+        effective_system = _PATIENT_PROMPT if persona == "patient" else _PHYSICIAN_PROMPT
+    return (
+        f"{effective_system}\n\n"
+        f"CONTEXT:\n{context_block}\n\n"
+        f"QUESTION: {question}\n\n"
+        f"ANSWER (cite sources inline as [Source: document title]):"
+    )
+# Strict prompt — used when first answer fails evaluation (HRS ≥ 60)
+_STRICT_SYSTEM_PROMPT = (
+    "You are MediRAG, a clinical safety assistant under strict mode. "
+    "A previous response was flagged as potentially unsafe or inaccurate. "
+    "You MUST answer ONLY using the information explicitly stated in the CONTEXT below. "
+    "Do NOT use any general medical knowledge, training data, or outside information. "
+    "If the context is insufficient, you MUST say EXACTLY: "
+    "'⚠️ Insufficient evidence in retrieved context to answer safely. Please consult a clinical specialist.' "
+    "NEVER hallucinate drug names, dosages, or clinical recommendations."
+)
+def _build_strict_prompt(question: str, context_chunks: list[dict]) -> str:
+    """Strict prompt: context-only, used on regeneration after failed evaluation."""
+    context_parts = []
+    for i, chunk in enumerate(context_chunks, 1):
+        text = chunk.get("text") or chunk.get("chunk_text", "")
+        title = chunk.get("title", "")
+        source = chunk.get("source", "")
+        pub_type = chunk.get("pub_type", "")
+        header_parts = [f"Source {i}"]
+        if title:
+            header_parts.append(f"Title: {title}")
+        if pub_type:
+            header_parts.append(pub_type)
+        if source and source != title:
+            header_parts.append(source)
+        header = "[" + " | ".join(header_parts) + "]"
+        context_parts.append(f"{header}\n{text.strip()}")
+    context_block = "\n\n".join(context_parts)
+    return (
+        f"{_STRICT_SYSTEM_PROMPT}\n\n"
+        f"CONTEXT:\n{context_block}\n\n"
+        f"QUESTION: {question}\n\n"
+        f"SAFE ANSWER (context-only, cite [Source: title] for every claim):"
+    )
+# ---------------------------------------------------------------------------
+# OpenAI provider
+# ---------------------------------------------------------------------------
+def _generate_openai(prompt: str, config: dict) -> str:
+    llm_cfg = config.get("llm", {})
+    # Override from frontend/config takes priority over system ENV
+    api_key = llm_cfg.get("openai_api_key") or os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        env_file = Path(".env")
+        if env_file.exists():
+            for line in env_file.read_text().splitlines():
+                if line.startswith("OPENAI_API_KEY="):
+                    api_key = line.split("=", 1)[1].strip().strip('"').strip("'")
+                    break
+    if not api_key:
+        raise RuntimeError("OpenAI API key not found. Set OPENAI_API_KEY env var or in .env.")
+    try:
+        from openai import OpenAI
+    except ImportError:
+        raise RuntimeError("openai not installed. Run: pip install openai")
+    model_name = llm_cfg.get("openai_model") or llm_cfg.get("model") or "gpt-4o"
+    client = OpenAI(api_key=api_key)
+    logger.info("Calling OpenAI API (model=%s)...", model_name)
+    t0 = time.perf_counter()
+    try:
+        response = client.chat.completions.create(
+            model=model_name,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=float(llm_cfg.get("generation_temperature", 0.7)),
+            max_tokens=1024,
+        )
+    except Exception as exc:
+        raise RuntimeError(f"OpenAI API error: {exc}") from exc
+    elapsed = int((time.perf_counter() - t0) * 1000)
+    answer = response.choices[0].message.content.strip()
+    if not answer:
+        raise RuntimeError("OpenAI returned an empty response.")
+    logger.info("OpenAI generated answer in %d ms (%d chars)", elapsed, len(answer))
+    return answer
+def _generate_gemini(prompt: str, config: dict) -> str:
+    llm_cfg = config.get("llm", {})
+    # Override from frontend/config takes priority over system ENV
+    api_key = llm_cfg.get("gemini_api_key") or os.environ.get("GEMINI_API_KEY")
+    if not api_key:
+        # Try loading from .env file if present
+        env_file = Path(".env")
+        if env_file.exists():
+            for line in env_file.read_text().splitlines():
+                if line.startswith("GEMINI_API_KEY="):
+                    api_key = line.split("=", 1)[1].strip().strip('"').strip("'")
+                    break
+    if not api_key:
+        raise RuntimeError(
+            "Gemini API key not found. "
+            "Either: (1) set GEMINI_API_KEY=your_key in the same terminal as uvicorn, "
+            "or (2) create a .env file with GEMINI_API_KEY=your_key in the project root."
+        )
+    try:
+        from google import genai
+        from google.genai import types
+    except ImportError:
+        raise RuntimeError(
+            "google-genai not installed. Run: pip install google-genai"
+        )
+    model_name = llm_cfg.get("gemini_model", "gemini-2.0-flash")
+    client = genai.Client(api_key=api_key)
+    logger.info("Calling Gemini API (model=%s)...", model_name)
+    t0 = time.perf_counter()
+    try:
+        response = client.models.generate_content(
+            model=model_name,
+            contents=prompt,
+            config=types.GenerateContentConfig(
+                temperature=float(llm_cfg.get("generation_temperature", 0.7)),
+                max_output_tokens=1024,
+            ),
+        )
+    except Exception as exc:
+        raise RuntimeError(f"Gemini API error: {exc}") from exc
+    elapsed = int((time.perf_counter() - t0) * 1000)
+    answer = response.text.strip() if response.text else ""
+    if not answer:
+        raise RuntimeError("Gemini returned an empty response.")
+    logger.info("Gemini generated answer in %d ms (%d chars)", elapsed, len(answer))
+    return answer
+# ---------------------------------------------------------------------------
+# Ollama provider (kept as fallback)
+# ---------------------------------------------------------------------------
+def _generate_ollama(prompt: str, config: dict) -> str:
+    import requests as _requests
+    llm_cfg = config.get("llm", {})
+    base_url = llm_cfg.get("base_url", "http://localhost:11434")
+    model = llm_cfg.get("model", "mistral")
+    timeout = llm_cfg.get("timeout_seconds", 120)
+    temperature = llm_cfg.get("generation_temperature", 0.7)
+    payload = {
+        "model": model,
+        "prompt": prompt,
+        "stream": False,
+        "options": {"temperature": temperature, "num_predict": 512},
+    }
+    url = f"{base_url}/api/generate"
+    logger.info("Calling Ollama (%s @ %s)...", model, base_url)
+    t0 = time.perf_counter()
+    try:
+        resp = _requests.post(url, json=payload, timeout=timeout)
+    except _requests.exceptions.ConnectionError as exc:
+        raise RuntimeError(
+            f"Ollama is not running at {base_url}. Start with: ollama serve"
+        ) from exc
+    except _requests.exceptions.Timeout as exc:
+        raise RuntimeError(
+            f"Ollama timed out after {timeout}s. Increase llm.timeout_seconds in config.yaml."
+        ) from exc
+    if resp.status_code != 200:
+        raise RuntimeError(f"Ollama HTTP {resp.status_code}: {resp.text[:300]}")
+    try:
+        data = resp.json()
+        answer = data.get("response", "").strip()
+    except (json.JSONDecodeError, KeyError) as exc:
+        raise RuntimeError(f"Unexpected Ollama response: {exc}") from exc
+    if not answer:
+        raise RuntimeError("Ollama returned an empty response.")
+    elapsed = int((time.perf_counter() - t0) * 1000)
+    logger.info("Ollama generated answer in %d ms (%d chars)", elapsed, len(answer))
+    return answer
+# ---------------------------------------------------------------------------
+# Mistral provider
+# ---------------------------------------------------------------------------
+def _generate_mistral(prompt: str, config: dict) -> str:
+    import requests as _requests
+    llm_cfg = config.get("llm", {})
+    # Resolve placeholder or direct value
+    _raw_key = llm_cfg.get("mistral_api_key", "")
+    api_key = os.environ.get("MISTRAL_API_KEY") if (not _raw_key or _raw_key.startswith("${")) else _raw_key
+    if not api_key:
+        raise RuntimeError(
+            "Mistral API key not found. Set MISTRAL_API_KEY in Backend/.env"
+        )
+    model = llm_cfg.get("model", "mistral-large-latest")
+    timeout = llm_cfg.get("timeout_seconds", 120)
+    temperature = llm_cfg.get("generation_temperature", 0.7)
+    payload = {
+        "model": model,
+        "messages": [{"role": "user", "content": prompt}],
+        "temperature": temperature,
+        "max_tokens": 1024,
+    }
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json"
+    }
+    url = "https://api.mistral.ai/v1/chat/completions"
+    logger.info("Calling Mistral API (model=%s, key=...***)", model)
+    t0 = time.perf_counter()
+    try:
+        resp = _requests.post(url, json=payload, headers=headers, timeout=timeout)
+    except Exception as exc:
+        raise RuntimeError(f"Mistral API network error: {exc}") from exc
+    if resp.status_code != 200:
+        raise RuntimeError(f"Mistral HTTP {resp.status_code}: {resp.text[:300]}")
+    try:
+        data = resp.json()
+        answer = data["choices"][0]["message"]["content"].strip()
+    except Exception as exc:
+        raise RuntimeError(f"Unexpected Mistral response: {exc}") from exc
+    if not answer:
+        raise RuntimeError("Mistral returned an empty response.")
+    elapsed = int((time.perf_counter() - t0) * 1000)
+    logger.info("Mistral generated answer in %d ms (%d chars)", elapsed, len(answer))
+    return answer
+# ---------------------------------------------------------------------------
+# Groq provider
+# ---------------------------------------------------------------------------
+def _generate_groq(prompt: str, config: dict) -> str:
+    import requests as _requests
+    llm_cfg = config.get("llm", {})
+    _raw_key = llm_cfg.get("groq_api_key", "")
+    api_key = os.environ.get("GROQ_API_KEY") if (not _raw_key or _raw_key.startswith("${")) else _raw_key
+    if not api_key:
+        raise RuntimeError(
+            "Groq API key not found. Set GROQ_API_KEY in Backend/.env"
+        )
+    model = llm_cfg.get("groq_model") or llm_cfg.get("model", "llama-3.3-70b-versatile")
+    timeout = llm_cfg.get("timeout_seconds", 120)
+    temperature = llm_cfg.get("generation_temperature", 0.7)
+    payload = {
+        "model": model,
+        "messages": [{"role": "user", "content": prompt}],
+        "temperature": temperature,
+        "max_tokens": 1024,
+    }
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json"
+    }
+    url = "https://api.groq.com/openai/v1/chat/completions"
+    logger.info("Calling Groq API (model=%s, key=...***)", model)
+    t0 = time.perf_counter()
+    try:
+        resp = _requests.post(url, json=payload, headers=headers, timeout=timeout)
+    except Exception as exc:
+        raise RuntimeError(f"Groq API network error: {exc}") from exc
+    if resp.status_code != 200:
+        raise RuntimeError(f"Groq HTTP {resp.status_code}: {resp.text[:300]}")
+    try:
+        data = resp.json()
+        answer = data["choices"][0]["message"]["content"].strip()
+    except Exception as exc:
+        raise RuntimeError(f"Unexpected Groq response: {exc}") from exc
+    if not answer:
+        raise RuntimeError("Groq returned an empty response.")
+    elapsed = int((time.perf_counter() - t0) * 1000)
+    logger.info("Groq generated answer in %d ms (%d chars)", elapsed, len(answer))
+    return answer
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def generate_answer(
+    question: str,
+    context_chunks: list[dict],
+    config: Optional[dict] = None,
+    overrides: Optional[dict] = None,
+) -> str:
+    """
+    Generate a grounded medical answer.
+    Provider is selected from config.yaml → llm.provider, but can be
+    overridden per-request via the `overrides` dict. This makes the eval
+    engine portable — callers bring their own API key and model.
+    Args:
+        question       : User's medical question.
+        context_chunks : Retrieved context chunks (dicts with 'text' key).
+        config         : Config dict (loaded from config.yaml if None).
+        overrides      : Per-request overrides. Supported keys:
+                           provider   → "gemini" or "ollama"
+                           api_key    → Gemini API key
+                           model      → model name (e.g. "gemini-2.5-flash-lite")
+                           ollama_url → Ollama base URL
+    Returns:
+        Generated answer string.
+    Raises:
+        RuntimeError   : If the provider is unreachable or returns an error.
+    """
+    if config is None:
+        config = _load_config()
+    # Build effective config: server config as base, overrides win
+    effective_llm = dict(config.get("llm", {}))
+    if overrides:
+        if overrides.get("provider"):
+            effective_llm["provider"] = overrides["provider"]
+        if overrides.get("api_key"):
+            pk = (overrides.get("provider") or "gemini").lower()
+            key_map = {
+                "gemini": "gemini_api_key",
+                "openai": "openai_api_key",
+                "mistral": "mistral_api_key",
+                "groq": "groq_api_key",
+            }
+            effective_llm[key_map.get(pk, "gemini_api_key")] = overrides["api_key"]
+        if overrides.get("model"):
+            pk = (overrides.get("provider") or "gemini").lower()
+            model_map = {
+                "gemini": "gemini_model",
+                "openai": "openai_model",
+                "mistral": "model",
+                "groq": "groq_model",
+            }
+            effective_llm[model_map.get(pk, "gemini_model")] = overrides["model"]
+        if overrides.get("ollama_url"):
+            effective_llm["base_url"] = overrides["ollama_url"]
+    effective_config = {**config, "llm": effective_llm}
+    provider = effective_llm.get("provider", "gemini").lower()
+    system_prompt_override = overrides.get("system_prompt") if overrides else None
+    persona = overrides.get("persona", "physician") if overrides else "physician"
+    prompt = _build_prompt(
+        question,
+        context_chunks,
+        system_prompt=system_prompt_override,
+        persona=persona
+    )
+    if provider == "gemini":
+        return _generate_gemini(prompt, effective_config)
+    elif provider == "openai":
+        return _generate_openai(prompt, effective_config)
+    elif provider == "ollama":
+        return _generate_ollama(prompt, effective_config)
+    elif provider == "mistral":
+        return _generate_mistral(prompt, effective_config)
+    elif provider == "groq":
+        return _generate_groq(prompt, effective_config)
+    else:
+        raise RuntimeError(
+            f"Unknown LLM provider '{provider}'. "
+            "Set llm.provider to 'gemini', 'mistral', 'groq', or 'ollama'."
+        )
+def generate_strict_answer(
+    question: str,
+    context_chunks: list[dict],
+    config: Optional[dict] = None,
+    overrides: Optional[dict] = None,
+) -> str:
+    """
+    Generate a STRICT context-only answer.
+    Called when initial answer fails evaluation (HRS >= 60).
+    The LLM is forbidden from using any training knowledge.
+    """
+    if config is None:
+        config = _load_config()
+    effective_llm = dict(config.get("llm", {}))
+    if overrides:
+        if overrides.get("provider"):
+            effective_llm["provider"] = overrides["provider"]
+        if overrides.get("api_key"):
+            pk = (overrides.get("provider") or "gemini").lower()
+            key_map = {
+                "gemini": "gemini_api_key",
+                "openai": "openai_api_key",
+                "mistral": "mistral_api_key",
+                "groq": "groq_api_key",
+            }
+            effective_llm[key_map.get(pk, "gemini_api_key")] = overrides["api_key"]
+        if overrides.get("model"):
+            pk = (overrides.get("provider") or "gemini").lower()
+            model_map = {
+                "gemini": "gemini_model",
+                "openai": "openai_model",
+                "mistral": "model",
+                "groq": "groq_model",
+            }
+            effective_llm[model_map.get(pk, "gemini_model")] = overrides["model"]
+        if overrides.get("ollama_url"):
+            effective_llm["base_url"] = overrides["ollama_url"]
+    effective_config = {**config, "llm": effective_llm}
+    provider = effective_llm.get("provider", "gemini").lower()
+    prompt = _build_strict_prompt(question, context_chunks)
+    if provider == "gemini":
+        return _generate_gemini(prompt, effective_config)
+    elif provider == "openai":
+        return _generate_openai(prompt, effective_config)
+    elif provider == "ollama":
+        return _generate_ollama(prompt, effective_config)
+    elif provider == "mistral":
+        return _generate_mistral(prompt, effective_config)
+    elif provider == "groq":
+        return _generate_groq(prompt, effective_config)
+    else:
+        raise RuntimeError(f"Unknown LLM provider '{provider}'.")

src/pipeline/ingest.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+FR-01: Document Ingestion
+=========================
+Loads documents from:
+  - PubMedQA   (HuggingFace: pubmed_qa, pqa_labeled) — up to 500 samples
+  - MedQA-USMLE (local JSONL from jind11/MedQA)       — up to 200 samples
+Then calls chunker.py to split and saves chunks to data/processed/chunks.jsonl.
+Usage:
+    python src/pipeline/ingest.py
+    python src/pipeline/ingest.py --pubmedqa 500 --medqa 200
+"""
+from __future__ import annotations
+import sys
+import os
+from pathlib import Path
+# Make project root importable when running as a script
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import json
+import logging
+import uuid
+import yaml
+from typing import Any
+import src  # noqa: F401 — triggers logging setup
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+def _load_config() -> dict:
+    with open("config.yaml", "r", encoding="utf-8") as f:
+        return yaml.safe_load(f)
+# ---------------------------------------------------------------------------
+# PubMedQA Ingestion (FR-01)
+# ---------------------------------------------------------------------------
+def ingest_pubmedqa(max_samples: int = 500) -> list[dict[str, Any]]:
+    """
+    Load PubMedQA from HuggingFace datasets.
+    Each QA item contributes its context passages (abstracts) as documents,
+    plus its long_answer if available.
+    pub_type = "research_abstract" → Tier 3 (SRS FR-03b)
+    """
+    # Use 'pqa_artificial' (211k rows) if asking for more than 1000,
+    # as 'pqa_labeled' only has 1000 rows.
+    split_name = "pqa_artificial" if max_samples > 1000 else "pqa_labeled"
+    logger.info("Loading PubMedQA split='%s' (max %d QA pairs)...", split_name, max_samples)
+    try:
+        from datasets import load_dataset
+        dataset = load_dataset(
+            "pubmed_qa", split_name, split="train", trust_remote_code=True
+        )
+    except Exception as exc:
+        logger.error("Failed to load PubMedQA from HuggingFace: %s", exc)
+        logger.error("Ensure you have an internet connection and datasets>=2.18.0")
+        return []
+    documents: list[dict] = []
+    for i, item in enumerate(dataset):
+        if i >= max_samples:
+            break
+        pub_id = str(item.get("pubid", uuid.uuid4().hex[:8]))
+        question = item.get("question", "")[:200]
+        # Index each context passage as a separate document
+        contexts: list[str] = item.get("context", {}).get("contexts", [])
+        for ctx in contexts:
+            if ctx and ctx.strip():
+                documents.append({
+                    "text":     ctx.strip(),
+                    "title":    question,
+                    "doc_id":   f"pubmedqa_{pub_id}",
+                    "source":   "pubmedqa",
+                    "pub_type": "research_abstract",
+                    "pub_year": 0,
+                    "journal":  "",
+                })
+        # Also index the long_answer (gold-standard explanation)
+        long_ans: str = item.get("long_answer", "").strip()
+        if long_ans:
+            documents.append({
+                "text":     long_ans,
+                "title":    question,
+                "doc_id":   f"pubmedqa_{pub_id}_ans",
+                "source":   "pubmedqa",
+                "pub_type": "research_abstract",
+                "pub_year": 0,
+                "journal":  "",
+            })
+    logger.info(
+        "PubMedQA: %d documents loaded from %d QA items",
+        len(documents),
+        min(max_samples, len(dataset)),
+    )
+    return documents
+# ---------------------------------------------------------------------------
+# MedQA-USMLE Ingestion (FR-01)
+# ---------------------------------------------------------------------------
+def ingest_medqa(
+    data_dir: str = "data/raw/medqa",
+    max_samples: int = 200,
+) -> list[dict[str, Any]]:
+    """
+    Load MedQA-USMLE from local JSONL files.
+    To obtain the data:
+        git clone https://github.com/jind11/MedQA
+        Copy the JSONL files from data_clean/questions/US/ to data/raw/medqa/
+    pub_type = "exam_question" → Tier 5 (SRS FR-03b)
+    """
+    data_path = Path(data_dir)
+    jsonl_files = sorted(list(data_path.glob("*.jsonl")) + list(data_path.glob("**/*.jsonl")))
+    if not jsonl_files:
+        logger.warning(
+            "MedQA data not found at '%s'. "
+            "To get it: git clone https://github.com/jind11/MedQA "
+            "and copy JSONL files to %s/",
+            data_dir, data_dir,
+        )
+        return []
+    logger.info("Loading MedQA from '%s' (%d files)...", data_dir, len(jsonl_files))
+    documents: list[dict] = []
+    for jsonl_file in jsonl_files:
+        if len(documents) >= max_samples:
+            break
+        with open(jsonl_file, "r", encoding="utf-8") as f:
+            for raw_line in f:
+                if len(documents) >= max_samples:
+                    break
+                raw_line = raw_line.strip()
+                if not raw_line:
+                    continue
+                try:
+                    item = json.loads(raw_line)
+                except json.JSONDecodeError as exc:
+                    logger.warning("Skipping malformed JSON in %s: %s", jsonl_file.name, exc)
+                    continue
+                question:    str  = item.get("question", "")
+                options:     dict = item.get("options", {})
+                answer_key:  str  = item.get("answer", "")
+                answer_text: str  = options.get(answer_key, "")
+                # Combine question + all options + correct answer as document text
+                opts_text = "  ".join(f"{k}: {v}" for k, v in options.items())
+                text = f"Question: {question}\nOptions: {opts_text}"
+                if answer_text:
+                    text += f"\nAnswer ({answer_key}): {answer_text}"
+                documents.append({
+                    "text":     text,
+                    "title":    question[:200],
+                    "doc_id":   f"medqa_{uuid.uuid4().hex[:10]}",
+                    "source":   "medqa",
+                    "pub_type": "exam_question",
+                    "pub_year": 0,
+                    "journal":  "",
+                })
+    logger.info("MedQA: %d documents loaded", len(documents))
+    return documents
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _save_raw_documents(documents: list[dict], output_path: str) -> None:
+    out = Path(output_path)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    with open(out, "w", encoding="utf-8") as f:
+        for doc in documents:
+            f.write(json.dumps(doc, ensure_ascii=False) + "\n")
+    logger.info("Saved %d raw documents to %s", len(documents), output_path)
+def _save_chunks(chunks: list[dict], output_path: str) -> None:
+    out = Path(output_path)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    with open(out, "w", encoding="utf-8") as f:
+        for chunk in chunks:
+            f.write(json.dumps(chunk, ensure_ascii=False) + "\n")
+    logger.info("Saved %d chunks to %s", len(chunks), output_path)
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main() -> None:
+    parser = argparse.ArgumentParser(description="MediRAG-Eval Document Ingestion (FR-01)")
+    parser.add_argument("--pubmedqa", type=int, default=500, help="Max PubMedQA samples")
+    parser.add_argument("--medqa",    type=int, default=200, help="Max MedQA-USMLE samples")
+    parser.add_argument(
+        "--medqa-dir", default="data/raw/medqa",
+        help="Directory containing MedQA JSONL files",
+    )
+    args = parser.parse_args()
+    config = _load_config()
+    # --- Ingest ---
+    pubmedqa_docs = ingest_pubmedqa(max_samples=args.pubmedqa)
+    medqa_docs    = ingest_medqa(data_dir=args.medqa_dir, max_samples=args.medqa)
+    all_docs      = pubmedqa_docs + medqa_docs
+    logger.info("Total documents ingested: %d", len(all_docs))
+    if not all_docs:
+        logger.error("No documents loaded. Check internet for PubMedQA and/or data/raw/medqa/ for MedQA.")
+        sys.exit(1)
+    # --- Save raw documents (for inspection) ---
+    _save_raw_documents(all_docs, "data/raw/documents.jsonl")
+    # --- Chunk ---
+    from src.pipeline.chunker import chunk_documents
+    chunks = chunk_documents(all_docs, config)
+    logger.info("Total chunks produced: %d", len(chunks))
+    # --- Save chunks for embedder ---
+    _save_chunks(chunks, "data/processed/chunks.jsonl")
+    logger.info("Ingestion complete. Now run: python src/pipeline/embedder.py")
+if __name__ == "__main__":
+    main()

src/pipeline/privacy.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""
+src/pipeline/privacy.py — PHI/PII Privacy Shield (The Sanitizer)
+==============================================================
+Detects and redacts sensitive patient information before external API calls.
+Supports names, dates, contact info, and generic medical IDs.
+"""
+from __future__ import annotations
+import re
+import logging
+from typing import Dict, Tuple
+logger = logging.getLogger(__name__)
+class PrivacyShield:
+    def __init__(self):
+        # Basic patterns for common PII
+        self.patterns = {
+            "EMAIL": r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+',
+            "PHONE": r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',
+            "SSN": r'\b\d{3}-\d{2}-\d{4}\b',
+            "DOB": r'\b\d{2}/\d{2}/\d{4}\b|\b\d{4}-\d{2}-\d{2}\b',
+            "ID": r'\bPT-\d{4,8}\b|\bID:\s?\d{4,8}\b'
+        }
+        # Names are harder without heavy NER, so we start with common indicators or capital patterns
+        # In a production app, we would use a dedicated medical NER model.
+        self.name_pattern = r'\b(?:Mr\.|Ms\.|Mrs\.|Dr\.)\s[A-Z][a-z]+(?:\s[A-Z][a-z]+)?\b'
+    def redact(self, text: str) -> Tuple[str, Dict[str, str]]:
+        """
+        Redacts PHI in text and returns (redacted_text, placeholder_map).
+        """
+        mapping = {}
+        redacted = text
+        # 1. Redact specific patterns
+        for label, pattern in self.patterns.items():
+            matches = re.findall(pattern, redacted)
+            for i, match in enumerate(set(matches)):
+                placeholder = f"[{label}_{i+1}]"
+                mapping[placeholder] = match
+                redacted = redacted.replace(match, placeholder)
+        # 2. Redact potential names
+        name_matches = re.findall(self.name_pattern, redacted)
+        for i, match in enumerate(set(name_matches)):
+            placeholder = f"[PATIENT_NAME_{i+1}]"
+            mapping[placeholder] = match
+            redacted = redacted.replace(match, placeholder)
+        if mapping:
+            logger.info("Privacy Shield: Redacted %d sensitive items.", len(mapping))
+        return redacted, mapping
+    def restore(self, text: str, mapping: Dict[str, str]) -> str:
+        """
+        Replaces placeholders in the AI response with original values.
+        """
+        restored = text
+        for placeholder, original in mapping.items():
+            restored = restored.replace(placeholder, original)
+        return restored
+# Singleton instance
+shield = PrivacyShield()

src/pipeline/retriever.py ADDED Viewed

	@@ -0,0 +1,463 @@

+"""
+FR-04: Vector Retrieval
+=======================
+FAISS IndexFlatIP with L2-normalised vectors (inner product = cosine similarity).
+Returns top-k chunks as (chunk_text, metadata_dict, similarity_score) tuples.
+Usage (as a module):
+    from src.pipeline.retriever import Retriever
+    r = Retriever(config)
+    results = r.search("What is the treatment for Type 2 Diabetes?")
+    for text, meta, score in results:
+        print(score, meta["pub_type"], text[:80])
+Usage (smoke test):
+    python src/pipeline/retriever.py
+"""
+from __future__ import annotations
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+import logging
+import pickle
+from typing import Any
+try:
+    import faiss
+except ImportError:
+    faiss = None
+import numpy as np
+import yaml
+logger = logging.getLogger(__name__)
+class Retriever:
+    """
+    Hybrid FAISS + BM25 document retriever.
+    On first search, lazily builds a BM25 index over all chunk texts.
+    Each search runs both FAISS (semantic) and BM25 (keyword) then merges
+    results using Reciprocal Rank Fusion (RRF) for best-of-both precision
+    and recall.
+    """
+    RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+    RERANK_CANDIDATES = 60   # retrieve this many via RRF, then re-rank to top_k
+    def __init__(self, config: dict) -> None:
+        self.config        = config
+        self.top_k: int    = config["retrieval"]["top_k"]
+        self.model_name: str = config["retrieval"]["embedding_model"]
+        self.index_path: str = config["retrieval"]["index_path"]
+        self.meta_path: str  = config["retrieval"]["metadata_path"]
+        self._model     = None
+        self._reranker  = None   # cross-encoder re-ranker, loaded lazily
+        self._index     = None
+        self._metadata: dict[int, dict] | None = None
+        self._bm25      = None          # built lazily on first search
+        self._bm25_ids: list[int] = []  # maps bm25 row → faiss_idx
+    # ------------------------------------------------------------------
+    # Private loaders (lazy)
+    # ------------------------------------------------------------------
+    def _load_model(self) -> None:
+        if self._model is None:
+            try:
+                from sentence_transformers import SentenceTransformer
+                logger.info("Loading BioBERT: %s", self.model_name)
+                self._model = SentenceTransformer(self.model_name)
+                logger.info("BioBERT model loaded successfully.")
+            except ImportError as e:
+                logger.error("sentence_transformers not installed: %s", e)
+                self._model = None
+            except Exception as e:
+                logger.error("Failed to load embedding model '%s': %s — FAISS search will be skipped, falling back to BM25.", self.model_name, e)
+                self._model = None
+    def _load_reranker(self) -> None:
+        if self._reranker is None:
+            try:
+                from sentence_transformers import CrossEncoder
+                logger.info("Loading re-ranker: %s", self.RERANKER_MODEL)
+                self._reranker = CrossEncoder(self.RERANKER_MODEL)
+                logger.info("Re-ranker loaded.")
+            except Exception as e:
+                logger.warning("Re-ranker unavailable (%s) — falling back to RRF ranking.", e)
+                self._reranker = "unavailable"
+    def _load_index(self) -> None:
+        if self._index is not None:
+            return
+        idx_path  = Path(self.index_path)
+        meta_path = Path(self.meta_path)
+        if not idx_path.exists():
+            raise FileNotFoundError(
+                f"FAISS index not found at '{idx_path}'. "
+                "Run python src/pipeline/ingest.py && python src/pipeline/embedder.py first."
+            )
+        try:
+            logger.info("Loading FAISS index from %s", idx_path)
+            if faiss is not None:
+                self._index = faiss.read_index(str(idx_path))
+            else:
+                self._index = None
+                logger.warning("FAISS not installed — FAISS search disabled.")
+            logger.info("Loading metadata store from %s", meta_path)
+            with open(meta_path, "rb") as f:
+                self._metadata = pickle.load(f)
+            logger.info(
+                "Retriever ready: %d vectors, %d metadata entries",
+                self._index.ntotal if self._index is not None else 0, len(self._metadata),
+            )
+            # Build drug→FDA chunks lookup (O(1) at query time)
+            self._fda_index: dict[str, list[int]] = {}
+            for idx, meta in self._metadata.items():
+                if meta.get("source") == "FDA DailyMed":
+                    doc_id = meta.get("doc_id", "")
+                    # doc_id format: fda_{drug_name}_{set_id}
+                    parts = doc_id.split("_")
+                    drug_key = parts[1].lower() if len(parts) >= 2 else ""
+                    if drug_key:
+                        self._fda_index.setdefault(drug_key, []).append(idx)
+            logger.info("FDA drug index built: %d unique drugs", len(self._fda_index))
+            # Build keyword→guideline chunks lookup for clinical guidelines
+            self._guideline_index: dict[str, list[int]] = {}
+            for idx, meta in self._metadata.items():
+                if meta.get("pub_type") == "clinical_guideline":
+                    text = (meta.get("chunk_text", "") + " " + meta.get("title", "")).lower()
+                    for keyword in [
+                        # Diabetes / ADA
+                        "diagnosis", "diagnostic", "treatment", "pharmacologic",
+                        "glycemic", "insulin", "obesity", "hypoglycemia",
+                        "screening", "complication", "pregnancy",
+                        "children", "adolescent", "older adult", "hospital",
+                        # Cardiovascular / ACC-AHA
+                        "hypertension", "blood pressure", "antihypertensive",
+                        "statin", "cholesterol", "ldl", "lipid", "triglyceride",
+                        "cardiovascular", "coronary", "heart disease", "stroke",
+                        "aspirin", "antiplatelet", "anticoagulant",
+                        "prevention", "risk reduction", "atherosclerosis",
+                        "heart failure", "ejection fraction",
+                        "smoking", "exercise", "diet", "lifestyle",
+                    ]:
+                        if keyword in text:
+                            self._guideline_index.setdefault(keyword, []).append(idx)
+            logger.info("Guideline index built: %d keyword entries", len(self._guideline_index))
+        except Exception as e:
+            logger.error("Failed to load FAISS index or metadata: %s", e)
+            self._index = None
+            if self._metadata is None:
+                self._metadata = {}
+    def _build_bm25(self) -> None:
+        """Build BM25 index from the loaded metadata store (called once)."""
+        if self._bm25 is not None:
+            return
+        self.rebuild_bm25()
+    def rebuild_bm25(self) -> None:
+        """Build BM25 index — loads from cache if available, otherwise builds and saves."""
+        try:
+            from rank_bm25 import BM25Okapi
+        except ImportError:
+            logger.warning("rank-bm25 not installed — falling back to FAISS-only.")
+            return
+        if self._metadata is None:
+            self._load_index()
+        # Cache path: alongside the metadata store
+        bm25_cache = Path(self.meta_path).parent / "bm25_cache.pkl"
+        meta_mtime = Path(self.meta_path).stat().st_mtime if Path(self.meta_path).exists() else 0
+        # Load from cache if it exists and is newer than the metadata store
+        if bm25_cache.exists() and bm25_cache.stat().st_mtime >= meta_mtime:
+            try:
+                logger.info("Loading BM25 index from cache %s …", bm25_cache)
+                with open(bm25_cache, "rb") as f:
+                    cached = pickle.load(f)
+                self._bm25 = cached["bm25"]
+                self._bm25_ids = cached["ids"]
+                logger.info("BM25 cache loaded (%d docs).", len(self._bm25_ids))
+                return
+            except Exception as e:
+                logger.warning("BM25 cache load failed (%s) — rebuilding.", e)
+        logger.info("Rebuilding BM25 index over %d chunks…", len(self._metadata))
+        corpus_ids: list[int] = []
+        corpus_tokens: list[list[str]] = []
+        for faiss_idx, meta in self._metadata.items():
+            text = meta.get("chunk_text", "")
+            if text:
+                corpus_ids.append(faiss_idx)
+                corpus_tokens.append(text.lower().split())
+        self._bm25 = BM25Okapi(corpus_tokens)
+        self._bm25_ids = corpus_ids
+        logger.info("BM25 index built (%d docs). Saving cache…", len(corpus_ids))
+        try:
+            with open(bm25_cache, "wb") as f:
+                pickle.dump({"bm25": self._bm25, "ids": self._bm25_ids}, f,
+                            protocol=pickle.HIGHEST_PROTOCOL)
+            logger.info("BM25 cache saved to %s", bm25_cache)
+        except Exception as e:
+            logger.warning("BM25 cache save failed: %s", e)
+    def get_fda_chunks(self, drug_name: str, section_priority: list[str] | None = None) -> list[dict]:
+        """
+        Directly return FDA DailyMed chunks for a specific drug by name.
+        Bypasses FAISS/BM25 ranking — O(1) lookup, always finds the drug's label.
+        Used during intervention re-retrieval when entity_verifier identifies a drug.
+        """
+        self._load_index()
+        key = drug_name.lower().strip()
+        indices = getattr(self, "_fda_index", {}).get(key, [])
+        if not indices:
+            # Try partial match (e.g. "warfarin sodium" → "warfarin")
+            indices = next(
+                (v for k, v in getattr(self, "_fda_index", {}).items() if key in k or k in key),
+                []
+            )
+        chunks = []
+        priority = section_priority or ["CONTRAINDICATIONS", "ADVERSE REACTIONS",
+                                        "DOSAGE AND ADMINISTRATION", "WARNINGS AND PRECAUTIONS",
+                                        "DRUG INTERACTIONS", "INDICATIONS AND USAGE",
+                                        "USE IN SPECIFIC POPULATIONS"]
+        for idx in indices:
+            meta = self._metadata.get(idx, {})
+            chunk_text = meta.get("chunk_text", "")
+            section = next((s for s in priority if s in chunk_text.upper()), "OTHER")
+            chunks.append({
+                "text": chunk_text, "chunk_id": meta.get("chunk_id"),
+                "source": meta.get("source", ""), "pub_type": meta.get("pub_type", ""),
+                "pub_year": meta.get("pub_year"), "title": meta.get("title", ""),
+                "_section": section, "_priority": priority.index(section) if section in priority else 99,
+            })
+        chunks.sort(key=lambda c: c["_priority"])
+        return chunks[:5]
+    def get_guideline_chunks(self, query: str, top_n: int = 5) -> list[dict]:
+        """
+        Return clinical guideline chunks relevant to the query via keyword matching.
+        Bypasses FAISS/BM25 ranking — used during intervention when retrieval fails.
+        """
+        self._load_index()
+        query_lower = query.lower()
+        guideline_idx = getattr(self, "_guideline_index", {})
+        if not guideline_idx:
+            return []
+        # Find matching indices — union of all matching keyword lists
+        matched: dict[int, int] = {}  # idx → match count
+        for keyword, indices in guideline_idx.items():
+            if keyword in query_lower:
+                for idx in indices:
+                    matched[idx] = matched.get(idx, 0) + 1
+        if not matched:
+            return []
+        # Sort by match count (most keyword hits first), take top_n
+        top_indices = sorted(matched, key=lambda i: matched[i], reverse=True)[:top_n]
+        chunks = []
+        for idx in top_indices:
+            meta = self._metadata.get(idx, {})
+            chunks.append({
+                "text": meta.get("chunk_text", ""),
+                "chunk_id": meta.get("chunk_id"),
+                "source": meta.get("source", ""),
+                "pub_type": meta.get("pub_type", "clinical_guideline"),
+                "pub_year": meta.get("pub_year"),
+                "title": meta.get("title", ""),
+            })
+        return chunks
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def search(
+        self,
+        query: str,
+        top_k: int | None = None,
+    ) -> list[tuple[str, dict[str, Any], float]]:
+        """
+        Hybrid semantic + keyword search using Reciprocal Rank Fusion.
+        Args:
+            query : Natural language query
+            top_k : Override config top_k if provided
+        Returns:
+            List of (chunk_text, metadata_dict, rrf_score),
+            sorted by descending combined score.
+        """
+        if not query or not query.strip():
+            logger.warning("Retriever.search called with empty query — returning []")
+            return []
+        k = top_k or self.top_k
+        # Fetch RERANK_CANDIDATES via RRF, then re-rank to top-k
+        fetch_k = max(self.RERANK_CANDIDATES, k * 3)
+        RRF_K = 60  # standard RRF constant (higher = smoother rank blending)
+        self._load_model()
+        self._load_reranker()
+        self._load_index()
+        self._build_bm25()
+        # ── 1. FAISS semantic search ──────────────────────────────────
+        faiss_ranks: dict[int, int] = {}
+        if self._model is not None and self._index is not None and faiss is not None:
+            try:
+                q_vec: np.ndarray = self._model.encode(
+                    [query.strip()],
+                    normalize_embeddings=True,
+                    convert_to_numpy=True,
+                ).astype(np.float32)
+                scores_arr, idx_arr = self._index.search(q_vec, fetch_k)
+                faiss_scores = scores_arr[0]
+                faiss_indices = idx_arr[0]
+                # Map faiss_idx → rank (1-indexed)
+                for rank, (faiss_idx, score) in enumerate(zip(faiss_indices, faiss_scores), 1):
+                    if faiss_idx != -1:
+                        faiss_ranks[int(faiss_idx)] = rank
+                # Raw top-1 cosine similarity (IndexFlatIP + L2-norm = cosine).
+                # Used by main.py for coverage-gap detection — a poor match here
+                # means the topic is genuinely absent from the database.
+                _top_faiss_cosine = float(faiss_scores[0]) if len(faiss_scores) > 0 else 0.0
+            except Exception as e:
+                logger.error("FAISS search failed: %s", e)
+        # If FAISS failed but BM25 is available, continue with BM25-only (no stub)
+        if not faiss_ranks and self._bm25 is not None:
+            _top_faiss_cosine = 0.0  # no FAISS score available
+            logger.warning("FAISS model unavailable — using BM25-only search for this query.")
+        # Only return empty if BOTH are completely unavailable
+        if not faiss_ranks and self._bm25 is None:
+            logger.error("Both FAISS and BM25 are unavailable. Cannot retrieve. Check that the index exists and dependencies are installed.")
+            return []
+        # ── 2. BM25 keyword search ────────────────────────────────────
+        bm25_ranks: dict[int, int] = {}
+        if self._bm25 is not None:
+            query_tokens = query.lower().split()
+            bm25_scores_arr = self._bm25.get_scores(query_tokens)
+            # Get top fetch_k indices by BM25 score
+            top_bm25 = np.argsort(bm25_scores_arr)[::-1][:fetch_k]
+            for rank, corpus_pos in enumerate(top_bm25, 1):
+                if bm25_scores_arr[corpus_pos] > 0:
+                    faiss_idx = self._bm25_ids[corpus_pos]
+                    bm25_ranks[faiss_idx] = rank
+        # ── 3. Reciprocal Rank Fusion ─────────────────────────────────
+        # Score = 1/(k+rank_faiss) + 1/(k+rank_bm25)
+        # A chunk only in FAISS gets 1/(60+rank); only in BM25 gets 1/(60+rank)
+        # A chunk in BOTH gets the sum — it floats to the top
+        all_ids = set(faiss_ranks.keys()) | set(bm25_ranks.keys())
+        rrf_scores: dict[int, float] = {}
+        for faiss_idx in all_ids:
+            score = 0.0
+            if faiss_idx in faiss_ranks:
+                score += 1.0 / (RRF_K + faiss_ranks[faiss_idx])
+            if faiss_idx in bm25_ranks:
+                score += 1.0 / (RRF_K + bm25_ranks[faiss_idx])
+            rrf_scores[faiss_idx] = score
+        # Capture absolute quality BEFORE normalising (used for retrieval confidence gate)
+        max_rrf_absolute = max(rrf_scores.values()) if rrf_scores else 0.0
+        # Normalise RRF scores to [0, 1] for display
+        if rrf_scores and max_rrf_absolute > 0:
+            rrf_scores = {k: v / max_rrf_absolute for k, v in rrf_scores.items()}
+        # Sort by RRF score descending — take RERANK_CANDIDATES (not just top-k)
+        candidate_ids = sorted(rrf_scores.keys(), key=lambda i: rrf_scores[i], reverse=True)[:self.RERANK_CANDIDATES]
+        candidates: list[tuple[str, dict, float]] = []
+        for faiss_idx in candidate_ids:
+            meta = self._metadata.get(faiss_idx, {})
+            text = meta.get("chunk_text", "")
+            meta["_retrieval_confidence"] = round(max_rrf_absolute, 6)
+            meta["_top_faiss_cosine"] = round(_top_faiss_cosine, 4)
+            candidates.append((text, meta, rrf_scores[faiss_idx]))
+        # ── Re-ranking ────────────────────────────────────────────────────
+        # Cross-encoder scores every (query, chunk) pair directly.
+        # No volume bias — the right chunk wins on relevance regardless of source.
+        if self._reranker and self._reranker != "unavailable" and len(candidates) > k:
+            pairs = [(query, text) for text, _, _ in candidates]
+            rerank_scores = self._reranker.predict(pairs)
+            ranked = sorted(
+                zip(rerank_scores, candidates),
+                key=lambda x: x[0],
+                reverse=True,
+            )
+            results = [item for _, item in ranked[:k]]
+            logger.debug("Re-ranked %d candidates → top-%d", len(candidates), k)
+        else:
+            results = candidates[:k]
+        logger.debug(
+            "Hybrid query '%s...' → %d results (top RRF=%.4f) "
+            "[FAISS candidates: %d, BM25 candidates: %d]",
+            query[:40], len(results),
+            results[0][2] if results else 0.0,
+            len(faiss_ranks), len(bm25_ranks),
+        )
+        return results
+# ---------------------------------------------------------------------------
+# CLI smoke test
+# ---------------------------------------------------------------------------
+def _load_config() -> dict:
+    with open("config.yaml", "r", encoding="utf-8") as f:
+        return yaml.safe_load(f)
+if __name__ == "__main__":
+    import src  # noqa: F401 — logging
+    config = _load_config()
+    retriever = Retriever(config)
+    test_queries = [
+        "What is the recommended dosage of Metformin for Type 2 Diabetes in elderly patients?",
+        "Contraindications of ibuprofen for patients with chronic kidney disease",
+        "First-line treatment for hypertension according to clinical guidelines",
+    ]
+    for query in test_queries:
+        print(f"\n{'='*70}")
+        print(f"QUERY: {query}")
+        print("=" * 70)
+        results = retriever.search(query, top_k=3)
+        if not results:
+            print("  No results — is the FAISS index built?")
+            continue
+        for rank, (text, meta, score) in enumerate(results, 1):
+            print(f"\n  Rank {rank} | score={score:.4f} | source={meta.get('source')} | "
+                  f"tier_type={meta.get('pub_type')}")
+            print(f"  Title: {meta.get('title', '')[:80]}")
+            print(f"  Text : {text[:200]}...")

tests/test_api.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import pytest
+from fastapi.testclient import TestClient
+from src.api.main import app
+client = TestClient(app)
+def test_health_endpoint():
+    """Test that the /health endpoint correctly reports system status."""
+    response = client.get("/health")
+    assert response.status_code == 200
+    data = response.json()
+    assert data["status"] == "ok"
+    assert "ollama_available" in data
+def test_evaluate_endpoint():
+    """Test the /evaluate endpoint with mock claims."""
+    payload = {
+        "question": "Is Metformin safe?",
+        "answer": "Metformin is a safe and effective drug. It is recommended.",
+        "context_chunks": [
+            {
+                "chunk_id": "mock-1",
+                "text": "Metformin is a first-line medication for the treatment of type 2 diabetes. It is safe.",
+                "source": "mock_db",
+                "pub_type": "research_abstract",
+                "pub_year": 2024,
+                "title": "Study on Metformin safety"
+            }
+        ],
+        "run_ragas": False
+    }
+    # Since the evaluation modules load heavy ML models,
+    # the first test call might take 10-15s to run.
+    response = client.post("/evaluate", json=payload)
+    assert response.status_code == 200
+    data = response.json()
+    assert "composite_score" in data
+    assert "hrs" in data
+    assert data["risk_band"] in ["LOW", "MODERATE", "HIGH", "CRITICAL"]
+    assert "faithfulness" in data["module_results"]
+def test_query_invalid_params():
+    """Test the /query validation rules."""
+    payload = {
+        "question": "Hi",  # 2 chars — below min_length=5, triggers 422
+        "top_k": 5
+    }
+    response = client.post("/query", json=payload)
+    assert response.status_code == 422  # Unprocessable Entity (Pydantic validation error)

tests/test_modules.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import pytest
+from src.modules.faithfulness import score_faithfulness
+from src.modules.source_credibility import score_source_credibility
+from src.modules.contradiction import score_contradiction
+from src.evaluation.aggregator import aggregate
+def test_source_credibility():
+    chunks = [
+        {"chunk_id": "c1", "pub_type": "research_abstract", "title": "Mock Paper"},
+        {"chunk_id": "c2", "pub_type": "exam_question", "title": "Mock Exam Q"}
+    ]
+    results = score_source_credibility(chunks)
+    assert results.score > 0.0
+    assert 0.3 <= results.score <= 0.5
+    assert results.details["chunk_count"] == 2
+def test_faithfulness_nli():
+    res_entail = score_faithfulness(
+        answer="The sky is blue.",
+        context_docs=["The sky is colored blue today."]
+    )
+    assert res_entail.score >= 0.8
+    res_contra = score_faithfulness(
+        answer="The sky is red.",
+        context_docs=["The sky is completely blue and not red."]
+    )
+    assert res_contra.score <= 0.2
+def test_aggregator_logic():
+    # Mock config
+    test_cfg = {
+        "evaluation": {
+            "weights": {
+                "faithfulness": 0.4,
+                "entity_accuracy": 0.2,
+                "source_credibility": 0.2,
+                "contradiction_risk": 0.2,
+                "ragas_composite": 0.0
+            }
+        }
+    }
+    module_results = {
+        "faithfulness": {"score": 1.0},
+        "entity_verifier": {"score": 1.0},
+        "source_credibility": {"score": 0.5},
+        "contradiction": {"score": 1.0},
+    }
+    class MockResult:
+        def __init__(self, score, error=None):
+            self.score = score
+            self.error = error
+            self.latency_ms = 10
+    res = aggregate(
+        faithfulness_result=MockResult(1.0),
+        entity_result=MockResult(1.0),
+        source_result=MockResult(0.5),
+        contradiction_result=MockResult(1.0),
+        weights=test_cfg["evaluation"]["weights"]
+    )
+    assert abs(res.score - 0.9) < 0.01
+    assert res.details["hrs"] == 10
+    assert res.details["risk_band"] == "LOW"