Spaces:

vxa8502
/

Sage

Running

App Files Files Community

vxa8502 commited on 12 days ago

Commit

dbdadad

1 Parent(s): 66926c8

Restructure Makefile

Browse files

Files changed (7) hide show

Makefile +109 -39
README.md +145 -79
docs/chunking_decisions.md +99 -0
sage/services/baselines.py +51 -0
scripts/evaluation.py +24 -10
scripts/faithfulness.py +137 -0
scripts/human_eval.py +12 -0

Makefile CHANGED Viewed

@@ -1,4 +1,4 @@
-.PHONY: all setup data data-validate eval eval-deep eval-quick demo demo-interview reset reset-hard check-env qdrant-up qdrant-down qdrant-status eda serve serve-dev docker-build docker-run deploy-info human-eval-generate human-eval human-eval-analyze test lint typecheck ci info summary metrics-snapshot health help
 # ---------------------------------------------------------------------------
 # Configurable Variables (override: make demo QUERY="gaming mouse")
@@ -9,6 +9,8 @@ TOP_K ?= 1
 SAMPLES ?= 10
 SEED ?= 42
 PORT ?= 8000
 # ---------------------------------------------------------------------------
 # Environment Check
@@ -75,57 +77,94 @@ eda: check-env
 	python scripts/eda.py
 # ---------------------------------------------------------------------------
-# Evaluation Suite
 # ---------------------------------------------------------------------------
-# Standard evaluation: primary metrics, spot-checks, explanation tests, faithfulness
 eval: check-env
 	@echo "=== EVALUATION SUITE ===" && \
 	echo "" && \
-	echo "--- Building natural query evaluation dataset ---" && \
 	python scripts/build_natural_eval_dataset.py && \
-	echo "" && \
-	echo "--- Recommendation evaluation (natural queries) ---" && \
 	python scripts/evaluation.py --dataset eval_natural_queries.json --section primary && \
 	echo "" && \
-	echo "--- Explanation tests ---" && \
 	python scripts/explanation.py --section basic && \
 	python scripts/explanation.py --section gate && \
 	python scripts/explanation.py --section verify && \
 	python scripts/explanation.py --section cold && \
 	echo "" && \
-	echo "--- Faithfulness evaluation (HHEM + RAGAS) ---" && \
 	python scripts/faithfulness.py --samples $(SAMPLES) --ragas && \
 	echo "" && \
-	echo "--- Sanity checks (spot) ---" && \
 	python scripts/sanity_checks.py --section spot && \
 	echo "" && \
 	echo "=== EVALUATION COMPLETE ==="
-# Deep evaluation: all ablations, baselines, calibration, failure analysis
-eval-deep: check-env
-	@test -d data/eval || (echo "ERROR: Run 'make eval' first to build eval datasets" && exit 1)
-	@echo "=== DEEP EVALUATION (ablations + baselines) ===" && \
 	echo "" && \
-	echo "--- Full recommendation evaluation (natural queries) ---" && \
 	python scripts/evaluation.py --dataset eval_natural_queries.json --section all && \
 	echo "" && \
-	echo "--- All sanity checks (incl. calibration) ---" && \
-	python scripts/sanity_checks.py --section all && \
 	echo "" && \
-	echo "--- Faithfulness failure analysis ---" && \
 	python scripts/faithfulness.py --analyze && \
 	python scripts/faithfulness.py --adjusted && \
 	echo "" && \
-	echo "=== DEEP EVALUATION COMPLETE ==="
-# Quick eval: skip RAGAS (faster iteration)
-eval-quick: check-env
-	@echo "=== QUICK EVALUATION (no RAGAS) ==="
-	python scripts/build_natural_eval_dataset.py && \
-	python scripts/evaluation.py --dataset eval_natural_queries.json --section primary && \
-	python scripts/faithfulness.py --samples 5
-	@echo "Quick eval complete"
 # ---------------------------------------------------------------------------
 # Demo
@@ -155,8 +194,9 @@ demo-interview: check-env
 # Full Pipeline
 # ---------------------------------------------------------------------------
-all: qdrant-up data eval demo
-	@python scripts/summary.py
 # ---------------------------------------------------------------------------
 # API
@@ -255,6 +295,7 @@ health:
 # ---------------------------------------------------------------------------
 # Clear processed data, keep raw download cache and Qdrant Cloud data
 reset:
 	@echo "Clearing processed data..."
 	rm -f data/reviews_prepared_*.parquet
@@ -263,10 +304,29 @@ reset:
 	rm -rf data/eval/
 	rm -f data/eval_results/eval_*.json
 	rm -f data/eval_results/faithfulness_*.json
-	@echo "  (human_eval_*.json preserved — use rm -rf data/eval_results/ to clear)"
 	rm -rf data/figures/
 	rm -f reports/eda_report.md
-	@echo "Done. (Raw cache + Qdrant preserved — use 'make reset-hard' to clear all)"
 # ---------------------------------------------------------------------------
 # Kaggle
@@ -363,10 +423,18 @@ help:
 	@echo "PIPELINE:"
 	@echo "  make data            Load, chunk, embed, and index reviews"
 	@echo "  make data-validate   Validate data outputs"
-	@echo "  make eda             Exploratory data analysis (generates figures)"
-	@echo "  make eval            Standard evaluation (SAMPLES=10 default)"
-	@echo "  make eval-deep       Deep evaluation (all ablations + baselines)"
-	@echo "  make eval-quick      Quick eval (skip RAGAS)"
 	@echo ""
 	@echo "API:"
 	@echo "  make serve           Start API server (PORT=8000)"
@@ -396,8 +464,10 @@ help:
 	@echo "  make reset-hard      Reset + clear Qdrant + raw data cache"
 	@echo ""
 	@echo "VARIABLES:"
-	@echo "  QUERY    Demo query (default: wireless headphones...)"
-	@echo "  TOP_K    Number of results (default: 1)"
-	@echo "  SAMPLES  Faithfulness eval samples (default: 10)"
-	@echo "  SEED     Random seed for human eval (default: 42)"
-	@echo "  PORT     API port (default: 8000)"

+.PHONY: all setup data data-validate eval eval-all eval-quick demo demo-interview reset reset-hard check-env qdrant-up qdrant-down qdrant-status eda serve serve-dev docker-build docker-run deploy-info human-eval-generate human-eval human-eval-analyze test lint typecheck ci info summary metrics-snapshot health load-test load-test-quick help
 # ---------------------------------------------------------------------------
 # Configurable Variables (override: make demo QUERY="gaming mouse")
 SAMPLES ?= 10
 SEED ?= 42
 PORT ?= 8000
+URL ?= https://vxa8502-sage.hf.space
+REQUESTS ?= 50
 # ---------------------------------------------------------------------------
 # Environment Check
 	python scripts/eda.py
 # ---------------------------------------------------------------------------
+# Evaluation Suite (layered: quick → standard → complete)
 # ---------------------------------------------------------------------------
+# Quick: Fast iteration, no RAGAS (~1 min)
+#   - Primary retrieval metrics (NDCG, Hit@K, MRR)
+#   - Basic faithfulness (HHEM only, 5 samples)
+eval-quick: check-env
+	@echo "=== QUICK EVALUATION ===" && \
+	python scripts/build_natural_eval_dataset.py && \
+	python scripts/evaluation.py --dataset eval_natural_queries.json --section primary && \
+	python scripts/faithfulness.py --samples 5 && \
+	echo "=== QUICK EVAL COMPLETE ==="
+# Standard: Pre-commit validation (~5 min)
+#   - Primary retrieval metrics
+#   - Explanation tests (basic, gate, verify, cold-start)
+#   - Faithfulness (HHEM + RAGAS)
+#   - Spot checks
 eval: check-env
 	@echo "=== EVALUATION SUITE ===" && \
 	echo "" && \
+	echo "--- [1/4] Retrieval metrics ---" && \
 	python scripts/build_natural_eval_dataset.py && \
 	python scripts/evaluation.py --dataset eval_natural_queries.json --section primary && \
 	echo "" && \
+	echo "--- [2/4] Explanation tests ---" && \
 	python scripts/explanation.py --section basic && \
 	python scripts/explanation.py --section gate && \
 	python scripts/explanation.py --section verify && \
 	python scripts/explanation.py --section cold && \
 	echo "" && \
+	echo "--- [3/4] Faithfulness (HHEM + RAGAS) ---" && \
 	python scripts/faithfulness.py --samples $(SAMPLES) --ragas && \
 	echo "" && \
+	echo "--- [4/4] Sanity checks ---" && \
 	python scripts/sanity_checks.py --section spot && \
 	echo "" && \
 	echo "=== EVALUATION COMPLETE ==="
+# Complete: Full reproducible suite (~15 min)
+#   - EDA (production data stats + figures)
+#   - All retrieval metrics + ablations (aggregation, rating, K, weights)
+#   - Baseline comparison (Random, Popularity, ItemKNN)
+#   - All explanation tests
+#   - Faithfulness (HHEM + RAGAS)
+#   - Grounding delta (WITH vs WITHOUT evidence)
+#   - Failure analysis + adjusted metrics
+#   - All sanity checks (spot, adversarial, empty, calibration)
+#   - Human eval analysis (if annotations exist)
+#   - Summary report
+eval-all: check-env
+	@echo "=== COMPLETE EVALUATION SUITE ===" && \
 	echo "" && \
+	echo "--- [1/9] EDA (production data) ---" && \
+	mkdir -p data/figures reports && \
+	python scripts/eda.py && \
+	echo "" && \
+	echo "--- [2/9] Retrieval metrics + ablations ---" && \
+	python scripts/build_natural_eval_dataset.py && \
 	python scripts/evaluation.py --dataset eval_natural_queries.json --section all && \
 	echo "" && \
+	echo "--- [3/9] Baseline comparison ---" && \
+	python scripts/evaluation.py --dataset eval_natural_queries.json --section primary --baselines && \
+	echo "" && \
+	echo "--- [4/9] Explanation tests ---" && \
+	python scripts/explanation.py --section basic && \
+	python scripts/explanation.py --section gate && \
+	python scripts/explanation.py --section verify && \
+	python scripts/explanation.py --section cold && \
+	echo "" && \
+	echo "--- [5/9] Faithfulness (HHEM + RAGAS) ---" && \
+	python scripts/faithfulness.py --samples $(SAMPLES) --ragas && \
+	echo "" && \
+	echo "--- [6/9] Grounding delta experiment ---" && \
+	python scripts/faithfulness.py --delta && \
 	echo "" && \
+	echo "--- [7/9] Failure analysis ---" && \
 	python scripts/faithfulness.py --analyze && \
 	python scripts/faithfulness.py --adjusted && \
 	echo "" && \
+	echo "--- [8/9] All sanity checks ---" && \
+	python scripts/sanity_checks.py --section all && \
+	echo "" && \
+	echo "--- [9/9] Human eval analysis ---" && \
+	(python scripts/human_eval.py --analyze 2>/dev/null || echo "  (skipped - no annotations found)") && \
+	echo "" && \
+	python scripts/summary.py && \
+	echo "=== COMPLETE EVALUATION DONE ==="
 # ---------------------------------------------------------------------------
 # Demo
 # Full Pipeline
 # ---------------------------------------------------------------------------
+# Complete reproducible pipeline: data + full eval + demo
+all: qdrant-up data eval-all demo
+	@echo "=== FULL PIPELINE COMPLETE ==="
 # ---------------------------------------------------------------------------
 # API
 # ---------------------------------------------------------------------------
 # Clear processed data, keep raw download cache and Qdrant Cloud data
+# After reset, run: make eval-all (full reproducible suite)
 reset:
 	@echo "Clearing processed data..."
 	rm -f data/reviews_prepared_*.parquet
 	rm -rf data/eval/
 	rm -f data/eval_results/eval_*.json
 	rm -f data/eval_results/faithfulness_*.json
+	rm -f data/eval_results/failure_analysis_*.json
+	rm -f data/eval_results/adjusted_faithfulness_*.json
+	rm -f data/eval_results/grounding_delta_*.json
+	@echo "  (human_eval_*.json preserved — run 'make human-eval' to re-annotate)"
 	rm -rf data/figures/
 	rm -f reports/eda_report.md
+	@echo "Done. Run 'make eval-all' to reproduce full evaluation suite."
+	@echo "  (Use 'make reset-hard' to also clear Qdrant + raw cache)"
+# ---------------------------------------------------------------------------
+# Load Testing
+# ---------------------------------------------------------------------------
+# Run load test against production (or local with URL=http://localhost:8000)
+# Target: P99 < 500ms
+load-test:
+	@echo "=== LOAD TEST ==="
+	python scripts/load_test.py --url $(URL) --requests $(REQUESTS)
+# Quick load test (20 requests, no explanations - tests retrieval only)
+load-test-quick:
+	@echo "=== QUICK LOAD TEST (retrieval only) ==="
+	python scripts/load_test.py --url $(URL) --requests 20 --no-explain
 # ---------------------------------------------------------------------------
 # Kaggle
 	@echo "PIPELINE:"
 	@echo "  make data            Load, chunk, embed, and index reviews"
 	@echo "  make data-validate   Validate data outputs"
+	@echo "  make eda             Exploratory data analysis (queries Qdrant)"
+	@echo ""
+	@echo "EVALUATION (layered):"
+	@echo "  make eval-quick      Quick iteration: NDCG + HHEM only (~1 min)"
+	@echo "  make eval            Standard: metrics + explanation + faithfulness (~5 min)"
+	@echo "  make eval-all        Complete: everything automated (~15 min)"
+	@echo "                       Includes: EDA, ablations, baselines, delta, analysis"
+	@echo ""
+	@echo "LOAD TESTING:"
+	@echo "  make load-test             Run 50 requests against production (P99 target)"
+	@echo "  make load-test URL=...     Test against custom URL"
+	@echo "  make load-test-quick       20 requests, no explanations (retrieval only)"
 	@echo ""
 	@echo "API:"
 	@echo "  make serve           Start API server (PORT=8000)"
 	@echo "  make reset-hard      Reset + clear Qdrant + raw data cache"
 	@echo ""
 	@echo "VARIABLES:"
+	@echo "  QUERY     Demo query (default: wireless headphones...)"
+	@echo "  TOP_K     Number of results (default: 1)"
+	@echo "  SAMPLES   Faithfulness eval samples (default: 10)"
+	@echo "  SEED      Random seed for human eval (default: 42)"
+	@echo "  PORT      API port (default: 8000)"
+	@echo "  URL       Load test target (default: https://vxa8502-sage.hf.space)"
+	@echo "  REQUESTS  Load test request count (default: 50)"

README.md CHANGED Viewed

@@ -6,149 +6,215 @@ colorTo: yellow
 sdk: docker
 app_port: 7860
 ---
-<!-- Above metadata configures Hugging Face Spaces (hidden there, visible on GitHub) -->
 # Sage
-RAG-powered product recommendation system with explainable AI. Retrieves relevant products via semantic search over customer reviews, generates natural language explanations grounded in evidence, and verifies faithfulness using hallucination detection.
-## Targets
-| Metric | Target |
-|--------|--------|
-| Recommendation Quality (NDCG@10) | > 0.30 |
-| Explanation Faithfulness (RAGAS) | > 0.85 |
-| System Latency (P99) | < 500ms |
-| Human Evaluation (n=50) | > 3.5/5.0 |
-## Tech Stack
-- **Embeddings:** E5-small (384-dim)
-- **Vector DB:** Qdrant with semantic caching
-- **LLM:** Claude Sonnet / GPT-4o-mini
-- **Faithfulness:** HHEM (Vectara hallucination detector) + quote verification
-- **API:** FastAPI with async handlers and streaming support
-- **Metrics:** Prometheus (latency histograms, cache hit rates, error counts)
 ## Quick Start
-### Option 1: Docker (easiest)
 ```bash
-git clone https://github.com/vxa8502/sage-recommendations
-cd sage-recommendations
 cp .env.example .env
-# Edit .env and set ANTHROPIC_API_KEY (or OPENAI_API_KEY)
 docker-compose up
 curl http://localhost:8000/health
 ```
-### Option 2: Local Development
 ```bash
-python3 -m venv .venv
-source .venv/bin/activate
-pip install -e ".[dev,pipeline,api,anthropic]"  # or openai
 cp .env.example .env
-# Edit .env: add LLM key + Qdrant (local via `make qdrant-up` or Qdrant Cloud)
-make data                  # Load data and embeddings
-make serve                 # Start API
 ```
-## Environment Variables
 ```bash
-# Required
-LLM_PROVIDER=anthropic              # or "openai"
-ANTHROPIC_API_KEY=your_key_here
-# Optional: Qdrant Cloud (for deployment or instead of local)
-# QDRANT_URL=https://your-cluster.cloud.qdrant.io
-# QDRANT_API_KEY=your_qdrant_key
 ```
 ## API Reference
 ### POST /recommend
 ```bash
-curl -X POST http://localhost:8000/recommend \
   -H "Content-Type: application/json" \
   -d '{"query": "wireless earbuds for running", "k": 3, "explain": true}'
 ```
-Returns ranked products with explanations grounded in customer reviews, HHEM confidence scores, and citation verification.
 ### POST /recommend/stream
-Stream recommendations with token-by-token explanation delivery (SSE).
 ### GET /health
-Service health check.
 ### GET /metrics
-Prometheus metrics: latency histograms, cache hit rates, error counts.
 ### GET /cache/stats
-Cache performance statistics.
-## Failure Modes (By Design)
-| Condition | System Behavior |
-|-----------|-----------------|
-| Insufficient evidence | Refuses to explain |
-| Quote not found in source | Falls back to paraphrased claims |
-| HHEM confidence below threshold | Flags explanation as uncertain |
-The system refuses to hallucinate rather than confidently stating unsupported claims.
-## Development
 ```bash
-make test      # Run tests
-make lint      # Run linter
-make eval      # Run evaluation suite
-make all       # Full pipeline
 ```
 ## Project Structure
 ```
 sage/
-├── adapters/           # External integrations (Qdrant, LLM, HHEM)
-├── api/                # FastAPI routes, middleware, metrics
-├── config/             # Settings, constants, queries
-├── core/               # Domain models, aggregation, verification
-├── services/           # Business logic (retrieval, explanation, cache)
 scripts/
-├── pipeline.py         # Data ingestion and embedding
-├── demo.py             # Interactive demo
-├── evaluation.py       # Recommendation metrics (NDCG, precision, recall)
-├── faithfulness.py     # RAGAS + HHEM faithfulness evaluation
-├── explanation.py      # Explanation quality tests
-├── human_eval.py       # Human evaluation workflow
-├── sanity_checks.py    # Spot checks and calibration
-├── load_test.py        # Latency benchmarking
-├── eda.py              # Exploratory data analysis
-tests/
-├── test_api.py
-├── test_evidence.py
-├── test_aggregation.py
 ```
-## Future Work
-1. **Cross-encoder reranking** for improved precision on top-k candidates
-2. **User feedback loops** for learning from implicit signals
-3. **Hybrid retrieval** with BM25 + dense fusion
-4. **Expanded human evaluation** with stratified sampling
 ## License
-Academic research only (uses Amazon Reviews 2023 dataset).

 sdk: docker
 app_port: 7860
 ---
+<!-- HF Spaces metadata above; hidden on HF, visible on GitHub -->
 # Sage
+**Product recommendations without explanations are black boxes.** Users see "You might like X" but never learn *why*. This system retrieves products via semantic search over real customer reviews, then generates natural language explanations grounded in that evidence. Every claim is verified against source text using hallucination detection.
+**Live demo:** [vxa8502-sage.hf.space](https://vxa8502-sage.hf.space)
+---
+## Results
+| Metric | Target | Achieved | Status |
+|--------|--------|----------|--------|
+| NDCG@10 (recommendation quality) | > 0.30 | 0.295 | 98% |
+| Claim-level faithfulness (HHEM) | > 0.85 | 0.968 | Pass |
+| Human evaluation (n=50) | > 3.5/5 | 4.43/5 | Pass |
+| P99 latency (retrieval) | < 500ms | 283ms | Pass |
+| P99 latency (cache hit) | < 100ms | ~80ms | Pass |
+**Grounding impact:** Explanations generated WITH evidence score 69% on HHEM. WITHOUT evidence: 3%. RAG grounding reduces hallucination by 66 percentage points.
+---
+## Architecture
+```
+User Query: "wireless earbuds for running"
+                    │
+                    ▼
+┌─────────────────────────────────────────────────────────────┐
+│                      SAGE API (FastAPI)                     │
+├─────────────────────────────────────────────────────────────┤
+│  1. EMBED         │  E5-small (384-dim)           ~20ms    │
+│  2. CACHE CHECK   │  Exact + semantic (0.92 sim)  ~1ms     │
+│  3. RETRIEVE      │  Qdrant vector search         ~50ms    │
+│  4. AGGREGATE     │  Chunk → Product (MAX score)  ~1ms     │
+│  5. EXPLAIN       │  Claude/GPT + evidence        ~300ms   │
+│  6. VERIFY        │  HHEM hallucination check     ~50ms    │
+└─────────────────────────────────────────────────────────────┘
+                    │
+                    ▼
+┌─────────────────────────────────────────────────────────────┐
+│  Response:                                                  │
+│  - Product ID + score                                       │
+│  - Explanation with [citations]                             │
+│  - HHEM confidence score                                    │
+│  - Quote verification results                               │
+└─────────────────────────────────────────────────────────────┘
+```
+**Data flow:** 1M Amazon reviews → 5-core filter → 30K reviews → semantic chunking → 423K chunks in Qdrant.
+---
+## Design Trade-offs
+| Decision | Alternative | Why This Choice |
+|----------|-------------|-----------------|
+| **E5-small** (384-dim) | E5-large, BGE-large | 3x faster inference, 0.02 NDCG delta. Latency > marginal accuracy. |
+| **Qdrant** | Pinecone, Weaviate | Free cloud tier (1GB), gRPC, native Python client. |
+| **Semantic chunking** | Fixed-window | Preserves complete arguments; +12% quote verification rate. |
+| **MAX aggregation** | MEAN, weighted | Best single chunk matters more than average for explanations. |
+| **HHEM** (Vectara) | NLI models, GPT-4 judge | Purpose-built for RAG; no API cost; 0.97 AUC on HaluEval. |
+| **Claim-level HHEM** | Full-explanation HHEM | Isolates hallucinated claims; more actionable than binary pass/fail. |
+| **Quality gate** (refuse) | Always answer | Reduces hallucination; 46% refusal rate is a feature, not a bug. |
+See [`docs/chunking_decisions.md`](docs/chunking_decisions.md) for detailed chunking rationale.
+---
+## Known Limitations
+| Limitation | Impact | Mitigation |
+|------------|--------|------------|
+| **Single category** (Electronics) | Can't recommend across categories | Architecture supports multi-category; data constraint only |
+| **No image features** | Misses visual product attributes | Could add CLIP embeddings in future |
+| **English only** | Non-English reviews have lower retrieval quality | E5 is primarily English-trained |
+| **Cache invalidation manual** | Stale explanations possible | TTL-based expiry (1 hour); manual `/cache/clear` |
+| **LLM latency on free tier** | P99 ~4s with explanations | Retrieval alone is 283ms; cache hits are ~80ms |
+| **No user personalization** | Same results for all users | Would need user history for collaborative filtering |
+---
 ## Quick Start
+### Docker (recommended)
 ```bash
+git clone https://github.com/yourusername/sage
+cd sage
 cp .env.example .env
+# Edit .env: add ANTHROPIC_API_KEY (or OPENAI_API_KEY)
 docker-compose up
 curl http://localhost:8000/health
 ```
+### Local Development
 ```bash
+python3 -m venv .venv && source .venv/bin/activate
+pip install -e ".[dev,pipeline,api,anthropic]"
 cp .env.example .env
+# Edit .env: add API keys
+make qdrant-up          # Start local Qdrant
+make data               # Load data (or use Qdrant Cloud)
+make serve              # Start API at localhost:8000
 ```
+### Environment Variables
 ```bash
+# Required (one of)
+ANTHROPIC_API_KEY=sk-ant-...
+OPENAI_API_KEY=sk-...
+LLM_PROVIDER=anthropic   # or "openai"
+# Optional: Qdrant Cloud (instead of local)
+QDRANT_URL=https://xxx.cloud.qdrant.io
+QDRANT_API_KEY=...
 ```
+---
 ## API Reference
 ### POST /recommend
 ```bash
+curl -X POST https://vxa8502-sage.hf.space/recommend \
   -H "Content-Type: application/json" \
   -d '{"query": "wireless earbuds for running", "k": 3, "explain": true}'
 ```
+Returns ranked products with:
+- Explanation grounded in customer reviews
+- HHEM confidence score (0-1)
+- Quote verification results
+- Evidence chunks with citations
 ### POST /recommend/stream
+Server-sent events for token-by-token explanation streaming.
 ### GET /health
+```json
+{"status": "healthy", "qdrant_connected": true, "llm_reachable": true}
+```
 ### GET /metrics
+Prometheus metrics: `sage_request_latency_seconds`, `sage_cache_events_total`, `sage_errors_total`.
 ### GET /cache/stats
+```json
+{"size": 42, "hit_rate": 0.35, "exact_hits": 10, "semantic_hits": 5, "misses": 27}
+```
+---
+## Evaluation
 ```bash
+make eval-quick    # ~1 min: NDCG + HHEM only
+make eval          # ~5 min: standard pre-commit
+make eval-all      # ~15 min: complete reproducible suite
+make load-test     # P99 latency against production
 ```
+See `make help` for all targets.
+---
 ## Project Structure
 ```
 sage/
+├── adapters/       # External integrations (Qdrant, LLM, HHEM)
+├── api/            # FastAPI routes, middleware, Prometheus metrics
+├── core/           # Domain models, aggregation, verification, chunking
+├── services/       # Business logic (retrieval, explanation, cache)
 scripts/
+├── pipeline.py     # Data ingestion and embedding
+├── evaluation.py   # NDCG, precision, recall, novelty, baselines
+├── faithfulness.py # HHEM, RAGAS, grounding delta
+├── human_eval.py   # Interactive human evaluation
+├── load_test.py    # P99 latency benchmarking
 ```
+---
+## Failure Modes (By Design)
+| Condition | System Behavior |
+|-----------|-----------------|
+| Insufficient evidence (< 2 chunks) | Refuses to explain |
+| Low relevance (top score < 0.5) | Refuses to explain |
+| Quote not found in evidence | Falls back to paraphrased claims |
+| HHEM score < 0.5 | Flags as uncertain |
+The system refuses to hallucinate rather than confidently stating unsupported claims.
+---
 ## License
+Academic/portfolio use only. Uses Amazon Reviews 2023 dataset.

docs/chunking_decisions.md ADDED Viewed

	@@ -0,0 +1,99 @@

+# Chunking Strategy Decisions
+## Strategy Overview
+| Review Length | Strategy | Rationale |
+|--------------|----------|-----------|
+| < 200 tokens | No chunking | Most reviews are single-topic |
+| 200-500 tokens | Semantic chunking (85th percentile breakpoint) | Preserves topic coherence |
+| > 500 tokens | Semantic + sliding window fallback | Handles very long mixed-topic reviews |
+Token estimation: ~4 chars/token (typical for English text with WordPiece tokenizers).
+---
+## Why Semantic Chunking?
+### Failure Modes of Naive (Fixed-Window) Chunking
+1. **Mid-sentence splits**: "The battery lasts 8 hours but" / "only if you disable WiFi" - the conditional is severed from the claim, causing the LLM to potentially cite "battery lasts 8 hours" without the critical qualifier.
+2. **Aspect fragmentation**: A review discussing battery, then screen, then price gets randomly sliced. Retrieval for "battery life" might return a chunk containing "...great battery. The screen however is dim and..." - mixing positive battery sentiment with negative screen sentiment.
+3. **Evidence dilution**: When a user asks about "noise cancellation", a chunk containing half a sentence about noise cancellation plus unrelated content about packaging provides weaker evidence than a chunk focused entirely on audio quality.
+### How Semantic Chunking Improves Faithfulness
+Semantic chunking uses embedding similarity between adjacent sentences to detect natural topic transitions. When a reviewer shifts from "battery life" to "build quality", sentence embeddings show a similarity drop. We split at these drops (below 85th percentile):
+1. **Preserves complete arguments**: Claims stay with their evidence and qualifiers
+2. **Creates topically coherent chunks**: Each chunk discusses one aspect
+3. **Improves HHEM scores**: Hallucination detection works better with tight topics
+---
+## Worked Example
+**Original review (320 tokens):**
+> "I bought these headphones for my commute. The noise cancellation is exceptional - it blocks out subway noise completely, even announcements. I tested it on a plane and the engine drone disappeared. However, the comfort is a different story. After 2 hours my ears hurt from the pressure. The headband also feels cheap and creaks when I move."
+**Naive chunking (150 tokens/chunk):**
+- Chunk 1: "...exceptional - it blocks out subway noise completely, even announcements. I tested it on a"
+- Chunk 2: "plane and the engine drone disappeared. However, the comfort is..."
+**Semantic chunking:**
+- Chunk 1: Complete noise cancellation evidence (subway + plane tests together)
+- Chunk 2: Complete comfort critique (ears + headband together)
+The semantic version keeps the complete noise cancellation evidence together for stronger grounding.
+---
+## Mixed Sentiment Handling
+**Example:** "Battery life is amazing but the build quality is garbage"
+**Does the chunker split this?** No - intentionally.
+**Arguments against splitting (why we chose this):**
+- Splitting mid-sentence creates grammatically broken chunks
+- The "but" contrast is meaningful information
+- Faithfulness requires citing what reviewers actually said
+- Rating filter (min_rating=4.0) excludes low-rated reviews with mixed sentiment
+---
+## Edge Cases
+### 1. Very Short Reviews (< 50 tokens)
+Example: "Works great!" or "Exactly as described"
+**Handling:** No chunking. Become single chunks.
+**Rationale:** Short reviews are single-topic. Main risk is LLM over-extrapolating from thin evidence, caught by HHEM.
+### 2. HTML Artifacts
+Example: "Great product!<br /><br />Fast shipping.<br />[[VIDEOID:abc123]]"
+**Handling:** `split_sentences()` replaces `<br />` with spaces. Video IDs pass through.
+### 3. Mixed Language Content
+Example: "Muy bueno! Great product."
+**Handling:** Sentence splitter handles basic mixed content. E5-small primarily trained on English, so non-English chunks may have lower retrieval quality.
+### 4. Numbers and Specifications
+Example: "Battery: 8hrs. Weight: 250g. Price: $49.99"
+**Handling:** Kept together as single chunk. Specification lists are valuable evidence.
+### 5. Sarcasm and Irony
+Example: "Oh yeah, 'great' battery life - lasted 2 whole hours"
+**Handling:** Not detected. Dense retrievers encode topic, not sentiment. Rating filter is the defense (sarcastic reviews typically have low ratings).
+---
+## Implementation Reference
+See `sage/core/chunking.py` for implementation.

sage/services/baselines.py CHANGED Viewed

@@ -242,3 +242,54 @@ def load_product_embeddings_from_qdrant() -> dict[str, np.ndarray]:
         product_embeddings[product_id] = normalize_vectors(mean_vec)
     return product_embeddings

         product_embeddings[product_id] = normalize_vectors(mean_vec)
     return product_embeddings
+def compute_item_popularity_from_qdrant(
+    normalize: bool = True,
+) -> dict[str, float] | dict[str, int]:
+    """
+    Compute item popularity (chunk count per product) from Qdrant.
+    This allows computing beyond-accuracy metrics (novelty, diversity)
+    without requiring local splits.
+    Args:
+        normalize: If True, return probabilities (0-1). If False, return raw counts.
+    Returns:
+        Dict mapping product_id to popularity (probability if normalize=True,
+        raw count if normalize=False).
+    """
+    from sage.adapters.vector_store import get_client
+    client = get_client()
+    # Scroll through all points (without vectors for speed)
+    counts: dict[str, int] = Counter()
+    offset = None
+    while True:
+        results, offset = client.scroll(
+            collection_name=COLLECTION_NAME,
+            limit=1000,
+            offset=offset,
+            with_vectors=False,
+        )
+        for point in results:
+            product_id = point.payload.get("product_id")
+            if product_id:
+                counts[product_id] += 1
+        if offset is None:
+            break
+    if not normalize:
+        return dict(counts)
+    # Normalize to probabilities
+    total = sum(counts.values())
+    if total == 0:
+        return {}
+    return {product_id: count / total for product_id, count in counts.items()}

scripts/evaluation.py CHANGED Viewed

@@ -28,6 +28,7 @@ from sage.services.baselines import (
     ItemKNNBaseline,
     PopularityBaseline,
     RandomBaseline,
     load_product_embeddings_from_qdrant,
 )
 from sage.config import get_logger, log_banner, log_section, log_kv
@@ -351,18 +352,25 @@ def main():
     total_items = len(item_embeddings)
     logger.info("Products in catalog: %d", total_items)
-    # Try to load splits for beyond-accuracy metrics (optional)
-    item_popularity = None
     train_records = None
     all_products = None
     try:
         train_df, _, _ = load_splits()
         train_records = train_df.to_dict("records")
         all_products = list(train_df["parent_asin"].unique())
         item_popularity = compute_item_popularity(train_records, item_key="parent_asin")
-        logger.info("Loaded splits for beyond-accuracy metrics")
     except FileNotFoundError:
-        logger.info("Splits not available - beyond-accuracy metrics will be skipped")
     # Load eval cases
     logger.info("Loading evaluation dataset: %s", args.dataset)
@@ -403,14 +411,20 @@ def main():
             "ndcg_at_10": best_ndcg,
         }
-    # Baseline comparison (requires splits)
     if args.baselines:
-        if train_records is None:
-            logger.warning(
-                "Skipping baselines - requires local splits (run 'make splits')"
-            )
-        else:
             run_baseline_comparison(cases, train_records, all_products, item_embeddings)
     # Save results (uses dataset stem as prefix for both timestamped and latest files)
     prefix = Path(args.dataset).stem

     ItemKNNBaseline,
     PopularityBaseline,
     RandomBaseline,
+    compute_item_popularity_from_qdrant,
     load_product_embeddings_from_qdrant,
 )
 from sage.config import get_logger, log_banner, log_section, log_kv
     total_items = len(item_embeddings)
     logger.info("Products in catalog: %d", total_items)
+    # Try to load splits for baseline comparison (optional)
     train_records = None
     all_products = None
+    item_counts = None  # Raw counts for baseline comparison
     try:
         train_df, _, _ = load_splits()
         train_records = train_df.to_dict("records")
         all_products = list(train_df["parent_asin"].unique())
         item_popularity = compute_item_popularity(train_records, item_key="parent_asin")
+        logger.info("Loaded splits for baseline comparison")
     except FileNotFoundError:
+        # Fall back to Qdrant-based popularity for beyond-accuracy metrics
+        logger.info("Splits not available - computing popularity from Qdrant")
+        item_popularity = compute_item_popularity_from_qdrant(normalize=True)
+        item_counts = compute_item_popularity_from_qdrant(normalize=False)
+        all_products = list(item_embeddings.keys())
+        logger.info(
+            "Computed popularity for %d products from Qdrant", len(item_popularity)
+        )
     # Load eval cases
     logger.info("Loading evaluation dataset: %s", args.dataset)
             "ndcg_at_10": best_ndcg,
         }
+    # Baseline comparison
     if args.baselines:
+        if train_records is None and item_counts is not None:
+            # Create pseudo-interactions from Qdrant counts for baseline comparison
+            logger.info("Using Qdrant-based counts for baseline comparison")
+            train_records = [
+                {"parent_asin": pid}
+                for pid, count in item_counts.items()
+                for _ in range(count)
+            ]
+        if train_records is not None:
             run_baseline_comparison(cases, train_records, all_products, item_embeddings)
+        else:
+            logger.warning("Skipping baselines - no data available")
     # Save results (uses dataset stem as prefix for both timestamped and latest files)
     prefix = Path(args.dataset).stem

scripts/faithfulness.py CHANGED Viewed

@@ -189,6 +189,18 @@ def run_evaluation(n_samples: int, run_ragas: bool = False):
             "faithfulness_std": ragas_report.std_score,
         }
     ts_file = save_results(results, "faithfulness")
     logger.info("Saved: %s", ts_file)
@@ -348,6 +360,126 @@ def run_adjusted_calculation():
     logger.info("Saved: %s", ts_file)
 # ============================================================================
 # Main
 # ============================================================================
@@ -361,12 +493,17 @@ def main():
     parser.add_argument(
         "--adjusted", action="store_true", help="Calculate adjusted metrics"
     )
     args = parser.parse_args()
     if args.analyze:
         run_failure_analysis()
     elif args.adjusted:
         run_adjusted_calculation()
     else:
         run_evaluation(n_samples=args.samples, run_ragas=args.ragas)

             "faithfulness_std": ragas_report.std_score,
         }
+    # Document RAGAS metric limitations
+    results["ragas_limitations"] = {
+        "metrics_available": ["faithfulness"],
+        "metrics_unavailable": {
+            "answer_relevancy": "Requires embeddings model; RAGAS doesn't support Anthropic as embeddings provider",
+            "context_precision": "Requires ground-truth reference answers per query (not available)",
+            "context_recall": "Requires ground-truth reference answers per query (not available)",
+        },
+        "primary_metric": "claim_level_hhem",
+        "rationale": "Claim-level HHEM (96.8%) is more reliable than full-explanation RAGAS for citation-heavy explanations",
+    }
     ts_file = save_results(results, "faithfulness")
     logger.info("Saved: %s", ts_file)
     logger.info("Saved: %s", ts_file)
+# ============================================================================
+# SECTION: Grounding Delta Experiment
+# ============================================================================
+def run_grounding_delta():
+    """
+    Compare HHEM scores WITH vs WITHOUT evidence grounding.
+    This shows the value of RAG: how much does grounding reduce hallucination?
+    """
+    from sage.adapters.llm import get_llm_client
+    from sage.services import get_explanation_services
+    from sage.services.retrieval import get_candidates
+    log_banner(logger, "GROUNDING DELTA EXPERIMENT")
+    logger.info("Comparing hallucination rates WITH vs WITHOUT evidence grounding")
+    queries = EVALUATION_QUERIES[:10]
+    _, detector = get_explanation_services()
+    llm = get_llm_client()
+    with_evidence = []
+    without_evidence = []
+    for i, query in enumerate(queries, 1):
+        logger.info('[%d/%d] "%s"', i, len(queries), query)
+        products = get_candidates(
+            query=query,
+            k=1,
+            min_rating=4.0,
+            aggregation=AggregationMethod.MAX,
+        )
+        if not products:
+            continue
+        product = products[0]
+        # Get evidence chunks for this product
+        from sage.services.retrieval import retrieve_chunks
+        all_chunks = retrieve_chunks(query, limit=100)
+        # Filter to just this product's chunks
+        evidence = [c for c in all_chunks if c.product_id == product.product_id][
+            :MAX_EVIDENCE
+        ]
+        evidence_texts = [c.text for c in evidence]
+        if not evidence_texts:
+            continue
+        # Generate WITH evidence (grounded)
+        system_prompt = "You are a helpful product recommendation assistant."
+        grounded_user = f"""Based on customer reviews, explain why this product is good for: "{query}"
+EVIDENCE FROM REVIEWS:
+{chr(10).join(f"- {t}" for t in evidence_texts[:3])}
+Write a brief 2-3 sentence recommendation based ONLY on the evidence above."""
+        try:
+            grounded_response, _ = llm.generate(system_prompt, grounded_user)
+            grounded_hhem = detector.check_explanation(
+                evidence_texts, grounded_response
+            )
+            with_evidence.append(grounded_hhem.score)
+            logger.info("  WITH evidence: %.3f", grounded_hhem.score)
+        except Exception:
+            logger.exception("  Error with grounded generation")
+            continue
+        # Generate WITHOUT evidence (ungrounded)
+        ungrounded_user = f"""Recommend a product for: "{query}"
+Write a brief 2-3 sentence recommendation. You may make reasonable assumptions about the product."""
+        try:
+            ungrounded_response, _ = llm.generate(system_prompt, ungrounded_user)
+            ungrounded_hhem = detector.check_explanation(
+                evidence_texts, ungrounded_response
+            )
+            without_evidence.append(ungrounded_hhem.score)
+            logger.info("  WITHOUT evidence: %.3f", ungrounded_hhem.score)
+        except Exception:
+            logger.exception("  Error with ungrounded generation")
+    # Summary
+    log_banner(logger, "GROUNDING DELTA RESULTS")
+    if with_evidence and without_evidence:
+        with_mean = np.mean(with_evidence)
+        without_mean = np.mean(without_evidence)
+        delta = with_mean - without_mean
+        logger.info("Samples: %d", min(len(with_evidence), len(without_evidence)))
+        logger.info("WITH evidence (grounded):    %.3f mean HHEM", with_mean)
+        logger.info("WITHOUT evidence (halluc):   %.3f mean HHEM", without_mean)
+        logger.info("Delta (grounding benefit):   +%.3f", delta)
+        logger.info(
+            "Interpretation: Grounding %s hallucination by %.1f%%",
+            "reduces" if delta > 0 else "increases",
+            abs(delta) * 100,
+        )
+        # Save results
+        results = {
+            "n_samples": min(len(with_evidence), len(without_evidence)),
+            "with_evidence_mean": float(with_mean),
+            "without_evidence_mean": float(without_mean),
+            "delta": float(delta),
+            "with_evidence_scores": with_evidence,
+            "without_evidence_scores": without_evidence,
+        }
+        ts_file = save_results(results, "grounding_delta")
+        logger.info("Saved: %s", ts_file)
+    else:
+        logger.warning("Not enough samples for comparison")
 # ============================================================================
 # Main
 # ============================================================================
     parser.add_argument(
         "--adjusted", action="store_true", help="Calculate adjusted metrics"
     )
+    parser.add_argument(
+        "--delta", action="store_true", help="Run grounding delta experiment"
+    )
     args = parser.parse_args()
     if args.analyze:
         run_failure_analysis()
     elif args.adjusted:
         run_adjusted_calculation()
+    elif args.delta:
+        run_grounding_delta()
     else:
         run_evaluation(n_samples=args.samples, run_ragas=args.ragas)

scripts/human_eval.py CHANGED Viewed

@@ -374,6 +374,18 @@ def analyze_results():
         "timestamp": datetime.now().isoformat(),
         "n_samples": len(rated),
         "n_total": len(samples),
         "dimensions": dimensions_results,
         "overall_helpfulness": round(overall, 2),
         "target": HELPFULNESS_TARGET,

         "timestamp": datetime.now().isoformat(),
         "n_samples": len(rated),
         "n_total": len(samples),
+        "methodology": {
+            "evaluator": "Single rater (developer/researcher)",
+            "instructions": "Rate each dimension 1-5 Likert: 1=strongly disagree, 5=strongly agree",
+            "dimensions": {
+                "comprehension": "I understood why this item was recommended",
+                "trust": "I trust this explanation is accurate",
+                "usefulness": "This explanation helped me make a decision",
+                "satisfaction": "I am satisfied with this explanation",
+            },
+            "sample_selection": "35 natural queries (balanced by category) + 15 config queries",
+            "inter_annotator_agreement": "N/A (single rater)",
+        },
         "dimensions": dimensions_results,
         "overall_helpfulness": round(overall, 2),
         "target": HELPFULNESS_TARGET,