Spaces:

vxa8502
/

Sage

Running

App Files Files Community

vxa8502 commited on 16 days ago

Commit

66926c8

1 Parent(s): f9c51d8

Replace EDA with production Qdrant queries

Browse files

Files changed (14) hide show

Makefile +7 -19
reports/eda_report.md +107 -0
sage/services/__init__.py +16 -0
scripts/__init__.py +1 -0
scripts/build_eval_dataset.py +0 -660
scripts/demo.py +1 -1
scripts/e2e_success_rate.py +1 -1
scripts/eda.py +375 -419
scripts/evaluation.py +23 -13
scripts/explanation.py +39 -30
scripts/faithfulness.py +2 -2
scripts/human_eval.py +1 -1
scripts/lib/__init__.py +2 -1
scripts/summary.py +1 -11

Makefile CHANGED Viewed

@@ -67,14 +67,12 @@ data-validate:
 	assert emb is not None and emb.shape[1] == 384, 'Embedding dimension mismatch'; \
 	print('Validation passed')"
-# Exploratory data analysis (generates figures + report)
-eda:
-	@echo "=== EDA ANALYSIS ==="
 	@mkdir -p data/figures
 	@mkdir -p reports
 	python scripts/eda.py
-	@echo "Figures saved to data/figures/"
-	@echo "Report generated: reports/eda_report.md"
 # ---------------------------------------------------------------------------
 # Evaluation Suite
@@ -82,16 +80,11 @@ eda:
 # Standard evaluation: primary metrics, spot-checks, explanation tests, faithfulness
 eval: check-env
-	@test -d data/splits || (echo "ERROR: Run 'make data' first" && exit 1)
 	@echo "=== EVALUATION SUITE ===" && \
 	echo "" && \
-	echo "--- Building evaluation datasets ---" && \
-	python scripts/build_eval_dataset.py && \
 	python scripts/build_natural_eval_dataset.py && \
 	echo "" && \
-	echo "--- Recommendation evaluation (LOO history) ---" && \
-	python scripts/evaluation.py --dataset eval_loo_history.json --section primary && \
-	echo "" && \
 	echo "--- Recommendation evaluation (natural queries) ---" && \
 	python scripts/evaluation.py --dataset eval_natural_queries.json --section primary && \
 	echo "" && \
@@ -114,9 +107,6 @@ eval-deep: check-env
 	@test -d data/eval || (echo "ERROR: Run 'make eval' first to build eval datasets" && exit 1)
 	@echo "=== DEEP EVALUATION (ablations + baselines) ===" && \
 	echo "" && \
-	echo "--- Full recommendation evaluation (LOO history) ---" && \
-	python scripts/evaluation.py --dataset eval_loo_history.json --section all --baselines && \
-	echo "" && \
 	echo "--- Full recommendation evaluation (natural queries) ---" && \
 	python scripts/evaluation.py --dataset eval_natural_queries.json --section all && \
 	echo "" && \
@@ -131,11 +121,9 @@ eval-deep: check-env
 # Quick eval: skip RAGAS (faster iteration)
 eval-quick: check-env
-	@test -d data/splits || (echo "ERROR: Run 'make data' first" && exit 1)
 	@echo "=== QUICK EVALUATION (no RAGAS) ==="
-	python scripts/build_eval_dataset.py && \
 	python scripts/build_natural_eval_dataset.py && \
-	python scripts/evaluation.py --dataset eval_loo_history.json --section primary && \
 	python scripts/faithfulness.py --samples 5
 	@echo "Quick eval complete"
@@ -248,10 +236,10 @@ metrics-snapshot:
 	@python -c "\
 	import json; from pathlib import Path; \
 	r = Path('data/eval_results'); \
-	loo = json.load(open(r/'eval_loo_history_latest.json', encoding='utf-8')) if (r/'eval_loo_history_latest.json').exists() else {}; \
 	faith = json.load(open(r/'faithfulness_latest.json', encoding='utf-8')) if (r/'faithfulness_latest.json').exists() else {}; \
 	human = json.load(open(r/'human_eval_latest.json', encoding='utf-8')) if (r/'human_eval_latest.json').exists() else {}; \
-	pm = loo.get('primary_metrics', {}); mm = faith.get('multi_metric', {}); \
 	print('=== SAGE METRICS ==='); \
 	print(f'NDCG@10:     {pm.get(\"ndcg_at_10\", \"n/a\")}'); \
 	print(f'Claim HHEM:  {mm.get(\"claim_level_avg_score\", \"n/a\")}'); \

 	assert emb is not None and emb.shape[1] == 384, 'Embedding dimension mismatch'; \
 	print('Validation passed')"
+# Exploratory data analysis (queries production Qdrant)
+eda: check-env
+	@echo "=== PRODUCTION EDA ==="
 	@mkdir -p data/figures
 	@mkdir -p reports
 	python scripts/eda.py
 # ---------------------------------------------------------------------------
 # Evaluation Suite
 # Standard evaluation: primary metrics, spot-checks, explanation tests, faithfulness
 eval: check-env
 	@echo "=== EVALUATION SUITE ===" && \
 	echo "" && \
+	echo "--- Building natural query evaluation dataset ---" && \
 	python scripts/build_natural_eval_dataset.py && \
 	echo "" && \
 	echo "--- Recommendation evaluation (natural queries) ---" && \
 	python scripts/evaluation.py --dataset eval_natural_queries.json --section primary && \
 	echo "" && \
 	@test -d data/eval || (echo "ERROR: Run 'make eval' first to build eval datasets" && exit 1)
 	@echo "=== DEEP EVALUATION (ablations + baselines) ===" && \
 	echo "" && \
 	echo "--- Full recommendation evaluation (natural queries) ---" && \
 	python scripts/evaluation.py --dataset eval_natural_queries.json --section all && \
 	echo "" && \
 # Quick eval: skip RAGAS (faster iteration)
 eval-quick: check-env
 	@echo "=== QUICK EVALUATION (no RAGAS) ==="
 	python scripts/build_natural_eval_dataset.py && \
+	python scripts/evaluation.py --dataset eval_natural_queries.json --section primary && \
 	python scripts/faithfulness.py --samples 5
 	@echo "Quick eval complete"
 	@python -c "\
 	import json; from pathlib import Path; \
 	r = Path('data/eval_results'); \
+	nq = json.load(open(r/'eval_natural_queries_latest.json', encoding='utf-8')) if (r/'eval_natural_queries_latest.json').exists() else {}; \
 	faith = json.load(open(r/'faithfulness_latest.json', encoding='utf-8')) if (r/'faithfulness_latest.json').exists() else {}; \
 	human = json.load(open(r/'human_eval_latest.json', encoding='utf-8')) if (r/'human_eval_latest.json').exists() else {}; \
+	pm = nq.get('primary_metrics', {}); mm = faith.get('multi_metric', {}); \
 	print('=== SAGE METRICS ==='); \
 	print(f'NDCG@10:     {pm.get(\"ndcg_at_10\", \"n/a\")}'); \
 	print(f'Claim HHEM:  {mm.get(\"claim_level_avg_score\", \"n/a\")}'); \

reports/eda_report.md ADDED Viewed

	@@ -0,0 +1,107 @@

+# Exploratory Data Analysis: Production Data
+**Source:** Qdrant Cloud (Collection: `sage_reviews`)
+**Status:** green
+**Generated from live production data**
+---
+## Dataset Overview
+This report analyzes the actual data deployed in production, ensuring all statistics match what the recommendation system uses.
+| Metric | Value |
+|--------|-------|
+| Total Chunks | 423,165 |
+| Unique Reviews | 334,282 |
+| Unique Products | 21,827 |
+| Expansion Ratio | 1.27x |
+---
+## Rating Distribution
+Amazon reviews exhibit a characteristic J-shaped distribution, heavily skewed toward 5-star ratings.
+![Rating Distribution](../data/figures/rating_distribution.png)
+| Rating | Count | Percentage |
+|--------|-------|------------|
+| 1 | 31,924 | 7.5% |
+| 2 | 21,301 | 5.0% |
+| 3 | 34,078 | 8.1% |
+| 4 | 71,153 | 16.8% |
+| 5 | 264,709 | 62.6% |
+**Key Observations:**
+- 5-star ratings: 62.6% of chunks
+- 1-star ratings: 7.5% of chunks
+- This polarization is typical for e-commerce review data
+---
+## Chunk Length Analysis
+Chunk lengths affect retrieval quality and context window usage.
+![Chunk Lengths](../data/figures/chunk_lengths.png)
+**Statistics:**
+- Median chunk length: 169 characters (~42 tokens)
+- Mean chunk length: 258 characters
+- Most chunks fit comfortably within embedding model context
+---
+## Chunking Distribution
+Reviews are chunked based on length: short reviews stay whole, longer reviews are split semantically.
+![Chunks per Review](../data/figures/chunks_per_review.png)
+| Metric | Value |
+|--------|-------|
+| Single-chunk reviews | 303,550 |
+| Multi-chunk reviews | 30,732 |
+| Expansion ratio | 1.27x |
+**Chunking Strategy:**
+- Reviews < 200 tokens: No chunking (embedded whole)
+- Reviews 200-500 tokens: Semantic chunking
+- Reviews > 500 tokens: Semantic + sliding window
+---
+## Temporal Distribution
+Review timestamps enable chronological analysis and temporal evaluation splits.
+![Temporal Distribution](../data/figures/temporal_distribution.png)
+---
+## Data Quality
+The production dataset has been through 5-core filtering (users and items with 5+ interactions) and quality checks:
+- All chunks have valid text content
+- All ratings are in [1, 5] range
+- All product identifiers present
+- Deterministic chunk IDs (MD5 hash of review_id + chunk_index)
+---
+## Summary
+This production EDA confirms the deployed data characteristics:
+1. **Scale:** 423,165 chunks across 21,827 products
+2. **Quality:** 5-core filtered, validated payloads
+3. **Distribution:** J-shaped ratings, typical e-commerce pattern
+4. **Chunking:** 1.27x expansion from reviews to chunks
+The data matches what the recommendation API queries in real-time.
+---
+*Report generated from Qdrant Cloud. Run `make eda` to regenerate.*

sage/services/__init__.py CHANGED Viewed

@@ -31,6 +31,21 @@ from sage.services.cold_start import (
     recommend_cold_start_user,
 )
 # Evaluation and faithfulness services are loaded lazily to avoid
 # pulling in ragas/langchain when only retrieval is needed.
 # Import from sage.services.evaluation or sage.services.faithfulness directly.
@@ -75,6 +90,7 @@ __all__ = [
     # Explanation
     "Explainer",
     "explain_recommendations",
     # Cold-start
     "ColdStartService",
     "recommend_cold_start_user",

     recommend_cold_start_user,
 )
+def get_explanation_services():
+    """Initialize Explainer and HallucinationDetector.
+    Centralizes the common pattern of creating both services together.
+    Import is deferred to avoid loading heavy models until needed.
+    Returns:
+        Tuple of (Explainer, HallucinationDetector) instances.
+    """
+    from sage.adapters.hhem import HallucinationDetector
+    return Explainer(), HallucinationDetector()
 # Evaluation and faithfulness services are loaded lazily to avoid
 # pulling in ragas/langchain when only retrieval is needed.
 # Import from sage.services.evaluation or sage.services.faithfulness directly.
     # Explanation
     "Explainer",
     "explain_recommendations",
+    "get_explanation_services",
     # Cold-start
     "ColdStartService",
     "recommend_cold_start_user",

scripts/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Scripts package marker for relative imports.

scripts/build_eval_dataset.py DELETED Viewed

@@ -1,660 +0,0 @@
-"""
-Build evaluation dataset from test split using leave-one-out protocol.
-For each user with 2+ reviews in the test set:
-1. Hold out their most recent review (the "target" item)
-2. Generate a query from:
-   - Keywords extracted from held-out review (simulates search)
-   - OR user's historical reviews (profile-based)
-3. Create EvalCase with target item as relevant
-Run from project root:
-    python scripts/build_eval_dataset.py
-"""
-import re
-import json
-from collections import Counter
-from pathlib import Path
-import pandas as pd
-import numpy as np
-from sage.core import EvalCase
-from sage.config import DATA_DIR, get_logger, log_banner, log_section
-from sage.services.evaluation import rating_to_relevance
-logger = get_logger(__name__)
-EVAL_DIR = DATA_DIR / "eval"
-# ---------------------------------------------------------------------------
-# Query Generation Strategies
-# ---------------------------------------------------------------------------
-# Common stopwords to filter out
-STOPWORDS = {
-    "i",
-    "me",
-    "my",
-    "myself",
-    "we",
-    "our",
-    "ours",
-    "ourselves",
-    "you",
-    "your",
-    "yours",
-    "yourself",
-    "yourselves",
-    "he",
-    "him",
-    "his",
-    "himself",
-    "she",
-    "her",
-    "hers",
-    "herself",
-    "it",
-    "its",
-    "itself",
-    "they",
-    "them",
-    "their",
-    "theirs",
-    "themselves",
-    "what",
-    "which",
-    "who",
-    "whom",
-    "this",
-    "that",
-    "these",
-    "those",
-    "am",
-    "is",
-    "are",
-    "was",
-    "were",
-    "be",
-    "been",
-    "being",
-    "have",
-    "has",
-    "had",
-    "having",
-    "do",
-    "does",
-    "did",
-    "doing",
-    "a",
-    "an",
-    "the",
-    "and",
-    "but",
-    "if",
-    "or",
-    "because",
-    "as",
-    "until",
-    "while",
-    "of",
-    "at",
-    "by",
-    "for",
-    "with",
-    "about",
-    "against",
-    "between",
-    "into",
-    "through",
-    "during",
-    "before",
-    "after",
-    "above",
-    "below",
-    "to",
-    "from",
-    "up",
-    "down",
-    "in",
-    "out",
-    "on",
-    "off",
-    "over",
-    "under",
-    "again",
-    "further",
-    "then",
-    "once",
-    "here",
-    "there",
-    "when",
-    "where",
-    "why",
-    "how",
-    "all",
-    "each",
-    "few",
-    "more",
-    "most",
-    "other",
-    "some",
-    "such",
-    "no",
-    "nor",
-    "not",
-    "only",
-    "own",
-    "same",
-    "so",
-    "than",
-    "too",
-    "very",
-    "s",
-    "t",
-    "can",
-    "will",
-    "just",
-    "don",
-    "should",
-    "now",
-    "d",
-    "ll",
-    "m",
-    "o",
-    "re",
-    "ve",
-    "y",
-    "ain",
-    "aren",
-    "couldn",
-    "didn",
-    "doesn",
-    "hadn",
-    "hasn",
-    "haven",
-    "isn",
-    "ma",
-    "mightn",
-    "mustn",
-    "needn",
-    "shan",
-    "shouldn",
-    "wasn",
-    "weren",
-    "won",
-    "wouldn",
-    "also",
-    "would",
-    "could",
-    "get",
-    "got",
-    "one",
-    "two",
-    "really",
-    "like",
-    "just",
-    "even",
-    "well",
-    "much",
-    "still",
-    "back",
-    "way",
-    "thing",
-    "things",
-    "make",
-    "made",
-    "work",
-    "works",
-    "worked",
-    "use",
-    "used",
-    "using",
-    "good",
-    "great",
-    "nice",
-    "product",
-    "item",
-    "bought",
-    "buy",
-    "amazon",
-    "review",
-    "ordered",
-    "order",
-    "received",
-    "came",
-    "arrived",
-    "shipping",
-    "shipped",
-}
-def extract_keywords(text: str, max_keywords: int = 8) -> list[str]:
-    """
-    Extract keywords from review text using simple frequency analysis.
-    Focuses on nouns and adjectives that describe product attributes.
-    Args:
-        text: Review text.
-        max_keywords: Maximum keywords to extract.
-    Returns:
-        List of keyword strings.
-    """
-    # Clean text
-    text = text.lower()
-    text = re.sub(r"<br\s*/?>", " ", text)  # Remove HTML breaks
-    text = re.sub(r"[^a-z\s]", " ", text)  # Keep only letters
-    text = re.sub(r"\s+", " ", text).strip()
-    # Tokenize and filter
-    words = text.split()
-    words = [w for w in words if len(w) > 2 and w not in STOPWORDS]
-    # Count frequencies
-    counts = Counter(words)
-    # Get top keywords
-    keywords = [word for word, _ in counts.most_common(max_keywords)]
-    return keywords
-def generate_query_from_review(
-    title: str,
-    text: str,
-    max_words: int = 10,
-) -> str:
-    """
-    Generate a search query from a review's title and text.
-    Combines title keywords with text keywords to create a realistic
-    query that a user might type to find this product.
-    Args:
-        title: Review title.
-        text: Review text.
-        max_words: Maximum words in generated query.
-    Returns:
-        Query string.
-    """
-    # Extract from title (usually more specific)
-    title_keywords = extract_keywords(title or "", max_keywords=4)
-    # Extract from text
-    text_keywords = extract_keywords(text or "", max_keywords=8)
-    # Combine, prioritizing title
-    all_keywords = []
-    seen = set()
-    for kw in title_keywords + text_keywords:
-        if kw not in seen:
-            all_keywords.append(kw)
-            seen.add(kw)
-    # Limit length
-    query_words = all_keywords[:max_words]
-    return " ".join(query_words) if query_words else "electronics product"
-def generate_query_from_history(
-    reviews: list[dict],
-    max_words: int = 15,
-) -> str:
-    """
-    Generate a query from user's review history (profile-based).
-    Concatenates positive review texts and extracts common themes.
-    Args:
-        reviews: List of review dicts with 'text' and 'rating' keys.
-        max_words: Maximum words in generated query.
-    Returns:
-        Query string.
-    """
-    # Filter to positive reviews
-    positive = [r for r in reviews if r.get("rating", 0) >= 4]
-    if not positive:
-        positive = reviews
-    # Combine texts
-    combined_text = " ".join(r.get("text", "")[:500] for r in positive[:5])
-    # Extract keywords
-    keywords = extract_keywords(combined_text, max_keywords=max_words)
-    return " ".join(keywords) if keywords else "electronics product"
-# ---------------------------------------------------------------------------
-# Evaluation Dataset Construction
-# ---------------------------------------------------------------------------
-def build_leave_one_out_cases(
-    df: pd.DataFrame,
-    min_reviews: int = 2,
-    query_strategy: str = "keyword",
-    verbose: bool = True,
-) -> list[EvalCase]:
-    """
-    Build evaluation cases using leave-one-out protocol.
-    For each user with enough reviews:
-    1. Sort reviews by timestamp
-    2. Hold out the most recent review as target
-    3. Generate query based on strategy
-    4. Create EvalCase with graded relevance
-    Args:
-        df: DataFrame with review data.
-        min_reviews: Minimum reviews per user to include.
-        query_strategy: "keyword" (from target) or "history" (from past reviews).
-        verbose: Print progress.
-    Returns:
-        List of EvalCase objects.
-    """
-    if verbose:
-        logger.info("Building eval cases with strategy: %s", query_strategy)
-        logger.info("Minimum reviews per user: %d", min_reviews)
-    # Group by user
-    user_groups = df.groupby("user_id")
-    eval_cases = []
-    skipped_users = 0
-    for user_id, group in user_groups:
-        if len(group) < min_reviews:
-            skipped_users += 1
-            continue
-        # Sort by timestamp (ascending)
-        group = group.sort_values("timestamp")
-        reviews = group.to_dict("records")
-        # Hold out the most recent review
-        target_review = reviews[-1]
-        history_reviews = reviews[:-1]
-        # Generate query
-        if query_strategy == "keyword":
-            query = generate_query_from_review(
-                title=target_review.get("title", ""),
-                text=target_review.get("text", ""),
-            )
-        elif query_strategy == "history":
-            query = generate_query_from_history(history_reviews)
-        else:
-            raise ValueError(f"Unknown query strategy: {query_strategy}")
-        # Build relevance dict
-        # Target item gets relevance based on rating
-        target_product = target_review.get("parent_asin")
-        target_rating = target_review.get("rating", 3)
-        relevance = rating_to_relevance(target_rating)
-        # Only include if target has positive relevance
-        if relevance > 0:
-            eval_cases.append(
-                EvalCase(
-                    query=query,
-                    relevant_items={target_product: relevance},
-                    user_id=user_id,
-                )
-            )
-    if verbose:
-        logger.info("Users with enough reviews: %d", len(user_groups) - skipped_users)
-        logger.info("Eval cases created: %d", len(eval_cases))
-        logger.info(
-            "Skipped (low relevance): %d",
-            len(user_groups) - skipped_users - len(eval_cases),
-        )
-    return eval_cases
-def build_multi_relevant_cases(
-    df: pd.DataFrame,
-    train_df: pd.DataFrame,
-    min_test_reviews: int = 1,
-    verbose: bool = True,
-) -> list[EvalCase]:
-    """
-    Build cases where ALL user's test reviews are relevant.
-    Uses user's training history to generate query, and ALL their
-    test reviews as relevant items. Better for users with multiple
-    test items.
-    Args:
-        df: Test split DataFrame.
-        train_df: Training split DataFrame.
-        min_test_reviews: Minimum test reviews to include user.
-        verbose: Print progress.
-    Returns:
-        List of EvalCase objects.
-    """
-    if verbose:
-        logger.info("Building multi-relevant eval cases...")
-    # Get users with training history
-    train_users = set(train_df["user_id"].unique())
-    # Group test reviews by user
-    test_groups = df.groupby("user_id")
-    eval_cases = []
-    for user_id, group in test_groups:
-        if len(group) < min_test_reviews:
-            continue
-        # Skip if no training history
-        if user_id not in train_users:
-            continue
-        # Get training reviews for query generation
-        user_train = train_df[train_df["user_id"] == user_id]
-        train_reviews = user_train.to_dict("records")
-        if not train_reviews:
-            continue
-        # Generate query from training history
-        query = generate_query_from_history(train_reviews)
-        # All test reviews are relevant
-        relevant_items = {}
-        for row in group.to_dict("records"):
-            product_id = row["parent_asin"]
-            rating = row["rating"]
-            relevance = rating_to_relevance(rating)
-            if relevance > 0:
-                # Take max relevance if product appears multiple times
-                relevant_items[product_id] = max(
-                    relevant_items.get(product_id, 0),
-                    relevance,
-                )
-        if relevant_items:
-            eval_cases.append(
-                EvalCase(
-                    query=query,
-                    relevant_items=relevant_items,
-                    user_id=user_id,
-                )
-            )
-    if verbose:
-        logger.info("Users with train history: %d", len(train_users))
-        logger.info("Eval cases created: %d", len(eval_cases))
-        avg_relevant = (
-            np.mean([len(c.relevant_items) for c in eval_cases]) if eval_cases else 0
-        )
-        logger.info("Avg relevant items per case: %.1f", avg_relevant)
-    return eval_cases
-def save_eval_cases(
-    cases: list[EvalCase],
-    filename: str,
-    verbose: bool = True,
-) -> Path:
-    """
-    Save evaluation cases to JSON file.
-    Args:
-        cases: List of EvalCase objects.
-        filename: Output filename (without directory).
-        verbose: Print confirmation.
-    Returns:
-        Path to saved file.
-    """
-    EVAL_DIR.mkdir(exist_ok=True)
-    filepath = EVAL_DIR / filename
-    # Convert to serializable format
-    data = [
-        {
-            "query": c.query,
-            "relevant_items": c.relevant_items,
-            "user_id": c.user_id,
-        }
-        for c in cases
-    ]
-    with open(filepath, "w", encoding="utf-8") as f:
-        json.dump(data, f, indent=2)
-    if verbose:
-        logger.info("Saved %d eval cases to: %s", len(cases), filepath)
-    return filepath
-def load_eval_cases(filename: str) -> list[EvalCase]:
-    """
-    Load evaluation cases from JSON file.
-    Args:
-        filename: Filename in eval directory.
-    Returns:
-        List of EvalCase objects.
-    """
-    filepath = EVAL_DIR / filename
-    with open(filepath, encoding="utf-8") as f:
-        data = json.load(f)
-    return [
-        EvalCase(
-            query=d["query"],
-            relevant_items=d["relevant_items"],
-            user_id=d.get("user_id"),
-        )
-        for d in data
-    ]
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-if __name__ == "__main__":
-    from sage.data import load_splits
-    log_banner(logger, "BUILD EVALUATION DATASET")
-    # Load splits
-    log_section(logger, "Loading data splits")
-    train_df, val_df, test_df = load_splits()
-    logger.info(
-        "Train: %s | Val: %s | Test: %s",
-        f"{len(train_df):,}",
-        f"{len(val_df):,}",
-        f"{len(test_df):,}",
-    )
-    # Strategy 1: Leave-one-out with keyword queries
-    # WARNING: This strategy has TARGET LEAKAGE - queries are generated from
-    # the held-out review itself. Only use as a retrieval sanity check,
-    # NOT for measuring recommendation quality.
-    log_section(logger, "Strategy 1: Leave-One-Out (Keyword Queries)")
-    logger.warning("Target leakage - use for sanity check only!")
-    loo_keyword_cases = build_leave_one_out_cases(
-        test_df,
-        min_reviews=2,
-        query_strategy="keyword",
-    )
-    # Show examples
-    logger.info("Sample queries:")
-    for case in loo_keyword_cases[:5]:
-        logger.info('  Query: "%s"', case.query)
-        logger.info(
-            "  Target: %s (rel=%s)",
-            list(case.relevant_items.keys())[0],
-            list(case.relevant_items.values())[0],
-        )
-    save_eval_cases(loo_keyword_cases, "eval_loo_keyword.json")
-    # Strategy 2: Leave-one-out with history queries
-    log_section(logger, "Strategy 2: Leave-One-Out (History Queries)")
-    loo_history_cases = build_leave_one_out_cases(
-        test_df,
-        min_reviews=2,
-        query_strategy="history",
-    )
-    # Show examples
-    logger.info("Sample queries:")
-    for case in loo_history_cases[:5]:
-        logger.info('  Query: "%s"', case.query)
-        logger.info(
-            "  Target: %s (rel=%s)",
-            list(case.relevant_items.keys())[0],
-            list(case.relevant_items.values())[0],
-        )
-    save_eval_cases(loo_history_cases, "eval_loo_history.json")
-    # Strategy 3: Multi-relevant (all test items)
-    log_section(logger, "Strategy 3: Multi-Relevant (Train->Test)")
-    multi_cases = build_multi_relevant_cases(
-        test_df,
-        train_df,
-        min_test_reviews=1,
-    )
-    if multi_cases:
-        logger.info("Sample queries:")
-        for case in multi_cases[:3]:
-            logger.info('  Query: "%s..."', case.query[:60])
-            logger.info("  Relevant: %d items", len(case.relevant_items))
-        save_eval_cases(multi_cases, "eval_multi_relevant.json")
-    # Summary
-    log_banner(logger, "EVALUATION DATASETS CREATED")
-    logger.info("  eval_loo_keyword.json:    %d cases", len(loo_keyword_cases))
-    logger.info("  eval_loo_history.json:    %d cases", len(loo_history_cases))
-    logger.info("  eval_multi_relevant.json: %d cases", len(multi_cases))
-    logger.info("  Location: %s", EVAL_DIR)

scripts/demo.py CHANGED Viewed

@@ -44,7 +44,7 @@ def demo_recommendation(query: str, top_k: int = 3, max_evidence: int = 3):
         return None
     # Initialize services
-    from scripts.lib.services import get_explanation_services
     explainer, detector = get_explanation_services()

         return None
     # Initialize services
+    from sage.services import get_explanation_services
     explainer, detector = get_explanation_services()

scripts/e2e_success_rate.py CHANGED Viewed

@@ -104,7 +104,7 @@ class E2EReport:
 def run_e2e_evaluation(n_samples: int = 20) -> E2EReport:
     """Run end-to-end success rate evaluation."""
-    from scripts.lib.services import get_explanation_services
     from sage.services.faithfulness import (
         is_refusal,
         is_mismatch_warning,

 def run_e2e_evaluation(n_samples: int = 20) -> E2EReport:
     """Run end-to-end success rate evaluation."""
+    from sage.services import get_explanation_services
     from sage.services.faithfulness import (
         is_refusal,
         is_mismatch_warning,

scripts/eda.py CHANGED Viewed

@@ -1,21 +1,46 @@
-# %% [markdown]
-# # Exploratory Data Analysis
-#
-# Analyze the Amazon Electronics reviews dataset to understand
-# data distributions, quality issues, and inform modeling decisions.
-# %% Imports
 from pathlib import Path
-import pandas as pd
 import matplotlib.pyplot as plt
-from sage.config import CHARS_PER_TOKEN, DEV_SUBSET_SIZE, DATA_DIR
-from sage.data import load_reviews, get_review_stats, prepare_data
-# Output directory for figures
 FIGURES_DIR = DATA_DIR / "figures"
-FIGURES_DIR.mkdir(exist_ok=True)
 # Plot configuration
 plt.style.use("seaborn-v0_8-whitegrid")
@@ -23,7 +48,7 @@ plt.rcParams.update(
     {
         "figure.figsize": (10, 5),
         "figure.dpi": 100,
-        "savefig.dpi": 300,  # High-res for markdown reports
         "savefig.bbox": "tight",
         "savefig.pad_inches": 0.1,
         "font.size": 11,
@@ -33,481 +58,412 @@ plt.rcParams.update(
     }
 )
-# Enable retina display for Jupyter notebooks
-try:
-    from IPython import get_ipython
-    if get_ipython() is not None:
-        get_ipython().run_line_magic("matplotlib", "inline")
-        get_ipython().run_line_magic("config", "InlineBackend.figure_format='retina'")
-except (ImportError, AttributeError):
-    pass
 PRIMARY_COLOR = "#05A0D1"
 SECONDARY_COLOR = "#FF9900"
 FIGURE_SIZE_WIDE = (12, 5)
-# %% Load data
-df = load_reviews(subset_size=DEV_SUBSET_SIZE)
-print(f"Loaded {len(df):,} reviews")
-# %% Basic statistics
-stats = get_review_stats(df)
-print("\n=== Dataset Overview ===")
-for key, value in stats.items():
-    if isinstance(value, float):
-        print(f"{key}: {value:.2f}")
-    else:
-        print(f"{key}: {value}")
-# %% Rating distribution
-fig, ax = plt.subplots()
-rating_counts = pd.Series(stats["rating_dist"])
-bars = ax.bar(
-    rating_counts.index, rating_counts.values, color=PRIMARY_COLOR, edgecolor="black"
-)
-ax.set_xlabel("Rating")
-ax.set_ylabel("Count")
-ax.set_title("Rating Distribution")
-ax.set_xticks(rating_counts.index)
-for bar, count in zip(bars, rating_counts.values, strict=True):
-    ax.text(
-        bar.get_x() + bar.get_width() / 2,
-        bar.get_height() + 50,
-        f"{count:,}",
-        ha="center",
-        va="bottom",
-        fontsize=10,
-    )
-plt.savefig(FIGURES_DIR / "rating_distribution.png")
-print("\nRating breakdown:")
-for rating, count in rating_counts.items():
-    pct = count / len(df) * 100
-    print(f"  {int(rating)} stars: {count:,} ({pct:.1f}%)")
-# %% Review length analysis
-df["text_length"] = df["text"].str.len()
-df["word_count"] = df["text"].str.split().str.len()
-df["estimated_tokens"] = df["text_length"] // CHARS_PER_TOKEN
-fig, axes = plt.subplots(1, 2, figsize=FIGURE_SIZE_WIDE)
-# Character length histogram
-ax1 = axes[0]
-df["text_length"].clip(upper=2000).hist(
-    bins=50, ax=ax1, color=PRIMARY_COLOR, edgecolor="white"
-)
-ax1.set_xlabel("Character Length (clipped at 2000)")
-ax1.set_ylabel("Count")
-ax1.set_title("Review Length Distribution")
-ax1.axvline(
-    df["text_length"].median(),
-    color="red",
-    linestyle="--",
-    label=f"Median: {df['text_length'].median():.0f}",
-)
-ax1.legend()
-# Token estimate histogram
-ax2 = axes[1]
-df["estimated_tokens"].clip(upper=500).hist(
-    bins=50, ax=ax2, color=SECONDARY_COLOR, edgecolor="white"
-)
-ax2.set_xlabel("Estimated Tokens (clipped at 500)")
-ax2.set_ylabel("Count")
-ax2.set_title("Estimated Token Distribution")
-ax2.axvline(200, color="red", linestyle="--", label="Chunking threshold (200)")
-ax2.legend()
-plt.savefig(FIGURES_DIR / "review_lengths.png")
-needs_chunking = (df["estimated_tokens"] > 200).sum()
-print("\nReview length stats:")
-print(f"  Median characters: {df['text_length'].median():.0f}")
-print(f"  Median tokens (est): {df['estimated_tokens'].median():.0f}")
-print(
-    f"  Reviews > 200 tokens: {needs_chunking:,} ({needs_chunking / len(df) * 100:.1f}%)"
-)
-# %% Review length by rating
-fig, ax = plt.subplots()
-length_by_rating = df.groupby("rating")["text_length"].median()
-bars = ax.bar(
-    length_by_rating.index,
-    length_by_rating.values,
-    color=PRIMARY_COLOR,
-    edgecolor="white",
-)
-ax.set_xlabel("Rating")
-ax.set_ylabel("Median Review Length (chars)")
-ax.set_title("Review Length by Rating")
-ax.set_xticks([1, 2, 3, 4, 5])
-plt.savefig(FIGURES_DIR / "length_by_rating.png")
-print("\nMedian review length by rating:")
-for rating, length in length_by_rating.items():
-    print(f"  {int(rating)} stars: {length:.0f} chars")
-# %% Temporal analysis
-df["datetime"] = pd.to_datetime(df["timestamp"], unit="ms")
-df["year_month"] = df["datetime"].dt.to_period("M")
-reviews_over_time = df.groupby("year_month").size()
-fig, ax = plt.subplots(figsize=FIGURE_SIZE_WIDE)
-reviews_over_time.plot(
-    kind="line", ax=ax, marker="o", markersize=3, linewidth=1, color=PRIMARY_COLOR
-)
-ax.set_xlabel("Month")
-ax.set_ylabel("Number of Reviews")
-ax.set_title("Reviews Over Time")
-plt.xticks(rotation=45)
-plt.savefig(FIGURES_DIR / "reviews_over_time.png")
-print("\nTemporal range:")
-print(f"  Earliest: {df['datetime'].min()}")
-print(f"  Latest: {df['datetime'].max()}")
-# %% Data quality checks
-print("\n=== Data Quality Checks ===")
-# Missing values
-missing = df.isnull().sum()
-print("\nMissing values:")
-for col, count in missing.items():
-    if count > 0:
-        print(f"  {col}: {count:,} ({count / len(df) * 100:.2f}%)")
-if missing.sum() == 0:
-    print("  None!")
-# Empty reviews
-empty_reviews = (df["text"].str.strip() == "").sum()
-print(f"\nEmpty reviews: {empty_reviews:,}")
-# Very short reviews (< 10 chars)
-very_short = (df["text_length"] < 10).sum()
-print(f"Very short reviews (<10 chars): {very_short:,}")
-# Duplicate reviews
-duplicate_texts = df["text"].duplicated().sum()
-print(f"Duplicate review texts: {duplicate_texts:,}")
-# Verified vs unverified
-if "verified_purchase" in df.columns:
-    verified_pct = df["verified_purchase"].mean() * 100
-    print(f"\nVerified purchases: {verified_pct:.1f}%")
-# %% User and item coverage
-user_counts = df["user_id"].value_counts()
-item_counts = df["parent_asin"].value_counts()
-fig, axes = plt.subplots(1, 2, figsize=FIGURE_SIZE_WIDE)
-# Reviews per user
-ax1 = axes[0]
-user_counts.clip(upper=20).value_counts().sort_index().plot(
-    kind="bar", ax=ax1, color=PRIMARY_COLOR
-)
-ax1.set_xlabel("Reviews per User")
-ax1.set_ylabel("Number of Users")
-ax1.set_title("User Activity Distribution")
-# Reviews per item
-ax2 = axes[1]
-item_counts.clip(upper=20).value_counts().sort_index().plot(
-    kind="bar", ax=ax2, color=SECONDARY_COLOR
-)
-ax2.set_xlabel("Reviews per Item")
-ax2.set_ylabel("Number of Items")
-ax2.set_title("Item Popularity Distribution")
-plt.savefig(FIGURES_DIR / "user_item_distribution.png")
-print("\nUser activity:")
-print(
-    f"  Users with 1 review: {(user_counts == 1).sum():,} ({(user_counts == 1).sum() / len(user_counts) * 100:.1f}%)"
-)
-print(f"  Users with 5+ reviews: {(user_counts >= 5).sum():,}")
-print(f"  Max reviews by one user: {user_counts.max()}")
-print("\nItem popularity:")
-print(
-    f"  Items with 1 review: {(item_counts == 1).sum():,} ({(item_counts == 1).sum() / len(item_counts) * 100:.1f}%)"
-)
-print(f"  Items with 5+ reviews: {(item_counts >= 5).sum():,}")
-print(f"  Max reviews for one item: {item_counts.max()}")
-# %% 5-core eligibility
-users_5plus = set(user_counts[user_counts >= 5].index)
-items_5plus = set(item_counts[item_counts >= 5].index)
-eligible_mask = df["user_id"].isin(users_5plus) & df["parent_asin"].isin(items_5plus)
-print("\n5-core filtering preview:")
-print(
-    f"  Reviews eligible (first pass): {eligible_mask.sum():,} ({eligible_mask.sum() / len(df) * 100:.1f}%)"
-)
-# %% Sample reviews across length buckets
-print("\n=== Sample Reviews by Length Bucket ===")
-print("(Understanding content patterns before chunking)\n")
-length_buckets = [
-    (0, 50, "Very short (0-50 tokens)"),
-    (50, 100, "Short (50-100 tokens)"),
-    (100, 200, "Medium (100-200 tokens)"),
-    (200, 400, "Long (200-400 tokens)"),
-    (400, float("inf"), "Very long (400+ tokens)"),
-]
-for min_tok, max_tok, label in length_buckets:
-    bucket_mask = (df["estimated_tokens"] >= min_tok) & (
-        df["estimated_tokens"] < max_tok
     )
-    bucket_df = df[bucket_mask]
-    if len(bucket_df) == 0:
-        print(f"{label}: No reviews")
-        continue
-    print(
-        f"{label}: {len(bucket_df):,} reviews ({len(bucket_df) / len(df) * 100:.1f}%)"
     )
-    samples = bucket_df.sample(min(3, len(bucket_df)), random_state=42)
-    for _, row in samples.iterrows():
-        rating = int(row["rating"])
-        tokens = row["estimated_tokens"]
-        text = row["text"][:200] + "..." if len(row["text"]) > 200 else row["text"]
-        text = text.replace("\n", " ")
-        print(f"  [{rating}*] ({tokens} tok) {text}")
-    print()
-# %% Prepared data comparison
-print("\n=== Prepared Data (what the model sees) ===")
-df_prepared = prepare_data(subset_size=DEV_SUBSET_SIZE, verbose=False)
-prepared_stats = get_review_stats(df_prepared)
-print(f"Raw reviews: {len(df):,}")
-print(
-    f"Prepared reviews: {len(df_prepared):,} ({len(df_prepared) / len(df) * 100:.1f}% retained)"
-)
-print(f"Unique users: {prepared_stats['unique_users']:,}")
-print(f"Unique items: {prepared_stats['unique_items']:,}")
-print(
-    f"Avg rating: {prepared_stats['avg_rating']:.2f} (raw: {stats['avg_rating']:.2f})"
-)
-# %% Summary
-print("\n" + "=" * 50)
-print("EDA SUMMARY")
-print("=" * 50)
-print(f"Total reviews: {len(df):,}")
-print(f"Unique users: {df['user_id'].nunique():,}")
-print(f"Unique items: {df['parent_asin'].nunique():,}")
-print(f"Average rating: {df['rating'].mean():.2f}")
-print(
-    f"Reviews needing chunking: {needs_chunking:,} ({needs_chunking / len(df) * 100:.1f}%)"
-)
-print(f"Data quality issues: {empty_reviews + very_short + duplicate_texts}")
-print(f"\nPlots saved to: {FIGURES_DIR}")
-# %% Generate markdown report
-REPORTS_DIR = Path("reports")
-REPORTS_DIR.mkdir(exist_ok=True)
-# Compute all stats for report
-raw_total = len(df)
-prepared_total = len(df_prepared)
-unique_users_raw = df["user_id"].nunique()
-unique_items_raw = df["parent_asin"].nunique()
-unique_users_prepared = prepared_stats["unique_users"]
-unique_items_prepared = prepared_stats["unique_items"]
-avg_rating_raw = stats["avg_rating"]
-avg_rating_prepared = prepared_stats["avg_rating"]
-retention_pct = prepared_total / raw_total * 100
-median_chars = df["text_length"].median()
-mean_chars = df["text_length"].mean()
-median_tokens = df["estimated_tokens"].median()
-chunking_pct = needs_chunking / len(df) * 100
-five_star_pct = rating_counts.get(5, 0) / len(df) * 100
-one_star_pct = rating_counts.get(1, 0) / len(df) * 100
-middle_pct = 100 - five_star_pct - one_star_pct
-users_one_review = (user_counts == 1).sum()
-users_one_review_pct = users_one_review / len(user_counts) * 100
-users_5plus = (user_counts >= 5).sum()
-max_user_reviews = user_counts.max()
-items_one_review = (item_counts == 1).sum()
-items_one_review_pct = items_one_review / len(item_counts) * 100
-items_5plus = (item_counts >= 5).sum()
-max_item_reviews = item_counts.max()
-length_1star = length_by_rating.get(1, 0)
-length_2star = length_by_rating.get(2, 0)
-length_3star = length_by_rating.get(3, 0)
-length_4star = length_by_rating.get(4, 0)
-length_5star = length_by_rating.get(5, 0)
-report_content = f"""# Exploratory Data Analysis: Amazon Electronics Reviews
-**Dataset:** McAuley-Lab/Amazon-Reviews-2023 (Electronics category)
-**Subset:** {raw_total:,} raw reviews -> {prepared_total:,} after 5-core filtering
 ---
 ## Dataset Overview
-The Amazon Electronics reviews dataset provides rich user feedback data for building recommendation systems. After standard preprocessing and 5-core filtering (requiring users and items to have at least 5 interactions), the dataset exhibits the characteristic sparsity of real-world recommendation scenarios.
-| Metric | Raw | After 5-Core |
-|--------|-----|--------------|
-| Total Reviews | {raw_total:,} | {prepared_total:,} |
-| Unique Users | {unique_users_raw:,} | {unique_users_prepared:,} |
-| Unique Items | {unique_items_raw:,} | {unique_items_prepared:,} |
-| Avg Rating | {avg_rating_raw:.2f} | {avg_rating_prepared:.2f} |
-| Retention | - | {retention_pct:.1f}% |
 ---
 ## Rating Distribution
-Amazon reviews exhibit a well-known J-shaped distribution, heavily skewed toward 5-star ratings. This reflects both genuine satisfaction and selection bias (dissatisfied customers often don't leave reviews).
 ![Rating Distribution](../data/figures/rating_distribution.png)
 **Key Observations:**
-- 5-star ratings dominate ({five_star_pct:.1f}% of reviews)
-- 1-star reviews form the second largest group ({one_star_pct:.1f}%)
-- Middle ratings (2-4 stars) are relatively rare ({middle_pct:.1f}% combined)
 - This polarization is typical for e-commerce review data
-**Implications for Modeling:**
-- Binary classification (positive/negative) may be more robust than regression
-- Rating-weighted aggregation should account for the skewed distribution
-- Evidence from 4-5 star reviews carries stronger positive signal
 ---
-## Review Length Analysis
-Review length varies significantly and correlates with the chunking strategy for the RAG pipeline. Most reviews are short enough to embed directly without chunking.
-![Review Length Distribution](../data/figures/review_lengths.png)
-**Length Statistics:**
-- Median: {median_chars:.0f} characters (~{median_tokens:.0f} tokens)
-- Mean: {mean_chars:.0f} characters (~{mean_chars / 4:.0f} tokens)
-- Reviews exceeding 200 tokens: {chunking_pct:.1f}% (require chunking)
-**Chunking Strategy Validation:**
-The tiered chunking approach is well-suited to this distribution:
-- **Short (<200 tokens):** No chunking needed - majority of reviews
-- **Medium (200-500 tokens):** Semantic chunking at topic boundaries
-- **Long (>500 tokens):** Semantic + sliding window fallback
 ---
-## Review Length by Rating
-Negative reviews tend to be longer than positive ones. Users who are dissatisfied often provide detailed explanations of issues, while satisfied users may simply express approval.
-![Review Length by Rating](../data/figures/length_by_rating.png)
-**Pattern:**
-- 1-star reviews: {length_1star:.0f} chars median
-- 2-3 star reviews: {length_2star:.0f}-{length_3star:.0f} chars median (users explain nuance)
-- 4-star reviews: {length_4star:.0f} chars median
-- 5-star reviews: {length_5star:.0f} chars median
-**Implications:**
-- Negative reviews provide richer evidence for issue identification
-- Positive reviews may require multiple chunks for substantive explanations
-- Rating filters (min_rating=4) naturally bias toward shorter evidence
 ---
 ## Temporal Distribution
-The dataset spans multiple years of reviews, enabling proper temporal train/validation/test splits that prevent data leakage.
-![Reviews Over Time](../data/figures/reviews_over_time.png)
-**Temporal Split Strategy:**
-- **Train (70%):** Oldest reviews - model learns from historical patterns
-- **Validation (10%):** Middle period - hyperparameter tuning
-- **Test (20%):** Most recent - simulates production deployment
-This chronological ordering ensures the model never sees "future" data during training.
----
-## User and Item Activity
-The long-tail distribution is pronounced: most users write few reviews, and most items receive few reviews. This sparsity is the fundamental challenge recommendation systems address.
-![User and Item Distribution](../data/figures/user_item_distribution.png)
-**User Activity:**
-- Users with only 1 review: {users_one_review_pct:.1f}%
-- Users with 5+ reviews: {users_5plus:,}
-- Power user max: {max_user_reviews} reviews
-**Item Popularity:**
-- Items with only 1 review: {items_one_review_pct:.1f}%
-- Items with 5+ reviews: {items_5plus:,}
-- Most reviewed item: {max_item_reviews} reviews
-**Cold-Start Implications:**
-- Many items have sparse evidence - content-based features are critical
-- User cold-start is common - onboarding preferences help
-- 5-core filtering ensures minimum evidence density for evaluation
 ---
-## Data Quality Assessment
-The raw dataset contains several quality issues addressed during preprocessing.
-| Issue | Count | Resolution |
-|-------|-------|------------|
-| Missing text | 0 | - |
-| Empty reviews | {empty_reviews} | Removed |
-| Very short (<10 chars) | {very_short:,} | Removed |
-| Duplicate texts | {duplicate_texts:,} | Kept (valid re-purchases) |
-| Invalid ratings | 0 | - |
-**Post-Cleaning:**
-- All reviews have valid text content
-- All ratings are in [1, 5] range
-- All user/product identifiers present
----
-## Summary
-The Amazon Electronics dataset, after 5-core filtering and cleaning, provides a solid foundation for building and evaluating a RAG-based recommendation system:
-1. **Scale:** {prepared_total:,} reviews across {unique_users_prepared:,} users and {unique_items_prepared:,} items
-2. **Sparsity:** {100 - retention_pct:.1f}% filtered - realistic for recommendation evaluation
-3. **Quality:** Clean text, valid ratings, proper identifiers
-4. **Temporal:** Supports chronological train/val/test splits
-5. **Content:** Review lengths suit the tiered chunking strategy
-The J-shaped rating distribution and long-tail user/item activity are characteristic of real e-commerce data, making this an appropriate benchmark for portfolio demonstration.
----
-*Report auto-generated by `scripts/eda.py`. Run `make eda` to regenerate.*
-"""
-report_path = REPORTS_DIR / "eda_report.md"
-report_path.write_text(report_content)
-print(f"\nReport generated: {report_path}")

+# ruff: noqa: E402
+"""
+Production EDA: Analyze data directly from Qdrant Cloud.
+Queries the production vector store to generate accurate statistics
+and visualizations. This ensures EDA reports match deployed data.
+Usage:
+    python scripts/eda.py
+    make eda
+Requires:
+    QDRANT_URL and QDRANT_API_KEY environment variables.
+"""
+from __future__ import annotations
+import os
+import sys
+from collections import Counter
 from pathlib import Path
+from dotenv import load_dotenv
+load_dotenv()
+# Validate environment before imports
+if not os.getenv("QDRANT_URL"):
+    print("ERROR: QDRANT_URL not set. Cannot run production EDA.")
+    print("Set QDRANT_URL and QDRANT_API_KEY in .env or environment.")
+    sys.exit(1)
 import matplotlib.pyplot as plt
+import numpy as np
+from sage.adapters.vector_store import get_client, get_collection_info
+from sage.config import COLLECTION_NAME, CHARS_PER_TOKEN, DATA_DIR
 FIGURES_DIR = DATA_DIR / "figures"
+FIGURES_DIR.mkdir(parents=True, exist_ok=True)
+REPORTS_DIR = Path("reports")
+REPORTS_DIR.mkdir(exist_ok=True)
 # Plot configuration
 plt.style.use("seaborn-v0_8-whitegrid")
     {
         "figure.figsize": (10, 5),
         "figure.dpi": 100,
+        "savefig.dpi": 300,
         "savefig.bbox": "tight",
         "savefig.pad_inches": 0.1,
         "font.size": 11,
     }
 )
 PRIMARY_COLOR = "#05A0D1"
 SECONDARY_COLOR = "#FF9900"
 FIGURE_SIZE_WIDE = (12, 5)
+def scroll_all_payloads(client, batch_size: int = 1000, limit: int | None = None):
+    """
+    Scroll through all points in the collection and yield payloads.
+    Args:
+        client: Qdrant client.
+        batch_size: Points per scroll request.
+        limit: Optional max points to retrieve (None = all).
+    Yields:
+        Payload dicts from each point.
+    """
+    offset = None
+    total = 0
+    while True:
+        results = client.scroll(
+            collection_name=COLLECTION_NAME,
+            limit=batch_size,
+            offset=offset,
+            with_payload=True,
+            with_vectors=False,
+        )
+        points, next_offset = results
+        if not points:
+            break
+        for point in points:
+            yield point.payload
+            total += 1
+            if limit and total >= limit:
+                return
+        offset = next_offset
+        if offset is None:
+            break
+def compute_stats(client, sample_size: int | None = None) -> dict:
+    """
+    Compute statistics from production Qdrant data.
+    Args:
+        client: Qdrant client.
+        sample_size: Optional limit for faster iteration.
+    Returns:
+        Dict with computed statistics.
+    """
+    print("Scanning Qdrant collection...")
+    ratings = []
+    text_lengths = []
+    timestamps = []
+    product_ids = set()
+    review_ids = set()
+    chunks_per_review = {}
+    for i, payload in enumerate(scroll_all_payloads(client, limit=sample_size)):
+        if i % 10000 == 0 and i > 0:
+            print(f"  Processed {i:,} chunks...")
+        ratings.append(payload.get("rating", 0))
+        text_lengths.append(len(payload.get("text", "")))
+        timestamps.append(payload.get("timestamp", 0))
+        product_ids.add(payload.get("product_id"))
+        review_ids.add(payload.get("review_id"))
+        # Track chunks per review
+        review_id = payload.get("review_id")
+        total_chunks = payload.get("total_chunks", 1)
+        if review_id:
+            chunks_per_review[review_id] = total_chunks
+    print(f"  Scanned {len(ratings):,} total chunks")
+    # Compute distributions
+    rating_dist = Counter(ratings)
+    chunk_dist = Counter(chunks_per_review.values())
+    # Estimate tokens from text length
+    token_lengths = [length // CHARS_PER_TOKEN for length in text_lengths]
+    return {
+        "total_chunks": len(ratings),
+        "unique_reviews": len(review_ids),
+        "unique_products": len(product_ids),
+        "ratings": ratings,
+        "rating_dist": dict(sorted(rating_dist.items())),
+        "text_lengths": text_lengths,
+        "token_lengths": token_lengths,
+        "timestamps": timestamps,
+        "chunks_per_review": list(chunks_per_review.values()),
+        "chunk_dist": dict(sorted(chunk_dist.items())),
+    }
+def generate_figures(stats: dict) -> None:
+    """Generate EDA figures from computed stats."""
+    # 1. Rating distribution
+    fig, ax = plt.subplots()
+    rating_counts = stats["rating_dist"]
+    ratings = list(rating_counts.keys())
+    counts = list(rating_counts.values())
+    bars = ax.bar(ratings, counts, color=PRIMARY_COLOR, edgecolor="black")
+    ax.set_xlabel("Rating")
+    ax.set_ylabel("Chunk Count")
+    ax.set_title("Rating Distribution (Production Data)")
+    ax.set_xticks(ratings)
+    for bar, count in zip(bars, counts, strict=True):
+        ax.text(
+            bar.get_x() + bar.get_width() / 2,
+            bar.get_height() + max(counts) * 0.01,
+            f"{count:,}",
+            ha="center",
+            va="bottom",
+            fontsize=9,
+        )
+    plt.savefig(FIGURES_DIR / "rating_distribution.png")
+    plt.close()
+    print(f"  Saved: {FIGURES_DIR / 'rating_distribution.png'}")
+    # 2. Chunk text length distribution
+    fig, axes = plt.subplots(1, 2, figsize=FIGURE_SIZE_WIDE)
+    ax1 = axes[0]
+    lengths = np.array(stats["text_lengths"])
+    ax1.hist(lengths.clip(max=2000), bins=50, color=PRIMARY_COLOR, edgecolor="black")
+    ax1.set_xlabel("Characters")
+    ax1.set_ylabel("Chunk Count")
+    ax1.set_title("Chunk Length Distribution")
+    ax1.axvline(
+        np.median(lengths),
+        color=SECONDARY_COLOR,
+        linestyle="--",
+        label=f"Median: {np.median(lengths):.0f}",
     )
+    ax1.legend()
+    ax2 = axes[1]
+    tokens = np.array(stats["token_lengths"])
+    ax2.hist(tokens.clip(max=500), bins=50, color=SECONDARY_COLOR, edgecolor="black")
+    ax2.set_xlabel("Estimated Tokens")
+    ax2.set_ylabel("Chunk Count")
+    ax2.set_title("Chunk Token Distribution")
+    ax2.axvline(
+        np.median(tokens),
+        color=PRIMARY_COLOR,
+        linestyle="--",
+        label=f"Median: {np.median(tokens):.0f}",
+    )
+    ax2.legend()
+    plt.savefig(FIGURES_DIR / "chunk_lengths.png")
+    plt.close()
+    print(f"  Saved: {FIGURES_DIR / 'chunk_lengths.png'}")
+    # 3. Chunks per review distribution
+    fig, ax = plt.subplots()
+    chunk_counts = stats["chunk_dist"]
+    x = list(chunk_counts.keys())
+    y = list(chunk_counts.values())
+    ax.bar(x, y, color=PRIMARY_COLOR, edgecolor="black")
+    ax.set_xlabel("Chunks per Review")
+    ax.set_ylabel("Number of Reviews")
+    ax.set_title("Review Chunking Distribution")
+    plt.savefig(FIGURES_DIR / "chunks_per_review.png")
+    plt.close()
+    print(f"  Saved: {FIGURES_DIR / 'chunks_per_review.png'}")
+    # 4. Temporal distribution (if timestamps exist)
+    timestamps = [t for t in stats["timestamps"] if t and t > 0]
+    if timestamps:
+        from datetime import datetime
+        fig, ax = plt.subplots()
+        # Convert to dates and count by month
+        dates = [datetime.fromtimestamp(t / 1000) for t in timestamps]
+        months = [d.strftime("%Y-%m") for d in dates]
+        month_counts = Counter(months)
+        sorted_months = sorted(month_counts.items())
+        if len(sorted_months) > 24:
+            # Show only last 24 months if too many
+            sorted_months = sorted_months[-24:]
+        x = [m[0] for m in sorted_months]
+        y = [m[1] for m in sorted_months]
+        ax.bar(range(len(x)), y, color=PRIMARY_COLOR)
+        ax.set_xlabel("Month")
+        ax.set_ylabel("Chunk Count")
+        ax.set_title("Temporal Distribution")
+        ax.set_xticks(range(0, len(x), max(1, len(x) // 6)))
+        ax.set_xticklabels(
+            [x[i] for i in range(0, len(x), max(1, len(x) // 6))], rotation=45
+        )
+        plt.savefig(FIGURES_DIR / "temporal_distribution.png")
+        plt.close()
+        print(f"  Saved: {FIGURES_DIR / 'temporal_distribution.png'}")
+def generate_report(stats: dict, collection_info: dict) -> None:
+    """Generate markdown EDA report."""
+    total_chunks = stats["total_chunks"]
+    unique_reviews = stats["unique_reviews"]
+    unique_products = stats["unique_products"]
+    # Rating stats
+    rating_dist = stats["rating_dist"]
+    total_ratings = sum(rating_dist.values())
+    five_star_pct = (
+        rating_dist.get(5.0, rating_dist.get(5, 0)) / total_ratings * 100
+        if total_ratings
+        else 0
+    )
+    one_star_pct = (
+        rating_dist.get(1.0, rating_dist.get(1, 0)) / total_ratings * 100
+        if total_ratings
+        else 0
     )
+    # Length stats
+    lengths = stats["text_lengths"]
+    tokens = stats["token_lengths"]
+    median_chars = int(np.median(lengths)) if lengths else 0
+    median_tokens = int(np.median(tokens)) if tokens else 0
+    mean_chars = int(np.mean(lengths)) if lengths else 0
+    # Chunk distribution
+    chunk_dist = stats["chunk_dist"]
+    single_chunk_reviews = chunk_dist.get(1, 0)
+    multi_chunk_reviews = unique_reviews - single_chunk_reviews
+    expansion_ratio = total_chunks / unique_reviews if unique_reviews else 0
+    # Rating breakdown
+    rating_lines = []
+    for rating in sorted(rating_dist.keys()):
+        count = rating_dist[rating]
+        pct = count / total_ratings * 100 if total_ratings else 0
+        rating_lines.append(f"| {int(rating)} | {count:,} | {pct:.1f}% |")
+    report_content = f"""# Exploratory Data Analysis: Production Data
+**Source:** Qdrant Cloud (Collection: `{collection_info.get("name", COLLECTION_NAME)}`)
+**Status:** {collection_info.get("status", "unknown")}
+**Generated from live production data**
 ---
 ## Dataset Overview
+This report analyzes the actual data deployed in production, ensuring all statistics match what the recommendation system uses.
+| Metric | Value |
+|--------|-------|
+| Total Chunks | {total_chunks:,} |
+| Unique Reviews | {unique_reviews:,} |
+| Unique Products | {unique_products:,} |
+| Expansion Ratio | {expansion_ratio:.2f}x |
 ---
 ## Rating Distribution
+Amazon reviews exhibit a characteristic J-shaped distribution, heavily skewed toward 5-star ratings.
 ![Rating Distribution](../data/figures/rating_distribution.png)
+| Rating | Count | Percentage |
+|--------|-------|------------|
+{chr(10).join(rating_lines)}
 **Key Observations:**
+- 5-star ratings: {five_star_pct:.1f}% of chunks
+- 1-star ratings: {one_star_pct:.1f}% of chunks
 - This polarization is typical for e-commerce review data
 ---
+## Chunk Length Analysis
+Chunk lengths affect retrieval quality and context window usage.
+![Chunk Lengths](../data/figures/chunk_lengths.png)
+**Statistics:**
+- Median chunk length: {median_chars:,} characters (~{median_tokens} tokens)
+- Mean chunk length: {mean_chars:,} characters
+- Most chunks fit comfortably within embedding model context
 ---
+## Chunking Distribution
+Reviews are chunked based on length: short reviews stay whole, longer reviews are split semantically.
+![Chunks per Review](../data/figures/chunks_per_review.png)
+| Metric | Value |
+|--------|-------|
+| Single-chunk reviews | {single_chunk_reviews:,} |
+| Multi-chunk reviews | {multi_chunk_reviews:,} |
+| Expansion ratio | {expansion_ratio:.2f}x |
+**Chunking Strategy:**
+- Reviews < 200 tokens: No chunking (embedded whole)
+- Reviews 200-500 tokens: Semantic chunking
+- Reviews > 500 tokens: Semantic + sliding window
 ---
 ## Temporal Distribution
+Review timestamps enable chronological analysis and temporal evaluation splits.
+![Temporal Distribution](../data/figures/temporal_distribution.png)
+---
+## Data Quality
+The production dataset has been through 5-core filtering (users and items with 5+ interactions) and quality checks:
+- All chunks have valid text content
+- All ratings are in [1, 5] range
+- All product identifiers present
+- Deterministic chunk IDs (MD5 hash of review_id + chunk_index)
+---
+## Summary
+This production EDA confirms the deployed data characteristics:
+1. **Scale:** {total_chunks:,} chunks across {unique_products:,} products
+2. **Quality:** 5-core filtered, validated payloads
+3. **Distribution:** J-shaped ratings, typical e-commerce pattern
+4. **Chunking:** {expansion_ratio:.2f}x expansion from reviews to chunks
+The data matches what the recommendation API queries in real-time.
 ---
+*Report generated from Qdrant Cloud. Run `make eda` to regenerate.*
+"""
+    report_path = REPORTS_DIR / "eda_report.md"
+    report_path.write_text(report_content)
+    print(f"  Report: {report_path}")
+def main():
+    print("=" * 60)
+    print("PRODUCTION EDA: Querying Qdrant Cloud")
+    print("=" * 60)
+    client = get_client()
+    # Get collection info
+    try:
+        info = get_collection_info(client)
+        print(f"\nCollection: {info['name']}")
+        print(f"Points: {info['points_count']:,}")
+        print(f"Status: {info['status']}")
+    except Exception as e:
+        print(f"ERROR: Cannot access collection: {e}")
+        print("Ensure QDRANT_URL and QDRANT_API_KEY are correct.")
+        sys.exit(1)
+    # Compute stats
+    print("\n--- Computing Statistics ---")
+    stats = compute_stats(client)
+    # Generate figures
+    print("\n--- Generating Figures ---")
+    generate_figures(stats)
+    # Generate report
+    print("\n--- Generating Report ---")
+    generate_report(stats, info)
+    print("\n" + "=" * 60)
+    print("EDA COMPLETE")
+    print("=" * 60)
+    print(f"Figures: {FIGURES_DIR}/")
+    print(f"Report:  {REPORTS_DIR / 'eda_report.md'}")
+    client.close()
+if __name__ == "__main__":
+    main()

scripts/evaluation.py CHANGED Viewed

@@ -338,27 +338,32 @@ def main():
     parser.add_argument(
         "--dataset",
         "-d",
-        default="eval_loo_history.json",
-        help="Evaluation dataset file (default: eval_loo_history.json)",
     )
     args = parser.parse_args()
     log_banner(logger, "OFFLINE EVALUATION")
-    # Load data
-    logger.info("Loading data...")
-    train_df, _, test_df = load_splits()
-    train_records = train_df.to_dict("records")
-    all_products = list(train_df["parent_asin"].unique())
-    item_popularity = compute_item_popularity(train_records, item_key="parent_asin")
     logger.info("Loading product embeddings from Qdrant...")
     item_embeddings = load_product_embeddings_from_qdrant()
     total_items = len(item_embeddings)
     logger.info("Products in catalog: %d", total_items)
     # Load eval cases
     logger.info("Loading evaluation dataset: %s", args.dataset)
     cases = load_eval_cases(args.dataset)
@@ -398,9 +403,14 @@ def main():
             "ndcg_at_10": best_ndcg,
         }
-    # Baseline comparison
     if args.baselines:
-        run_baseline_comparison(cases, train_records, all_products, item_embeddings)
     # Save results (uses dataset stem as prefix for both timestamped and latest files)
     prefix = Path(args.dataset).stem

     parser.add_argument(
         "--dataset",
         "-d",
+        default="eval_natural_queries.json",
+        help="Evaluation dataset file (default: eval_natural_queries.json)",
     )
     args = parser.parse_args()
     log_banner(logger, "OFFLINE EVALUATION")
+    # Load product embeddings from Qdrant (always available)
     logger.info("Loading product embeddings from Qdrant...")
     item_embeddings = load_product_embeddings_from_qdrant()
     total_items = len(item_embeddings)
     logger.info("Products in catalog: %d", total_items)
+    # Try to load splits for beyond-accuracy metrics (optional)
+    item_popularity = None
+    train_records = None
+    all_products = None
+    try:
+        train_df, _, _ = load_splits()
+        train_records = train_df.to_dict("records")
+        all_products = list(train_df["parent_asin"].unique())
+        item_popularity = compute_item_popularity(train_records, item_key="parent_asin")
+        logger.info("Loaded splits for beyond-accuracy metrics")
+    except FileNotFoundError:
+        logger.info("Splits not available - beyond-accuracy metrics will be skipped")
     # Load eval cases
     logger.info("Loading evaluation dataset: %s", args.dataset)
     cases = load_eval_cases(args.dataset)
             "ndcg_at_10": best_ndcg,
         }
+    # Baseline comparison (requires splits)
     if args.baselines:
+        if train_records is None:
+            logger.warning(
+                "Skipping baselines - requires local splits (run 'make splits')"
+            )
+        else:
+            run_baseline_comparison(cases, train_records, all_products, item_embeddings)
     # Save results (uses dataset stem as prefix for both timestamped and latest files)
     prefix = Path(args.dataset).stem

scripts/explanation.py CHANGED Viewed

@@ -43,7 +43,7 @@ PRODUCTS_PER_QUERY = 2
 def run_basic_tests():
     """Test basic explanation generation and HHEM detection."""
-    from scripts.lib.services import get_explanation_services
     log_banner(logger, "BASIC EXPLANATION TESTS")
     logger.info("Using LLM provider: %s", LLM_PROVIDER)
@@ -105,15 +105,18 @@ def run_basic_tests():
         logger.info('Query: "%s"', test_query)
         logger.info("Streaming: ")
-        stream = explainer.generate_explanation_stream(test_query, test_product)
-        chunks = list(stream)
-        logger.info("".join(chunks))
-        streamed_result = stream.get_complete_result()
-        hhem = detector.check_explanation(
-            streamed_result.evidence_texts, streamed_result.explanation
-        )
-        logger.info("HHEM Score: %.3f", hhem.score)
     log_banner(logger, "BASIC TESTS COMPLETE")
@@ -273,17 +276,20 @@ def run_cold_start_tests():
     )
     from sage.core import UserPreferences
     from sage.services.cold_start import preferences_to_query
-    from sage.data import load_splits
     log_banner(logger, "COLD-START HANDLING TESTS")
-    # Load data
-    logger.info("Loading data...")
-    train_df, val_df, test_df = load_splits()
-    user_counts = train_df.groupby("user_id").size().to_dict()
-    logger.info("Training users: %d", len(user_counts))
     # Test warmup levels
     log_section(logger, "1. WARMUP LEVEL DETECTION")
@@ -347,20 +353,23 @@ def run_cold_start_tests():
     for r in recs:
         logger.info("  %s: score=%.3f", r.product_id, r.score)
-    # Find a warm user
-    warm_users = [u for u, c in user_counts.items() if c >= 5]
-    if warm_users:
-        warm_user = warm_users[0]
-        user_history = train_df[train_df["user_id"] == warm_user].to_dict("records")
-        logger.info("Warm user (%d interactions):", len(user_history))
-        recs = hybrid_recommend(
-            query="similar products",
-            user_history=user_history,
-            top_k=3,
-        )
-        for r in recs:
-            logger.info("  %s: score=%.3f", r.product_id, r.score)
     log_banner(logger, "COLD-START TESTS COMPLETE")

 def run_basic_tests():
     """Test basic explanation generation and HHEM detection."""
+    from sage.services import get_explanation_services
     log_banner(logger, "BASIC EXPLANATION TESTS")
     logger.info("Using LLM provider: %s", LLM_PROVIDER)
         logger.info('Query: "%s"', test_query)
         logger.info("Streaming: ")
+        try:
+            stream = explainer.generate_explanation_stream(test_query, test_product)
+            chunks = list(stream)
+            logger.info("".join(chunks))
+            streamed_result = stream.get_complete_result()
+            hhem = detector.check_explanation(
+                streamed_result.evidence_texts, streamed_result.explanation
+            )
+            logger.info("HHEM Score: %.3f", hhem.score)
+        except ValueError as e:
+            logger.info("Quality gate refused streaming: %s", e)
     log_banner(logger, "BASIC TESTS COMPLETE")
     )
     from sage.core import UserPreferences
     from sage.services.cold_start import preferences_to_query
     log_banner(logger, "COLD-START HANDLING TESTS")
+    # Try to load splits for warm user tests (optional)
+    train_df = None
+    user_counts = {}
+    try:
+        from sage.data import load_splits
+        train_df, _, _ = load_splits()
+        user_counts = train_df.groupby("user_id").size().to_dict()
+        logger.info("Loaded splits: %d training users", len(user_counts))
+    except FileNotFoundError:
+        logger.info("Splits not available - warm user tests will be skipped")
     # Test warmup levels
     log_section(logger, "1. WARMUP LEVEL DETECTION")
     for r in recs:
         logger.info("  %s: score=%.3f", r.product_id, r.score)
+    # Find a warm user (only if splits available)
+    if train_df is not None:
+        warm_users = [u for u, c in user_counts.items() if c >= 5]
+        if warm_users:
+            warm_user = warm_users[0]
+            user_history = train_df[train_df["user_id"] == warm_user].to_dict("records")
+            logger.info("Warm user (%d interactions):", len(user_history))
+            recs = hybrid_recommend(
+                query="similar products",
+                user_history=user_history,
+                top_k=3,
+            )
+            for r in recs:
+                logger.info("  %s: score=%.3f", r.product_id, r.score)
+    else:
+        logger.info("Skipping warm user test (no splits)")
     log_banner(logger, "COLD-START TESTS COMPLETE")

scripts/faithfulness.py CHANGED Viewed

@@ -51,7 +51,7 @@ TOP_K_PRODUCTS = 3
 def run_evaluation(n_samples: int, run_ragas: bool = False):
     """Run faithfulness evaluation on sample queries."""
-    from scripts.lib.services import get_explanation_services
     queries = EVALUATION_QUERIES[:n_samples]
@@ -202,7 +202,7 @@ def run_evaluation(n_samples: int, run_ragas: bool = False):
 def run_failure_analysis():
     """Analyze failure cases to identify root causes."""
-    from scripts.lib.services import get_explanation_services
     log_banner(logger, "FAILURE CASE ANALYSIS")

 def run_evaluation(n_samples: int, run_ragas: bool = False):
     """Run faithfulness evaluation on sample queries."""
+    from sage.services import get_explanation_services
     queries = EVALUATION_QUERIES[:n_samples]
 def run_failure_analysis():
     """Analyze failure cases to identify root causes."""
+    from sage.services import get_explanation_services
     log_banner(logger, "FAILURE CASE ANALYSIS")

scripts/human_eval.py CHANGED Viewed

@@ -105,7 +105,7 @@ def generate_samples(force: bool = False, seed: int = 42):
     import random
     from sage.services.retrieval import get_candidates
-    from scripts.lib.services import get_explanation_services
     # Protect existing rated samples from accidental overwrite
     if SAMPLES_FILE.exists() and not force:

     import random
     from sage.services.retrieval import get_candidates
+    from sage.services import get_explanation_services
     # Protect existing rated samples from accidental overwrite
     if SAMPLES_FILE.exists() and not force:

scripts/lib/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Shared utilities for scripts."""
-from scripts.lib.services import get_explanation_services
 __all__ = ["get_explanation_services"]

 """Shared utilities for scripts."""
+# Re-export from sage.services for backwards compatibility
+from sage.services import get_explanation_services
 __all__ = ["get_explanation_services"]

scripts/summary.py CHANGED Viewed

@@ -51,17 +51,6 @@ def main():
     print("SAGE PIPELINE RESULTS")
     print(SEP)
-    # -- Recommendation Quality (LOO History) ---------------------------------
-    loo = load_json(RESULTS_DIR / "eval_loo_history_latest.json")
-    print_section("Recommendation Quality (LOO History):")
-    if loo and "primary_metrics" in loo:
-        m = loo["primary_metrics"]
-        print(f"  NDCG@10:    {fmt(m.get('ndcg_at_10'))}")
-        print(f"  Hit@10:     {fmt(m.get('hit_at_10'))}")
-        print(f"  MRR:        {fmt(m.get('mrr'))}")
-    else:
-        print("  (not available)")
     # -- Recommendation Quality (Natural Queries) -----------------------------
     nat = load_json(RESULTS_DIR / "eval_natural_queries_latest.json")
     print_section("Recommendation Quality (Natural Queries):")
@@ -69,6 +58,7 @@ def main():
         m = nat["primary_metrics"]
         print(f"  NDCG@10:    {fmt(m.get('ndcg_at_10'))}")
         print(f"  Hit@10:     {fmt(m.get('hit_at_10'))}")
     else:
         print("  (not available)")

     print("SAGE PIPELINE RESULTS")
     print(SEP)
     # -- Recommendation Quality (Natural Queries) -----------------------------
     nat = load_json(RESULTS_DIR / "eval_natural_queries_latest.json")
     print_section("Recommendation Quality (Natural Queries):")
         m = nat["primary_metrics"]
         print(f"  NDCG@10:    {fmt(m.get('ndcg_at_10'))}")
         print(f"  Hit@10:     {fmt(m.get('hit_at_10'))}")
+        print(f"  MRR:        {fmt(m.get('mrr'))}")
     else:
         print("  (not available)")