Spaces:

vxa8502
/

Sage

Running

App Files Files Community

vxa8502 commited on Feb 6

Commit

bf39698

1 Parent(s): 4a10224

Apply ruff formatting

Browse files

Files changed (42) hide show

sage/adapters/hhem.py +12 -3
sage/adapters/llm.py +3 -1
sage/adapters/vector_store.py +1 -2
sage/api/app.py +10 -12
sage/api/metrics.py +7 -1
sage/api/middleware.py +6 -5
sage/api/routes.py +43 -11
sage/api/run.py +2 -1
sage/config/logging.py +40 -13
sage/core/aggregation.py +18 -14
sage/core/chunking.py +6 -6
sage/core/evidence.py +12 -6
sage/core/models.py +34 -4
sage/core/prompts.py +1 -1
sage/core/verification.py +7 -8
sage/services/__init__.py +1 -0
sage/services/baselines.py +8 -7
sage/services/cache.py +8 -2
sage/services/cold_start.py +4 -1
sage/services/evaluation.py +3 -1
sage/services/explanation.py +12 -6
sage/services/faithfulness.py +10 -7
sage/services/retrieval.py +15 -4
sage/utils.py +1 -0
scripts/build_eval_dataset.py +235 -40
scripts/build_natural_eval_dataset.py +21 -25
scripts/demo.py +7 -6
scripts/e2e_success_rate.py +66 -14
scripts/eda.py +78 -29
scripts/evaluation.py +58 -18
scripts/explanation.py +53 -15
scripts/faithfulness.py +83 -36
scripts/human_eval.py +57 -26
scripts/pipeline.py +63 -20
scripts/sanity_checks.py +93 -28
scripts/summary.py +22 -7
tests/test_aggregation.py +9 -3
tests/test_api.py +29 -7
tests/test_chunking.py +3 -1
tests/test_evidence.py +4 -1
tests/test_faithfulness.py +9 -2
tests/test_models.py +22 -6

sage/adapters/hhem.py CHANGED Viewed

@@ -148,7 +148,9 @@ class HallucinationDetector:
             remaining = [t for t in evidence_texts if hyp_lower not in t.lower()]
             evidence_texts = containing + remaining
-        hypothesis_tokens = len(self.tokenizer(hypothesis, add_special_tokens=False).input_ids)
         budget = HHEM_MAX_TOKENS - HHEM_TEMPLATE_OVERHEAD - hypothesis_tokens
         kept = []
@@ -253,13 +255,20 @@ class HallucinationDetector:
             List of ClaimResult objects, one per claim.
         """
         pairs = [
-            (self._format_premise(evidence_texts, hypothesis=claim, prioritize_hypothesis=True), claim)
             for claim in claims
         ]
         scores = self._predict(pairs)
         return [
-            ClaimResult(claim=claim, score=score, is_hallucinated=score < self.threshold)
             for claim, score in zip(claims, scores)
         ]

             remaining = [t for t in evidence_texts if hyp_lower not in t.lower()]
             evidence_texts = containing + remaining
+        hypothesis_tokens = len(
+            self.tokenizer(hypothesis, add_special_tokens=False).input_ids
+        )
         budget = HHEM_MAX_TOKENS - HHEM_TEMPLATE_OVERHEAD - hypothesis_tokens
         kept = []
             List of ClaimResult objects, one per claim.
         """
         pairs = [
+            (
+                self._format_premise(
+                    evidence_texts, hypothesis=claim, prioritize_hypothesis=True
+                ),
+                claim,
+            )
             for claim in claims
         ]
         scores = self._predict(pairs)
         return [
+            ClaimResult(
+                claim=claim, score=score, is_hallucinated=score < self.threshold
+            )
             for claim, score in zip(claims, scores)
         ]

sage/adapters/llm.py CHANGED Viewed

@@ -356,7 +356,9 @@ def get_llm_client(provider: str | None = None) -> LLMClient:
     elif provider == "openai":
         return OpenAIClient()
     else:
-        raise ValueError(f"Unknown LLM provider: {provider}. Use 'anthropic' or 'openai'.")
 __all__ = [

     elif provider == "openai":
         return OpenAIClient()
     else:
+        raise ValueError(
+            f"Unknown LLM provider: {provider}. Use 'anthropic' or 'openai'."
+        )
 __all__ = [

sage/adapters/vector_store.py CHANGED Viewed

@@ -42,8 +42,7 @@ def get_client():
         from qdrant_client import QdrantClient
     except ImportError:
         raise ImportError(
-            "qdrant-client package required. "
-            "Install with: pip install qdrant-client"
         )
     if QDRANT_API_KEY:

         from qdrant_client import QdrantClient
     except ImportError:
         raise ImportError(
+            "qdrant-client package required. Install with: pip install qdrant-client"
         )
     if QDRANT_API_KEY:

sage/api/app.py CHANGED Viewed

@@ -31,24 +31,19 @@ async def _lifespan(app: FastAPI):
     # Validate LLM credentials early
     from sage.config import ANTHROPIC_API_KEY, LLM_PROVIDER, OPENAI_API_KEY
     if not ANTHROPIC_API_KEY and not OPENAI_API_KEY:
         logger.error(
-            "No LLM API key set -- add ANTHROPIC_API_KEY "
-            "or OPENAI_API_KEY to .env"
         )
     elif LLM_PROVIDER == "anthropic" and not ANTHROPIC_API_KEY:
-        logger.warning(
-            "LLM_PROVIDER=anthropic but ANTHROPIC_API_KEY "
-            "is not set"
-        )
     elif LLM_PROVIDER == "openai" and not OPENAI_API_KEY:
-        logger.warning(
-            "LLM_PROVIDER=openai but OPENAI_API_KEY "
-            "is not set"
-        )
     # Embedder (loads E5-small model) -- required for all requests
     from sage.adapters.embeddings import get_embedder
     try:
         app.state.embedder = get_embedder()
         logger.info("Embedder loaded")
@@ -58,6 +53,7 @@ async def _lifespan(app: FastAPI):
     # Qdrant client
     from sage.adapters.vector_store import get_client, collection_exists
     app.state.qdrant = get_client()
     try:
         if collection_exists(app.state.qdrant):
@@ -69,6 +65,7 @@ async def _lifespan(app: FastAPI):
     # HHEM hallucination detector (loads T5 model) -- required for grounding
     from sage.adapters.hhem import HallucinationDetector
     try:
         app.state.detector = HallucinationDetector()
         logger.info("HHEM detector loaded")
@@ -78,18 +75,19 @@ async def _lifespan(app: FastAPI):
     # LLM explainer -- graceful degradation if unavailable
     from sage.services.explanation import Explainer
     try:
         app.state.explainer = Explainer()
         logger.info("Explainer ready (%s)", app.state.explainer.model)
     except Exception:
         logger.exception(
-            "Failed to initialize explainer -- "
-            "explain=true requests will fail"
         )
         app.state.explainer = None
     # Semantic cache
     from sage.services.cache import SemanticCache
     app.state.cache = SemanticCache()
     logger.info("Semantic cache initialized")

     # Validate LLM credentials early
     from sage.config import ANTHROPIC_API_KEY, LLM_PROVIDER, OPENAI_API_KEY
     if not ANTHROPIC_API_KEY and not OPENAI_API_KEY:
         logger.error(
+            "No LLM API key set -- add ANTHROPIC_API_KEY or OPENAI_API_KEY to .env"
         )
     elif LLM_PROVIDER == "anthropic" and not ANTHROPIC_API_KEY:
+        logger.warning("LLM_PROVIDER=anthropic but ANTHROPIC_API_KEY is not set")
     elif LLM_PROVIDER == "openai" and not OPENAI_API_KEY:
+        logger.warning("LLM_PROVIDER=openai but OPENAI_API_KEY is not set")
     # Embedder (loads E5-small model) -- required for all requests
     from sage.adapters.embeddings import get_embedder
     try:
         app.state.embedder = get_embedder()
         logger.info("Embedder loaded")
     # Qdrant client
     from sage.adapters.vector_store import get_client, collection_exists
     app.state.qdrant = get_client()
     try:
         if collection_exists(app.state.qdrant):
     # HHEM hallucination detector (loads T5 model) -- required for grounding
     from sage.adapters.hhem import HallucinationDetector
     try:
         app.state.detector = HallucinationDetector()
         logger.info("HHEM detector loaded")
     # LLM explainer -- graceful degradation if unavailable
     from sage.services.explanation import Explainer
     try:
         app.state.explainer = Explainer()
         logger.info("Explainer ready (%s)", app.state.explainer.model)
     except Exception:
         logger.exception(
+            "Failed to initialize explainer -- explain=true requests will fail"
         )
         app.state.explainer = None
     # Semantic cache
     from sage.services.cache import SemanticCache
     app.state.cache = SemanticCache()
     logger.info("Semantic cache initialized")

sage/api/metrics.py CHANGED Viewed

@@ -16,7 +16,12 @@ logger = get_logger(__name__)
 # ---------------------------------------------------------------------------
 try:
-    from prometheus_client import Counter, Histogram, generate_latest, CONTENT_TYPE_LATEST
     REQUEST_COUNT = Counter(
         "sage_requests_total",
@@ -48,6 +53,7 @@ except ImportError:
 # Public helpers
 # ---------------------------------------------------------------------------
 def record_request(endpoint: str, method: str, status: int) -> None:
     """Increment the request counter."""
     if _PROMETHEUS_AVAILABLE:

 # ---------------------------------------------------------------------------
 try:
+    from prometheus_client import (
+        Counter,
+        Histogram,
+        generate_latest,
+        CONTENT_TYPE_LATEST,
+    )
     REQUEST_COUNT = Counter(
         "sage_requests_total",
 # Public helpers
 # ---------------------------------------------------------------------------
 def record_request(endpoint: str, method: str, status: int) -> None:
     """Increment the request counter."""
     if _PROMETHEUS_AVAILABLE:

sage/api/middleware.py CHANGED Viewed

@@ -69,9 +69,7 @@ class LatencyMiddleware:
                 # The Prometheus histogram (in finally) measures total time.
                 elapsed_ms = (time.perf_counter() - start) * 1000
                 headers = list(message.get("headers", []))
-                headers.append(
-                    (b"x-response-time-ms", f"{elapsed_ms:.1f}".encode())
-                )
                 headers.append((b"x-request-id", request_id.encode()))
                 message = {**message, "headers": headers}
             await send(message)
@@ -88,6 +86,9 @@ class LatencyMiddleware:
             if path not in _QUIET_PATHS:
                 logger.info(
                     "%s %s %d %.1fms [%s]",
-                    method, path, status,
-                    elapsed_ms, request_id,
                 )

                 # The Prometheus histogram (in finally) measures total time.
                 elapsed_ms = (time.perf_counter() - start) * 1000
                 headers = list(message.get("headers", []))
+                headers.append((b"x-response-time-ms", f"{elapsed_ms:.1f}".encode()))
                 headers.append((b"x-request-id", request_id.encode()))
                 message = {**message, "headers": headers}
             await send(message)
             if path not in _QUIET_PATHS:
                 logger.info(
                     "%s %s %d %.1fms [%s]",
+                    method,
+                    path,
+                    status,
+                    elapsed_ms,
+                    request_id,
                 )

sage/api/routes.py CHANGED Viewed

@@ -24,7 +24,12 @@ from pydantic import BaseModel
 from sage.adapters.vector_store import collection_exists
 from sage.api.metrics import metrics_response, record_cache_event
 from sage.config import MAX_EVIDENCE, get_logger
-from sage.core import AggregationMethod, ExplanationResult, ProductScore, verify_citations
 from sage.services.retrieval import get_candidates
 # Cap parallel LLM+HHEM workers per request. With k=10 and concurrent
@@ -41,6 +46,7 @@ router = APIRouter()
 # Response models
 # ---------------------------------------------------------------------------
 class EvidenceSource(BaseModel):
     id: str
     text: str
@@ -95,6 +101,7 @@ class CacheStatsResponse(BaseModel):
 # Shared helpers
 # ---------------------------------------------------------------------------
 @dataclass
 class RecommendParams:
     """Query parameters shared by /recommend and /recommend/stream."""
@@ -105,7 +112,9 @@ class RecommendParams:
 def _fetch_products(
-    params: RecommendParams, app, query_embedding=None,
 ) -> list[ProductScore]:
     """Run candidate generation with lifespan-managed singletons."""
     return get_candidates(
@@ -138,6 +147,7 @@ def _build_evidence_list(result: ExplanationResult) -> list[dict]:
 # Health
 # ---------------------------------------------------------------------------
 @router.get("/health", response_model=HealthResponse)
 def health(request: Request):
     """Deployment readiness probe. Checks Qdrant connectivity.
@@ -159,6 +169,7 @@ def health(request: Request):
 # Recommend (non-streaming)
 # ---------------------------------------------------------------------------
 @router.get(
     "/recommend",
     response_model=RecommendResponse,
@@ -208,20 +219,27 @@ def recommend(
                 # HHEM model in eval() + no_grad() = read-only forward
                 # pass with no state mutation. Tokenizer is stateless.
                 er = explainer.generate_explanation(
-                    query=q, product=product, max_evidence=MAX_EVIDENCE,
                 )
                 hr = detector.check_explanation(
                     evidence_texts=er.evidence_texts,
                     explanation=er.explanation,
                 )
-                cr = verify_citations(er.explanation, er.evidence_ids, er.evidence_texts)
                 return er, hr, cr
-            with ThreadPoolExecutor(max_workers=min(len(products), _MAX_EXPLAIN_WORKERS)) as pool:
                 results = list(pool.map(_explain, products))
             for i, (product, (er, hr, cr)) in enumerate(
-                zip(products, results), 1,
             ):
                 rec = _build_product_dict(i, product)
                 rec["explanation"] = er.explanation
@@ -257,6 +275,7 @@ def recommend(
 # Recommend (SSE streaming)
 # ---------------------------------------------------------------------------
 def _sse_event(event: str, data: str) -> str:
     """Format a single SSE event."""
     return f"event: {event}\ndata: {data}\n\n"
@@ -267,9 +286,16 @@ def _stream_recommendations(
     app,
 ) -> Iterator[str]:
     """Generator that yields SSE events for streaming recommendations."""
-    yield _sse_event("metadata", json.dumps({
-        "verified": False, "cache": False, "hhem": False,
-    }))
     try:
         products = _fetch_products(params, app)
@@ -285,7 +311,9 @@ def _stream_recommendations(
     explainer = app.state.explainer
     if explainer is None:
-        yield _sse_event("error", json.dumps({"detail": "Explanation service unavailable"}))
         yield _sse_event("done", json.dumps({"status": "error"}))
         return
@@ -314,7 +342,9 @@ def _stream_recommendations(
             yield _sse_event("refusal", json.dumps({"detail": str(exc)}))
         except Exception:
             logger.exception("Streaming error for product %s", product.product_id)
-            yield _sse_event("error", json.dumps({"detail": "Failed to generate explanation"}))
     yield _sse_event("done", json.dumps({"status": "complete"}))
@@ -341,6 +371,7 @@ def recommend_stream(
 # Cache management
 # ---------------------------------------------------------------------------
 @router.get("/cache/stats", response_model=CacheStatsResponse)
 def cache_stats(request: Request):
     """Return cache performance statistics."""
@@ -370,6 +401,7 @@ def cache_clear(request: Request):
 # Prometheus metrics
 # ---------------------------------------------------------------------------
 @router.get("/metrics")
 def metrics():
     """Prometheus metrics endpoint."""

 from sage.adapters.vector_store import collection_exists
 from sage.api.metrics import metrics_response, record_cache_event
 from sage.config import MAX_EVIDENCE, get_logger
+from sage.core import (
+    AggregationMethod,
+    ExplanationResult,
+    ProductScore,
+    verify_citations,
+)
 from sage.services.retrieval import get_candidates
 # Cap parallel LLM+HHEM workers per request. With k=10 and concurrent
 # Response models
 # ---------------------------------------------------------------------------
 class EvidenceSource(BaseModel):
     id: str
     text: str
 # Shared helpers
 # ---------------------------------------------------------------------------
 @dataclass
 class RecommendParams:
     """Query parameters shared by /recommend and /recommend/stream."""
 def _fetch_products(
+    params: RecommendParams,
+    app,
+    query_embedding=None,
 ) -> list[ProductScore]:
     """Run candidate generation with lifespan-managed singletons."""
     return get_candidates(
 # Health
 # ---------------------------------------------------------------------------
 @router.get("/health", response_model=HealthResponse)
 def health(request: Request):
     """Deployment readiness probe. Checks Qdrant connectivity.
 # Recommend (non-streaming)
 # ---------------------------------------------------------------------------
 @router.get(
     "/recommend",
     response_model=RecommendResponse,
                 # HHEM model in eval() + no_grad() = read-only forward
                 # pass with no state mutation. Tokenizer is stateless.
                 er = explainer.generate_explanation(
+                    query=q,
+                    product=product,
+                    max_evidence=MAX_EVIDENCE,
                 )
                 hr = detector.check_explanation(
                     evidence_texts=er.evidence_texts,
                     explanation=er.explanation,
                 )
+                cr = verify_citations(
+                    er.explanation, er.evidence_ids, er.evidence_texts
+                )
                 return er, hr, cr
+            with ThreadPoolExecutor(
+                max_workers=min(len(products), _MAX_EXPLAIN_WORKERS)
+            ) as pool:
                 results = list(pool.map(_explain, products))
             for i, (product, (er, hr, cr)) in enumerate(
+                zip(products, results),
+                1,
             ):
                 rec = _build_product_dict(i, product)
                 rec["explanation"] = er.explanation
 # Recommend (SSE streaming)
 # ---------------------------------------------------------------------------
 def _sse_event(event: str, data: str) -> str:
     """Format a single SSE event."""
     return f"event: {event}\ndata: {data}\n\n"
     app,
 ) -> Iterator[str]:
     """Generator that yields SSE events for streaming recommendations."""
+    yield _sse_event(
+        "metadata",
+        json.dumps(
+            {
+                "verified": False,
+                "cache": False,
+                "hhem": False,
+            }
+        ),
+    )
     try:
         products = _fetch_products(params, app)
     explainer = app.state.explainer
     if explainer is None:
+        yield _sse_event(
+            "error", json.dumps({"detail": "Explanation service unavailable"})
+        )
         yield _sse_event("done", json.dumps({"status": "error"}))
         return
             yield _sse_event("refusal", json.dumps({"detail": str(exc)}))
         except Exception:
             logger.exception("Streaming error for product %s", product.product_id)
+            yield _sse_event(
+                "error", json.dumps({"detail": "Failed to generate explanation"})
+            )
     yield _sse_event("done", json.dumps({"status": "complete"}))
 # Cache management
 # ---------------------------------------------------------------------------
 @router.get("/cache/stats", response_model=CacheStatsResponse)
 def cache_stats(request: Request):
     """Return cache performance statistics."""
 # Prometheus metrics
 # ---------------------------------------------------------------------------
 @router.get("/metrics")
 def metrics():
     """Prometheus metrics endpoint."""

sage/api/run.py CHANGED Viewed

@@ -24,7 +24,8 @@ def main():
     parser = argparse.ArgumentParser(description="Sage API server")
     parser.add_argument("--host", default="0.0.0.0", help="Bind address")
     parser.add_argument(
-        "--port", type=int,
         default=int(os.getenv("PORT", "8000")),
         help="Port (defaults to PORT env var, then 8000)",
     )

     parser = argparse.ArgumentParser(description="Sage API server")
     parser.add_argument("--host", default="0.0.0.0", help="Bind address")
     parser.add_argument(
+        "--port",
+        type=int,
         default=int(os.getenv("PORT", "8000")),
         help="Port (defaults to PORT env var, then 8000)",
     )

sage/config/logging.py CHANGED Viewed

@@ -28,27 +28,48 @@ LOG_FORMAT = os.getenv("SAGE_LOG_FORMAT", "console")  # "console" or "json"
 # Standard LogRecord attributes to ignore when extracting user-specified extras.
 # These are built-in attributes from logging.LogRecord plus taskName from asyncio.
-_STANDARD_LOG_ATTRS = frozenset({
-    "name", "msg", "args", "created", "filename", "funcName",
-    "levelname", "levelno", "lineno", "module", "msecs",
-    "pathname", "process", "processName", "relativeCreated",
-    "stack_info", "exc_info", "exc_text", "thread", "threadName",
-    "message", "asctime", "taskName",
-})
 # ---------------------------------------------------------------------------
 # Custom Formatter (Console)
 # ---------------------------------------------------------------------------
 class ConsoleFormatter(logging.Formatter):
     """Human-readable formatter with visual hierarchy."""
     COLORS = {
-        "DEBUG": "\033[36m",     # Cyan
-        "INFO": "\033[32m",      # Green
-        "WARNING": "\033[33m",   # Yellow
-        "ERROR": "\033[31m",     # Red
         "CRITICAL": "\033[35m",  # Magenta
         "RESET": "\033[0m",
     }
@@ -87,6 +108,7 @@ class ConsoleFormatter(logging.Formatter):
 # Custom Formatter (JSON)
 # ---------------------------------------------------------------------------
 class JSONFormatter(logging.Formatter):
     """Machine-parseable JSON formatter for production."""
@@ -180,14 +202,19 @@ def get_logger(name: str) -> logging.Logger:
 # Convenience functions for visual output
 # ---------------------------------------------------------------------------
-def log_banner(logger: logging.Logger, title: str, char: str = "=", width: int = 60) -> None:
     """Log a visual banner for section headers."""
     logger.info(char * width)
     logger.info(title)
     logger.info(char * width)
-def log_section(logger: logging.Logger, title: str, char: str = "-", width: int = 60) -> None:
     """Log a section divider."""
     logger.info("")
     logger.info(char * width)

 # Standard LogRecord attributes to ignore when extracting user-specified extras.
 # These are built-in attributes from logging.LogRecord plus taskName from asyncio.
+_STANDARD_LOG_ATTRS = frozenset(
+    {
+        "name",
+        "msg",
+        "args",
+        "created",
+        "filename",
+        "funcName",
+        "levelname",
+        "levelno",
+        "lineno",
+        "module",
+        "msecs",
+        "pathname",
+        "process",
+        "processName",
+        "relativeCreated",
+        "stack_info",
+        "exc_info",
+        "exc_text",
+        "thread",
+        "threadName",
+        "message",
+        "asctime",
+        "taskName",
+    }
+)
 # ---------------------------------------------------------------------------
 # Custom Formatter (Console)
 # ---------------------------------------------------------------------------
 class ConsoleFormatter(logging.Formatter):
     """Human-readable formatter with visual hierarchy."""
     COLORS = {
+        "DEBUG": "\033[36m",  # Cyan
+        "INFO": "\033[32m",  # Green
+        "WARNING": "\033[33m",  # Yellow
+        "ERROR": "\033[31m",  # Red
         "CRITICAL": "\033[35m",  # Magenta
         "RESET": "\033[0m",
     }
 # Custom Formatter (JSON)
 # ---------------------------------------------------------------------------
 class JSONFormatter(logging.Formatter):
     """Machine-parseable JSON formatter for production."""
 # Convenience functions for visual output
 # ---------------------------------------------------------------------------
+def log_banner(
+    logger: logging.Logger, title: str, char: str = "=", width: int = 60
+) -> None:
     """Log a visual banner for section headers."""
     logger.info(char * width)
     logger.info(title)
     logger.info(char * width)
+def log_section(
+    logger: logging.Logger, title: str, char: str = "-", width: int = 60
+) -> None:
     """Log a section divider."""
     logger.info("")
     logger.info(char * width)

sage/core/aggregation.py CHANGED Viewed

@@ -55,13 +55,15 @@ def aggregate_chunks_to_products(
         else:
             agg_score = max(scores)
-        product_scores.append(ProductScore(
-            product_id=product_id,
-            score=agg_score,
-            chunk_count=len(prod_chunks),
-            avg_rating=float(np.mean(ratings)),
-            evidence=sorted(prod_chunks, key=lambda c: c.score, reverse=True),
-        ))
     # Sort by score descending
     return sorted(product_scores, key=lambda p: p.score, reverse=True)
@@ -112,12 +114,14 @@ def apply_weighted_ranking(
     # Create new ProductScore objects with updated scores
     reranked = []
     for i, product in enumerate(products):
-        reranked.append(ProductScore(
-            product_id=product.product_id,
-            score=float(final_scores[i]),
-            chunk_count=product.chunk_count,
-            avg_rating=product.avg_rating,
-            evidence=product.evidence,
-        ))
     return sorted(reranked, key=lambda p: p.score, reverse=True)

         else:
             agg_score = max(scores)
+        product_scores.append(
+            ProductScore(
+                product_id=product_id,
+                score=agg_score,
+                chunk_count=len(prod_chunks),
+                avg_rating=float(np.mean(ratings)),
+                evidence=sorted(prod_chunks, key=lambda c: c.score, reverse=True),
+            )
+        )
     # Sort by score descending
     return sorted(product_scores, key=lambda p: p.score, reverse=True)
     # Create new ProductScore objects with updated scores
     reranked = []
     for i, product in enumerate(products):
+        reranked.append(
+            ProductScore(
+                product_id=product.product_id,
+                score=float(final_scores[i]),
+                chunk_count=product.chunk_count,
+                avg_rating=product.avg_rating,
+                evidence=product.evidence,
+            )
+        )
     return sorted(reranked, key=lambda p: p.score, reverse=True)

sage/core/chunking.py CHANGED Viewed

@@ -19,16 +19,16 @@ from sage.config import CHARS_PER_TOKEN
 # Chunking thresholds (tokens)
-NO_CHUNK_THRESHOLD = 200      # Texts under this: no chunking
-SEMANTIC_THRESHOLD = 500      # Texts under this: semantic only
-MAX_CHUNK_TOKENS = 400        # Chunks larger than this get sliding window
 # Semantic chunking config
-SIMILARITY_PERCENTILE = 85    # Split at drops below this percentile
 # Sliding window config (fallback)
-SLIDING_CHUNK_SIZE = 150      # Target tokens per sliding window chunk
-SLIDING_OVERLAP = 30          # Token overlap between chunks
 def estimate_tokens(text: str) -> int:

 # Chunking thresholds (tokens)
+NO_CHUNK_THRESHOLD = 200  # Texts under this: no chunking
+SEMANTIC_THRESHOLD = 500  # Texts under this: semantic only
+MAX_CHUNK_TOKENS = 400  # Chunks larger than this get sliding window
 # Semantic chunking config
+SIMILARITY_PERCENTILE = 85  # Split at drops below this percentile
 # Sliding window config (fallback)
+SLIDING_CHUNK_SIZE = 150  # Target tokens per sliding window chunk
+SLIDING_OVERLAP = 30  # Token overlap between chunks
 def estimate_tokens(text: str) -> int:

sage/core/evidence.py CHANGED Viewed

@@ -101,8 +101,14 @@ def check_evidence_quality(
     # Check thresholds using table-driven validation
     thresholds = [
-        (chunk_count < min_chunks, f"insufficient_chunks: {chunk_count} < {min_chunks}"),
-        (total_tokens < min_tokens, f"insufficient_tokens: {total_tokens} < {min_tokens}"),
         (top_score < min_score, f"low_relevance: {top_score:.3f} < {min_score}"),
     ]
@@ -150,17 +156,17 @@ def generate_refusal_message(
             f"I cannot provide a confident recommendation for this product based on "
             f"the available review evidence. Only {quality.chunk_count} review excerpt(s) "
             f"were found, which is insufficient to make a well-grounded recommendation "
-            f"for your query about \"{query}\"."
         )
     elif "insufficient_tokens" in reason:
         return (
             f"I cannot provide a meaningful recommendation for this product. "
             f"The available review evidence is too brief ({quality.total_tokens} tokens) "
-            f"to support a well-grounded explanation for your query about \"{query}\"."
         )
     elif "low_relevance" in reason:
         return (
-            f"I cannot recommend this product for your query about \"{query}\" because "
             f"the available reviews do not appear to be sufficiently relevant "
             f"(relevance score: {quality.top_score:.2f}). The reviews may discuss "
             f"different aspects or product features than what you're looking for."
@@ -168,5 +174,5 @@ def generate_refusal_message(
     else:
         return (
             f"I cannot provide a recommendation for this product due to "
-            f"insufficient review evidence for your query about \"{query}\"."
         )

     # Check thresholds using table-driven validation
     thresholds = [
+        (
+            chunk_count < min_chunks,
+            f"insufficient_chunks: {chunk_count} < {min_chunks}",
+        ),
+        (
+            total_tokens < min_tokens,
+            f"insufficient_tokens: {total_tokens} < {min_tokens}",
+        ),
         (top_score < min_score, f"low_relevance: {top_score:.3f} < {min_score}"),
     ]
             f"I cannot provide a confident recommendation for this product based on "
             f"the available review evidence. Only {quality.chunk_count} review excerpt(s) "
             f"were found, which is insufficient to make a well-grounded recommendation "
+            f'for your query about "{query}".'
         )
     elif "insufficient_tokens" in reason:
         return (
             f"I cannot provide a meaningful recommendation for this product. "
             f"The available review evidence is too brief ({quality.total_tokens} tokens) "
+            f'to support a well-grounded explanation for your query about "{query}".'
         )
     elif "low_relevance" in reason:
         return (
+            f'I cannot recommend this product for your query about "{query}" because '
             f"the available reviews do not appear to be sufficiently relevant "
             f"(relevance score: {quality.top_score:.2f}). The reviews may discuss "
             f"different aspects or product features than what you're looking for."
     else:
         return (
             f"I cannot provide a recommendation for this product due to "
+            f'insufficient review evidence for your query about "{query}".'
         )

sage/core/models.py CHANGED Viewed

@@ -20,8 +20,10 @@ from typing import Iterator
 # RETRIEVAL & RECOMMENDATION MODELS
 # ============================================================================
 class AggregationMethod(Enum):
     """Methods for aggregating chunk scores to product scores."""
     MAX = "max"
     MEAN = "mean"
     WEIGHTED_MEAN = "weighted_mean"
@@ -35,6 +37,7 @@ class Chunk:
     This is the unit stored in the vector database. Reviews are chunked
     using semantic or sliding-window strategies based on length.
     """
     text: str
     chunk_index: int
     total_chunks: int
@@ -52,6 +55,7 @@ class RetrievedChunk:
     This is returned by semantic search and used as evidence for
     explanation generation.
     """
     text: str
     score: float
     product_id: str
@@ -67,6 +71,7 @@ class ProductScore:
     Multiple chunks may belong to the same product. This dataclass
     holds the aggregated score and all supporting evidence.
     """
     product_id: str
     score: float
     chunk_count: int
@@ -89,6 +94,7 @@ class Recommendation:
     This is the output of the recommendation pipeline, ready for
     display or API response.
     """
     rank: int
     product_id: str
     score: float
@@ -102,6 +108,7 @@ class Recommendation:
 # COLD START MODELS
 # ============================================================================
 @dataclass
 class UserPreferences:
     """
@@ -110,6 +117,7 @@ class UserPreferences:
     In production, these would be collected via an onboarding flow:
     "What categories interest you?" "What's your budget?" etc.
     """
     categories: list[str] | None = None
     budget: str | None = None  # "low", "medium", "high", or specific like "$50-100"
     priorities: list[str] | None = None  # ["quality", "value", "durability"]
@@ -123,6 +131,7 @@ class NewItem:
     In production, this would come from the product catalog.
     """
     product_id: str
     title: str
     description: str | None = None
@@ -136,6 +145,7 @@ class NewItem:
 # EXPLANATION MODELS
 # ============================================================================
 @dataclass
 class ExplanationResult:
     """
@@ -144,6 +154,7 @@ class ExplanationResult:
     Contains the generated explanation along with evidence attribution
     for traceability and faithfulness verification.
     """
     explanation: str
     product_id: str
     query: str
@@ -174,6 +185,7 @@ class StreamingExplanation:
             print(token, end="", flush=True)
         result = stream.get_complete_result()
     """
     token_iterator: Iterator[str]
     product_id: str
     query: str
@@ -215,6 +227,7 @@ class EvidenceQuality:
     when evidence is too thin. Thin evidence (1 chunk, few tokens)
     correlates strongly with LLM overclaiming.
     """
     is_sufficient: bool
     chunk_count: int
     total_tokens: int
@@ -226,9 +239,11 @@ class EvidenceQuality:
 # VERIFICATION MODELS
 # ============================================================================
 @dataclass
 class QuoteVerification:
     """Result of verifying a single quoted claim against evidence."""
     quote: str
     found: bool
     source_text: str | None = None  # Which evidence text contained it
@@ -243,6 +258,7 @@ class VerificationResult:
     exists in the provided evidence. Catches wrong attribution where
     LLM cites quotes that don't exist.
     """
     all_verified: bool
     quotes_found: int
     quotes_missing: int
@@ -254,6 +270,7 @@ class VerificationResult:
 # HALLUCINATION DETECTION MODELS
 # ============================================================================
 @dataclass
 class HallucinationResult:
     """
@@ -263,6 +280,7 @@ class HallucinationResult:
     consistency between evidence (premise) and explanation (hypothesis).
     Score < 0.5 indicates hallucination.
     """
     score: float
     is_hallucinated: bool
     threshold: float
@@ -273,6 +291,7 @@ class HallucinationResult:
 @dataclass
 class ClaimResult:
     """Result of hallucination check for a single claim."""
     claim: str
     score: float
     is_hallucinated: bool
@@ -286,6 +305,7 @@ class AgreementReport:
     Useful for understanding when the two methods disagree and
     calibrating thresholds.
     """
     n_samples: int
     agreement_rate: float  # Proportion where both agree on pass/fail
     hhem_pass_rate: float
@@ -306,6 +326,7 @@ class AdjustedFaithfulnessReport:
     Refusals (e.g., "I cannot recommend...") are correct LLM behavior
     but get penalized by HHEM. This report adjusts for that.
     """
     n_total: int
     n_refusals: int
     n_evaluated: int  # n_total - n_refusals
@@ -334,6 +355,7 @@ class ClaimLevelReport:
     - min_score: Lowest scoring claim (weakest grounding)
     - pass_rate: Proportion of claims scoring >= threshold
     """
     n_explanations: int
     n_claims: int
     avg_score: float
@@ -368,6 +390,7 @@ class MultiMetricFaithfulnessReport:
     individually. Full-explanation HHEM (57%) measures structural patterns
     that HHEM was trained on, not actual hallucination."
     """
     n_samples: int
     # Quote verification (lexical)
     quote_verification_rate: float
@@ -410,19 +433,19 @@ class MultiMetricFaithfulnessReport:
             "=" * 50,
             "",
             "Quote Verification (lexical grounding):",
-            f"  Pass rate: {self.quote_verification_rate*100:.1f}% ({self.quotes_found}/{self.quotes_total})",
             "",
             "Claim-Level HHEM (semantic grounding per claim):",
-            f"  Pass rate: {self.claim_level_pass_rate*100:.1f}%",
             f"  Avg score: {self.claim_level_avg_score:.3f}",
             f"  Min score: {self.claim_level_min_score:.3f}",
             "",
             "Full-Explanation HHEM (structural compatibility):",
-            f"  Pass rate: {self.full_explanation_pass_rate*100:.1f}%",
             f"  Avg score: {self.full_explanation_avg_score:.3f}",
             "",
             "-" * 50,
-            f"PRIMARY METRIC ({self.primary_metric}): {self.claim_level_pass_rate*100:.1f}%",
         ]
         return "\n".join(lines)
@@ -431,9 +454,11 @@ class MultiMetricFaithfulnessReport:
 # FAITHFULNESS EVALUATION MODELS (RAGAS)
 # ============================================================================
 @dataclass
 class FaithfulnessResult:
     """Result of RAGAS faithfulness evaluation for a single explanation."""
     score: float
     query: str
     explanation: str
@@ -444,6 +469,7 @@ class FaithfulnessResult:
 @dataclass
 class FaithfulnessReport:
     """Aggregate report for batch faithfulness evaluation (legacy format)."""
     mean_score: float
     min_score: float
     max_score: float
@@ -459,6 +485,7 @@ class FaithfulnessReport:
 # EVALUATION METRICS MODELS
 # ============================================================================
 @dataclass
 class EvalCase:
     """
@@ -470,6 +497,7 @@ class EvalCase:
                        For binary relevance, use 1 for relevant, 0 for not.
         user_id: Optional user identifier for the case.
     """
     query: str
     relevant_items: dict[str, float]
     user_id: str | None = None
@@ -483,6 +511,7 @@ class EvalCase:
 @dataclass
 class EvalResult:
     """Results from evaluating a single recommendation list."""
     ndcg: float = 0.0
     hit: float = 0.0
     mrr: float = 0.0
@@ -498,6 +527,7 @@ class MetricsReport:
     Includes both accuracy metrics (NDCG, Hit, MRR) and
     beyond-accuracy metrics (diversity, coverage, novelty).
     """
     n_cases: int = 0
     ndcg_at_k: float = 0.0
     hit_at_k: float = 0.0

 # RETRIEVAL & RECOMMENDATION MODELS
 # ============================================================================
 class AggregationMethod(Enum):
     """Methods for aggregating chunk scores to product scores."""
     MAX = "max"
     MEAN = "mean"
     WEIGHTED_MEAN = "weighted_mean"
     This is the unit stored in the vector database. Reviews are chunked
     using semantic or sliding-window strategies based on length.
     """
     text: str
     chunk_index: int
     total_chunks: int
     This is returned by semantic search and used as evidence for
     explanation generation.
     """
     text: str
     score: float
     product_id: str
     Multiple chunks may belong to the same product. This dataclass
     holds the aggregated score and all supporting evidence.
     """
     product_id: str
     score: float
     chunk_count: int
     This is the output of the recommendation pipeline, ready for
     display or API response.
     """
     rank: int
     product_id: str
     score: float
 # COLD START MODELS
 # ============================================================================
 @dataclass
 class UserPreferences:
     """
     In production, these would be collected via an onboarding flow:
     "What categories interest you?" "What's your budget?" etc.
     """
     categories: list[str] | None = None
     budget: str | None = None  # "low", "medium", "high", or specific like "$50-100"
     priorities: list[str] | None = None  # ["quality", "value", "durability"]
     In production, this would come from the product catalog.
     """
     product_id: str
     title: str
     description: str | None = None
 # EXPLANATION MODELS
 # ============================================================================
 @dataclass
 class ExplanationResult:
     """
     Contains the generated explanation along with evidence attribution
     for traceability and faithfulness verification.
     """
     explanation: str
     product_id: str
     query: str
             print(token, end="", flush=True)
         result = stream.get_complete_result()
     """
     token_iterator: Iterator[str]
     product_id: str
     query: str
     when evidence is too thin. Thin evidence (1 chunk, few tokens)
     correlates strongly with LLM overclaiming.
     """
     is_sufficient: bool
     chunk_count: int
     total_tokens: int
 # VERIFICATION MODELS
 # ============================================================================
 @dataclass
 class QuoteVerification:
     """Result of verifying a single quoted claim against evidence."""
     quote: str
     found: bool
     source_text: str | None = None  # Which evidence text contained it
     exists in the provided evidence. Catches wrong attribution where
     LLM cites quotes that don't exist.
     """
     all_verified: bool
     quotes_found: int
     quotes_missing: int
 # HALLUCINATION DETECTION MODELS
 # ============================================================================
 @dataclass
 class HallucinationResult:
     """
     consistency between evidence (premise) and explanation (hypothesis).
     Score < 0.5 indicates hallucination.
     """
     score: float
     is_hallucinated: bool
     threshold: float
 @dataclass
 class ClaimResult:
     """Result of hallucination check for a single claim."""
     claim: str
     score: float
     is_hallucinated: bool
     Useful for understanding when the two methods disagree and
     calibrating thresholds.
     """
     n_samples: int
     agreement_rate: float  # Proportion where both agree on pass/fail
     hhem_pass_rate: float
     Refusals (e.g., "I cannot recommend...") are correct LLM behavior
     but get penalized by HHEM. This report adjusts for that.
     """
     n_total: int
     n_refusals: int
     n_evaluated: int  # n_total - n_refusals
     - min_score: Lowest scoring claim (weakest grounding)
     - pass_rate: Proportion of claims scoring >= threshold
     """
     n_explanations: int
     n_claims: int
     avg_score: float
     individually. Full-explanation HHEM (57%) measures structural patterns
     that HHEM was trained on, not actual hallucination."
     """
     n_samples: int
     # Quote verification (lexical)
     quote_verification_rate: float
             "=" * 50,
             "",
             "Quote Verification (lexical grounding):",
+            f"  Pass rate: {self.quote_verification_rate * 100:.1f}% ({self.quotes_found}/{self.quotes_total})",
             "",
             "Claim-Level HHEM (semantic grounding per claim):",
+            f"  Pass rate: {self.claim_level_pass_rate * 100:.1f}%",
             f"  Avg score: {self.claim_level_avg_score:.3f}",
             f"  Min score: {self.claim_level_min_score:.3f}",
             "",
             "Full-Explanation HHEM (structural compatibility):",
+            f"  Pass rate: {self.full_explanation_pass_rate * 100:.1f}%",
             f"  Avg score: {self.full_explanation_avg_score:.3f}",
             "",
             "-" * 50,
+            f"PRIMARY METRIC ({self.primary_metric}): {self.claim_level_pass_rate * 100:.1f}%",
         ]
         return "\n".join(lines)
 # FAITHFULNESS EVALUATION MODELS (RAGAS)
 # ============================================================================
 @dataclass
 class FaithfulnessResult:
     """Result of RAGAS faithfulness evaluation for a single explanation."""
     score: float
     query: str
     explanation: str
 @dataclass
 class FaithfulnessReport:
     """Aggregate report for batch faithfulness evaluation (legacy format)."""
     mean_score: float
     min_score: float
     max_score: float
 # EVALUATION METRICS MODELS
 # ============================================================================
 @dataclass
 class EvalCase:
     """
                        For binary relevance, use 1 for relevant, 0 for not.
         user_id: Optional user identifier for the case.
     """
     query: str
     relevant_items: dict[str, float]
     user_id: str | None = None
 @dataclass
 class EvalResult:
     """Results from evaluating a single recommendation list."""
     ndcg: float = 0.0
     hit: float = 0.0
     mrr: float = 0.0
     Includes both accuracy metrics (NDCG, Hit, MRR) and
     beyond-accuracy metrics (diversity, coverage, novelty).
     """
     n_cases: int = 0
     ndcg_at_k: float = 0.0
     hit_at_k: float = 0.0

sage/core/prompts.py CHANGED Viewed

@@ -76,7 +76,7 @@ def format_evidence(
         return "(No review evidence available)"
     return "\n\n".join(
-        f"[{chunk.review_id}] ({int(chunk.rating or 0)}/5 stars): \"{chunk.text}\""
         for chunk in chunks[:max_chunks]
     )

         return "(No review evidence available)"
     return "\n\n".join(
+        f'[{chunk.review_id}] ({int(chunk.rating or 0)}/5 stars): "{chunk.text}"'
         for chunk in chunks[:max_chunks]
     )

sage/core/verification.py CHANGED Viewed

@@ -86,9 +86,9 @@ def extract_quotes(text: str, min_length: int = 4) -> list[str]:
         List of unique quoted strings found in the text.
     """
     patterns = [
-        r'"([^"]+)"',      # Regular double quotes
-        r'"([^"]+)"',      # Curly double quotes
-        r"'([^']+)'",      # Single quotes
     ]
     quotes = []
@@ -206,6 +206,7 @@ def verify_explanation(
 # Citation ID Verification
 # =============================================================================
 @dataclass
 class CitationResult:
     """Result of verifying a single citation."""
@@ -256,12 +257,12 @@ def extract_citations(text: str) -> list[tuple[str, str | None]]:
         quote_text = match.group(1)
         citation_block = match.group(2)
         # Split multiple citations like "review_123, review_456"
-        for citation_id in re.findall(r'review_\d+', citation_block):
             citations.append((citation_id, quote_text))
     # Pattern for standalone citations not preceded by a quote
     # Find all citations, then filter out ones already captured with quotes
-    all_citation_ids = set(re.findall(r'review_\d+', text))
     quoted_citation_ids = {c[0] for c in citations}
     standalone_ids = all_citation_ids - quoted_citation_ids
@@ -294,9 +295,7 @@ def verify_citation(
     """
     # Collect all chunks belonging to this citation ID (a single review
     # may produce multiple chunks after splitting long reviews).
-    matching_indices = [
-        i for i, eid in enumerate(evidence_ids) if eid == citation_id
-    ]
     if not matching_indices:
         return CitationResult(

         List of unique quoted strings found in the text.
     """
     patterns = [
+        r'"([^"]+)"',  # Regular double quotes
+        r'"([^"]+)"',  # Curly double quotes
+        r"'([^']+)'",  # Single quotes
     ]
     quotes = []
 # Citation ID Verification
 # =============================================================================
 @dataclass
 class CitationResult:
     """Result of verifying a single citation."""
         quote_text = match.group(1)
         citation_block = match.group(2)
         # Split multiple citations like "review_123, review_456"
+        for citation_id in re.findall(r"review_\d+", citation_block):
             citations.append((citation_id, quote_text))
     # Pattern for standalone citations not preceded by a quote
     # Find all citations, then filter out ones already captured with quotes
+    all_citation_ids = set(re.findall(r"review_\d+", text))
     quoted_citation_ids = {c[0] for c in citations}
     standalone_ids = all_citation_ids - quoted_citation_ids
     """
     # Collect all chunks belonging to this citation ID (a single review
     # may produce multiple chunks after splitting long reviews).
+    matching_indices = [i for i, eid in enumerate(evidence_ids) if eid == citation_id]
     if not matching_indices:
         return CitationResult(

sage/services/__init__.py CHANGED Viewed

@@ -59,6 +59,7 @@ _LAZY_IMPORTS = {
 def __getattr__(name: str):
     if name in _LAZY_IMPORTS:
         import importlib
         module = importlib.import_module(_LAZY_IMPORTS[name])
         return getattr(module, name)
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

 def __getattr__(name: str):
     if name in _LAZY_IMPORTS:
         import importlib
         module = importlib.import_module(_LAZY_IMPORTS[name])
         return getattr(module, name)
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

sage/services/baselines.py CHANGED Viewed

@@ -73,9 +73,7 @@ class PopularityBaseline:
         counts = Counter(i[item_key] for i in interactions if item_key in i)
         # Sort by popularity (descending)
-        self.ranked_items = [
-            item for item, _ in counts.most_common()
-        ]
         self.popularity = counts
@@ -119,9 +117,9 @@ class ItemKNNBaseline:
             embedder: E5Embedder instance for query embedding.
         """
         self.product_ids = list(product_embeddings.keys())
-        self.embeddings = np.array([
-            product_embeddings[pid] for pid in self.product_ids
-        ])
         # Normalize embeddings for cosine similarity
         norms = np.linalg.norm(self.embeddings, axis=1, keepdims=True)
@@ -143,6 +141,7 @@ class ItemKNNBaseline:
         """
         if self.embedder is None:
             from sage.adapters.embeddings import get_embedder
             self.embedder = get_embedder()
         # Embed query
@@ -188,7 +187,9 @@ def build_product_embeddings(
         elif aggregation == "max":
             agg_emb = product_embs.max(axis=0)
         else:
-            raise ValueError(f"Unknown aggregation method: {aggregation}. Use 'mean' or 'max'.")
         # Normalize
         agg_emb = agg_emb / (np.linalg.norm(agg_emb) + 1e-8)

         counts = Counter(i[item_key] for i in interactions if item_key in i)
         # Sort by popularity (descending)
+        self.ranked_items = [item for item, _ in counts.most_common()]
         self.popularity = counts
             embedder: E5Embedder instance for query embedding.
         """
         self.product_ids = list(product_embeddings.keys())
+        self.embeddings = np.array(
+            [product_embeddings[pid] for pid in self.product_ids]
+        )
         # Normalize embeddings for cosine similarity
         norms = np.linalg.norm(self.embeddings, axis=1, keepdims=True)
         """
         if self.embedder is None:
             from sage.adapters.embeddings import get_embedder
             self.embedder = get_embedder()
         # Embed query
         elif aggregation == "max":
             agg_emb = product_embs.max(axis=0)
         else:
+            raise ValueError(
+                f"Unknown aggregation method: {aggregation}. Use 'mean' or 'max'."
+            )
         # Normalize
         agg_emb = agg_emb / (np.linalg.norm(agg_emb) + 1e-8)

sage/services/cache.py CHANGED Viewed

@@ -27,6 +27,7 @@ logger = get_logger(__name__)
 # Cache entry
 # ---------------------------------------------------------------------------
 @dataclass
 class _CacheEntry:
     """Single cached result with metadata for eviction."""
@@ -43,6 +44,7 @@ class _CacheEntry:
 # Cache stats
 # ---------------------------------------------------------------------------
 @dataclass
 class CacheStats:
     """Snapshot of cache performance metrics."""
@@ -73,6 +75,7 @@ class CacheStats:
 # Semantic cache
 # ---------------------------------------------------------------------------
 class SemanticCache:
     """Thread-safe in-memory cache with exact-match and semantic-similarity layers.
@@ -116,7 +119,9 @@ class SemanticCache:
     # ------------------------------------------------------------------
     def get(
-        self, query: str, query_embedding: np.ndarray | None = None,
     ) -> tuple[dict | None, str]:
         """Look up a cached result.
@@ -232,7 +237,8 @@ class SemanticCache:
     # ------------------------------------------------------------------
     def _find_semantic_match(
-        self, query_embedding: np.ndarray,
     ) -> tuple[_CacheEntry, float]:
         """Find the best semantic match among cached entries.

 # Cache entry
 # ---------------------------------------------------------------------------
 @dataclass
 class _CacheEntry:
     """Single cached result with metadata for eviction."""
 # Cache stats
 # ---------------------------------------------------------------------------
 @dataclass
 class CacheStats:
     """Snapshot of cache performance metrics."""
 # Semantic cache
 # ---------------------------------------------------------------------------
 class SemanticCache:
     """Thread-safe in-memory cache with exact-match and semantic-similarity layers.
     # ------------------------------------------------------------------
     def get(
+        self,
+        query: str,
+        query_embedding: np.ndarray | None = None,
     ) -> tuple[dict | None, str]:
         """Look up a cached result.
     # ------------------------------------------------------------------
     def _find_semantic_match(
+        self,
+        query_embedding: np.ndarray,
     ) -> tuple[_CacheEntry, float]:
         """Find the best semantic match among cached entries.

sage/services/cold_start.py CHANGED Viewed

@@ -102,6 +102,7 @@ class ColdStartService:
         """Lazy-load retrieval service."""
         if self._retrieval is None:
             from sage.services.retrieval import RetrievalService
             self._retrieval = RetrievalService(collection_name=self.collection_name)
         return self._retrieval
@@ -179,7 +180,9 @@ class ColdStartService:
         item_text = " ".join(text_parts)
         # Embed as a passage
-        item_embedding = self.embedder.embed_passages([item_text], show_progress=False)[0]
         # Search for similar chunks
         results = search(

         """Lazy-load retrieval service."""
         if self._retrieval is None:
             from sage.services.retrieval import RetrievalService
             self._retrieval = RetrievalService(collection_name=self.collection_name)
         return self._retrieval
         item_text = " ".join(text_parts)
         # Embed as a passage
+        item_embedding = self.embedder.embed_passages([item_text], show_progress=False)[
+            0
+        ]
         # Search for similar chunks
         results = search(

sage/services/evaluation.py CHANGED Viewed

@@ -263,7 +263,9 @@ class EvaluationService:
             ndcg_at_k=float(np.mean(ndcg_scores)) if ndcg_scores else 0.0,
             hit_at_k=float(np.mean(hit_scores)) if hit_scores else 0.0,
             mrr=float(np.mean(mrr_scores)) if mrr_scores else 0.0,
-            precision_at_k=float(np.mean(precision_scores)) if precision_scores else 0.0,
             recall_at_k=float(np.mean(recall_scores)) if recall_scores else 0.0,
             diversity=float(np.mean(diversity_scores)) if diversity_scores else 0.0,
             novelty=float(np.mean(novelty_scores)) if novelty_scores else 0.0,

             ndcg_at_k=float(np.mean(ndcg_scores)) if ndcg_scores else 0.0,
             hit_at_k=float(np.mean(hit_scores)) if hit_scores else 0.0,
             mrr=float(np.mean(mrr_scores)) if mrr_scores else 0.0,
+            precision_at_k=float(np.mean(precision_scores))
+            if precision_scores
+            else 0.0,
             recall_at_k=float(np.mean(recall_scores)) if recall_scores else 0.0,
             diversity=float(np.mean(diversity_scores)) if diversity_scores else 0.0,
             novelty=float(np.mean(novelty_scores)) if novelty_scores else 0.0,

sage/services/explanation.py CHANGED Viewed

@@ -109,8 +109,8 @@ class Explainer:
         Returns:
             (explanation, tokens, evidence_texts, evidence_ids, user_prompt).
         """
-        system_prompt, user_prompt, evidence_texts, evidence_ids = build_explanation_prompt(
-            query, product, max_evidence
         )
         t0 = time.perf_counter()
@@ -120,7 +120,9 @@ class Explainer:
         )
         logger.info(
             "LLM generation for %s: %.0fms, %d tokens",
-            product.product_id, (time.perf_counter() - t0) * 1000, tokens,
         )
         return explanation, tokens, evidence_texts, evidence_ids, user_prompt
@@ -241,8 +243,8 @@ class Explainer:
                 f"Client {type(self.client).__name__} does not support streaming."
             )
-        system_prompt, user_prompt, evidence_texts, evidence_ids = build_explanation_prompt(
-            query, product, max_evidence
         )
         token_iterator = self.client.generate_stream(
@@ -335,7 +337,11 @@ class Explainer:
         """
         return [
             self.generate_explanation(
-                query, product, max_evidence, enforce_quality_gate, enforce_forbidden_phrases
             )
             for product in products
         ]

         Returns:
             (explanation, tokens, evidence_texts, evidence_ids, user_prompt).
         """
+        system_prompt, user_prompt, evidence_texts, evidence_ids = (
+            build_explanation_prompt(query, product, max_evidence)
         )
         t0 = time.perf_counter()
         )
         logger.info(
             "LLM generation for %s: %.0fms, %d tokens",
+            product.product_id,
+            (time.perf_counter() - t0) * 1000,
+            tokens,
         )
         return explanation, tokens, evidence_texts, evidence_ids, user_prompt
                 f"Client {type(self.client).__name__} does not support streaming."
             )
+        system_prompt, user_prompt, evidence_texts, evidence_ids = (
+            build_explanation_prompt(query, product, max_evidence)
         )
         token_iterator = self.client.generate_stream(
         """
         return [
             self.generate_explanation(
+                query,
+                product,
+                max_evidence,
+                enforce_quality_gate,
+                enforce_forbidden_phrases,
             )
             for product in products
         ]

sage/services/faithfulness.py CHANGED Viewed

@@ -73,7 +73,9 @@ def create_ragas_sample(query: str, explanation: str, evidence_texts: list[str])
     )
-def _explanation_results_to_samples(explanation_results: list[ExplanationResult]) -> list:
     """Convert ExplanationResults to RAGAS samples."""
     return [
         create_ragas_sample(
@@ -239,6 +241,7 @@ class FaithfulnessEvaluator:
             results=individual_results,
         )
 def evaluate_faithfulness(
     explanation_results: list[ExplanationResult],
     provider: str | None = None,
@@ -396,7 +399,8 @@ def compute_adjusted_faithfulness(
     # - Valid non-recommendations count as passes (correct behavior)
     # - Regular recommendations evaluated by HHEM
     regular_passes = sum(
-        1 for r, is_non_rec in zip(results, valid_non_recs)
         if not is_non_rec and not r.is_hallucinated
     )
     adjusted_passes = regular_passes + n_valid_non_recs
@@ -594,14 +598,13 @@ def compute_multi_metric_faithfulness(
     detector = get_detector()
     # 1. Full-explanation HHEM (structural)
-    full_scores = [
-        detector.check_explanation(ev, exp).score
-        for ev, exp in items
-    ]
     # 2. Claim-level HHEM
     claim_report = compute_claim_level_hhem(
-        items, threshold, full_explanation_scores=full_scores,
     )
     # 3. Quote verification (lexical)

     )
+def _explanation_results_to_samples(
+    explanation_results: list[ExplanationResult],
+) -> list:
     """Convert ExplanationResults to RAGAS samples."""
     return [
         create_ragas_sample(
             results=individual_results,
         )
 def evaluate_faithfulness(
     explanation_results: list[ExplanationResult],
     provider: str | None = None,
     # - Valid non-recommendations count as passes (correct behavior)
     # - Regular recommendations evaluated by HHEM
     regular_passes = sum(
+        1
+        for r, is_non_rec in zip(results, valid_non_recs)
         if not is_non_rec and not r.is_hallucinated
     )
     adjusted_passes = regular_passes + n_valid_non_recs
     detector = get_detector()
     # 1. Full-explanation HHEM (structural)
+    full_scores = [detector.check_explanation(ev, exp).score for ev, exp in items]
     # 2. Claim-level HHEM
     claim_report = compute_claim_level_hhem(
+        items,
+        threshold,
+        full_explanation_scores=full_scores,
     )
     # 3. Quote verification (lexical)

sage/services/retrieval.py CHANGED Viewed

@@ -138,7 +138,11 @@ class RetrievalService:
             limit=limit,
             min_rating=min_rating,
         )
-        logger.info("Qdrant search: %.0fms, %d results", (time.perf_counter() - t0) * 1000, len(results))
         chunks = []
         for r in results:
@@ -157,7 +161,9 @@ class RetrievalService:
             )
         product_ids = {c.product_id for c in chunks}
-        logger.info("Retrieved %d chunks across %d products", len(chunks), len(product_ids))
         return chunks
@@ -247,7 +253,11 @@ def retrieve_chunks(
     """Retrieve relevant chunks from the vector store."""
     service = RetrievalService(client=client, embedder=embedder)
     return service.retrieve_chunks(
-        query, limit, min_rating, exclude_products, query_embedding,
     )
@@ -347,7 +357,8 @@ def recommend_for_user(
     # Get products to exclude
     exclude: set[str] = {
-        pid for r in user_history
         if (pid := r.get("product_id")) is not None and isinstance(pid, str)
     }

             limit=limit,
             min_rating=min_rating,
         )
+        logger.info(
+            "Qdrant search: %.0fms, %d results",
+            (time.perf_counter() - t0) * 1000,
+            len(results),
+        )
         chunks = []
         for r in results:
             )
         product_ids = {c.product_id for c in chunks}
+        logger.info(
+            "Retrieved %d chunks across %d products", len(chunks), len(product_ids)
+        )
         return chunks
     """Retrieve relevant chunks from the vector store."""
     service = RetrievalService(client=client, embedder=embedder)
     return service.retrieve_chunks(
+        query,
+        limit,
+        min_rating,
+        exclude_products,
+        query_embedding,
     )
     # Get products to exclude
     exclude: set[str] = {
+        pid
+        for r in user_history
         if (pid := r.get("product_id")) is not None and isinstance(pid, str)
     }

sage/utils.py CHANGED Viewed

@@ -20,6 +20,7 @@ def save_results(data: dict, prefix: str, directory: Path | None = None) -> Path
     """
     if directory is None:
         from sage.config import RESULTS_DIR
         directory = RESULTS_DIR
     directory.mkdir(parents=True, exist_ok=True)

     """
     if directory is None:
         from sage.config import RESULTS_DIR
         directory = RESULTS_DIR
     directory.mkdir(parents=True, exist_ok=True)

scripts/build_eval_dataset.py CHANGED Viewed

@@ -35,27 +35,199 @@ EVAL_DIR = DATA_DIR / "eval"
 # Common stopwords to filter out
 STOPWORDS = {
-    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your",
-    "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she",
-    "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
-    "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
-    "these", "those", "am", "is", "are", "was", "were", "be", "been", "being",
-    "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an",
-    "the", "and", "but", "if", "or", "because", "as", "until", "while", "of",
-    "at", "by", "for", "with", "about", "against", "between", "into", "through",
-    "during", "before", "after", "above", "below", "to", "from", "up", "down",
-    "in", "out", "on", "off", "over", "under", "again", "further", "then",
-    "once", "here", "there", "when", "where", "why", "how", "all", "each",
-    "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only",
-    "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just",
-    "don", "should", "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren",
-    "couldn", "didn", "doesn", "hadn", "hasn", "haven", "isn", "ma", "mightn",
-    "mustn", "needn", "shan", "shouldn", "wasn", "weren", "won", "wouldn",
-    "also", "would", "could", "get", "got", "one", "two", "really", "like",
-    "just", "even", "well", "much", "still", "back", "way", "thing", "things",
-    "make", "made", "work", "works", "worked", "use", "used", "using", "good",
-    "great", "nice", "product", "item", "bought", "buy", "amazon", "review",
-    "ordered", "order", "received", "came", "arrived", "shipping", "shipped",
 }
@@ -75,7 +247,7 @@ def extract_keywords(text: str, max_keywords: int = 8) -> list[str]:
     # Clean text
     text = text.lower()
     text = re.sub(r"<br\s*/?>", " ", text)  # Remove HTML breaks
-    text = re.sub(r"[^a-z\s]", " ", text)   # Keep only letters
     text = re.sub(r"\s+", " ", text).strip()
     # Tokenize and filter
@@ -165,6 +337,7 @@ def generate_query_from_history(
 # Evaluation Dataset Construction
 # ---------------------------------------------------------------------------
 def build_leave_one_out_cases(
     df: pd.DataFrame,
     min_reviews: int = 2,
@@ -231,16 +404,21 @@ def build_leave_one_out_cases(
         # Only include if target has positive relevance
         if relevance > 0:
-            eval_cases.append(EvalCase(
-                query=query,
-                relevant_items={target_product: relevance},
-                user_id=user_id,
-            ))
     if verbose:
         logger.info("Users with enough reviews: %d", len(user_groups) - skipped_users)
         logger.info("Eval cases created: %d", len(eval_cases))
-        logger.info("Skipped (low relevance): %d", len(user_groups) - skipped_users - len(eval_cases))
     return eval_cases
@@ -310,16 +488,20 @@ def build_multi_relevant_cases(
                 )
         if relevant_items:
-            eval_cases.append(EvalCase(
-                query=query,
-                relevant_items=relevant_items,
-                user_id=user_id,
-            ))
     if verbose:
         logger.info("Users with train history: %d", len(train_users))
         logger.info("Eval cases created: %d", len(eval_cases))
-        avg_relevant = np.mean([len(c.relevant_items) for c in eval_cases]) if eval_cases else 0
         logger.info("Avg relevant items per case: %.1f", avg_relevant)
     return eval_cases
@@ -400,7 +582,12 @@ if __name__ == "__main__":
     # Load splits
     log_section(logger, "Loading data splits")
     train_df, val_df, test_df = load_splits()
-    logger.info("Train: %s | Val: %s | Test: %s", f"{len(train_df):,}", f"{len(val_df):,}", f"{len(test_df):,}")
     # Strategy 1: Leave-one-out with keyword queries
     # WARNING: This strategy has TARGET LEAKAGE - queries are generated from
@@ -418,8 +605,12 @@ if __name__ == "__main__":
     # Show examples
     logger.info("Sample queries:")
     for case in loo_keyword_cases[:5]:
-        logger.info("  Query: \"%s\"", case.query)
-        logger.info("  Target: %s (rel=%s)", list(case.relevant_items.keys())[0], list(case.relevant_items.values())[0])
     save_eval_cases(loo_keyword_cases, "eval_loo_keyword.json")
@@ -435,8 +626,12 @@ if __name__ == "__main__":
     # Show examples
     logger.info("Sample queries:")
     for case in loo_history_cases[:5]:
-        logger.info("  Query: \"%s\"", case.query)
-        logger.info("  Target: %s (rel=%s)", list(case.relevant_items.keys())[0], list(case.relevant_items.values())[0])
     save_eval_cases(loo_history_cases, "eval_loo_history.json")
@@ -452,7 +647,7 @@ if __name__ == "__main__":
     if multi_cases:
         logger.info("Sample queries:")
         for case in multi_cases[:3]:
-            logger.info("  Query: \"%s...\"", case.query[:60])
             logger.info("  Relevant: %d items", len(case.relevant_items))
         save_eval_cases(multi_cases, "eval_multi_relevant.json")

 # Common stopwords to filter out
 STOPWORDS = {
+    "i",
+    "me",
+    "my",
+    "myself",
+    "we",
+    "our",
+    "ours",
+    "ourselves",
+    "you",
+    "your",
+    "yours",
+    "yourself",
+    "yourselves",
+    "he",
+    "him",
+    "his",
+    "himself",
+    "she",
+    "her",
+    "hers",
+    "herself",
+    "it",
+    "its",
+    "itself",
+    "they",
+    "them",
+    "their",
+    "theirs",
+    "themselves",
+    "what",
+    "which",
+    "who",
+    "whom",
+    "this",
+    "that",
+    "these",
+    "those",
+    "am",
+    "is",
+    "are",
+    "was",
+    "were",
+    "be",
+    "been",
+    "being",
+    "have",
+    "has",
+    "had",
+    "having",
+    "do",
+    "does",
+    "did",
+    "doing",
+    "a",
+    "an",
+    "the",
+    "and",
+    "but",
+    "if",
+    "or",
+    "because",
+    "as",
+    "until",
+    "while",
+    "of",
+    "at",
+    "by",
+    "for",
+    "with",
+    "about",
+    "against",
+    "between",
+    "into",
+    "through",
+    "during",
+    "before",
+    "after",
+    "above",
+    "below",
+    "to",
+    "from",
+    "up",
+    "down",
+    "in",
+    "out",
+    "on",
+    "off",
+    "over",
+    "under",
+    "again",
+    "further",
+    "then",
+    "once",
+    "here",
+    "there",
+    "when",
+    "where",
+    "why",
+    "how",
+    "all",
+    "each",
+    "few",
+    "more",
+    "most",
+    "other",
+    "some",
+    "such",
+    "no",
+    "nor",
+    "not",
+    "only",
+    "own",
+    "same",
+    "so",
+    "than",
+    "too",
+    "very",
+    "s",
+    "t",
+    "can",
+    "will",
+    "just",
+    "don",
+    "should",
+    "now",
+    "d",
+    "ll",
+    "m",
+    "o",
+    "re",
+    "ve",
+    "y",
+    "ain",
+    "aren",
+    "couldn",
+    "didn",
+    "doesn",
+    "hadn",
+    "hasn",
+    "haven",
+    "isn",
+    "ma",
+    "mightn",
+    "mustn",
+    "needn",
+    "shan",
+    "shouldn",
+    "wasn",
+    "weren",
+    "won",
+    "wouldn",
+    "also",
+    "would",
+    "could",
+    "get",
+    "got",
+    "one",
+    "two",
+    "really",
+    "like",
+    "just",
+    "even",
+    "well",
+    "much",
+    "still",
+    "back",
+    "way",
+    "thing",
+    "things",
+    "make",
+    "made",
+    "work",
+    "works",
+    "worked",
+    "use",
+    "used",
+    "using",
+    "good",
+    "great",
+    "nice",
+    "product",
+    "item",
+    "bought",
+    "buy",
+    "amazon",
+    "review",
+    "ordered",
+    "order",
+    "received",
+    "came",
+    "arrived",
+    "shipping",
+    "shipped",
 }
     # Clean text
     text = text.lower()
     text = re.sub(r"<br\s*/?>", " ", text)  # Remove HTML breaks
+    text = re.sub(r"[^a-z\s]", " ", text)  # Keep only letters
     text = re.sub(r"\s+", " ", text).strip()
     # Tokenize and filter
 # Evaluation Dataset Construction
 # ---------------------------------------------------------------------------
 def build_leave_one_out_cases(
     df: pd.DataFrame,
     min_reviews: int = 2,
         # Only include if target has positive relevance
         if relevance > 0:
+            eval_cases.append(
+                EvalCase(
+                    query=query,
+                    relevant_items={target_product: relevance},
+                    user_id=user_id,
+                )
+            )
     if verbose:
         logger.info("Users with enough reviews: %d", len(user_groups) - skipped_users)
         logger.info("Eval cases created: %d", len(eval_cases))
+        logger.info(
+            "Skipped (low relevance): %d",
+            len(user_groups) - skipped_users - len(eval_cases),
+        )
     return eval_cases
                 )
         if relevant_items:
+            eval_cases.append(
+                EvalCase(
+                    query=query,
+                    relevant_items=relevant_items,
+                    user_id=user_id,
+                )
+            )
     if verbose:
         logger.info("Users with train history: %d", len(train_users))
         logger.info("Eval cases created: %d", len(eval_cases))
+        avg_relevant = (
+            np.mean([len(c.relevant_items) for c in eval_cases]) if eval_cases else 0
+        )
         logger.info("Avg relevant items per case: %.1f", avg_relevant)
     return eval_cases
     # Load splits
     log_section(logger, "Loading data splits")
     train_df, val_df, test_df = load_splits()
+    logger.info(
+        "Train: %s | Val: %s | Test: %s",
+        f"{len(train_df):,}",
+        f"{len(val_df):,}",
+        f"{len(test_df):,}",
+    )
     # Strategy 1: Leave-one-out with keyword queries
     # WARNING: This strategy has TARGET LEAKAGE - queries are generated from
     # Show examples
     logger.info("Sample queries:")
     for case in loo_keyword_cases[:5]:
+        logger.info('  Query: "%s"', case.query)
+        logger.info(
+            "  Target: %s (rel=%s)",
+            list(case.relevant_items.keys())[0],
+            list(case.relevant_items.values())[0],
+        )
     save_eval_cases(loo_keyword_cases, "eval_loo_keyword.json")
     # Show examples
     logger.info("Sample queries:")
     for case in loo_history_cases[:5]:
+        logger.info('  Query: "%s"', case.query)
+        logger.info(
+            "  Target: %s (rel=%s)",
+            list(case.relevant_items.keys())[0],
+            list(case.relevant_items.values())[0],
+        )
     save_eval_cases(loo_history_cases, "eval_loo_history.json")
     if multi_cases:
         logger.info("Sample queries:")
         for case in multi_cases[:3]:
+            logger.info('  Query: "%s..."', case.query[:60])
             logger.info("  Relevant: %d items", len(case.relevant_items))
         save_eval_cases(multi_cases, "eval_multi_relevant.json")

scripts/build_natural_eval_dataset.py CHANGED Viewed

@@ -70,7 +70,6 @@ NATURAL_QUERIES = [
         "category": "echo_devices",
         "intent": "feature_specific",
     },
     # === FIRE TABLET QUERIES ===
     {
         "query": "tablet for reading books and light browsing",
@@ -111,7 +110,6 @@ NATURAL_QUERIES = [
         "category": "fire_tablets",
         "intent": "use_case",
     },
     # === FIRE TV / STREAMING QUERIES ===
     {
         "query": "streaming device for my tv",
@@ -151,7 +149,6 @@ NATURAL_QUERIES = [
         "category": "fire_tv",
         "intent": "use_case",
     },
     # === SMART HOME QUERIES ===
     {
         "query": "smart plug to control lights with alexa",
@@ -191,7 +188,6 @@ NATURAL_QUERIES = [
         "category": "smart_home",
         "intent": "feature_specific",
     },
     # === STORAGE QUERIES ===
     {
         "query": "sd card for camera",
@@ -232,7 +228,6 @@ NATURAL_QUERIES = [
         "category": "storage",
         "intent": "feature_specific",
     },
     # === HEADPHONES / AUDIO QUERIES ===
     {
         "query": "wireless headphones for working out",
@@ -283,7 +278,6 @@ NATURAL_QUERIES = [
         "category": "headphones_audio",
         "intent": "use_case",
     },
     # === CABLES / ADAPTERS QUERIES ===
     {
         "query": "usb c charging cable",
@@ -322,7 +316,6 @@ NATURAL_QUERIES = [
         "category": "cables_adapters",
         "intent": "feature_specific",
     },
     # === KEYBOARD / MOUSE QUERIES ===
     {
         "query": "wireless keyboard for computer",
@@ -353,7 +346,6 @@ NATURAL_QUERIES = [
         "category": "keyboards_mice",
         "intent": "feature_specific",
     },
     # === GIFT QUERIES ===
     {
         "query": "gift for someone who likes music",
@@ -395,7 +387,6 @@ NATURAL_QUERIES = [
         "category": "gifts",
         "intent": "gift",
     },
     # === PROBLEM-SOLVING QUERIES ===
     {
         "query": "headphones that dont hurt ears",
@@ -424,7 +415,6 @@ NATURAL_QUERIES = [
         "category": "fire_tv",
         "intent": "problem_solving",
     },
     # === COMPARISON / BEST QUERIES ===
     {
         "query": "best value fire tablet",
@@ -460,15 +450,19 @@ def build_natural_eval_cases() -> list[EvalCase]:
     """Convert natural queries to EvalCase objects."""
     cases = []
     for item in NATURAL_QUERIES:
-        cases.append(EvalCase(
-            query=item["query"],
-            relevant_items=item["relevant_items"],
-            user_id=None,  # No user for natural queries
-        ))
     return cases
-def save_natural_eval_cases(cases: list[EvalCase], filename: str = "eval_natural_queries.json"):
     """Save evaluation cases with metadata."""
     EVAL_DIR.mkdir(exist_ok=True)
     filepath = EVAL_DIR / filename
@@ -476,12 +470,14 @@ def save_natural_eval_cases(cases: list[EvalCase], filename: str = "eval_natural
     # Include metadata for analysis
     data = []
     for i, item in enumerate(NATURAL_QUERIES):
-        data.append({
-            "query": item["query"],
-            "relevant_items": item["relevant_items"],
-            "category": item.get("category", "unknown"),
-            "intent": item.get("intent", "general"),
-        })
     with open(filepath, "w") as f:
         json.dump(data, f, indent=2)
@@ -522,9 +518,9 @@ def analyze_dataset():
     # Sample queries
     log_section(logger, "SAMPLE QUERIES")
     for q in NATURAL_QUERIES[:5]:
-        logger.info("Query: \"%s\"", q['query'])
-        logger.info("  Category: %s | Intent: %s", q['category'], q['intent'])
-        logger.info("  Relevant: %d products", len(q['relevant_items']))
 if __name__ == "__main__":

         "category": "echo_devices",
         "intent": "feature_specific",
     },
     # === FIRE TABLET QUERIES ===
     {
         "query": "tablet for reading books and light browsing",
         "category": "fire_tablets",
         "intent": "use_case",
     },
     # === FIRE TV / STREAMING QUERIES ===
     {
         "query": "streaming device for my tv",
         "category": "fire_tv",
         "intent": "use_case",
     },
     # === SMART HOME QUERIES ===
     {
         "query": "smart plug to control lights with alexa",
         "category": "smart_home",
         "intent": "feature_specific",
     },
     # === STORAGE QUERIES ===
     {
         "query": "sd card for camera",
         "category": "storage",
         "intent": "feature_specific",
     },
     # === HEADPHONES / AUDIO QUERIES ===
     {
         "query": "wireless headphones for working out",
         "category": "headphones_audio",
         "intent": "use_case",
     },
     # === CABLES / ADAPTERS QUERIES ===
     {
         "query": "usb c charging cable",
         "category": "cables_adapters",
         "intent": "feature_specific",
     },
     # === KEYBOARD / MOUSE QUERIES ===
     {
         "query": "wireless keyboard for computer",
         "category": "keyboards_mice",
         "intent": "feature_specific",
     },
     # === GIFT QUERIES ===
     {
         "query": "gift for someone who likes music",
         "category": "gifts",
         "intent": "gift",
     },
     # === PROBLEM-SOLVING QUERIES ===
     {
         "query": "headphones that dont hurt ears",
         "category": "fire_tv",
         "intent": "problem_solving",
     },
     # === COMPARISON / BEST QUERIES ===
     {
         "query": "best value fire tablet",
     """Convert natural queries to EvalCase objects."""
     cases = []
     for item in NATURAL_QUERIES:
+        cases.append(
+            EvalCase(
+                query=item["query"],
+                relevant_items=item["relevant_items"],
+                user_id=None,  # No user for natural queries
+            )
+        )
     return cases
+def save_natural_eval_cases(
+    cases: list[EvalCase], filename: str = "eval_natural_queries.json"
+):
     """Save evaluation cases with metadata."""
     EVAL_DIR.mkdir(exist_ok=True)
     filepath = EVAL_DIR / filename
     # Include metadata for analysis
     data = []
     for i, item in enumerate(NATURAL_QUERIES):
+        data.append(
+            {
+                "query": item["query"],
+                "relevant_items": item["relevant_items"],
+                "category": item.get("category", "unknown"),
+                "intent": item.get("intent", "general"),
+            }
+        )
     with open(filepath, "w") as f:
         json.dump(data, f, indent=2)
     # Sample queries
     log_section(logger, "SAMPLE QUERIES")
     for q in NATURAL_QUERIES[:5]:
+        logger.info('Query: "%s"', q["query"])
+        logger.info("  Category: %s | Intent: %s", q["category"], q["intent"])
+        logger.info("  Relevant: %d products", len(q["relevant_items"]))
 if __name__ == "__main__":

scripts/demo.py CHANGED Viewed

@@ -31,7 +31,7 @@ def demo_recommendation(query: str, top_k: int = 3, max_evidence: int = 3):
     Returns dict suitable for JSON serialization.
     """
     log_banner(logger, "SAGE RECOMMENDATION DEMO", width=70)
-    logger.info("Query: \"%s\"", query)
     # Get candidates
     products = get_candidates(
@@ -91,7 +91,7 @@ def demo_recommendation(query: str, top_k: int = 3, max_evidence: int = 3):
             # Truncate long evidence for display
             display_text = ev_text[:200] + "..." if len(ev_text) > 200 else ev_text
             logger.info("[%s]:", ev_id)
-            logger.info("  \"%s\"", display_text)
         # Compile result
         result = {
@@ -108,8 +108,7 @@ def demo_recommendation(query: str, top_k: int = 3, max_evidence: int = 3):
             "evidence_sources": [
                 {"id": ev_id, "text": ev_text}
                 for ev_id, ev_text in zip(
-                    explanation_result.evidence_ids,
-                    explanation_result.evidence_texts
                 )
             ],
         }
@@ -131,13 +130,15 @@ def demo_recommendation(query: str, top_k: int = 3, max_evidence: int = 3):
 def main():
     parser = argparse.ArgumentParser(description="Demo recommendation pipeline")
     parser.add_argument(
-        "--query", "-q",
         type=str,
         default="wireless earbuds for running",
         help="Query to demonstrate",
     )
     parser.add_argument(
-        "--top-k", "-k",
         type=int,
         default=1,
         help="Number of products to recommend (default: 1)",

     Returns dict suitable for JSON serialization.
     """
     log_banner(logger, "SAGE RECOMMENDATION DEMO", width=70)
+    logger.info('Query: "%s"', query)
     # Get candidates
     products = get_candidates(
             # Truncate long evidence for display
             display_text = ev_text[:200] + "..." if len(ev_text) > 200 else ev_text
             logger.info("[%s]:", ev_id)
+            logger.info('  "%s"', display_text)
         # Compile result
         result = {
             "evidence_sources": [
                 {"id": ev_id, "text": ev_text}
                 for ev_id, ev_text in zip(
+                    explanation_result.evidence_ids, explanation_result.evidence_texts
                 )
             ],
         }
 def main():
     parser = argparse.ArgumentParser(description="Demo recommendation pipeline")
     parser.add_argument(
+        "--query",
+        "-q",
         type=str,
         default="wireless earbuds for running",
         help="Query to demonstrate",
     )
     parser.add_argument(
+        "--top-k",
+        "-k",
         type=int,
         default=1,
         help="Number of products to recommend (default: 1)",

scripts/e2e_success_rate.py CHANGED Viewed

@@ -149,7 +149,7 @@ def run_e2e_evaluation(n_samples: int = 20) -> E2EReport:
     case_id = 0
     for query in queries:
-        logger.info("Query: \"%s\"", query)
         products = get_candidates(
             query=query,
@@ -275,23 +275,43 @@ def run_e2e_evaluation(n_samples: int = 20) -> E2EReport:
     n_evidence_insufficient = sum(1 for c in all_cases if not c.evidence_sufficient)
     n_generated = sum(1 for c in all_cases if c.evidence_sufficient)
     n_forbidden_violations = sum(1 for c in all_cases if c.has_forbidden_phrases)
-    n_hhem_failures = sum(1 for c in all_cases if c.evidence_sufficient and not c.hhem_pass and not c.is_valid_non_recommendation)
     n_valid_non_recs = sum(1 for c in all_cases if c.is_valid_non_recommendation)
     # Success counts
     n_raw_success = sum(1 for c in all_cases if c.e2e_success)
-    n_adjusted_success = n_raw_success + n_valid_non_recs  # Valid non-recs are correct behavior
     # Rates
     evidence_pass_rate = n_generated / n_total if n_total > 0 else 0
     # Forbidden phrase compliance among generated explanations
     generated_cases = [c for c in all_cases if c.evidence_sufficient]
-    phrase_compliance = sum(1 for c in generated_cases if not c.has_forbidden_phrases) / len(generated_cases) if generated_cases else 0
     # HHEM pass rate among non-refusal generated explanations
-    non_refusal_generated = [c for c in generated_cases if not c.is_valid_non_recommendation]
-    hhem_pass_rate = sum(1 for c in non_refusal_generated if c.hhem_pass) / len(non_refusal_generated) if non_refusal_generated else 0
     raw_e2e = n_raw_success / n_total if n_total > 0 else 0
     adjusted_e2e = n_adjusted_success / n_total if n_total > 0 else 0
@@ -321,11 +341,31 @@ def run_e2e_evaluation(n_samples: int = 20) -> E2EReport:
     log_section(logger, "Stage Breakdown")
     logger.info("Total cases:              %d", n_total)
-    logger.info("Evidence insufficient:    %d (%.1f%%)", n_evidence_insufficient, n_evidence_insufficient / n_total * 100)
-    logger.info("Generated explanations:   %d (%.1f%%)", n_generated, n_generated / n_total * 100)
-    logger.info("Forbidden phrase fails:   %d (%.1f%%)", n_forbidden_violations, n_forbidden_violations / n_total * 100)
-    logger.info("HHEM failures:            %d (%.1f%%)", n_hhem_failures, n_hhem_failures / n_total * 100)
-    logger.info("Valid non-recommendations:%d (%.1f%%)", n_valid_non_recs, n_valid_non_recs / n_total * 100)
     log_section(logger, "Component Rates")
     logger.info("Evidence pass rate:       %.1f%%", evidence_pass_rate * 100)
@@ -333,8 +373,18 @@ def run_e2e_evaluation(n_samples: int = 20) -> E2EReport:
     logger.info("HHEM pass rate:           %.1f%%", hhem_pass_rate * 100)
     log_section(logger, "END-TO-END SUCCESS RATES")
-    logger.info("Raw E2E success:          %d/%d = %.1f%%", n_raw_success, n_total, raw_e2e * 100)
-    logger.info("Adjusted E2E success:     %d/%d = %.1f%%", n_adjusted_success, n_total, adjusted_e2e * 100)
     logger.info("Target:                   %.1f%%", target * 100)
     logger.info("Gap to target:            %.1f%%", report.gap_to_target * 100)
     logger.info("Meets target:             %s", "YES" if report.meets_target else "NO")
@@ -355,7 +405,9 @@ def run_e2e_evaluation(n_samples: int = 20) -> E2EReport:
         "cases": [c.to_dict() for c in all_cases],
     }
-    output_file = RESULTS_DIR / f"e2e_success_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
     with open(output_file, "w") as f:
         json.dump(output, f, indent=2)
     logger.info("Saved: %s", output_file)

     case_id = 0
     for query in queries:
+        logger.info('Query: "%s"', query)
         products = get_candidates(
             query=query,
     n_evidence_insufficient = sum(1 for c in all_cases if not c.evidence_sufficient)
     n_generated = sum(1 for c in all_cases if c.evidence_sufficient)
     n_forbidden_violations = sum(1 for c in all_cases if c.has_forbidden_phrases)
+    n_hhem_failures = sum(
+        1
+        for c in all_cases
+        if c.evidence_sufficient
+        and not c.hhem_pass
+        and not c.is_valid_non_recommendation
+    )
     n_valid_non_recs = sum(1 for c in all_cases if c.is_valid_non_recommendation)
     # Success counts
     n_raw_success = sum(1 for c in all_cases if c.e2e_success)
+    n_adjusted_success = (
+        n_raw_success + n_valid_non_recs
+    )  # Valid non-recs are correct behavior
     # Rates
     evidence_pass_rate = n_generated / n_total if n_total > 0 else 0
     # Forbidden phrase compliance among generated explanations
     generated_cases = [c for c in all_cases if c.evidence_sufficient]
+    phrase_compliance = (
+        sum(1 for c in generated_cases if not c.has_forbidden_phrases)
+        / len(generated_cases)
+        if generated_cases
+        else 0
+    )
     # HHEM pass rate among non-refusal generated explanations
+    non_refusal_generated = [
+        c for c in generated_cases if not c.is_valid_non_recommendation
+    ]
+    hhem_pass_rate = (
+        sum(1 for c in non_refusal_generated if c.hhem_pass)
+        / len(non_refusal_generated)
+        if non_refusal_generated
+        else 0
+    )
     raw_e2e = n_raw_success / n_total if n_total > 0 else 0
     adjusted_e2e = n_adjusted_success / n_total if n_total > 0 else 0
     log_section(logger, "Stage Breakdown")
     logger.info("Total cases:              %d", n_total)
+    logger.info(
+        "Evidence insufficient:    %d (%.1f%%)",
+        n_evidence_insufficient,
+        n_evidence_insufficient / n_total * 100,
+    )
+    logger.info(
+        "Generated explanations:   %d (%.1f%%)",
+        n_generated,
+        n_generated / n_total * 100,
+    )
+    logger.info(
+        "Forbidden phrase fails:   %d (%.1f%%)",
+        n_forbidden_violations,
+        n_forbidden_violations / n_total * 100,
+    )
+    logger.info(
+        "HHEM failures:            %d (%.1f%%)",
+        n_hhem_failures,
+        n_hhem_failures / n_total * 100,
+    )
+    logger.info(
+        "Valid non-recommendations:%d (%.1f%%)",
+        n_valid_non_recs,
+        n_valid_non_recs / n_total * 100,
+    )
     log_section(logger, "Component Rates")
     logger.info("Evidence pass rate:       %.1f%%", evidence_pass_rate * 100)
     logger.info("HHEM pass rate:           %.1f%%", hhem_pass_rate * 100)
     log_section(logger, "END-TO-END SUCCESS RATES")
+    logger.info(
+        "Raw E2E success:          %d/%d = %.1f%%",
+        n_raw_success,
+        n_total,
+        raw_e2e * 100,
+    )
+    logger.info(
+        "Adjusted E2E success:     %d/%d = %.1f%%",
+        n_adjusted_success,
+        n_total,
+        adjusted_e2e * 100,
+    )
     logger.info("Target:                   %.1f%%", target * 100)
     logger.info("Gap to target:            %.1f%%", report.gap_to_target * 100)
     logger.info("Meets target:             %s", "YES" if report.meets_target else "NO")
         "cases": [c.to_dict() for c in all_cases],
     }
+    output_file = (
+        RESULTS_DIR / f"e2e_success_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+    )
     with open(output_file, "w") as f:
         json.dump(output, f, indent=2)
     logger.info("Saved: %s", output_file)

scripts/eda.py CHANGED Viewed

@@ -17,19 +17,22 @@ FIGURES_DIR.mkdir(exist_ok=True)
 # Plot configuration
 plt.style.use("seaborn-v0_8-whitegrid")
-plt.rcParams.update({
-    "figure.figsize": (10, 5),
-    "figure.dpi": 100,
-    "savefig.dpi": 150,
-    "font.size": 11,
-    "axes.titlesize": 12,
-    "axes.labelsize": 11,
-    "figure.autolayout": True,
-})
 # Enable retina display for Jupyter notebooks
 try:
     from IPython import get_ipython
     if get_ipython() is not None:
         get_ipython().run_line_magic("matplotlib", "inline")
         get_ipython().run_line_magic("config", "InlineBackend.figure_format='retina'")
@@ -56,15 +59,23 @@ for key, value in stats.items():
 # %% Rating distribution
 fig, ax = plt.subplots()
 rating_counts = pd.Series(stats["rating_dist"])
-bars = ax.bar(rating_counts.index, rating_counts.values, color=PRIMARY_COLOR, edgecolor="black")
 ax.set_xlabel("Rating")
 ax.set_ylabel("Count")
 ax.set_title("Rating Distribution")
 ax.set_xticks(rating_counts.index)
 for bar, count in zip(bars, rating_counts.values):
-    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50,
-            f"{count:,}", ha="center", va="bottom", fontsize=10)
 plt.tight_layout()
 plt.savefig(FIGURES_DIR / "rating_distribution.png", dpi=150)
@@ -84,16 +95,25 @@ fig, axes = plt.subplots(1, 2, figsize=FIGURE_SIZE_WIDE)
 # Character length histogram
 ax1 = axes[0]
-df["text_length"].clip(upper=2000).hist(bins=50, ax=ax1, color=PRIMARY_COLOR, edgecolor="white")
 ax1.set_xlabel("Character Length (clipped at 2000)")
 ax1.set_ylabel("Count")
 ax1.set_title("Review Length Distribution")
-ax1.axvline(df["text_length"].median(), color="red", linestyle="--", label=f"Median: {df['text_length'].median():.0f}")
 ax1.legend()
 # Token estimate histogram
 ax2 = axes[1]
-df["estimated_tokens"].clip(upper=500).hist(bins=50, ax=ax2, color=SECONDARY_COLOR, edgecolor="white")
 ax2.set_xlabel("Estimated Tokens (clipped at 500)")
 ax2.set_ylabel("Count")
 ax2.set_title("Estimated Token Distribution")
@@ -108,12 +128,19 @@ needs_chunking = (df["estimated_tokens"] > 200).sum()
 print("\nReview length stats:")
 print(f"  Median characters: {df['text_length'].median():.0f}")
 print(f"  Median tokens (est): {df['estimated_tokens'].median():.0f}")
-print(f"  Reviews > 200 tokens: {needs_chunking:,} ({needs_chunking/len(df)*100:.1f}%)")
 # %% Review length by rating
 fig, ax = plt.subplots()
 length_by_rating = df.groupby("rating")["text_length"].median()
-bars = ax.bar(length_by_rating.index, length_by_rating.values, color=PRIMARY_COLOR, edgecolor="white")
 ax.set_xlabel("Rating")
 ax.set_ylabel("Median Review Length (chars)")
 ax.set_title("Review Length by Rating")
@@ -134,7 +161,9 @@ df["year_month"] = df["datetime"].dt.to_period("M")
 reviews_over_time = df.groupby("year_month").size()
 fig, ax = plt.subplots(figsize=FIGURE_SIZE_WIDE)
-reviews_over_time.plot(kind="line", ax=ax, marker="o", markersize=3, linewidth=1, color=PRIMARY_COLOR)
 ax.set_xlabel("Month")
 ax.set_ylabel("Number of Reviews")
 ax.set_title("Reviews Over Time")
@@ -156,7 +185,7 @@ missing = df.isnull().sum()
 print("\nMissing values:")
 for col, count in missing.items():
     if count > 0:
-        print(f"  {col}: {count:,} ({count/len(df)*100:.2f}%)")
 if missing.sum() == 0:
     print("  None!")
@@ -185,14 +214,18 @@ fig, axes = plt.subplots(1, 2, figsize=FIGURE_SIZE_WIDE)
 # Reviews per user
 ax1 = axes[0]
-user_counts.clip(upper=20).value_counts().sort_index().plot(kind="bar", ax=ax1, color=PRIMARY_COLOR)
 ax1.set_xlabel("Reviews per User")
 ax1.set_ylabel("Number of Users")
 ax1.set_title("User Activity Distribution")
 # Reviews per item
 ax2 = axes[1]
-item_counts.clip(upper=20).value_counts().sort_index().plot(kind="bar", ax=ax2, color=SECONDARY_COLOR)
 ax2.set_xlabel("Reviews per Item")
 ax2.set_ylabel("Number of Items")
 ax2.set_title("Item Popularity Distribution")
@@ -202,12 +235,16 @@ plt.savefig(FIGURES_DIR / "user_item_distribution.png", dpi=150)
 plt.show()
 print("\nUser activity:")
-print(f"  Users with 1 review: {(user_counts == 1).sum():,} ({(user_counts == 1).sum()/len(user_counts)*100:.1f}%)")
 print(f"  Users with 5+ reviews: {(user_counts >= 5).sum():,}")
 print(f"  Max reviews by one user: {user_counts.max()}")
 print("\nItem popularity:")
-print(f"  Items with 1 review: {(item_counts == 1).sum():,} ({(item_counts == 1).sum()/len(item_counts)*100:.1f}%)")
 print(f"  Items with 5+ reviews: {(item_counts >= 5).sum():,}")
 print(f"  Max reviews for one item: {item_counts.max()}")
@@ -217,7 +254,9 @@ items_5plus = set(item_counts[item_counts >= 5].index)
 eligible_mask = df["user_id"].isin(users_5plus) & df["parent_asin"].isin(items_5plus)
 print("\n5-core filtering preview:")
-print(f"  Reviews eligible (first pass): {eligible_mask.sum():,} ({eligible_mask.sum()/len(df)*100:.1f}%)")
 # %% Sample reviews across length buckets
 print("\n=== Sample Reviews by Length Bucket ===")
@@ -232,14 +271,18 @@ length_buckets = [
 ]
 for min_tok, max_tok, label in length_buckets:
-    bucket_mask = (df["estimated_tokens"] >= min_tok) & (df["estimated_tokens"] < max_tok)
     bucket_df = df[bucket_mask]
     if len(bucket_df) == 0:
         print(f"{label}: No reviews")
         continue
-    print(f"{label}: {len(bucket_df):,} reviews ({len(bucket_df)/len(df)*100:.1f}%)")
     samples = bucket_df.sample(min(3, len(bucket_df)), random_state=42)
     for _, row in samples.iterrows():
@@ -256,10 +299,14 @@ df_prepared = prepare_data(subset_size=DEV_SUBSET_SIZE, verbose=False)
 prepared_stats = get_review_stats(df_prepared)
 print(f"Raw reviews: {len(df):,}")
-print(f"Prepared reviews: {len(df_prepared):,} ({len(df_prepared)/len(df)*100:.1f}% retained)")
 print(f"Unique users: {prepared_stats['unique_users']:,}")
 print(f"Unique items: {prepared_stats['unique_items']:,}")
-print(f"Avg rating: {prepared_stats['avg_rating']:.2f} (raw: {stats['avg_rating']:.2f})")
 # %% Summary
 print("\n" + "=" * 50)
@@ -269,7 +316,9 @@ print(f"Total reviews: {len(df):,}")
 print(f"Unique users: {df['user_id'].nunique():,}")
 print(f"Unique items: {df['parent_asin'].nunique():,}")
 print(f"Average rating: {df['rating'].mean():.2f}")
-print(f"Reviews needing chunking: {needs_chunking:,} ({needs_chunking/len(df)*100:.1f}%)")
 print(f"Data quality issues: {empty_reviews + very_short + duplicate_texts}")
 print(f"\nPlots saved to: {FIGURES_DIR}")

 # Plot configuration
 plt.style.use("seaborn-v0_8-whitegrid")
+plt.rcParams.update(
+    {
+        "figure.figsize": (10, 5),
+        "figure.dpi": 100,
+        "savefig.dpi": 150,
+        "font.size": 11,
+        "axes.titlesize": 12,
+        "axes.labelsize": 11,
+        "figure.autolayout": True,
+    }
+)
 # Enable retina display for Jupyter notebooks
 try:
     from IPython import get_ipython
     if get_ipython() is not None:
         get_ipython().run_line_magic("matplotlib", "inline")
         get_ipython().run_line_magic("config", "InlineBackend.figure_format='retina'")
 # %% Rating distribution
 fig, ax = plt.subplots()
 rating_counts = pd.Series(stats["rating_dist"])
+bars = ax.bar(
+    rating_counts.index, rating_counts.values, color=PRIMARY_COLOR, edgecolor="black"
+)
 ax.set_xlabel("Rating")
 ax.set_ylabel("Count")
 ax.set_title("Rating Distribution")
 ax.set_xticks(rating_counts.index)
 for bar, count in zip(bars, rating_counts.values):
+    ax.text(
+        bar.get_x() + bar.get_width() / 2,
+        bar.get_height() + 50,
+        f"{count:,}",
+        ha="center",
+        va="bottom",
+        fontsize=10,
+    )
 plt.tight_layout()
 plt.savefig(FIGURES_DIR / "rating_distribution.png", dpi=150)
 # Character length histogram
 ax1 = axes[0]
+df["text_length"].clip(upper=2000).hist(
+    bins=50, ax=ax1, color=PRIMARY_COLOR, edgecolor="white"
+)
 ax1.set_xlabel("Character Length (clipped at 2000)")
 ax1.set_ylabel("Count")
 ax1.set_title("Review Length Distribution")
+ax1.axvline(
+    df["text_length"].median(),
+    color="red",
+    linestyle="--",
+    label=f"Median: {df['text_length'].median():.0f}",
+)
 ax1.legend()
 # Token estimate histogram
 ax2 = axes[1]
+df["estimated_tokens"].clip(upper=500).hist(
+    bins=50, ax=ax2, color=SECONDARY_COLOR, edgecolor="white"
+)
 ax2.set_xlabel("Estimated Tokens (clipped at 500)")
 ax2.set_ylabel("Count")
 ax2.set_title("Estimated Token Distribution")
 print("\nReview length stats:")
 print(f"  Median characters: {df['text_length'].median():.0f}")
 print(f"  Median tokens (est): {df['estimated_tokens'].median():.0f}")
+print(
+    f"  Reviews > 200 tokens: {needs_chunking:,} ({needs_chunking / len(df) * 100:.1f}%)"
+)
 # %% Review length by rating
 fig, ax = plt.subplots()
 length_by_rating = df.groupby("rating")["text_length"].median()
+bars = ax.bar(
+    length_by_rating.index,
+    length_by_rating.values,
+    color=PRIMARY_COLOR,
+    edgecolor="white",
+)
 ax.set_xlabel("Rating")
 ax.set_ylabel("Median Review Length (chars)")
 ax.set_title("Review Length by Rating")
 reviews_over_time = df.groupby("year_month").size()
 fig, ax = plt.subplots(figsize=FIGURE_SIZE_WIDE)
+reviews_over_time.plot(
+    kind="line", ax=ax, marker="o", markersize=3, linewidth=1, color=PRIMARY_COLOR
+)
 ax.set_xlabel("Month")
 ax.set_ylabel("Number of Reviews")
 ax.set_title("Reviews Over Time")
 print("\nMissing values:")
 for col, count in missing.items():
     if count > 0:
+        print(f"  {col}: {count:,} ({count / len(df) * 100:.2f}%)")
 if missing.sum() == 0:
     print("  None!")
 # Reviews per user
 ax1 = axes[0]
+user_counts.clip(upper=20).value_counts().sort_index().plot(
+    kind="bar", ax=ax1, color=PRIMARY_COLOR
+)
 ax1.set_xlabel("Reviews per User")
 ax1.set_ylabel("Number of Users")
 ax1.set_title("User Activity Distribution")
 # Reviews per item
 ax2 = axes[1]
+item_counts.clip(upper=20).value_counts().sort_index().plot(
+    kind="bar", ax=ax2, color=SECONDARY_COLOR
+)
 ax2.set_xlabel("Reviews per Item")
 ax2.set_ylabel("Number of Items")
 ax2.set_title("Item Popularity Distribution")
 plt.show()
 print("\nUser activity:")
+print(
+    f"  Users with 1 review: {(user_counts == 1).sum():,} ({(user_counts == 1).sum() / len(user_counts) * 100:.1f}%)"
+)
 print(f"  Users with 5+ reviews: {(user_counts >= 5).sum():,}")
 print(f"  Max reviews by one user: {user_counts.max()}")
 print("\nItem popularity:")
+print(
+    f"  Items with 1 review: {(item_counts == 1).sum():,} ({(item_counts == 1).sum() / len(item_counts) * 100:.1f}%)"
+)
 print(f"  Items with 5+ reviews: {(item_counts >= 5).sum():,}")
 print(f"  Max reviews for one item: {item_counts.max()}")
 eligible_mask = df["user_id"].isin(users_5plus) & df["parent_asin"].isin(items_5plus)
 print("\n5-core filtering preview:")
+print(
+    f"  Reviews eligible (first pass): {eligible_mask.sum():,} ({eligible_mask.sum() / len(df) * 100:.1f}%)"
+)
 # %% Sample reviews across length buckets
 print("\n=== Sample Reviews by Length Bucket ===")
 ]
 for min_tok, max_tok, label in length_buckets:
+    bucket_mask = (df["estimated_tokens"] >= min_tok) & (
+        df["estimated_tokens"] < max_tok
+    )
     bucket_df = df[bucket_mask]
     if len(bucket_df) == 0:
         print(f"{label}: No reviews")
         continue
+    print(
+        f"{label}: {len(bucket_df):,} reviews ({len(bucket_df) / len(df) * 100:.1f}%)"
+    )
     samples = bucket_df.sample(min(3, len(bucket_df)), random_state=42)
     for _, row in samples.iterrows():
 prepared_stats = get_review_stats(df_prepared)
 print(f"Raw reviews: {len(df):,}")
+print(
+    f"Prepared reviews: {len(df_prepared):,} ({len(df_prepared) / len(df) * 100:.1f}% retained)"
+)
 print(f"Unique users: {prepared_stats['unique_users']:,}")
 print(f"Unique items: {prepared_stats['unique_items']:,}")
+print(
+    f"Avg rating: {prepared_stats['avg_rating']:.2f} (raw: {stats['avg_rating']:.2f})"
+)
 # %% Summary
 print("\n" + "=" * 50)
 print(f"Unique users: {df['user_id'].nunique():,}")
 print(f"Unique items: {df['parent_asin'].nunique():,}")
 print(f"Average rating: {df['rating'].mean():.2f}")
+print(
+    f"Reviews needing chunking: {needs_chunking:,} ({needs_chunking / len(df) * 100:.1f}%)"
+)
 print(f"Data quality issues: {empty_reviews + very_short + duplicate_texts}")
 print(f"\nPlots saved to: {FIGURES_DIR}")

scripts/evaluation.py CHANGED Viewed

@@ -48,6 +48,7 @@ def create_recommend_fn(
     rating_weight: float = 0.0,
 ):
     """Create a recommend function for evaluation."""
     def _recommend(query: str) -> list[str]:
         recs = recommend(
             query=query,
@@ -59,10 +60,13 @@ def create_recommend_fn(
             rating_weight=rating_weight,
         )
         return [r.product_id for r in recs]
     return _recommend
-def save_results(results: dict, filename: str | None = None, dataset: str | None = None) -> Path:
     """Save evaluation results to JSON file.
     Also writes a fixed-name "latest" file so downstream scripts (e.g.
@@ -89,6 +93,7 @@ def save_results(results: dict, filename: str | None = None, dataset: str | None
 # SECTION: Primary Evaluation
 # ============================================================================
 def run_primary_evaluation(cases, item_embeddings, item_popularity, total_items):
     """Run primary evaluation on leave-one-out dataset."""
     log_banner(logger, "EVALUATION: Leave-One-Out (History Queries)")
@@ -124,6 +129,7 @@ def run_primary_evaluation(cases, item_embeddings, item_popularity, total_items)
 # SECTION: Aggregation Methods
 # ============================================================================
 def run_aggregation_comparison(cases):
     """Compare different aggregation methods."""
     log_banner(logger, "AGGREGATION METHOD COMPARISON")
@@ -154,6 +160,7 @@ def run_aggregation_comparison(cases):
 # SECTION: Rating Filter
 # ============================================================================
 def run_rating_filter_comparison(cases):
     """Compare different rating filters."""
     log_banner(logger, "RATING FILTER COMPARISON")
@@ -177,6 +184,7 @@ def run_rating_filter_comparison(cases):
 # SECTION: K Values
 # ============================================================================
 def run_k_value_comparison(cases):
     """Compare metrics at different K values."""
     log_banner(logger, "METRICS AT DIFFERENT K VALUES")
@@ -200,16 +208,23 @@ def run_k_value_comparison(cases):
 # SECTION: Weight Tuning
 # ============================================================================
 def run_weight_tuning(cases):
     """Run ranking weight tuning experiment."""
     log_banner(logger, "RANKING WEIGHT TUNING (alpha*sim + beta*rating)")
     weight_configs = [
-        (1.0, 0.0), (0.9, 0.1), (0.8, 0.2),
-        (0.7, 0.3), (0.6, 0.4), (0.5, 0.5),
     ]
-    logger.info("%-10s %-12s %-10s %-10s %-10s", "alpha", "beta", "NDCG@10", "Hit@10", "MRR")
     logger.info("-" * 52)
     results = []
@@ -227,15 +242,22 @@ def run_weight_tuning(cases):
             k=10,
             verbose=False,
         )
-        results.append({
-            "alpha": alpha, "beta": beta,
-            "ndcg_at_10": report.ndcg_at_k,
-            "hit_at_10": report.hit_at_k,
-            "mrr": report.mrr,
-        })
         logger.info(
             "%-10.1f %-12.1f %-10.4f %-10.4f %-10.4f",
-            alpha, beta, report.ndcg_at_k, report.hit_at_k, report.mrr
         )
         if report.ndcg_at_k > best_ndcg:
@@ -245,7 +267,9 @@ def run_weight_tuning(cases):
     logger.info("-" * 52)
     logger.info(
         "Best: alpha=%.1f, beta=%.1f (NDCG@10=%.4f)",
-        best_weights[0], best_weights[1], best_ndcg
     )
     return results, best_weights, best_ndcg
@@ -255,6 +279,7 @@ def run_weight_tuning(cases):
 # SECTION: Baseline Comparison
 # ============================================================================
 def run_baseline_comparison(cases, train_records, all_products, product_embeddings):
     """Compare against baselines: Random, Popularity, ItemKNN."""
     log_banner(logger, "BASELINE COMPARISON")
@@ -274,7 +299,12 @@ def run_baseline_comparison(cases, train_records, all_products, product_embeddin
         return itemknn_baseline.recommend(query, top_k=10)
     def rag_recommend(query: str) -> list[str]:
-        recs = recommend(query=query, top_k=10, candidate_limit=100, aggregation=AggregationMethod.MAX)
         return [r.product_id for r in recs]
     results = {}
@@ -305,7 +335,10 @@ def run_baseline_comparison(cases, train_records, all_products, product_embeddin
     for name, report in results.items():
         logger.info(
             "%-15s %10.4f %10.4f %10.4f",
-            name, report.ndcg_at_k, report.hit_at_k, report.mrr
         )
     # Relative improvements
@@ -323,17 +356,22 @@ def run_baseline_comparison(cases, train_records, all_products, product_embeddin
 # Main
 # ============================================================================
 def main():
     parser = argparse.ArgumentParser(description="Run recommendation evaluation")
-    parser.add_argument("--baselines", action="store_true", help="Include baseline comparison")
     parser.add_argument(
-        "--section", "-s",
         choices=["all", "primary", "aggregation", "rating", "k", "weights"],
         default="primary",
         help="Which section to run (default: primary)",
     )
     parser.add_argument(
-        "--dataset", "-d",
         default="eval_loo_history.json",
         help="Evaluation dataset file (default: eval_loo_history.json)",
     )
@@ -375,7 +413,9 @@ def main():
         )
     if args.section in ("all", "aggregation"):
-        all_results["experiments"]["aggregation_methods"] = run_aggregation_comparison(cases)
     if args.section in ("all", "rating"):
         run_rating_filter_comparison(cases)

     rating_weight: float = 0.0,
 ):
     """Create a recommend function for evaluation."""
     def _recommend(query: str) -> list[str]:
         recs = recommend(
             query=query,
             rating_weight=rating_weight,
         )
         return [r.product_id for r in recs]
     return _recommend
+def save_results(
+    results: dict, filename: str | None = None, dataset: str | None = None
+) -> Path:
     """Save evaluation results to JSON file.
     Also writes a fixed-name "latest" file so downstream scripts (e.g.
 # SECTION: Primary Evaluation
 # ============================================================================
 def run_primary_evaluation(cases, item_embeddings, item_popularity, total_items):
     """Run primary evaluation on leave-one-out dataset."""
     log_banner(logger, "EVALUATION: Leave-One-Out (History Queries)")
 # SECTION: Aggregation Methods
 # ============================================================================
 def run_aggregation_comparison(cases):
     """Compare different aggregation methods."""
     log_banner(logger, "AGGREGATION METHOD COMPARISON")
 # SECTION: Rating Filter
 # ============================================================================
 def run_rating_filter_comparison(cases):
     """Compare different rating filters."""
     log_banner(logger, "RATING FILTER COMPARISON")
 # SECTION: K Values
 # ============================================================================
 def run_k_value_comparison(cases):
     """Compare metrics at different K values."""
     log_banner(logger, "METRICS AT DIFFERENT K VALUES")
 # SECTION: Weight Tuning
 # ============================================================================
 def run_weight_tuning(cases):
     """Run ranking weight tuning experiment."""
     log_banner(logger, "RANKING WEIGHT TUNING (alpha*sim + beta*rating)")
     weight_configs = [
+        (1.0, 0.0),
+        (0.9, 0.1),
+        (0.8, 0.2),
+        (0.7, 0.3),
+        (0.6, 0.4),
+        (0.5, 0.5),
     ]
+    logger.info(
+        "%-10s %-12s %-10s %-10s %-10s", "alpha", "beta", "NDCG@10", "Hit@10", "MRR"
+    )
     logger.info("-" * 52)
     results = []
             k=10,
             verbose=False,
         )
+        results.append(
+            {
+                "alpha": alpha,
+                "beta": beta,
+                "ndcg_at_10": report.ndcg_at_k,
+                "hit_at_10": report.hit_at_k,
+                "mrr": report.mrr,
+            }
+        )
         logger.info(
             "%-10.1f %-12.1f %-10.4f %-10.4f %-10.4f",
+            alpha,
+            beta,
+            report.ndcg_at_k,
+            report.hit_at_k,
+            report.mrr,
         )
         if report.ndcg_at_k > best_ndcg:
     logger.info("-" * 52)
     logger.info(
         "Best: alpha=%.1f, beta=%.1f (NDCG@10=%.4f)",
+        best_weights[0],
+        best_weights[1],
+        best_ndcg,
     )
     return results, best_weights, best_ndcg
 # SECTION: Baseline Comparison
 # ============================================================================
 def run_baseline_comparison(cases, train_records, all_products, product_embeddings):
     """Compare against baselines: Random, Popularity, ItemKNN."""
     log_banner(logger, "BASELINE COMPARISON")
         return itemknn_baseline.recommend(query, top_k=10)
     def rag_recommend(query: str) -> list[str]:
+        recs = recommend(
+            query=query,
+            top_k=10,
+            candidate_limit=100,
+            aggregation=AggregationMethod.MAX,
+        )
         return [r.product_id for r in recs]
     results = {}
     for name, report in results.items():
         logger.info(
             "%-15s %10.4f %10.4f %10.4f",
+            name,
+            report.ndcg_at_k,
+            report.hit_at_k,
+            report.mrr,
         )
     # Relative improvements
 # Main
 # ============================================================================
 def main():
     parser = argparse.ArgumentParser(description="Run recommendation evaluation")
     parser.add_argument(
+        "--baselines", action="store_true", help="Include baseline comparison"
+    )
+    parser.add_argument(
+        "--section",
+        "-s",
         choices=["all", "primary", "aggregation", "rating", "k", "weights"],
         default="primary",
         help="Which section to run (default: primary)",
     )
     parser.add_argument(
+        "--dataset",
+        "-d",
         default="eval_loo_history.json",
         help="Evaluation dataset file (default: eval_loo_history.json)",
     )
         )
     if args.section in ("all", "aggregation"):
+        all_results["experiments"]["aggregation_methods"] = run_aggregation_comparison(
+            cases
+        )
     if args.section in ("all", "rating"):
         run_rating_filter_comparison(cases)

scripts/explanation.py CHANGED Viewed

@@ -40,6 +40,7 @@ PRODUCTS_PER_QUERY = 2
 # SECTION: Basic Explanation Generation
 # ============================================================================
 def run_basic_tests():
     """Test basic explanation generation and HHEM detection."""
     from sage.services.explanation import Explainer
@@ -59,10 +60,13 @@ def run_basic_tests():
     query_results = {}
     for query in test_queries:
         products = get_candidates(
-            query=query, k=TOP_K_PRODUCTS, min_rating=4.0, aggregation=AggregationMethod.MAX
         )
         query_results[query] = products
-        logger.info("Query: \"%s\"", query)
         logger.info("  Found %d products", len(products))
     # Generate explanations
@@ -71,7 +75,7 @@ def run_basic_tests():
     all_explanations = []
     for query, products in query_results.items():
-        logger.info("--- Query: \"%s\" ---", query)
         for product in products[:PRODUCTS_PER_QUERY]:
             result = explainer.generate_explanation(query, product)
             all_explanations.append(result)
@@ -100,7 +104,7 @@ def run_basic_tests():
     if query_results:
         test_query = list(query_results.keys())[0]
         test_product = query_results[test_query][0]
-        logger.info("Query: \"%s\"", test_query)
         logger.info("Streaming: ")
         stream = explainer.generate_explanation_stream(test_query, test_product)
@@ -110,7 +114,9 @@ def run_basic_tests():
         logger.info("".join(chunks))
         streamed_result = stream.get_complete_result()
-        hhem = detector.check_explanation(streamed_result.evidence_texts, streamed_result.explanation)
         logger.info("HHEM Score: %.3f", hhem.score)
     log_banner(logger, "BASIC TESTS COMPLETE")
@@ -120,7 +126,10 @@ def run_basic_tests():
 # SECTION: Evidence Quality Gate
 # ============================================================================
-def create_mock_product(n_chunks: int, tokens_per_chunk: int = 100, product_score: float = 0.85) -> ProductScore:
     """Create a mock ProductScore for testing."""
     chunks = [
         RetrievedChunk(
@@ -145,7 +154,11 @@ def run_quality_gate_tests():
     """Test the evidence quality gate."""
     from sage.core.evidence import check_evidence_quality, generate_refusal_message
     from sage.services.faithfulness import is_refusal
-    from sage.config import MIN_EVIDENCE_CHUNKS, MIN_EVIDENCE_TOKENS, MIN_RETRIEVAL_SCORE
     log_banner(logger, "EVIDENCE QUALITY GATE TESTS")
@@ -161,7 +174,14 @@ def run_quality_gate_tests():
         product = create_mock_product(n_chunks, tok, score)
         quality = check_evidence_quality(product)
         status = "PASS" if quality.is_sufficient == expected else "FAIL"
-        logger.info("[%s] %d chunks, %d tok, score=%.2f -> %s", status, n_chunks, tok, score, reason)
         assert quality.is_sufficient == expected
     log_section(logger, "2. REFUSAL GENERATION")
@@ -172,12 +192,18 @@ def run_quality_gate_tests():
         quality = check_evidence_quality(product)
         refusal = generate_refusal_message(query, quality)
         detected = is_refusal(refusal)
-        logger.info("[%s] Refusal detected for %s", "PASS" if detected else "FAIL", quality.failure_reason)
         assert detected
     logger.info(
         "Thresholds: chunks=%d, tokens=%d, score=%.2f",
-        MIN_EVIDENCE_CHUNKS, MIN_EVIDENCE_TOKENS, MIN_RETRIEVAL_SCORE
     )
     log_banner(logger, "QUALITY GATE TESTS COMPLETE")
@@ -186,9 +212,14 @@ def run_quality_gate_tests():
 # SECTION: Verification Loop
 # ============================================================================
 def run_verification_tests():
     """Test the post-generation verification loop."""
-    from sage.core.verification import extract_quotes, verify_quote_in_evidence, verify_explanation
     log_banner(logger, "VERIFICATION LOOP TESTS")
@@ -235,6 +266,7 @@ def run_verification_tests():
 # SECTION: Cold-Start
 # ============================================================================
 def run_cold_start_tests():
     """Test cold-start handling."""
     from sage.services.cold_start import (
@@ -264,7 +296,9 @@ def run_cold_start_tests():
     for count in test_counts:
         level = get_warmup_level(count)
         weight = get_content_weight(count)
-        logger.info("  %d interactions: level=%s, content_weight=%.1f", count, level, weight)
     # Test preferences to query
     log_section(logger, "2. PREFERENCES TO QUERY")
@@ -277,7 +311,7 @@ def run_cold_start_tests():
     )
     query = preferences_to_query(prefs)
     logger.info("Preferences: %s", prefs)
-    logger.info("Query: \"%s\"", query)
     # Test cold-start recommendations
     log_section(logger, "3. COLD-START RECOMMENDATIONS")
@@ -290,7 +324,9 @@ def run_cold_start_tests():
     )
     logger.info("Got %d recommendations", len(recs))
     for r in recs[:3]:
-        logger.info("  %s: score=%.3f, rating=%.1f", r.product_id, r.score, r.avg_rating)
     logger.info("Query-based (cold user):")
     recs = recommend_cold_start_user(
@@ -337,10 +373,12 @@ def run_cold_start_tests():
 # Main
 # ============================================================================
 def main():
     parser = argparse.ArgumentParser(description="Run explanation tests")
     parser.add_argument(
-        "--section", "-s",
         choices=["all", "basic", "gate", "verify", "cold"],
         default="all",
         help="Which section to run",

 # SECTION: Basic Explanation Generation
 # ============================================================================
 def run_basic_tests():
     """Test basic explanation generation and HHEM detection."""
     from sage.services.explanation import Explainer
     query_results = {}
     for query in test_queries:
         products = get_candidates(
+            query=query,
+            k=TOP_K_PRODUCTS,
+            min_rating=4.0,
+            aggregation=AggregationMethod.MAX,
         )
         query_results[query] = products
+        logger.info('Query: "%s"', query)
         logger.info("  Found %d products", len(products))
     # Generate explanations
     all_explanations = []
     for query, products in query_results.items():
+        logger.info('--- Query: "%s" ---', query)
         for product in products[:PRODUCTS_PER_QUERY]:
             result = explainer.generate_explanation(query, product)
             all_explanations.append(result)
     if query_results:
         test_query = list(query_results.keys())[0]
         test_product = query_results[test_query][0]
+        logger.info('Query: "%s"', test_query)
         logger.info("Streaming: ")
         stream = explainer.generate_explanation_stream(test_query, test_product)
         logger.info("".join(chunks))
         streamed_result = stream.get_complete_result()
+        hhem = detector.check_explanation(
+            streamed_result.evidence_texts, streamed_result.explanation
+        )
         logger.info("HHEM Score: %.3f", hhem.score)
     log_banner(logger, "BASIC TESTS COMPLETE")
 # SECTION: Evidence Quality Gate
 # ============================================================================
+def create_mock_product(
+    n_chunks: int, tokens_per_chunk: int = 100, product_score: float = 0.85
+) -> ProductScore:
     """Create a mock ProductScore for testing."""
     chunks = [
         RetrievedChunk(
     """Test the evidence quality gate."""
     from sage.core.evidence import check_evidence_quality, generate_refusal_message
     from sage.services.faithfulness import is_refusal
+    from sage.config import (
+        MIN_EVIDENCE_CHUNKS,
+        MIN_EVIDENCE_TOKENS,
+        MIN_RETRIEVAL_SCORE,
+    )
     log_banner(logger, "EVIDENCE QUALITY GATE TESTS")
         product = create_mock_product(n_chunks, tok, score)
         quality = check_evidence_quality(product)
         status = "PASS" if quality.is_sufficient == expected else "FAIL"
+        logger.info(
+            "[%s] %d chunks, %d tok, score=%.2f -> %s",
+            status,
+            n_chunks,
+            tok,
+            score,
+            reason,
+        )
         assert quality.is_sufficient == expected
     log_section(logger, "2. REFUSAL GENERATION")
         quality = check_evidence_quality(product)
         refusal = generate_refusal_message(query, quality)
         detected = is_refusal(refusal)
+        logger.info(
+            "[%s] Refusal detected for %s",
+            "PASS" if detected else "FAIL",
+            quality.failure_reason,
+        )
         assert detected
     logger.info(
         "Thresholds: chunks=%d, tokens=%d, score=%.2f",
+        MIN_EVIDENCE_CHUNKS,
+        MIN_EVIDENCE_TOKENS,
+        MIN_RETRIEVAL_SCORE,
     )
     log_banner(logger, "QUALITY GATE TESTS COMPLETE")
 # SECTION: Verification Loop
 # ============================================================================
 def run_verification_tests():
     """Test the post-generation verification loop."""
+    from sage.core.verification import (
+        extract_quotes,
+        verify_quote_in_evidence,
+        verify_explanation,
+    )
     log_banner(logger, "VERIFICATION LOOP TESTS")
 # SECTION: Cold-Start
 # ============================================================================
 def run_cold_start_tests():
     """Test cold-start handling."""
     from sage.services.cold_start import (
     for count in test_counts:
         level = get_warmup_level(count)
         weight = get_content_weight(count)
+        logger.info(
+            "  %d interactions: level=%s, content_weight=%.1f", count, level, weight
+        )
     # Test preferences to query
     log_section(logger, "2. PREFERENCES TO QUERY")
     )
     query = preferences_to_query(prefs)
     logger.info("Preferences: %s", prefs)
+    logger.info('Query: "%s"', query)
     # Test cold-start recommendations
     log_section(logger, "3. COLD-START RECOMMENDATIONS")
     )
     logger.info("Got %d recommendations", len(recs))
     for r in recs[:3]:
+        logger.info(
+            "  %s: score=%.3f, rating=%.1f", r.product_id, r.score, r.avg_rating
+        )
     logger.info("Query-based (cold user):")
     recs = recommend_cold_start_user(
 # Main
 # ============================================================================
 def main():
     parser = argparse.ArgumentParser(description="Run explanation tests")
     parser.add_argument(
+        "--section",
+        "-s",
         choices=["all", "basic", "gate", "verify", "cold"],
         default="all",
         help="Which section to run",

scripts/faithfulness.py CHANGED Viewed

@@ -47,6 +47,7 @@ TOP_K_PRODUCTS = 3
 # SECTION: Core Evaluation
 # ============================================================================
 def run_evaluation(n_samples: int, run_ragas: bool = False):
     """Run faithfulness evaluation on sample queries."""
     from sage.services.explanation import Explainer
@@ -64,8 +65,13 @@ def run_evaluation(n_samples: int, run_ragas: bool = False):
     all_explanations = []
     for i, query in enumerate(queries, 1):
-        logger.info("[%d/%d] \"%s\"", i, len(queries), query)
-        products = get_candidates(query=query, k=TOP_K_PRODUCTS, min_rating=4.0, aggregation=AggregationMethod.MAX)
         if not products:
             logger.info("  No products found")
@@ -73,7 +79,9 @@ def run_evaluation(n_samples: int, run_ragas: bool = False):
         product = products[0]
         try:
-            result = explainer.generate_explanation(query, product, max_evidence=MAX_EVIDENCE)
             all_explanations.append(result)
             logger.info("  %s: %s...", product.product_id, result.explanation[:60])
         except Exception:
@@ -101,7 +109,9 @@ def run_evaluation(n_samples: int, run_ragas: bool = False):
     logger.info(
         "HHEM (full-explanation): %d/%d grounded, mean=%.3f",
-        len(hhem_results) - n_hallucinated, len(hhem_results), np.mean(hhem_scores),
     )
     # Multi-metric faithfulness (claim-level as primary)
@@ -109,20 +119,24 @@ def run_evaluation(n_samples: int, run_ragas: bool = False):
     from sage.services.faithfulness import compute_multi_metric_faithfulness
-    multi_items = [
-        (expl.evidence_texts, expl.explanation) for expl in all_explanations
-    ]
     multi_report = compute_multi_metric_faithfulness(multi_items)
-    logger.info("Quote verification: %d/%d (%.1f%%)",
-        multi_report.quotes_found, multi_report.quotes_total,
         multi_report.quote_verification_rate * 100,
     )
-    logger.info("Claim-level HHEM:   %.3f avg, %.1f%% pass rate",
-        multi_report.claim_level_avg_score, multi_report.claim_level_pass_rate * 100,
     )
-    logger.info("Full-explanation:   %.3f avg, %.1f%% pass rate (reference only)",
-        multi_report.full_explanation_avg_score, multi_report.full_explanation_pass_rate * 100,
     )
     # RAGAS (optional)
@@ -132,16 +146,17 @@ def run_evaluation(n_samples: int, run_ragas: bool = False):
         try:
             from sage.services.faithfulness import FaithfulnessEvaluator
             evaluator = FaithfulnessEvaluator()
             ragas_report = evaluator.evaluate_batch(all_explanations)
             logger.info(
                 "Faithfulness: %.3f +/- %.3f",
-                ragas_report.mean_score, ragas_report.std_score
             )
             logger.info(
-                "Passing: %d/%d",
-                ragas_report.n_passing, ragas_report.n_samples
             )
         except Exception:
             logger.exception("RAGAS evaluation failed")
@@ -217,8 +232,10 @@ def run_failure_analysis():
     case_id = 0
     for query in ANALYSIS_QUERIES:
-        logger.info("Query: \"%s\"", query)
-        products = get_candidates(query=query, k=3, min_rating=3.5, aggregation=AggregationMethod.MAX)
         if not products:
             continue
@@ -226,21 +243,27 @@ def run_failure_analysis():
         for product in products[:2]:
             try:
                 result = explainer.generate_explanation(query, product, max_evidence=3)
-                hhem = detector.check_explanation(result.evidence_texts, result.explanation)
                 case_id += 1
-                all_cases.append({
-                    "case_id": case_id,
-                    "query": query,
-                    "product_id": product.product_id,
-                    "explanation": result.explanation,
-                    "evidence_texts": result.evidence_texts,
-                    "hhem_score": hhem.score,
-                    "is_hallucinated": hhem.is_hallucinated,
-                })
                 status = "FAIL" if hhem.is_hallucinated else "PASS"
-                logger.info("  [%s] %.3f - %s...", status, hhem.score, product.product_id[:20])
             except Exception:
                 logger.exception("  Error processing product")
@@ -254,7 +277,9 @@ def run_failure_analysis():
     log_banner(logger, "ANALYSIS SUMMARY")
     logger.info("Total cases: %d", len(all_cases))
-    logger.info("Failures: %d (%.1f%%)", len(failures), len(failures) / len(all_cases) * 100)
     logger.info("Passes: %d", len(passes))
     # Categorize failures
@@ -274,6 +299,7 @@ def run_failure_analysis():
 # SECTION: Adjusted Faithfulness
 # ============================================================================
 def run_adjusted_calculation():
     """Calculate adjusted faithfulness with refusals excluded."""
     from sage.services.faithfulness import is_refusal
@@ -296,8 +322,14 @@ def run_adjusted_calculation():
     # Classify
     refusals = [c for c in cases if is_refusal(c["explanation"])]
-    non_refusal_passes = [c for c in cases if not is_refusal(c["explanation"]) and not c["is_hallucinated"]]
-    non_refusal_fails = [c for c in cases if not is_refusal(c["explanation"]) and c["is_hallucinated"]]
     n_total = len(cases)
     raw_pass = sum(1 for c in cases if not c["is_hallucinated"])
@@ -309,9 +341,21 @@ def run_adjusted_calculation():
     logger.info("Non-refusal fails: %d", len(non_refusal_fails))
     log_section(logger, "Metrics")
-    logger.info("Raw pass rate:      %d/%d = %.1f%%", raw_pass, n_total, raw_pass / n_total * 100)
-    logger.info("Adjusted pass rate: %d/%d = %.1f%%", adjusted_pass, n_total, adjusted_pass / n_total * 100)
-    logger.info("Improvement: +%.1f%%", (adjusted_pass / n_total - raw_pass / n_total) * 100)
     # Save
     output = {
@@ -328,12 +372,15 @@ def run_adjusted_calculation():
 # Main
 # ============================================================================
 def main():
     parser = argparse.ArgumentParser(description="Run faithfulness evaluation")
     parser.add_argument("--samples", "-n", type=int, default=DEFAULT_SAMPLES)
     parser.add_argument("--ragas", action="store_true", help="Include RAGAS evaluation")
     parser.add_argument("--analyze", action="store_true", help="Run failure analysis")
-    parser.add_argument("--adjusted", action="store_true", help="Calculate adjusted metrics")
     args = parser.parse_args()
     if args.analyze:

 # SECTION: Core Evaluation
 # ============================================================================
 def run_evaluation(n_samples: int, run_ragas: bool = False):
     """Run faithfulness evaluation on sample queries."""
     from sage.services.explanation import Explainer
     all_explanations = []
     for i, query in enumerate(queries, 1):
+        logger.info('[%d/%d] "%s"', i, len(queries), query)
+        products = get_candidates(
+            query=query,
+            k=TOP_K_PRODUCTS,
+            min_rating=4.0,
+            aggregation=AggregationMethod.MAX,
+        )
         if not products:
             logger.info("  No products found")
         product = products[0]
         try:
+            result = explainer.generate_explanation(
+                query, product, max_evidence=MAX_EVIDENCE
+            )
             all_explanations.append(result)
             logger.info("  %s: %s...", product.product_id, result.explanation[:60])
         except Exception:
     logger.info(
         "HHEM (full-explanation): %d/%d grounded, mean=%.3f",
+        len(hhem_results) - n_hallucinated,
+        len(hhem_results),
+        np.mean(hhem_scores),
     )
     # Multi-metric faithfulness (claim-level as primary)
     from sage.services.faithfulness import compute_multi_metric_faithfulness
+    multi_items = [(expl.evidence_texts, expl.explanation) for expl in all_explanations]
     multi_report = compute_multi_metric_faithfulness(multi_items)
+    logger.info(
+        "Quote verification: %d/%d (%.1f%%)",
+        multi_report.quotes_found,
+        multi_report.quotes_total,
         multi_report.quote_verification_rate * 100,
     )
+    logger.info(
+        "Claim-level HHEM:   %.3f avg, %.1f%% pass rate",
+        multi_report.claim_level_avg_score,
+        multi_report.claim_level_pass_rate * 100,
     )
+    logger.info(
+        "Full-explanation:   %.3f avg, %.1f%% pass rate (reference only)",
+        multi_report.full_explanation_avg_score,
+        multi_report.full_explanation_pass_rate * 100,
     )
     # RAGAS (optional)
         try:
             from sage.services.faithfulness import FaithfulnessEvaluator
             evaluator = FaithfulnessEvaluator()
             ragas_report = evaluator.evaluate_batch(all_explanations)
             logger.info(
                 "Faithfulness: %.3f +/- %.3f",
+                ragas_report.mean_score,
+                ragas_report.std_score,
             )
             logger.info(
+                "Passing: %d/%d", ragas_report.n_passing, ragas_report.n_samples
             )
         except Exception:
             logger.exception("RAGAS evaluation failed")
     case_id = 0
     for query in ANALYSIS_QUERIES:
+        logger.info('Query: "%s"', query)
+        products = get_candidates(
+            query=query, k=3, min_rating=3.5, aggregation=AggregationMethod.MAX
+        )
         if not products:
             continue
         for product in products[:2]:
             try:
                 result = explainer.generate_explanation(query, product, max_evidence=3)
+                hhem = detector.check_explanation(
+                    result.evidence_texts, result.explanation
+                )
                 case_id += 1
+                all_cases.append(
+                    {
+                        "case_id": case_id,
+                        "query": query,
+                        "product_id": product.product_id,
+                        "explanation": result.explanation,
+                        "evidence_texts": result.evidence_texts,
+                        "hhem_score": hhem.score,
+                        "is_hallucinated": hhem.is_hallucinated,
+                    }
+                )
                 status = "FAIL" if hhem.is_hallucinated else "PASS"
+                logger.info(
+                    "  [%s] %.3f - %s...", status, hhem.score, product.product_id[:20]
+                )
             except Exception:
                 logger.exception("  Error processing product")
     log_banner(logger, "ANALYSIS SUMMARY")
     logger.info("Total cases: %d", len(all_cases))
+    logger.info(
+        "Failures: %d (%.1f%%)", len(failures), len(failures) / len(all_cases) * 100
+    )
     logger.info("Passes: %d", len(passes))
     # Categorize failures
 # SECTION: Adjusted Faithfulness
 # ============================================================================
 def run_adjusted_calculation():
     """Calculate adjusted faithfulness with refusals excluded."""
     from sage.services.faithfulness import is_refusal
     # Classify
     refusals = [c for c in cases if is_refusal(c["explanation"])]
+    non_refusal_passes = [
+        c
+        for c in cases
+        if not is_refusal(c["explanation"]) and not c["is_hallucinated"]
+    ]
+    non_refusal_fails = [
+        c for c in cases if not is_refusal(c["explanation"]) and c["is_hallucinated"]
+    ]
     n_total = len(cases)
     raw_pass = sum(1 for c in cases if not c["is_hallucinated"])
     logger.info("Non-refusal fails: %d", len(non_refusal_fails))
     log_section(logger, "Metrics")
+    logger.info(
+        "Raw pass rate:      %d/%d = %.1f%%",
+        raw_pass,
+        n_total,
+        raw_pass / n_total * 100,
+    )
+    logger.info(
+        "Adjusted pass rate: %d/%d = %.1f%%",
+        adjusted_pass,
+        n_total,
+        adjusted_pass / n_total * 100,
+    )
+    logger.info(
+        "Improvement: +%.1f%%", (adjusted_pass / n_total - raw_pass / n_total) * 100
+    )
     # Save
     output = {
 # Main
 # ============================================================================
 def main():
     parser = argparse.ArgumentParser(description="Run faithfulness evaluation")
     parser.add_argument("--samples", "-n", type=int, default=DEFAULT_SAMPLES)
     parser.add_argument("--ragas", action="store_true", help="Include RAGAS evaluation")
     parser.add_argument("--analyze", action="store_true", help="Run failure analysis")
+    parser.add_argument(
+        "--adjusted", action="store_true", help="Calculate adjusted metrics"
+    )
     args = parser.parse_args()
     if args.analyze:

scripts/human_eval.py CHANGED Viewed

@@ -51,6 +51,7 @@ NATURAL_QUERIES_FILE = DATA_DIR / "eval" / "eval_natural_queries.json"
 # Sample Generation
 # ============================================================================
 def _select_diverse_natural_queries(target: int = 35) -> list[str]:
     """Select diverse queries from natural eval dataset, balanced by category."""
     if not NATURAL_QUERIES_FILE.exists():
@@ -114,7 +115,8 @@ def generate_samples(force: bool = False):
             logger.error(
                 "%s contains %d rated samples. "
                 "Use --force to overwrite, or run --annotate to continue.",
-                SAMPLES_FILE, rated,
             )
             sys.exit(1)
@@ -129,7 +131,9 @@ def generate_samples(force: bool = False):
     all_queries = natural + config
     logger.info(
         "Queries: %d natural + %d config = %d total",
-        len(natural), len(config), len(all_queries),
     )
     if len(all_queries) < TARGET_SAMPLES:
@@ -137,7 +141,8 @@ def generate_samples(force: bool = False):
             "Only %d unique queries available (target: %d). "
             "Results will lack statistical power. "
             "Run 'make eval' to build natural query dataset.",
-            len(all_queries), TARGET_SAMPLES,
         )
     # Initialize services
@@ -146,10 +151,12 @@ def generate_samples(force: bool = False):
     samples = []
     for i, query in enumerate(all_queries, 1):
-        logger.info("[%d/%d] \"%s\"", i, len(all_queries), query)
         products = get_candidates(
-            query=query, k=1, min_rating=4.0,
             aggregation=AggregationMethod.MAX,
         )
         if not products:
@@ -159,10 +166,13 @@ def generate_samples(force: bool = False):
         product = products[0]
         try:
             expl = explainer.generate_explanation(
-                query, product, max_evidence=MAX_EVIDENCE,
             )
             hhem = detector.check_explanation(
-                expl.evidence_texts, expl.explanation,
             )
             sample = {
@@ -178,7 +188,9 @@ def generate_samples(force: bool = False):
             samples.append(sample)
             logger.info(
                 "  %s (%.1f stars) HHEM=%.3f",
-                product.product_id, product.avg_rating, hhem.score,
             )
         except ValueError as exc:
             logger.info("  Quality gate refusal: %s", exc)
@@ -197,6 +209,7 @@ def generate_samples(force: bool = False):
 # Interactive Annotation
 # ============================================================================
 def _load_samples() -> list[dict]:
     """Load samples from disk."""
     if not SAMPLES_FILE.exists():
@@ -261,7 +274,7 @@ def annotate_samples():
                 text = ev["text"]
                 if len(text) > 200:
                     text = text[:200] + "..."
-                print(f"  [{ev['id']}]: \"{text}\"")
             print()
             # Collect ratings
@@ -286,6 +299,7 @@ def annotate_samples():
 # Analysis
 # ============================================================================
 def analyze_results():
     """Compute aggregate metrics from rated samples."""
     samples = _load_samples()
@@ -306,7 +320,7 @@ def analyze_results():
         n = len(scores)
         mean = sum(scores) / n
         variance = sum((x - mean) ** 2 for x in scores) / (n - 1) if n > 1 else 0.0
-        std = variance ** 0.5
         dimensions_results[dim_key] = {
             "mean": round(mean, 2),
             "std": round(std, 2),
@@ -315,7 +329,11 @@ def analyze_results():
         }
         logger.info(
             "  %-15s mean=%.2f  std=%.2f  range=[%d, %d]",
-            dim_key + ":", mean, std, min(scores), max(scores),
         )
     # Overall helpfulness: mean of per-sample averages
@@ -328,15 +346,20 @@ def analyze_results():
     passed = overall >= HELPFULNESS_TARGET
     logger.info("")
-    logger.info("Overall helpfulness: %.2f (target: %.1f) [%s]",
-                overall, HELPFULNESS_TARGET, "PASS" if passed else "FAIL")
     # HHEM vs Trust correlation (Spearman)
     correlation = _compute_hhem_trust_correlation(rated)
     if correlation:
         logger.info(
             "HHEM-Trust correlation: r=%.3f, p=%.4f",
-            correlation["spearman_r"], correlation["p_value"],
         )
     # Save results
@@ -368,6 +391,7 @@ def _compute_hhem_trust_correlation(rated: list[dict]) -> dict | None:
     try:
         from scipy.stats import spearmanr
         r, p = spearmanr(hhem_scores, trust_scores)
         return {"spearman_r": round(float(r), 4), "p_value": round(float(p), 4)}
     except ImportError:
@@ -399,13 +423,13 @@ def _manual_spearman(x: list[float], y: list[float]) -> dict | None:
     ry = _rank(y)
     d_sq = sum((rx[i] - ry[i]) ** 2 for i in range(n))
-    rho = 1 - (6 * d_sq) / (n * (n ** 2 - 1))
     # Approximate p-value via t-distribution (large sample)
     if abs(rho) >= 1.0:
         p = 0.0
     else:
-        t = rho * math.sqrt((n - 2) / (1 - rho ** 2))
         # Two-tailed p-value approximation
         p = 2 * (1 - _t_cdf_approx(abs(t), n - 2))
@@ -427,6 +451,7 @@ def _t_cdf_approx(t: float, df: int) -> float:
 # Status
 # ============================================================================
 def show_status():
     """Show annotation progress."""
     if not SAMPLES_FILE.exists():
@@ -450,21 +475,27 @@ def show_status():
 # Main
 # ============================================================================
 def main():
     parser = argparse.ArgumentParser(
         description="Human evaluation of recommendation explanations",
     )
     group = parser.add_mutually_exclusive_group(required=True)
-    group.add_argument("--generate", action="store_true",
-                       help="Generate recommendation samples")
-    group.add_argument("--annotate", action="store_true",
-                       help="Rate samples interactively (resumable)")
-    group.add_argument("--analyze", action="store_true",
-                       help="Compute aggregate results from ratings")
-    group.add_argument("--status", action="store_true",
-                       help="Show annotation progress")
-    parser.add_argument("--force", action="store_true",
-                        help="Overwrite existing rated samples (with --generate)")
     args = parser.parse_args()
     if args.force and not args.generate:

 # Sample Generation
 # ============================================================================
 def _select_diverse_natural_queries(target: int = 35) -> list[str]:
     """Select diverse queries from natural eval dataset, balanced by category."""
     if not NATURAL_QUERIES_FILE.exists():
             logger.error(
                 "%s contains %d rated samples. "
                 "Use --force to overwrite, or run --annotate to continue.",
+                SAMPLES_FILE,
+                rated,
             )
             sys.exit(1)
     all_queries = natural + config
     logger.info(
         "Queries: %d natural + %d config = %d total",
+        len(natural),
+        len(config),
+        len(all_queries),
     )
     if len(all_queries) < TARGET_SAMPLES:
             "Only %d unique queries available (target: %d). "
             "Results will lack statistical power. "
             "Run 'make eval' to build natural query dataset.",
+            len(all_queries),
+            TARGET_SAMPLES,
         )
     # Initialize services
     samples = []
     for i, query in enumerate(all_queries, 1):
+        logger.info('[%d/%d] "%s"', i, len(all_queries), query)
         products = get_candidates(
+            query=query,
+            k=1,
+            min_rating=4.0,
             aggregation=AggregationMethod.MAX,
         )
         if not products:
         product = products[0]
         try:
             expl = explainer.generate_explanation(
+                query,
+                product,
+                max_evidence=MAX_EVIDENCE,
             )
             hhem = detector.check_explanation(
+                expl.evidence_texts,
+                expl.explanation,
             )
             sample = {
             samples.append(sample)
             logger.info(
                 "  %s (%.1f stars) HHEM=%.3f",
+                product.product_id,
+                product.avg_rating,
+                hhem.score,
             )
         except ValueError as exc:
             logger.info("  Quality gate refusal: %s", exc)
 # Interactive Annotation
 # ============================================================================
 def _load_samples() -> list[dict]:
     """Load samples from disk."""
     if not SAMPLES_FILE.exists():
                 text = ev["text"]
                 if len(text) > 200:
                     text = text[:200] + "..."
+                print(f'  [{ev["id"]}]: "{text}"')
             print()
             # Collect ratings
 # Analysis
 # ============================================================================
 def analyze_results():
     """Compute aggregate metrics from rated samples."""
     samples = _load_samples()
         n = len(scores)
         mean = sum(scores) / n
         variance = sum((x - mean) ** 2 for x in scores) / (n - 1) if n > 1 else 0.0
+        std = variance**0.5
         dimensions_results[dim_key] = {
             "mean": round(mean, 2),
             "std": round(std, 2),
         }
         logger.info(
             "  %-15s mean=%.2f  std=%.2f  range=[%d, %d]",
+            dim_key + ":",
+            mean,
+            std,
+            min(scores),
+            max(scores),
         )
     # Overall helpfulness: mean of per-sample averages
     passed = overall >= HELPFULNESS_TARGET
     logger.info("")
+    logger.info(
+        "Overall helpfulness: %.2f (target: %.1f) [%s]",
+        overall,
+        HELPFULNESS_TARGET,
+        "PASS" if passed else "FAIL",
+    )
     # HHEM vs Trust correlation (Spearman)
     correlation = _compute_hhem_trust_correlation(rated)
     if correlation:
         logger.info(
             "HHEM-Trust correlation: r=%.3f, p=%.4f",
+            correlation["spearman_r"],
+            correlation["p_value"],
         )
     # Save results
     try:
         from scipy.stats import spearmanr
         r, p = spearmanr(hhem_scores, trust_scores)
         return {"spearman_r": round(float(r), 4), "p_value": round(float(p), 4)}
     except ImportError:
     ry = _rank(y)
     d_sq = sum((rx[i] - ry[i]) ** 2 for i in range(n))
+    rho = 1 - (6 * d_sq) / (n * (n**2 - 1))
     # Approximate p-value via t-distribution (large sample)
     if abs(rho) >= 1.0:
         p = 0.0
     else:
+        t = rho * math.sqrt((n - 2) / (1 - rho**2))
         # Two-tailed p-value approximation
         p = 2 * (1 - _t_cdf_approx(abs(t), n - 2))
 # Status
 # ============================================================================
 def show_status():
     """Show annotation progress."""
     if not SAMPLES_FILE.exists():
 # Main
 # ============================================================================
 def main():
     parser = argparse.ArgumentParser(
         description="Human evaluation of recommendation explanations",
     )
     group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument(
+        "--generate", action="store_true", help="Generate recommendation samples"
+    )
+    group.add_argument(
+        "--annotate", action="store_true", help="Rate samples interactively (resumable)"
+    )
+    group.add_argument(
+        "--analyze", action="store_true", help="Compute aggregate results from ratings"
+    )
+    group.add_argument("--status", action="store_true", help="Show annotation progress")
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Overwrite existing rated samples (with --generate)",
+    )
     args = parser.parse_args()
     if args.force and not args.generate:

scripts/pipeline.py CHANGED Viewed

@@ -54,6 +54,7 @@ logger = get_logger(__name__)
 # TOKENIZER VALIDATION (--validate-tokenizer)
 # ============================================================================
 def run_tokenizer_validation():
     """Validate the chars/token ratio assumption used in chunker.py."""
     from transformers import AutoTokenizer
@@ -90,10 +91,16 @@ def run_tokenizer_validation():
 # CHUNKING QUALITY TEST (--test-chunking)
 # ============================================================================
 def run_chunking_test():
     """Test chunking quality on long reviews."""
     import pandas as pd
-    from sage.core.chunking import chunk_text, split_sentences, estimate_tokens, NO_CHUNK_THRESHOLD
     log_banner(logger, "CHUNKING QUALITY TEST", width=70)
@@ -113,28 +120,36 @@ def run_chunking_test():
         chunks = chunk_text(text, embedder=embedder)
         sentences = split_sentences(text)
-        results.append({
-            "tokens": tokens,
-            "sentences": len(sentences),
-            "chunks": len(chunks),
-            "avg_chunk_tokens": np.mean([estimate_tokens(c) for c in chunks]),
-        })
         if idx < 5:
             logger.info(
                 "Review %d [%d*] (%d tok) -> %d chunks",
-                idx + 1, rating, tokens, len(chunks)
             )
     results_df = pd.DataFrame(results)
     log_section(logger, f"Summary ({len(results_df)} reviews)")
     logger.info(
         "Chunks per review: %.2f (median: %.0f)",
-        results_df["chunks"].mean(), results_df["chunks"].median()
     )
     logger.info("Avg tokens/chunk: %.0f", results_df["avg_chunk_tokens"].mean())
-    expansion = (results_df["chunks"] * results_df["avg_chunk_tokens"]).sum() / results_df["tokens"].sum()
     logger.info("Expansion ratio: %.2fx", expansion)
@@ -142,6 +157,7 @@ def run_chunking_test():
 # MAIN PIPELINE
 # ============================================================================
 def run_pipeline(subset_size: int, force: bool):
     """Run the full data pipeline: load, chunk, embed, upload."""
     logger.info("Config", extra={"subset_size": subset_size, "force": force})
@@ -174,7 +190,8 @@ def run_pipeline(subset_size: int, force: bool):
     needs_chunking = (df["estimated_tokens"] > 200).sum()
     logger.info(
         "Reviews needing chunking (>200 tokens): %d (%.1f%%)",
-        needs_chunking, needs_chunking / len(df) * 100
     )
     # Prepare reviews for chunking
@@ -192,7 +209,9 @@ def run_pipeline(subset_size: int, force: bool):
     chunks = chunk_reviews_batch(reviews_for_chunking, embedder=embedder)
     logger.info(
         "Created %d chunks from %d reviews (expansion: %.2fx)",
-        len(chunks), len(reviews_for_chunking), len(chunks) / len(reviews_for_chunking)
     )
     # Generate embeddings
@@ -200,7 +219,9 @@ def run_pipeline(subset_size: int, force: bool):
     cache_path = DATA_DIR / f"embeddings_{len(chunks)}.npy"
     logger.info("Embedding %d chunks...", len(chunk_texts))
-    embeddings = embedder.embed_passages(chunk_texts, cache_path=cache_path, force=force)
     logger.info("Embeddings shape: %s", embeddings.shape)
     # Embedding technical validation
@@ -243,12 +264,21 @@ def run_pipeline(subset_size: int, force: bool):
     sim_out = float(np.dot(emb_query, emb_out))
     logger.info("Query: '%s'", test_query)
-    logger.info("  In-domain (same topic):  '%s' = %.3f", in_domain_similar, sim_in_similar)
-    logger.info("  In-domain (diff topic):  '%s' = %.3f", in_domain_different, sim_in_different)
     logger.info("  Out-of-domain:           '%s' = %.3f", out_of_domain, sim_out)
     if sim_in_similar > sim_in_different > sim_out:
-        logger.info("Ranking correct: %.3f > %.3f > %.3f", sim_in_similar, sim_in_different, sim_out)
     else:
         logger.warning("Unexpected ranking")
@@ -307,10 +337,23 @@ def run_pipeline(subset_size: int, force: bool):
 def main():
     parser = argparse.ArgumentParser(description="Run the data pipeline")
-    parser.add_argument("--force", action="store_true", help="Force recreate collection")
-    parser.add_argument("--subset-size", type=int, default=DEV_SUBSET_SIZE, help="Number of reviews to load initially")
-    parser.add_argument("--validate-tokenizer", action="store_true", help="Run tokenizer validation only")
-    parser.add_argument("--test-chunking", action="store_true", help="Run chunking quality test only")
     args = parser.parse_args()
     if args.validate_tokenizer:

 # TOKENIZER VALIDATION (--validate-tokenizer)
 # ============================================================================
 def run_tokenizer_validation():
     """Validate the chars/token ratio assumption used in chunker.py."""
     from transformers import AutoTokenizer
 # CHUNKING QUALITY TEST (--test-chunking)
 # ============================================================================
 def run_chunking_test():
     """Test chunking quality on long reviews."""
     import pandas as pd
+    from sage.core.chunking import (
+        chunk_text,
+        split_sentences,
+        estimate_tokens,
+        NO_CHUNK_THRESHOLD,
+    )
     log_banner(logger, "CHUNKING QUALITY TEST", width=70)
         chunks = chunk_text(text, embedder=embedder)
         sentences = split_sentences(text)
+        results.append(
+            {
+                "tokens": tokens,
+                "sentences": len(sentences),
+                "chunks": len(chunks),
+                "avg_chunk_tokens": np.mean([estimate_tokens(c) for c in chunks]),
+            }
+        )
         if idx < 5:
             logger.info(
                 "Review %d [%d*] (%d tok) -> %d chunks",
+                idx + 1,
+                rating,
+                tokens,
+                len(chunks),
             )
     results_df = pd.DataFrame(results)
     log_section(logger, f"Summary ({len(results_df)} reviews)")
     logger.info(
         "Chunks per review: %.2f (median: %.0f)",
+        results_df["chunks"].mean(),
+        results_df["chunks"].median(),
     )
     logger.info("Avg tokens/chunk: %.0f", results_df["avg_chunk_tokens"].mean())
+    expansion = (
+        results_df["chunks"] * results_df["avg_chunk_tokens"]
+    ).sum() / results_df["tokens"].sum()
     logger.info("Expansion ratio: %.2fx", expansion)
 # MAIN PIPELINE
 # ============================================================================
 def run_pipeline(subset_size: int, force: bool):
     """Run the full data pipeline: load, chunk, embed, upload."""
     logger.info("Config", extra={"subset_size": subset_size, "force": force})
     needs_chunking = (df["estimated_tokens"] > 200).sum()
     logger.info(
         "Reviews needing chunking (>200 tokens): %d (%.1f%%)",
+        needs_chunking,
+        needs_chunking / len(df) * 100,
     )
     # Prepare reviews for chunking
     chunks = chunk_reviews_batch(reviews_for_chunking, embedder=embedder)
     logger.info(
         "Created %d chunks from %d reviews (expansion: %.2fx)",
+        len(chunks),
+        len(reviews_for_chunking),
+        len(chunks) / len(reviews_for_chunking),
     )
     # Generate embeddings
     cache_path = DATA_DIR / f"embeddings_{len(chunks)}.npy"
     logger.info("Embedding %d chunks...", len(chunk_texts))
+    embeddings = embedder.embed_passages(
+        chunk_texts, cache_path=cache_path, force=force
+    )
     logger.info("Embeddings shape: %s", embeddings.shape)
     # Embedding technical validation
     sim_out = float(np.dot(emb_query, emb_out))
     logger.info("Query: '%s'", test_query)
+    logger.info(
+        "  In-domain (same topic):  '%s' = %.3f", in_domain_similar, sim_in_similar
+    )
+    logger.info(
+        "  In-domain (diff topic):  '%s' = %.3f", in_domain_different, sim_in_different
+    )
     logger.info("  Out-of-domain:           '%s' = %.3f", out_of_domain, sim_out)
     if sim_in_similar > sim_in_different > sim_out:
+        logger.info(
+            "Ranking correct: %.3f > %.3f > %.3f",
+            sim_in_similar,
+            sim_in_different,
+            sim_out,
+        )
     else:
         logger.warning("Unexpected ranking")
 def main():
     parser = argparse.ArgumentParser(description="Run the data pipeline")
+    parser.add_argument(
+        "--force", action="store_true", help="Force recreate collection"
+    )
+    parser.add_argument(
+        "--subset-size",
+        type=int,
+        default=DEV_SUBSET_SIZE,
+        help="Number of reviews to load initially",
+    )
+    parser.add_argument(
+        "--validate-tokenizer",
+        action="store_true",
+        help="Run tokenizer validation only",
+    )
+    parser.add_argument(
+        "--test-chunking", action="store_true", help="Run chunking quality test only"
+    )
     args = parser.parse_args()
     if args.validate_tokenizer:

scripts/sanity_checks.py CHANGED Viewed

@@ -42,6 +42,7 @@ RESULTS_DIR.mkdir(exist_ok=True)
 # SECTION: Spot-Check
 # ============================================================================
 def run_spot_check():
     """Manual spot-check of explanations vs evidence."""
     from sage.services.explanation import Explainer
@@ -56,18 +57,24 @@ def run_spot_check():
     queries = EVALUATION_QUERIES[:5]
     for query in queries:
-        products = get_candidates(query=query, k=2, min_rating=4.0, aggregation=AggregationMethod.MAX)
         for product in products[:2]:
             result = explainer.generate_explanation(query, product, max_evidence=3)
             hhem = detector.check_explanation(result.evidence_texts, result.explanation)
             log_section(logger, f"SAMPLE {len(results) + 1}")
-            logger.info("Query: \"%s\"", query)
-            logger.info("HHEM: %.3f (%s)", hhem.score, "PASS" if not hhem.is_hallucinated else "FAIL")
             logger.info("EVIDENCE:")
             for ev in result.evidence_texts[:2]:
-                logger.info("  \"%s...\"", ev[:100])
             logger.info("EXPLANATION:")
             logger.info("  %s", result.explanation)
@@ -86,6 +93,7 @@ def run_spot_check():
 # SECTION: Adversarial Tests
 # ============================================================================
 def run_adversarial_tests():
     """Test with contradictory evidence."""
     from sage.services.explanation import Explainer
@@ -118,10 +126,28 @@ def run_adversarial_tests():
         log_section(logger, case["name"])
         chunks = [
-            RetrievedChunk(text=case["positive"], score=0.9, product_id="TEST", rating=5.0, review_id="pos"),
-            RetrievedChunk(text=case["negative"], score=0.85, product_id="TEST", rating=1.0, review_id="neg"),
         ]
-        product = ProductScore(product_id="TEST", score=0.85, chunk_count=2, avg_rating=3.0, evidence=chunks)
         result = explainer.generate_explanation(case["query"], product, max_evidence=2)
         hhem = detector.check_explanation(result.evidence_texts, result.explanation)
@@ -142,6 +168,7 @@ def run_adversarial_tests():
 # SECTION: Empty Context Tests
 # ============================================================================
 def run_empty_context_tests():
     """Test graceful refusal with irrelevant evidence."""
     from sage.services.explanation import Explainer
@@ -153,19 +180,46 @@ def run_empty_context_tests():
     detector = HallucinationDetector()
     cases = [
-        {"name": "Irrelevant", "query": "quantum computing textbook", "evidence": "Great USB cable."},
         {"name": "Minimal", "query": "high-quality camera lens", "evidence": "OK."},
-        {"name": "Foreign", "query": "wireless mouse", "evidence": "Muy bueno el producto."},
     ]
-    refusal_words = ["cannot", "can't", "unable", "no evidence", "insufficient", "limited"]
     results = []
     for case in cases:
         log_section(logger, case["name"])
-        chunk = RetrievedChunk(text=case["evidence"], score=0.3, product_id="TEST", rating=3.0, review_id="r1")
-        product = ProductScore(product_id="TEST", score=0.3, chunk_count=1, avg_rating=3.0, evidence=[chunk])
         result = explainer.generate_explanation(case["query"], product, max_evidence=1)
         _hhem = detector.check_explanation(result.evidence_texts, result.explanation)
@@ -185,6 +239,7 @@ def run_empty_context_tests():
 # SECTION: Calibration Check
 # ============================================================================
 @dataclass
 class CalibrationSample:
     query: str
@@ -210,21 +265,27 @@ def run_calibration_check():
     logger.info("Generating samples...")
     for query in queries:
-        products = get_candidates(query=query, k=5, min_rating=3.0, aggregation=AggregationMethod.MAX)
         for product in products[:2]:
             try:
                 result = explainer.generate_explanation(query, product, max_evidence=3)
-                hhem = detector.check_explanation(result.evidence_texts, result.explanation)
-                samples.append(CalibrationSample(
-                    query=query,
-                    product_id=product.product_id,
-                    retrieval_score=product.score,
-                    evidence_count=product.chunk_count,
-                    avg_rating=product.avg_rating,
-                    hhem_score=hhem.score,
-                ))
             except Exception:
                 logger.debug("Error generating sample", exc_info=True)
@@ -251,24 +312,28 @@ def run_calibration_check():
     # Stratified analysis
     sorted_samples = sorted(samples, key=lambda s: s.retrieval_score)
     n = len(sorted_samples)
-    low = sorted_samples[:n//3]
-    mid = sorted_samples[n//3:2*n//3]
-    high = sorted_samples[2*n//3:]
     log_section(logger, "HHEM by Confidence Tier")
     logger.info("  LOW  (n=%2d): %.3f", len(low), np.mean([s.hhem_score for s in low]))
     logger.info("  MED  (n=%2d): %.3f", len(mid), np.mean([s.hhem_score for s in mid]))
-    logger.info("  HIGH (n=%2d): %.3f", len(high), np.mean([s.hhem_score for s in high]))
 # ============================================================================
 # Main
 # ============================================================================
 def main():
     parser = argparse.ArgumentParser(description="Run pipeline sanity checks")
     parser.add_argument(
-        "--section", "-s",
         choices=["all", "spot", "adversarial", "empty", "calibration"],
         default="all",
         help="Which section to run",

 # SECTION: Spot-Check
 # ============================================================================
 def run_spot_check():
     """Manual spot-check of explanations vs evidence."""
     from sage.services.explanation import Explainer
     queries = EVALUATION_QUERIES[:5]
     for query in queries:
+        products = get_candidates(
+            query=query, k=2, min_rating=4.0, aggregation=AggregationMethod.MAX
+        )
         for product in products[:2]:
             result = explainer.generate_explanation(query, product, max_evidence=3)
             hhem = detector.check_explanation(result.evidence_texts, result.explanation)
             log_section(logger, f"SAMPLE {len(results) + 1}")
+            logger.info('Query: "%s"', query)
+            logger.info(
+                "HHEM: %.3f (%s)",
+                hhem.score,
+                "PASS" if not hhem.is_hallucinated else "FAIL",
+            )
             logger.info("EVIDENCE:")
             for ev in result.evidence_texts[:2]:
+                logger.info('  "%s..."', ev[:100])
             logger.info("EXPLANATION:")
             logger.info("  %s", result.explanation)
 # SECTION: Adversarial Tests
 # ============================================================================
 def run_adversarial_tests():
     """Test with contradictory evidence."""
     from sage.services.explanation import Explainer
         log_section(logger, case["name"])
         chunks = [
+            RetrievedChunk(
+                text=case["positive"],
+                score=0.9,
+                product_id="TEST",
+                rating=5.0,
+                review_id="pos",
+            ),
+            RetrievedChunk(
+                text=case["negative"],
+                score=0.85,
+                product_id="TEST",
+                rating=1.0,
+                review_id="neg",
+            ),
         ]
+        product = ProductScore(
+            product_id="TEST",
+            score=0.85,
+            chunk_count=2,
+            avg_rating=3.0,
+            evidence=chunks,
+        )
         result = explainer.generate_explanation(case["query"], product, max_evidence=2)
         hhem = detector.check_explanation(result.evidence_texts, result.explanation)
 # SECTION: Empty Context Tests
 # ============================================================================
 def run_empty_context_tests():
     """Test graceful refusal with irrelevant evidence."""
     from sage.services.explanation import Explainer
     detector = HallucinationDetector()
     cases = [
+        {
+            "name": "Irrelevant",
+            "query": "quantum computing textbook",
+            "evidence": "Great USB cable.",
+        },
         {"name": "Minimal", "query": "high-quality camera lens", "evidence": "OK."},
+        {
+            "name": "Foreign",
+            "query": "wireless mouse",
+            "evidence": "Muy bueno el producto.",
+        },
     ]
+    refusal_words = [
+        "cannot",
+        "can't",
+        "unable",
+        "no evidence",
+        "insufficient",
+        "limited",
+    ]
     results = []
     for case in cases:
         log_section(logger, case["name"])
+        chunk = RetrievedChunk(
+            text=case["evidence"],
+            score=0.3,
+            product_id="TEST",
+            rating=3.0,
+            review_id="r1",
+        )
+        product = ProductScore(
+            product_id="TEST",
+            score=0.3,
+            chunk_count=1,
+            avg_rating=3.0,
+            evidence=[chunk],
+        )
         result = explainer.generate_explanation(case["query"], product, max_evidence=1)
         _hhem = detector.check_explanation(result.evidence_texts, result.explanation)
 # SECTION: Calibration Check
 # ============================================================================
 @dataclass
 class CalibrationSample:
     query: str
     logger.info("Generating samples...")
     for query in queries:
+        products = get_candidates(
+            query=query, k=5, min_rating=3.0, aggregation=AggregationMethod.MAX
+        )
         for product in products[:2]:
             try:
                 result = explainer.generate_explanation(query, product, max_evidence=3)
+                hhem = detector.check_explanation(
+                    result.evidence_texts, result.explanation
+                )
+                samples.append(
+                    CalibrationSample(
+                        query=query,
+                        product_id=product.product_id,
+                        retrieval_score=product.score,
+                        evidence_count=product.chunk_count,
+                        avg_rating=product.avg_rating,
+                        hhem_score=hhem.score,
+                    )
+                )
             except Exception:
                 logger.debug("Error generating sample", exc_info=True)
     # Stratified analysis
     sorted_samples = sorted(samples, key=lambda s: s.retrieval_score)
     n = len(sorted_samples)
+    low = sorted_samples[: n // 3]
+    mid = sorted_samples[n // 3 : 2 * n // 3]
+    high = sorted_samples[2 * n // 3 :]
     log_section(logger, "HHEM by Confidence Tier")
     logger.info("  LOW  (n=%2d): %.3f", len(low), np.mean([s.hhem_score for s in low]))
     logger.info("  MED  (n=%2d): %.3f", len(mid), np.mean([s.hhem_score for s in mid]))
+    logger.info(
+        "  HIGH (n=%2d): %.3f", len(high), np.mean([s.hhem_score for s in high])
+    )
 # ============================================================================
 # Main
 # ============================================================================
 def main():
     parser = argparse.ArgumentParser(description="Run pipeline sanity checks")
     parser.add_argument(
+        "--section",
+        "-s",
         choices=["all", "spot", "adversarial", "empty", "calibration"],
         default="all",
         help="Which section to run",

scripts/summary.py CHANGED Viewed

@@ -16,7 +16,12 @@ import json
 import sys
 from pathlib import Path
-from sage.config import EVAL_DIMENSIONS, FAITHFULNESS_TARGET, HELPFULNESS_TARGET, RESULTS_DIR
 WIDTH = 60
 SEP = "=" * WIDTH
@@ -82,14 +87,20 @@ def main():
         quotes_total = mm.get("quotes_total", 0)
         if claim_pass is not None:
-            print(f"  Claim HHEM:     {fmt(claim_avg, 3)}  ({claim_pass*100:.0f}% pass)")
-            print(f"  Quote Verif:    {fmt(quote_rate, 3)}  ({quotes_found}/{quotes_total})")
         # Full-explanation HHEM (reference)
         h = faith["hhem"]
         n_grounded = n_samples - h.get("n_hallucinated", 0)
         full_avg = h.get("mean_score")
-        print(f"  Full HHEM:      {fmt(full_avg, 3)}  ({n_grounded}/{n_samples} grounded, reference)")
         # RAGAS if available
         ragas = faith.get("ragas", {})
@@ -98,8 +109,10 @@ def main():
             print(f"  RAGAS Faith:    {fmt(ragas_faith, 3)}")
         # Pass/fail: use claim-level as primary, fall back to RAGAS, then full HHEM
-        effective = claim_avg if claim_avg is not None else (
-            ragas_faith if ragas_faith is not None else full_avg
         )
         if effective is not None:
             status = "PASS" if effective >= FAITHFULNESS_TARGET else "FAIL"
@@ -123,7 +136,9 @@ def main():
             print(f"  {label + ':':<15s} {fmt(m, 2) if m is not None else '   ---'}")
         if overall is not None:
             status = "PASS" if human.get("pass", False) else "FAIL"
-            print(f"  Helpfulness:    {fmt(overall, 2)}  (target: {target:.1f})  [{status}]")
         corr = human.get("hhem_trust_correlation", {})
         r = corr.get("spearman_r")
         if r is not None:

 import sys
 from pathlib import Path
+from sage.config import (
+    EVAL_DIMENSIONS,
+    FAITHFULNESS_TARGET,
+    HELPFULNESS_TARGET,
+    RESULTS_DIR,
+)
 WIDTH = 60
 SEP = "=" * WIDTH
         quotes_total = mm.get("quotes_total", 0)
         if claim_pass is not None:
+            print(
+                f"  Claim HHEM:     {fmt(claim_avg, 3)}  ({claim_pass * 100:.0f}% pass)"
+            )
+            print(
+                f"  Quote Verif:    {fmt(quote_rate, 3)}  ({quotes_found}/{quotes_total})"
+            )
         # Full-explanation HHEM (reference)
         h = faith["hhem"]
         n_grounded = n_samples - h.get("n_hallucinated", 0)
         full_avg = h.get("mean_score")
+        print(
+            f"  Full HHEM:      {fmt(full_avg, 3)}  ({n_grounded}/{n_samples} grounded, reference)"
+        )
         # RAGAS if available
         ragas = faith.get("ragas", {})
             print(f"  RAGAS Faith:    {fmt(ragas_faith, 3)}")
         # Pass/fail: use claim-level as primary, fall back to RAGAS, then full HHEM
+        effective = (
+            claim_avg
+            if claim_avg is not None
+            else (ragas_faith if ragas_faith is not None else full_avg)
         )
         if effective is not None:
             status = "PASS" if effective >= FAITHFULNESS_TARGET else "FAIL"
             print(f"  {label + ':':<15s} {fmt(m, 2) if m is not None else '   ---'}")
         if overall is not None:
             status = "PASS" if human.get("pass", False) else "FAIL"
+            print(
+                f"  Helpfulness:    {fmt(overall, 2)}  (target: {target:.1f})  [{status}]"
+            )
         corr = human.get("hhem_trust_correlation", {})
         r = corr.get("spearman_r")
         if r is not None:

tests/test_aggregation.py CHANGED Viewed

@@ -82,7 +82,9 @@ class TestApplyWeightedRanking:
             ProductScore(product_id="A", score=0.9, chunk_count=2, avg_rating=3.0),
             ProductScore(product_id="B", score=0.7, chunk_count=1, avg_rating=5.0),
         ]
-        ranked = apply_weighted_ranking(products, similarity_weight=0.5, rating_weight=0.5)
         assert len(ranked) == 2
         # B has higher rating, so with 50/50 weights it might rank higher
         assert all(isinstance(p, ProductScore) for p in ranked)
@@ -92,7 +94,9 @@ class TestApplyWeightedRanking:
             ProductScore(product_id="A", score=0.9, chunk_count=1, avg_rating=1.0),
             ProductScore(product_id="B", score=0.5, chunk_count=1, avg_rating=5.0),
         ]
-        ranked = apply_weighted_ranking(products, similarity_weight=1.0, rating_weight=0.0)
         assert ranked[0].product_id == "A"
     def test_pure_rating_reranks(self):
@@ -100,7 +104,9 @@ class TestApplyWeightedRanking:
             ProductScore(product_id="A", score=0.9, chunk_count=1, avg_rating=1.0),
             ProductScore(product_id="B", score=0.5, chunk_count=1, avg_rating=5.0),
         ]
-        ranked = apply_weighted_ranking(products, similarity_weight=0.0, rating_weight=1.0)
         assert ranked[0].product_id == "B"
     def test_single_product(self):

             ProductScore(product_id="A", score=0.9, chunk_count=2, avg_rating=3.0),
             ProductScore(product_id="B", score=0.7, chunk_count=1, avg_rating=5.0),
         ]
+        ranked = apply_weighted_ranking(
+            products, similarity_weight=0.5, rating_weight=0.5
+        )
         assert len(ranked) == 2
         # B has higher rating, so with 50/50 weights it might rank higher
         assert all(isinstance(p, ProductScore) for p in ranked)
             ProductScore(product_id="A", score=0.9, chunk_count=1, avg_rating=1.0),
             ProductScore(product_id="B", score=0.5, chunk_count=1, avg_rating=5.0),
         ]
+        ranked = apply_weighted_ranking(
+            products, similarity_weight=1.0, rating_weight=0.0
+        )
         assert ranked[0].product_id == "A"
     def test_pure_rating_reranks(self):
             ProductScore(product_id="A", score=0.9, chunk_count=1, avg_rating=1.0),
             ProductScore(product_id="B", score=0.5, chunk_count=1, avg_rating=5.0),
         ]
+        ranked = apply_weighted_ranking(
+            products, similarity_weight=0.0, rating_weight=1.0
+        )
         assert ranked[0].product_id == "B"
     def test_single_product(self):

tests/test_api.py CHANGED Viewed

@@ -27,9 +27,16 @@ def _make_app(**state_overrides) -> FastAPI:
     mock_cache = MagicMock()
     mock_cache.get.return_value = (None, "miss")
     mock_cache.stats.return_value = SimpleNamespace(
-        size=0, max_entries=100, exact_hits=0, semantic_hits=0,
-        misses=0, evictions=0, hit_rate=0.0, ttl_seconds=3600.0,
-        similarity_threshold=0.92, avg_semantic_similarity=0.0,
     )
     app.state.qdrant = state_overrides.get("qdrant", mock_qdrant)
@@ -56,6 +63,7 @@ class TestHealthEndpoint:
         with TestClient(app) as c:
             # Patch collection_exists to return True
             import sage.api.routes as routes_mod
             original = routes_mod.collection_exists
             routes_mod.collection_exists = lambda client: True
             try:
@@ -70,6 +78,7 @@ class TestHealthEndpoint:
     def test_degraded_when_collection_missing(self):
         app = _make_app()
         import sage.api.routes as routes_mod
         original = routes_mod.collection_exists
         routes_mod.collection_exists = lambda client: False
         try:
@@ -90,6 +99,7 @@ class TestRecommendEndpoint:
     def test_empty_results(self, client):
         import sage.api.routes as routes_mod
         original = routes_mod.get_candidates
         routes_mod.get_candidates = lambda **kw: []
         try:
@@ -102,12 +112,18 @@ class TestRecommendEndpoint:
     def test_returns_products_without_explain(self):
         product = ProductScore(
-            product_id="P1", score=0.9, chunk_count=2, avg_rating=4.5,
             evidence=[
-                RetrievedChunk(text="Good", score=0.9, product_id="P1", rating=4.5, review_id="r1"),
             ],
         )
         import sage.api.routes as routes_mod
         original = routes_mod.get_candidates
         routes_mod.get_candidates = lambda **kw: [product]
         app = _make_app()
@@ -126,12 +142,18 @@ class TestRecommendEndpoint:
     def test_explainer_unavailable_returns_503(self):
         product = ProductScore(
-            product_id="P1", score=0.9, chunk_count=2, avg_rating=4.5,
             evidence=[
-                RetrievedChunk(text="Good", score=0.9, product_id="P1", rating=4.5, review_id="r1"),
             ],
         )
         import sage.api.routes as routes_mod
         original = routes_mod.get_candidates
         routes_mod.get_candidates = lambda **kw: [product]

     mock_cache = MagicMock()
     mock_cache.get.return_value = (None, "miss")
     mock_cache.stats.return_value = SimpleNamespace(
+        size=0,
+        max_entries=100,
+        exact_hits=0,
+        semantic_hits=0,
+        misses=0,
+        evictions=0,
+        hit_rate=0.0,
+        ttl_seconds=3600.0,
+        similarity_threshold=0.92,
+        avg_semantic_similarity=0.0,
     )
     app.state.qdrant = state_overrides.get("qdrant", mock_qdrant)
         with TestClient(app) as c:
             # Patch collection_exists to return True
             import sage.api.routes as routes_mod
             original = routes_mod.collection_exists
             routes_mod.collection_exists = lambda client: True
             try:
     def test_degraded_when_collection_missing(self):
         app = _make_app()
         import sage.api.routes as routes_mod
         original = routes_mod.collection_exists
         routes_mod.collection_exists = lambda client: False
         try:
     def test_empty_results(self, client):
         import sage.api.routes as routes_mod
         original = routes_mod.get_candidates
         routes_mod.get_candidates = lambda **kw: []
         try:
     def test_returns_products_without_explain(self):
         product = ProductScore(
+            product_id="P1",
+            score=0.9,
+            chunk_count=2,
+            avg_rating=4.5,
             evidence=[
+                RetrievedChunk(
+                    text="Good", score=0.9, product_id="P1", rating=4.5, review_id="r1"
+                ),
             ],
         )
         import sage.api.routes as routes_mod
         original = routes_mod.get_candidates
         routes_mod.get_candidates = lambda **kw: [product]
         app = _make_app()
     def test_explainer_unavailable_returns_503(self):
         product = ProductScore(
+            product_id="P1",
+            score=0.9,
+            chunk_count=2,
+            avg_rating=4.5,
             evidence=[
+                RetrievedChunk(
+                    text="Good", score=0.9, product_id="P1", rating=4.5, review_id="r1"
+                ),
             ],
         )
         import sage.api.routes as routes_mod
         original = routes_mod.get_candidates
         routes_mod.get_candidates = lambda **kw: [product]

tests/test_chunking.py CHANGED Viewed

@@ -67,7 +67,9 @@ class TestSlidingWindowChunk:
     def test_long_text_creates_multiple_chunks(self):
         # Create text long enough to require multiple chunks
-        sentences = [f"This is sentence number {i} with some padding text." for i in range(20)]
         text = " ".join(sentences)
         chunks = sliding_window_chunk(text, chunk_size=50, overlap=10)
         assert len(chunks) > 1

     def test_long_text_creates_multiple_chunks(self):
         # Create text long enough to require multiple chunks
+        sentences = [
+            f"This is sentence number {i} with some padding text." for i in range(20)
+        ]
         text = " ".join(sentences)
         chunks = sliding_window_chunk(text, chunk_size=50, overlap=10)
         assert len(chunks) > 1

tests/test_evidence.py CHANGED Viewed

@@ -50,7 +50,10 @@ class TestCheckEvidenceQuality:
         product = _product(score=0.3, n_chunks=3, text_len=300)
         quality = check_evidence_quality(product, min_score=0.7)
         assert quality.is_sufficient is False
-        assert "relevance" in quality.failure_reason.lower() or "score" in quality.failure_reason.lower()
     def test_tracks_chunk_count(self):
         product = _product(score=0.85, n_chunks=4, text_len=200)

         product = _product(score=0.3, n_chunks=3, text_len=300)
         quality = check_evidence_quality(product, min_score=0.7)
         assert quality.is_sufficient is False
+        assert (
+            "relevance" in quality.failure_reason.lower()
+            or "score" in quality.failure_reason.lower()
+        )
     def test_tracks_chunk_count(self):
         product = _product(score=0.85, n_chunks=4, text_len=200)

tests/test_faithfulness.py CHANGED Viewed

@@ -29,13 +29,20 @@ class TestIsRefusal:
 class TestIsMismatchWarning:
     def test_detects_not_best_match(self):
-        assert is_mismatch_warning("This product may not be the best match for your needs.") is True
     def test_detects_not_designed_for(self):
         assert is_mismatch_warning("This is not designed for that purpose.") is True
     def test_detects_not_suitable(self):
-        assert is_mismatch_warning("This product is not suitable for heavy use.") is True
     def test_normal_explanation_not_mismatch(self):
         assert is_mismatch_warning("Great headphones with noise cancellation.") is False

 class TestIsMismatchWarning:
     def test_detects_not_best_match(self):
+        assert (
+            is_mismatch_warning(
+                "This product may not be the best match for your needs."
+            )
+            is True
+        )
     def test_detects_not_designed_for(self):
         assert is_mismatch_warning("This is not designed for that purpose.") is True
     def test_detects_not_suitable(self):
+        assert (
+            is_mismatch_warning("This product is not suitable for heavy use.") is True
+        )
     def test_normal_explanation_not_mismatch(self):
         assert is_mismatch_warning("Great headphones with noise cancellation.") is False

tests/test_models.py CHANGED Viewed

@@ -35,19 +35,32 @@ class TestNewItem:
 class TestProductScore:
     def test_top_evidence_returns_highest(self):
         chunks = [
-            RetrievedChunk(text="low", score=0.5, product_id="P1", rating=4.0, review_id="r1"),
-            RetrievedChunk(text="high", score=0.9, product_id="P1", rating=4.0, review_id="r2"),
-            RetrievedChunk(text="mid", score=0.7, product_id="P1", rating=4.0, review_id="r3"),
         ]
         product = ProductScore(
-            product_id="P1", score=0.9, chunk_count=3, avg_rating=4.0, evidence=chunks,
         )
         assert product.top_evidence.text == "high"
         assert product.top_evidence.score == 0.9
     def test_top_evidence_empty(self):
         product = ProductScore(
-            product_id="P1", score=0.5, chunk_count=0, avg_rating=4.0,
         )
         assert product.top_evidence is None
@@ -116,7 +129,10 @@ class TestStreamingExplanation:
 class TestEvidenceQuality:
     def test_sufficient(self):
         eq = EvidenceQuality(
-            is_sufficient=True, chunk_count=3, total_tokens=150, top_score=0.9,
         )
         assert eq.is_sufficient is True
         assert eq.failure_reason is None

 class TestProductScore:
     def test_top_evidence_returns_highest(self):
         chunks = [
+            RetrievedChunk(
+                text="low", score=0.5, product_id="P1", rating=4.0, review_id="r1"
+            ),
+            RetrievedChunk(
+                text="high", score=0.9, product_id="P1", rating=4.0, review_id="r2"
+            ),
+            RetrievedChunk(
+                text="mid", score=0.7, product_id="P1", rating=4.0, review_id="r3"
+            ),
         ]
         product = ProductScore(
+            product_id="P1",
+            score=0.9,
+            chunk_count=3,
+            avg_rating=4.0,
+            evidence=chunks,
         )
         assert product.top_evidence.text == "high"
         assert product.top_evidence.score == 0.9
     def test_top_evidence_empty(self):
         product = ProductScore(
+            product_id="P1",
+            score=0.5,
+            chunk_count=0,
+            avg_rating=4.0,
         )
         assert product.top_evidence is None
 class TestEvidenceQuality:
     def test_sufficient(self):
         eq = EvidenceQuality(
+            is_sufficient=True,
+            chunk_count=3,
+            total_tokens=150,
+            top_score=0.9,
         )
         assert eq.is_sufficient is True
         assert eq.failure_reason is None