Spaces:

vxa8502
/

Sage

Running

App Files Files Community

vxa8502 commited on Feb 10

Commit

2af9051

1 Parent(s): dbdadad

Harden CORS defaults (empty by default, explicit whitelist)

Browse files

Files changed (12) hide show

.env.example +5 -0
sage/api/app.py +58 -15
sage/api/context.py +21 -0
sage/api/middleware.py +14 -2
sage/api/routes.py +131 -57
sage/config/__init__.py +9 -0
sage/config/logging.py +23 -0
sage/core/__init__.py +0 -2
sage/core/verification.py +4 -2
sage/services/faithfulness.py +10 -17
sage/utils.py +16 -0
tests/test_production.py +293 -0

.env.example CHANGED Viewed

@@ -16,6 +16,11 @@ ANTHROPIC_API_KEY=your_anthropic_api_key
 # QDRANT_URL=https://your-cluster.cloud.qdrant.io
 # QDRANT_API_KEY=your_qdrant_api_key
 # =============================================================================
 # Optional
 # =============================================================================

 # QDRANT_URL=https://your-cluster.cloud.qdrant.io
 # QDRANT_API_KEY=your_qdrant_api_key
+# =============================================================================
+# Security
+# =============================================================================
+# CORS_ORIGINS=https://your-domain.com,http://localhost:3000  # Comma-separated
 # =============================================================================
 # Optional
 # =============================================================================

sage/api/app.py CHANGED Viewed

@@ -26,7 +26,13 @@ from sage.api.middleware import (
 from sage.api.routes import router
 from sage.config import get_logger
-CORS_ORIGINS = [o.strip() for o in os.getenv("CORS_ORIGINS", "*").split(",")]
 # Graceful shutdown timeout (seconds to wait for active requests)
 SHUTDOWN_TIMEOUT = float(os.getenv("SHUTDOWN_TIMEOUT", "30.0"))
@@ -49,17 +55,41 @@ async def _lifespan(app: FastAPI):
     reset_shutdown_coordinator()
     coordinator = get_shutdown_coordinator()
-    # Validate LLM credentials early
     from sage.config import ANTHROPIC_API_KEY, LLM_PROVIDER, OPENAI_API_KEY
-    if not ANTHROPIC_API_KEY and not OPENAI_API_KEY:
-        logger.error(
-            "No LLM API key set -- add ANTHROPIC_API_KEY or OPENAI_API_KEY to .env"
         )
-    elif LLM_PROVIDER == "anthropic" and not ANTHROPIC_API_KEY:
-        logger.warning("LLM_PROVIDER=anthropic but ANTHROPIC_API_KEY is not set")
-    elif LLM_PROVIDER == "openai" and not OPENAI_API_KEY:
-        logger.warning("LLM_PROVIDER=openai but OPENAI_API_KEY is not set")
     # Embedder (loads E5-small model) -- required for all requests
     from sage.adapters.embeddings import get_embedder
@@ -134,11 +164,24 @@ def create_app() -> FastAPI:
         lifespan=_lifespan,
     )
     app.add_middleware(LatencyMiddleware)
-    app.add_middleware(
-        CORSMiddleware,
-        allow_origins=CORS_ORIGINS,
-        allow_methods=["GET", "POST"],
-        allow_headers=["*"],
-    )
     app.include_router(router)
     return app

 from sage.api.routes import router
 from sage.config import get_logger
+# CORS configuration - explicit origins required for security.
+# Default to empty (no CORS) rather than "*" (all origins).
+# Set CORS_ORIGINS="https://your-domain.com,http://localhost:3000" in production.
+_cors_env = os.getenv("CORS_ORIGINS", "")
+CORS_ORIGINS = (
+    [o.strip() for o in _cors_env.split(",") if o.strip()] if _cors_env else []
+)
 # Graceful shutdown timeout (seconds to wait for active requests)
 SHUTDOWN_TIMEOUT = float(os.getenv("SHUTDOWN_TIMEOUT", "30.0"))
     reset_shutdown_coordinator()
     coordinator = get_shutdown_coordinator()
+    # Validate LLM credentials early - fail fast if invalid
     from sage.config import ANTHROPIC_API_KEY, LLM_PROVIDER, OPENAI_API_KEY
+    def _validate_api_key(key: str | None, provider: str) -> bool:
+        """Validate API key format. Returns True if valid."""
+        if not key:
+            return False
+        if provider == "anthropic":
+            # Anthropic keys start with "sk-ant-" and are 100+ chars
+            return key.startswith("sk-ant-") and len(key) > 50
+        if provider == "openai":
+            # OpenAI keys start with "sk-" and are 40+ chars
+            return key.startswith("sk-") and len(key) > 20
+        return bool(key)  # Unknown provider - just check non-empty
+    if LLM_PROVIDER == "anthropic":
+        if not ANTHROPIC_API_KEY:
+            logger.error("LLM_PROVIDER=anthropic but ANTHROPIC_API_KEY is not set")
+            raise ValueError("ANTHROPIC_API_KEY required when LLM_PROVIDER=anthropic")
+        if not _validate_api_key(ANTHROPIC_API_KEY, "anthropic"):
+            logger.error("ANTHROPIC_API_KEY has invalid format")
+            raise ValueError(
+                "ANTHROPIC_API_KEY has invalid format (expected sk-ant-...)"
+            )
+    elif LLM_PROVIDER == "openai":
+        if not OPENAI_API_KEY:
+            logger.error("LLM_PROVIDER=openai but OPENAI_API_KEY is not set")
+            raise ValueError("OPENAI_API_KEY required when LLM_PROVIDER=openai")
+        if not _validate_api_key(OPENAI_API_KEY, "openai"):
+            logger.error("OPENAI_API_KEY has invalid format")
+            raise ValueError("OPENAI_API_KEY has invalid format (expected sk-...)")
+    else:
+        logger.warning(
+            "Unknown LLM_PROVIDER=%s, skipping credential validation", LLM_PROVIDER
         )
     # Embedder (loads E5-small model) -- required for all requests
     from sage.adapters.embeddings import get_embedder
         lifespan=_lifespan,
     )
     app.add_middleware(LatencyMiddleware)
+    # CORS middleware with security hardening
+    if CORS_ORIGINS:
+        if "*" in CORS_ORIGINS:
+            logger.warning(
+                "CORS_ORIGINS contains '*' - this allows requests from any origin. "
+                "Set explicit origins in production."
+            )
+        app.add_middleware(
+            CORSMiddleware,
+            allow_origins=CORS_ORIGINS,
+            allow_methods=["GET", "POST"],
+            allow_headers=["Content-Type", "Accept", "Authorization"],
+            allow_credentials=False,
+            max_age=3600,  # Cache preflight for 1 hour
+        )
+    else:
+        logger.info("CORS disabled (no CORS_ORIGINS configured)")
     app.include_router(router)
     return app

sage/api/context.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""
+Request context management using contextvars.
+Provides thread-safe request context propagation for logging and tracing.
+Request ID set in middleware is accessible throughout the request lifecycle.
+"""
+from contextvars import ContextVar
+# Request ID for correlation across logs
+request_id_var: ContextVar[str] = ContextVar("request_id", default="-")
+def get_request_id() -> str:
+    """Get the current request ID from context."""
+    return request_id_var.get()
+def set_request_id(request_id: str) -> None:
+    """Set the request ID in context."""
+    request_id_var.set(request_id)

sage/api/middleware.py CHANGED Viewed

@@ -1,8 +1,9 @@
 """
-Request latency middleware and graceful shutdown coordinator.
 Logs method/path/status/elapsed_ms for every request and records
-Prometheus histogram observations. Adds ``X-Response-Time-Ms`` header.
 Uses a pure ASGI middleware (not BaseHTTPMiddleware) to avoid buffering
 SSE streams.
@@ -24,6 +25,7 @@ from dataclasses import dataclass, field
 from starlette.responses import JSONResponse
 from starlette.types import ASGIApp, Message, Receive, Scope, Send
 from sage.api.metrics import observe_duration, record_request
 from sage.config import get_logger
@@ -192,6 +194,7 @@ class LatencyMiddleware:
         start = time.perf_counter()
         request_id = uuid.uuid4().hex[:12]
         status = 500  # default until we see http.response.start
         async def send_wrapper(message: Message) -> None:
@@ -202,8 +205,17 @@ class LatencyMiddleware:
                 # The Prometheus histogram (in finally) measures total time.
                 elapsed_ms = (time.perf_counter() - start) * 1000
                 headers = list(message.get("headers", []))
                 headers.append((b"x-response-time-ms", f"{elapsed_ms:.1f}".encode()))
                 headers.append((b"x-request-id", request_id.encode()))
                 message = {**message, "headers": headers}
             await send(message)

 """
+Request latency middleware, security headers, and graceful shutdown coordinator.
 Logs method/path/status/elapsed_ms for every request and records
+Prometheus histogram observations. Adds security headers and
+``X-Response-Time-Ms`` header.
 Uses a pure ASGI middleware (not BaseHTTPMiddleware) to avoid buffering
 SSE streams.
 from starlette.responses import JSONResponse
 from starlette.types import ASGIApp, Message, Receive, Scope, Send
+from sage.api.context import set_request_id
 from sage.api.metrics import observe_duration, record_request
 from sage.config import get_logger
         start = time.perf_counter()
         request_id = uuid.uuid4().hex[:12]
+        set_request_id(request_id)  # Propagate to all child operations
         status = 500  # default until we see http.response.start
         async def send_wrapper(message: Message) -> None:
                 # The Prometheus histogram (in finally) measures total time.
                 elapsed_ms = (time.perf_counter() - start) * 1000
                 headers = list(message.get("headers", []))
+                # Timing and correlation headers
                 headers.append((b"x-response-time-ms", f"{elapsed_ms:.1f}".encode()))
                 headers.append((b"x-request-id", request_id.encode()))
+                # Security headers
+                headers.append((b"x-content-type-options", b"nosniff"))
+                headers.append((b"x-frame-options", b"DENY"))
+                headers.append((b"x-xss-protection", b"1; mode=block"))
+                headers.append((b"referrer-policy", b"strict-origin-when-cross-origin"))
+                headers.append(
+                    (b"cache-control", b"no-store, no-cache, must-revalidate")
+                )
                 message = {**message, "headers": headers}
             await send(message)

sage/api/routes.py CHANGED Viewed

@@ -15,7 +15,7 @@ from __future__ import annotations
 import asyncio
 import json
 import os
-from concurrent.futures import ThreadPoolExecutor
 from typing import AsyncIterator
 import numpy as np
@@ -26,6 +26,7 @@ from pydantic import BaseModel, Field
 from sage.adapters.vector_store import collection_exists
 from sage.api.metrics import metrics_response, record_cache_event, record_error
 from sage.config import MAX_EVIDENCE, get_logger
 from sage.core import (
     AggregationMethod,
     ExplanationResult,
@@ -39,6 +40,9 @@ from sage.services.retrieval import get_candidates
 # good parallelism while bounding total concurrent LLM calls.
 _MAX_EXPLAIN_WORKERS = 4
 # Request timeout in seconds. David's rule: 10s max end-to-end.
 # If the LLM hangs, cut it off and return what we have.
 REQUEST_TIMEOUT_SECONDS = float(os.getenv("REQUEST_TIMEOUT_SECONDS", "10.0"))
@@ -206,6 +210,16 @@ def _build_evidence_list(result: ExplanationResult) -> list[dict]:
     return result.to_evidence_dicts()
 # ---------------------------------------------------------------------------
 # Health
 # ---------------------------------------------------------------------------
@@ -313,11 +327,7 @@ async def ready(request: Request):
     # Core components must be ready (explainer is optional)
     core_ready = all(
-        [
-            components.get("qdrant", False),
-            components.get("embedder", False),
-            components.get("hhem", False),
-        ]
     )
     if core_ready and components.get("explainer", False):
@@ -349,6 +359,103 @@ async def ready(request: Request):
 # ---------------------------------------------------------------------------
 def _sync_recommend(
     body: RecommendationRequest,
     app,
@@ -361,77 +468,44 @@ def _sync_recommend(
     cache = app.state.cache
     q = body.query
     explain = body.explain
-    # Check cache before any heavy work (only for the explain path).
-    # The embedding computed here is reused for candidate retrieval below,
-    # avoiding the cost of a second embed_single_query call.
     if explain:
         query_embedding = app.state.embedder.embed_single_query(q)
-        cached, hit_type = cache.get(q, query_embedding)
-        record_cache_event(f"hit_{hit_type}" if hit_type != "miss" else "miss")
-        if cached is not None:
             return cached
     else:
         query_embedding = None
     products = _fetch_products(body, app, query_embedding=query_embedding)
     if not products:
         return {"query": q, "recommendations": []}
-    recommendations = []
     if explain:
         if app.state.explainer is None:
             raise RuntimeError("Explanation service unavailable")
-        explainer = app.state.explainer
-        detector = app.state.detector
-        def _explain(product: ProductScore):
-            # Thread safety: LLM clients use httpx (thread-safe).
-            # HHEM model in eval() + no_grad() = read-only forward
-            # pass with no state mutation. Tokenizer is stateless.
-            er = explainer.generate_explanation(
-                query=q,
-                product=product,
-                max_evidence=MAX_EVIDENCE,
-            )
-            hr = detector.check_explanation(
-                evidence_texts=er.evidence_texts,
-                explanation=er.explanation,
-            )
-            cr = verify_citations(er.explanation, er.evidence_ids, er.evidence_texts)
-            return er, hr, cr
-        with ThreadPoolExecutor(
-            max_workers=min(len(products), _MAX_EXPLAIN_WORKERS)
-        ) as pool:
-            results = list(pool.map(_explain, products))
-        for i, (product, (er, hr, cr)) in enumerate(
-            zip(products, results, strict=True),
-            1,
-        ):
-            rec = _build_product_dict(i, product)
-            rec["explanation"] = er.explanation
-            rec["confidence"] = {
-                "hhem_score": round(hr.score, 3),
-                "is_grounded": not hr.is_hallucinated,
-                "threshold": hr.threshold,
-            }
-            rec["citations_verified"] = cr.all_valid
-            rec["evidence_sources"] = _build_evidence_list(er)
-            recommendations.append(rec)
     else:
-        for i, product in enumerate(products, 1):
-            recommendations.append(_build_product_dict(i, product))
     result = {"query": q, "recommendations": recommendations}
-    # Store in cache (explain path only; embedding was computed above)
     if explain:
-        cache.put(q, query_embedding, result)
     return result

 import asyncio
 import json
 import os
+from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
 from typing import AsyncIterator
 import numpy as np
 from sage.adapters.vector_store import collection_exists
 from sage.api.metrics import metrics_response, record_cache_event, record_error
 from sage.config import MAX_EVIDENCE, get_logger
+from sage.utils import normalize_text
 from sage.core import (
     AggregationMethod,
     ExplanationResult,
 # good parallelism while bounding total concurrent LLM calls.
 _MAX_EXPLAIN_WORKERS = 4
+# Per-worker timeout for explanation generation (prevents hung workers)
+_EXPLAIN_WORKER_TIMEOUT = 30.0
 # Request timeout in seconds. David's rule: 10s max end-to-end.
 # If the LLM hangs, cut it off and return what we have.
 REQUEST_TIMEOUT_SECONDS = float(os.getenv("REQUEST_TIMEOUT_SECONDS", "10.0"))
     return result.to_evidence_dicts()
+def _build_cache_key(query: str, k: int, explain: bool, min_rating: float) -> str:
+    """Build a cache key that includes all request parameters.
+    This prevents returning cached results for different request parameters.
+    For example, a query with k=3 should not return cached results from k=5.
+    """
+    normalized_query = normalize_text(query)
+    return f"{normalized_query}:k={k}:explain={explain}:rating={min_rating:.1f}"
 # ---------------------------------------------------------------------------
 # Health
 # ---------------------------------------------------------------------------
     # Core components must be ready (explainer is optional)
     core_ready = all(
+        components.get(key, False) for key in ("qdrant", "embedder", "hhem")
     )
     if core_ready and components.get("explainer", False):
 # ---------------------------------------------------------------------------
+def _check_cache(
+    cache,
+    cache_key: str,
+    query_embedding: np.ndarray,
+) -> dict | None:
+    """Check cache for existing result and record metrics.
+    Returns cached result if found, None otherwise.
+    """
+    cached, hit_type = cache.get(cache_key, query_embedding)
+    record_cache_event(f"hit_{hit_type}" if hit_type != "miss" else "miss")
+    return cached
+def _generate_explanation_for_product(
+    query: str,
+    product: ProductScore,
+    explainer,
+    detector,
+) -> tuple:
+    """Generate explanation, HHEM score, and citation verification for a product.
+    Thread-safe: LLM clients use httpx, HHEM model is read-only.
+    Returns (ExplanationResult, HallucinationResult, CitationVerificationResult).
+    """
+    er = explainer.generate_explanation(
+        query=query,
+        product=product,
+        max_evidence=MAX_EVIDENCE,
+    )
+    hr = detector.check_explanation(
+        evidence_texts=er.evidence_texts,
+        explanation=er.explanation,
+    )
+    cr = verify_citations(er.explanation, er.evidence_ids, er.evidence_texts)
+    return er, hr, cr
+def _generate_explanations_parallel(
+    query: str,
+    products: list[ProductScore],
+    explainer,
+    detector,
+) -> list[tuple[ProductScore, tuple]]:
+    """Generate explanations for multiple products in parallel.
+    Uses ThreadPoolExecutor with per-worker timeout to prevent hung workers
+    from exhausting the pool. Products that timeout or fail are skipped.
+    """
+    results = []
+    with ThreadPoolExecutor(
+        max_workers=min(len(products), _MAX_EXPLAIN_WORKERS)
+    ) as pool:
+        futures = {
+            pool.submit(
+                _generate_explanation_for_product, query, p, explainer, detector
+            ): p
+            for p in products
+        }
+        for future in futures:
+            product = futures[future]
+            try:
+                result = future.result(timeout=_EXPLAIN_WORKER_TIMEOUT)
+                results.append((product, result))
+            except FuturesTimeoutError:
+                logger.warning(
+                    "Explanation timeout for product %s after %.1fs",
+                    product.product_id,
+                    _EXPLAIN_WORKER_TIMEOUT,
+                )
+            except Exception:
+                logger.exception(
+                    "Explanation failed for product %s", product.product_id
+                )
+    return results
+def _build_recommendation_with_explanation(
+    rank: int,
+    product: ProductScore,
+    er: ExplanationResult,
+    hr,
+    cr,
+) -> dict:
+    """Build recommendation dict with explanation and confidence metrics."""
+    rec = _build_product_dict(rank, product)
+    rec["explanation"] = er.explanation
+    rec["confidence"] = {
+        "hhem_score": round(hr.score, 3),
+        "is_grounded": not hr.is_hallucinated,
+        "threshold": hr.threshold,
+    }
+    rec["citations_verified"] = cr.all_valid
+    rec["evidence_sources"] = _build_evidence_list(er)
+    return rec
 def _sync_recommend(
     body: RecommendationRequest,
     app,
     cache = app.state.cache
     q = body.query
     explain = body.explain
+    min_rating = body.filters.min_rating if body.filters else 4.0
+    cache_key = _build_cache_key(q, body.k, explain, min_rating)
+    # Check cache before any heavy work (explain path only).
+    # Embedding computed here is reused for candidate retrieval.
     if explain:
         query_embedding = app.state.embedder.embed_single_query(q)
+        if (cached := _check_cache(cache, cache_key, query_embedding)) is not None:
             return cached
     else:
         query_embedding = None
     products = _fetch_products(body, app, query_embedding=query_embedding)
     if not products:
         return {"query": q, "recommendations": []}
+    # Build recommendations with or without explanations
     if explain:
         if app.state.explainer is None:
             raise RuntimeError("Explanation service unavailable")
+        explanation_results = _generate_explanations_parallel(
+            q, products, app.state.explainer, app.state.detector
+        )
+        recommendations = [
+            _build_recommendation_with_explanation(i, product, er, hr, cr)
+            for i, (product, (er, hr, cr)) in enumerate(explanation_results, 1)
+        ]
     else:
+        recommendations = [
+            _build_product_dict(i, product) for i, product in enumerate(products, 1)
+        ]
     result = {"query": q, "recommendations": recommendations}
+    # Store in cache (explain path only)
     if explain:
+        cache.put(cache_key, query_embedding, result)
     return result

sage/config/__init__.py CHANGED Viewed

@@ -135,6 +135,13 @@ CACHE_MAX_ENTRIES = int(os.getenv("CACHE_MAX_ENTRIES", "1000"))
 CACHE_TTL_SECONDS = float(os.getenv("CACHE_TTL_SECONDS", "3600"))
 # ---------------------------------------------------------------------------
 # Evidence Quality Gate
 # ---------------------------------------------------------------------------
@@ -244,6 +251,8 @@ __all__ = [
     "CACHE_SIMILARITY_THRESHOLD",
     "CACHE_MAX_ENTRIES",
     "CACHE_TTL_SECONDS",
     # Evidence gate
     "MAX_EVIDENCE",
     "MIN_EVIDENCE_CHUNKS",

 CACHE_TTL_SECONDS = float(os.getenv("CACHE_TTL_SECONDS", "3600"))
+# ---------------------------------------------------------------------------
+# Citation Format
+# ---------------------------------------------------------------------------
+CITATION_PREFIX = "review_"  # Prefix for citation IDs (e.g., "review_123")
 # ---------------------------------------------------------------------------
 # Evidence Quality Gate
 # ---------------------------------------------------------------------------
     "CACHE_SIMILARITY_THRESHOLD",
     "CACHE_MAX_ENTRIES",
     "CACHE_TTL_SECONDS",
+    # Citation
+    "CITATION_PREFIX",
     # Evidence gate
     "MAX_EVIDENCE",
     "MIN_EVIDENCE_CHUNKS",

sage/config/logging.py CHANGED Viewed

@@ -72,12 +72,21 @@ class ConsoleFormatter(logging.Formatter):
         "ERROR": "\033[31m",  # Red
         "CRITICAL": "\033[35m",  # Magenta
         "RESET": "\033[0m",
     }
     def format(self, record: logging.LogRecord) -> str:
         # Check if we're in a TTY (supports colors)
         use_colors = hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
         # Format timestamp
         timestamp = self.formatTime(record, "%H:%M:%S")
@@ -86,9 +95,12 @@ class ConsoleFormatter(logging.Formatter):
         if use_colors:
             color = self.COLORS.get(level, "")
             reset = self.COLORS["RESET"]
             level_str = f"{color}{level:<8}{reset}"
         else:
             level_str = f"{level:<8}"
         # Format message
         message = record.getMessage()
@@ -101,6 +113,8 @@ class ConsoleFormatter(logging.Formatter):
         extra_str = f" [{', '.join(extras)}]" if extras else ""
         return f"{timestamp} {level_str} {message}{extra_str}"
@@ -116,10 +130,19 @@ class JSONFormatter(logging.Formatter):
         import json
         from datetime import datetime, timezone
         log_entry = {
             "timestamp": datetime.now(timezone.utc).isoformat(),
             "level": record.levelname,
             "logger": record.name,
             "message": record.getMessage(),
         }

         "ERROR": "\033[31m",  # Red
         "CRITICAL": "\033[35m",  # Magenta
         "RESET": "\033[0m",
+        "DIM": "\033[2m",  # Dim for request ID
     }
     def format(self, record: logging.LogRecord) -> str:
         # Check if we're in a TTY (supports colors)
         use_colors = hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
+        # Get request ID from context
+        try:
+            from sage.api.context import get_request_id
+            request_id = get_request_id()
+        except ImportError:
+            request_id = "-"
         # Format timestamp
         timestamp = self.formatTime(record, "%H:%M:%S")
         if use_colors:
             color = self.COLORS.get(level, "")
             reset = self.COLORS["RESET"]
+            dim = self.COLORS["DIM"]
             level_str = f"{color}{level:<8}{reset}"
+            rid_str = f"{dim}[{request_id}]{reset}" if request_id != "-" else ""
         else:
             level_str = f"{level:<8}"
+            rid_str = f"[{request_id}]" if request_id != "-" else ""
         # Format message
         message = record.getMessage()
         extra_str = f" [{', '.join(extras)}]" if extras else ""
+        if rid_str:
+            return f"{timestamp} {level_str} {rid_str} {message}{extra_str}"
         return f"{timestamp} {level_str} {message}{extra_str}"
         import json
         from datetime import datetime, timezone
+        # Import here to avoid circular imports
+        try:
+            from sage.api.context import get_request_id
+            request_id = get_request_id()
+        except ImportError:
+            request_id = "-"
         log_entry = {
             "timestamp": datetime.now(timezone.utc).isoformat(),
             "level": record.levelname,
             "logger": record.name,
+            "request_id": request_id,
             "message": record.getMessage(),
         }

sage/core/__init__.py CHANGED Viewed

@@ -65,7 +65,6 @@ from sage.core.verification import (
     check_forbidden_phrases,
     extract_citations,
     extract_quotes,
-    normalize_text,
     verify_citation,
     verify_citations,
     verify_explanation,
@@ -132,7 +131,6 @@ __all__ = [
     "check_forbidden_phrases",
     "extract_citations",
     "extract_quotes",
-    "normalize_text",
     "verify_citation",
     "verify_citations",
     "verify_explanation",

     check_forbidden_phrases,
     extract_citations,
     extract_quotes,
     verify_citation,
     verify_citations,
     verify_explanation,
     "check_forbidden_phrases",
     "extract_citations",
     "extract_quotes",
     "verify_citation",
     "verify_citations",
     "verify_explanation",

sage/core/verification.py CHANGED Viewed

@@ -13,6 +13,7 @@ non-existent review IDs.
 import re
 from dataclasses import dataclass
 from sage.core.models import (
     CitationResult,
     CitationVerificationResult,
@@ -218,16 +219,17 @@ def extract_citations(text: str) -> list[tuple[str, str | None]]:
     # Pattern for quote followed by citation(s): "quote" [review_123] or [review_123, review_456]
     quote_citation_pattern = r'"([^"]+)"\s*\[([^\]]+)\]'
     for match in re.finditer(quote_citation_pattern, text):
         quote_text = match.group(1)
         citation_block = match.group(2)
         # Split multiple citations like "review_123, review_456"
-        for citation_id in re.findall(r"review_\d+", citation_block):
             citations.append((citation_id, quote_text))
     # Pattern for standalone citations not preceded by a quote
     # Find all citations, then filter out ones already captured with quotes
-    all_citation_ids = set(re.findall(r"review_\d+", text))
     quoted_citation_ids = {c[0] for c in citations}
     standalone_ids = all_citation_ids - quoted_citation_ids

 import re
 from dataclasses import dataclass
+from sage.config import CITATION_PREFIX
 from sage.core.models import (
     CitationResult,
     CitationVerificationResult,
     # Pattern for quote followed by citation(s): "quote" [review_123] or [review_123, review_456]
     quote_citation_pattern = r'"([^"]+)"\s*\[([^\]]+)\]'
+    citation_id_pattern = rf"{re.escape(CITATION_PREFIX)}\d+"
     for match in re.finditer(quote_citation_pattern, text):
         quote_text = match.group(1)
         citation_block = match.group(2)
         # Split multiple citations like "review_123, review_456"
+        for citation_id in re.findall(citation_id_pattern, citation_block):
             citations.append((citation_id, quote_text))
     # Pattern for standalone citations not preceded by a quote
     # Find all citations, then filter out ones already captured with quotes
+    all_citation_ids = set(re.findall(citation_id_pattern, text))
     quoted_citation_ids = {c[0] for c in citations}
     standalone_ids = all_citation_ids - quoted_citation_ids

sage/services/faithfulness.py CHANGED Viewed

@@ -15,6 +15,7 @@ import asyncio
 import numpy as np
 from sage.core import (
     AdjustedFaithfulnessReport,
     AgreementReport,
@@ -120,10 +121,8 @@ def create_ragas_sample(query: str, explanation: str, evidence_texts: list[str])
     Raises:
         ImportError: If ragas is not installed.
     """
-    try:
-        from ragas import SingleTurnSample
-    except ImportError:
-        raise ImportError("ragas package required. Install with: pip install ragas")
     # Clean explanation for RAGAS evaluation
     cleaned_explanation = _clean_explanation_for_ragas(explanation)
@@ -162,10 +161,8 @@ def get_ragas_llm(provider: str | None = None):
     Returns:
         RAGAS-compatible LLM wrapper.
     """
-    try:
-        from ragas.llms import llm_factory
-    except ImportError:
-        raise ImportError("ragas package required. Install with: pip install ragas")
     provider = provider or LLM_PROVIDER
@@ -211,10 +208,8 @@ class FaithfulnessEvaluator:
             provider: LLM provider for RAGAS.
             target: Faithfulness target score (default 0.85).
         """
-        try:
-            from ragas.metrics import Faithfulness
-        except ImportError:
-            raise ImportError("ragas package required. Install with: pip install ragas")
         self.llm = get_ragas_llm(provider)
         self.scorer = Faithfulness(llm=self.llm)
@@ -262,11 +257,9 @@ class FaithfulnessEvaluator:
         explanation_results: list[ExplanationResult],
     ) -> FaithfulnessReport:
         """Evaluate faithfulness for multiple explanations."""
-        try:
-            from ragas import EvaluationDataset, evaluate
-            from ragas.metrics import Faithfulness
-        except ImportError:
-            raise ImportError("ragas package required. Install with: pip install ragas")
         samples = _explanation_results_to_samples(explanation_results)
         dataset = EvaluationDataset(samples=samples)

 import numpy as np
+from sage.utils import ensure_ragas_installed
 from sage.core import (
     AdjustedFaithfulnessReport,
     AgreementReport,
     Raises:
         ImportError: If ragas is not installed.
     """
+    ensure_ragas_installed()
+    from ragas import SingleTurnSample
     # Clean explanation for RAGAS evaluation
     cleaned_explanation = _clean_explanation_for_ragas(explanation)
     Returns:
         RAGAS-compatible LLM wrapper.
     """
+    ensure_ragas_installed()
+    from ragas.llms import llm_factory
     provider = provider or LLM_PROVIDER
             provider: LLM provider for RAGAS.
             target: Faithfulness target score (default 0.85).
         """
+        ensure_ragas_installed()
+        from ragas.metrics import Faithfulness
         self.llm = get_ragas_llm(provider)
         self.scorer = Faithfulness(llm=self.llm)
         explanation_results: list[ExplanationResult],
     ) -> FaithfulnessReport:
         """Evaluate faithfulness for multiple explanations."""
+        ensure_ragas_installed()
+        from ragas import EvaluationDataset, evaluate
+        from ragas.metrics import Faithfulness
         samples = _explanation_results_to_samples(explanation_results)
         dataset = EvaluationDataset(samples=samples)

sage/utils.py CHANGED Viewed

@@ -92,6 +92,22 @@ def require_imports(*packages: str | tuple[str, str]) -> list[ModuleType]:
     return modules
 # ---------------------------------------------------------------------------
 # Lazy Loading Utilities
 # ---------------------------------------------------------------------------

     return modules
+def ensure_ragas_installed() -> None:
+    """Ensure RAGAS package is installed.
+    Centralizes the RAGAS availability check used across faithfulness evaluation.
+    Call this before importing RAGAS components to get a clear error message.
+    Usage:
+        ensure_ragas_installed()
+        from ragas import SingleTurnSample  # Safe to import now
+    Raises:
+        ImportError: If ragas is not installed with install instructions.
+    """
+    require_import("ragas")
 # ---------------------------------------------------------------------------
 # Lazy Loading Utilities
 # ---------------------------------------------------------------------------

tests/test_production.py ADDED Viewed

	@@ -0,0 +1,293 @@

+"""Tests for production hardening fixes.
+Tests security headers, cache key generation, request ID propagation,
+and other production-critical behaviors.
+"""
+import pytest
+from types import SimpleNamespace
+from unittest.mock import MagicMock, patch
+from fastapi import FastAPI
+from fastapi.testclient import TestClient
+from sage.api.middleware import LatencyMiddleware
+from sage.api.routes import router, _build_cache_key
+from sage.services.cache import SemanticCache
+import numpy as np
+def _make_app_with_middleware(**state_overrides) -> FastAPI:
+    """Create a test app with middleware and mocked state."""
+    app = FastAPI()
+    # Add latency middleware (includes security headers)
+    app.add_middleware(LatencyMiddleware)
+    app.include_router(router)
+    # Mock Qdrant client
+    mock_qdrant = MagicMock()
+    mock_qdrant.get_collections.return_value = MagicMock(collections=[])
+    # Mock cache
+    mock_cache = MagicMock()
+    mock_cache.get.return_value = (None, "miss")
+    mock_cache.stats.return_value = SimpleNamespace(
+        size=0,
+        max_entries=100,
+        exact_hits=0,
+        semantic_hits=0,
+        misses=0,
+        evictions=0,
+        hit_rate=0.0,
+        ttl_seconds=3600.0,
+        similarity_threshold=0.92,
+        avg_semantic_similarity=0.0,
+    )
+    # Mock explainer with client attribute for health check
+    mock_explainer = MagicMock()
+    mock_explainer.client = MagicMock()
+    app.state.qdrant = state_overrides.get("qdrant", mock_qdrant)
+    app.state.embedder = state_overrides.get("embedder", MagicMock())
+    app.state.detector = state_overrides.get("detector", MagicMock())
+    app.state.explainer = state_overrides.get("explainer", mock_explainer)
+    app.state.cache = state_overrides.get("cache", mock_cache)
+    return app
+class TestSecurityHeaders:
+    """Test that security headers are added to all responses."""
+    @pytest.fixture
+    def client(self):
+        app = _make_app_with_middleware()
+        return TestClient(app)
+    @patch("sage.api.routes.collection_exists", return_value=True)
+    def test_security_headers_present(self, mock_collection_exists, client):
+        resp = client.get("/health")
+        assert resp.status_code == 200
+        # Check security headers
+        assert resp.headers.get("x-content-type-options") == "nosniff"
+        assert resp.headers.get("x-frame-options") == "DENY"
+        assert resp.headers.get("x-xss-protection") == "1; mode=block"
+        assert resp.headers.get("referrer-policy") == "strict-origin-when-cross-origin"
+        assert "no-store" in resp.headers.get("cache-control", "")
+    @patch("sage.api.routes.collection_exists", return_value=True)
+    def test_request_id_header_present(self, mock_collection_exists, client):
+        resp = client.get("/health")
+        assert resp.status_code == 200
+        # Check request ID is present and has expected format
+        request_id = resp.headers.get("x-request-id")
+        assert request_id is not None
+        assert len(request_id) == 12  # UUID hex[:12]
+    @patch("sage.api.routes.collection_exists", return_value=True)
+    def test_response_time_header_present(self, mock_collection_exists, client):
+        resp = client.get("/health")
+        assert resp.status_code == 200
+        # Check response time header
+        response_time = resp.headers.get("x-response-time-ms")
+        assert response_time is not None
+        assert float(response_time) >= 0
+class TestCacheKeyGeneration:
+    """Test that cache keys include all request parameters."""
+    def test_cache_key_includes_query(self):
+        key1 = _build_cache_key("headphones", k=3, explain=True, min_rating=4.0)
+        key2 = _build_cache_key("earbuds", k=3, explain=True, min_rating=4.0)
+        assert key1 != key2
+    def test_cache_key_includes_k(self):
+        key1 = _build_cache_key("headphones", k=3, explain=True, min_rating=4.0)
+        key2 = _build_cache_key("headphones", k=5, explain=True, min_rating=4.0)
+        assert key1 != key2
+        assert "k=3" in key1
+        assert "k=5" in key2
+    def test_cache_key_includes_explain(self):
+        key1 = _build_cache_key("headphones", k=3, explain=True, min_rating=4.0)
+        key2 = _build_cache_key("headphones", k=3, explain=False, min_rating=4.0)
+        assert key1 != key2
+        assert "explain=True" in key1
+        assert "explain=False" in key2
+    def test_cache_key_includes_rating(self):
+        key1 = _build_cache_key("headphones", k=3, explain=True, min_rating=4.0)
+        key2 = _build_cache_key("headphones", k=3, explain=True, min_rating=3.5)
+        assert key1 != key2
+        assert "rating=4.0" in key1
+        assert "rating=3.5" in key2
+    def test_cache_key_normalizes_query(self):
+        key1 = _build_cache_key(
+            "  Best  Headphones  ", k=3, explain=True, min_rating=4.0
+        )
+        key2 = _build_cache_key("best headphones", k=3, explain=True, min_rating=4.0)
+        assert key1 == key2
+    def test_cache_key_case_insensitive(self):
+        key1 = _build_cache_key("HEADPHONES", k=3, explain=True, min_rating=4.0)
+        key2 = _build_cache_key("headphones", k=3, explain=True, min_rating=4.0)
+        assert key1 == key2
+class TestCacheIntegration:
+    """Integration tests for cache with request parameters."""
+    def test_same_query_different_k_different_cache_entries(self):
+        cache = SemanticCache(max_entries=100, ttl_seconds=3600)
+        # Create fake embeddings
+        embedding = np.random.rand(384).astype(np.float32)
+        # Store result with k=3
+        key1 = _build_cache_key("headphones", k=3, explain=True, min_rating=4.0)
+        result1 = {"query": "headphones", "recommendations": ["p1", "p2", "p3"]}
+        cache.put(key1, embedding, result1)
+        # Store result with k=5
+        key2 = _build_cache_key("headphones", k=5, explain=True, min_rating=4.0)
+        result2 = {
+            "query": "headphones",
+            "recommendations": ["p1", "p2", "p3", "p4", "p5"],
+        }
+        cache.put(key2, embedding, result2)
+        # Retrieve k=3 result
+        cached1, hit_type1 = cache.get(key1, embedding)
+        assert cached1 is not None
+        assert len(cached1["recommendations"]) == 3
+        # Retrieve k=5 result
+        cached2, hit_type2 = cache.get(key2, embedding)
+        assert cached2 is not None
+        assert len(cached2["recommendations"]) == 5
+    def test_same_query_different_rating_different_cache_entries(self):
+        cache = SemanticCache(max_entries=100, ttl_seconds=3600)
+        embedding = np.random.rand(384).astype(np.float32)
+        # Store with rating=4.0
+        key1 = _build_cache_key("headphones", k=3, explain=True, min_rating=4.0)
+        cache.put(key1, embedding, {"rating_filter": 4.0})
+        # Store with rating=3.5
+        key2 = _build_cache_key("headphones", k=3, explain=True, min_rating=3.5)
+        cache.put(key2, embedding, {"rating_filter": 3.5})
+        # Verify they're separate entries
+        cached1, _ = cache.get(key1, embedding)
+        cached2, _ = cache.get(key2, embedding)
+        assert cached1["rating_filter"] == 4.0
+        assert cached2["rating_filter"] == 3.5
+class TestRequestContext:
+    """Test request ID context propagation."""
+    def test_request_id_context_var(self):
+        from sage.api.context import get_request_id, set_request_id
+        # Default value
+        assert get_request_id() == "-"
+        # Set and get
+        set_request_id("abc123")
+        assert get_request_id() == "abc123"
+        # Reset for other tests
+        set_request_id("-")
+class TestCORSConfiguration:
+    """Test CORS configuration security."""
+    def test_cors_not_applied_when_empty(self):
+        """When CORS_ORIGINS is empty, no CORS middleware should be added."""
+        from sage.api.app import CORS_ORIGINS
+        # This test verifies the default behavior
+        # In production, CORS_ORIGINS should be explicitly set
+        # Default is empty list (no CORS)
+        assert isinstance(CORS_ORIGINS, list)
+    def test_cors_origins_parsing(self):
+        """Test that CORS origins are parsed correctly."""
+        import os
+        # Save original
+        original = os.environ.get("CORS_ORIGINS")
+        try:
+            # Test with explicit origins
+            os.environ["CORS_ORIGINS"] = "https://example.com,http://localhost:3000"
+            # Would need to reload the module to test this properly
+            # Just verify the format is correct
+            origins = [
+                o.strip() for o in os.environ["CORS_ORIGINS"].split(",") if o.strip()
+            ]
+            assert origins == ["https://example.com", "http://localhost:3000"]
+            # Test with empty string
+            os.environ["CORS_ORIGINS"] = ""
+            origins = [
+                o.strip() for o in os.environ["CORS_ORIGINS"].split(",") if o.strip()
+            ]
+            assert origins == []
+        finally:
+            # Restore original
+            if original is not None:
+                os.environ["CORS_ORIGINS"] = original
+            elif "CORS_ORIGINS" in os.environ:
+                del os.environ["CORS_ORIGINS"]
+class TestInputValidation:
+    """Test input validation edge cases."""
+    @pytest.fixture
+    def client(self):
+        app = _make_app_with_middleware()
+        return TestClient(app)
+    def test_empty_query_rejected(self, client):
+        resp = client.post("/recommend", json={"query": ""})
+        assert resp.status_code == 422
+    def test_query_too_long_rejected(self, client):
+        resp = client.post("/recommend", json={"query": "x" * 501})
+        assert resp.status_code == 422
+    def test_k_zero_rejected(self, client):
+        resp = client.post("/recommend", json={"query": "test", "k": 0})
+        assert resp.status_code == 422
+    def test_k_too_large_rejected(self, client):
+        resp = client.post("/recommend", json={"query": "test", "k": 11})
+        assert resp.status_code == 422
+    def test_invalid_min_rating_rejected(self, client):
+        resp = client.post(
+            "/recommend",
+            json={"query": "test", "filters": {"min_rating": 10.0}},
+        )
+        assert resp.status_code == 422
+    def test_negative_price_rejected(self, client):
+        resp = client.post(
+            "/recommend",
+            json={"query": "test", "filters": {"min_price": -1.0}},
+        )
+        assert resp.status_code == 422