Spaces:

vxa8502
/

Sage

Running

App Files Files Community

vxa8502 commited on Mar 5

Commit

ca96fbf

1 Parent(s): a9bab1a

Add bootstrap confidence intervals to evaluation metrics

Browse files

Files changed (9) hide show

data/eval_results/eval_natural_queries_20260305_161900_824897.json +34 -0
data/eval_results/eval_natural_queries_latest.json +0 -16
data/eval_results/eval_natural_queries_latest.json +1 -0
sage/core/__init__.py +2 -0
sage/core/models.py +55 -13
sage/services/baselines.py +0 -4
sage/services/evaluation.py +24 -10
scripts/evaluation.py +12 -1
scripts/summary.py +17 -3

data/eval_results/eval_natural_queries_20260305_161900_824897.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "timestamp": "2026-03-05T16:18:49.380747",
+  "dataset": "eval_natural_queries.json",
+  "catalog_size": 21827,
+  "experiments": {},
+  "primary_metrics": {
+    "ndcg_at_10": 0.4871922222425982,
+    "hit_at_10": 0.7380952380952381,
+    "mrr": 0.42086167800453517,
+    "precision_at_10": 0.12857142857142856,
+    "recall_at_10": 0.4722222222222222,
+    "diversity": 0.01957190520646696,
+    "coverage": 0.015531222797452697,
+    "novelty": 9.808908578271737,
+    "ndcg_ci": {
+      "mean": 0.4872,
+      "ci_lower": 0.3779,
+      "ci_upper": 0.6078,
+      "confidence": 0.95
+    },
+    "hit_ci": {
+      "mean": 0.7381,
+      "ci_lower": 0.5952,
+      "ci_upper": 0.8571,
+      "confidence": 0.95
+    },
+    "mrr_ci": {
+      "mean": 0.4209,
+      "ci_lower": 0.301,
+      "ci_upper": 0.5545,
+      "confidence": 0.95
+    }
+  }
+}

data/eval_results/eval_natural_queries_latest.json DELETED Viewed

@@ -1,16 +0,0 @@
-{
-  "timestamp": "2026-02-10T11:49:08.500849",
-  "dataset": "eval_natural_queries.json",
-  "catalog_size": 21827,
-  "experiments": {},
-  "primary_metrics": {
-    "ndcg_at_10": 0.4871922222425982,
-    "hit_at_10": 0.7380952380952381,
-    "mrr": 0.42086167800453517,
-    "precision_at_10": 0.12857142857142856,
-    "recall_at_10": 0.4722222222222222,
-    "diversity": 0.01957190520646696,
-    "coverage": 0.015531222797452697,
-    "novelty": 9.808908578271737
-  }
-}

data/eval_results/eval_natural_queries_latest.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ eval_natural_queries_20260305_161900_824897.json

sage/core/__init__.py CHANGED Viewed

@@ -37,6 +37,7 @@ from sage.core.models import (
     FaithfulnessReport,
     FaithfulnessResult,
     # Evaluation
     EvalCase,
     EvalResult,
     MetricsReport,
@@ -110,6 +111,7 @@ __all__ = [
     "MultiMetricFaithfulnessReport",
     "FaithfulnessReport",
     "FaithfulnessResult",
     "EvalCase",
     "EvalResult",
     "MetricsReport",

     FaithfulnessReport,
     FaithfulnessResult,
     # Evaluation
+    ConfidenceInterval,
     EvalCase,
     EvalResult,
     MetricsReport,
     "MultiMetricFaithfulnessReport",
     "FaithfulnessReport",
     "FaithfulnessResult",
+    "ConfidenceInterval",
     "EvalCase",
     "EvalResult",
     "MetricsReport",

sage/core/models.py CHANGED Viewed

@@ -13,7 +13,7 @@ from __future__ import annotations
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import Iterator
 # ============================================================================
@@ -555,6 +555,27 @@ class EvalResult:
     recall: float = 0.0
 @dataclass
 class MetricsReport:
     """
@@ -575,9 +596,14 @@ class MetricsReport:
     novelty: float = 0.0
     k: int = 10
-    def to_dict(self) -> dict:
         """Convert to dictionary for easy serialization."""
-        return {
             "n_cases": self.n_cases,
             f"ndcg@{self.k}": round(self.ndcg_at_k, 4),
             f"hit@{self.k}": round(self.hit_at_k, 4),
@@ -588,19 +614,35 @@ class MetricsReport:
             "coverage": round(self.coverage, 4),
             "novelty": round(self.novelty, 4),
         }
     def __str__(self) -> str:
         lines = [
             f"Evaluation Results (n={self.n_cases}, k={self.k})",
-            "-" * 40,
-            f"NDCG@{self.k}:      {self.ndcg_at_k:.4f}",
-            f"Hit@{self.k}:       {self.hit_at_k:.4f}",
-            f"MRR:           {self.mrr:.4f}",
-            f"Precision@{self.k}: {self.precision_at_k:.4f}",
-            f"Recall@{self.k}:    {self.recall_at_k:.4f}",
-            "-" * 40,
-            f"Diversity:     {self.diversity:.4f}",
-            f"Coverage:      {self.coverage:.4f}",
-            f"Novelty:       {self.novelty:.4f}",
         ]
         return "\n".join(lines)

 from dataclasses import dataclass, field
 from enum import Enum
+from typing import Any, Iterator
 # ============================================================================
     recall: float = 0.0
+@dataclass
+class ConfidenceInterval:
+    """Bootstrap confidence interval for a metric."""
+    mean: float
+    lower: float
+    upper: float
+    confidence: float = 0.95
+    def __str__(self) -> str:
+        return f"{self.mean:.3f} [{self.lower:.3f}, {self.upper:.3f}]"
+    def to_dict(self) -> dict[str, float]:
+        return {
+            "mean": round(self.mean, 4),
+            "ci_lower": round(self.lower, 4),
+            "ci_upper": round(self.upper, 4),
+            "confidence": self.confidence,
+        }
 @dataclass
 class MetricsReport:
     """
     novelty: float = 0.0
     k: int = 10
+    # Bootstrap confidence intervals (optional)
+    ndcg_ci: ConfidenceInterval | None = None
+    hit_ci: ConfidenceInterval | None = None
+    mrr_ci: ConfidenceInterval | None = None
+    def to_dict(self) -> dict[str, Any]:
         """Convert to dictionary for easy serialization."""
+        result: dict[str, Any] = {
             "n_cases": self.n_cases,
             f"ndcg@{self.k}": round(self.ndcg_at_k, 4),
             f"hit@{self.k}": round(self.hit_at_k, 4),
             "coverage": round(self.coverage, 4),
             "novelty": round(self.novelty, 4),
         }
+        for name, ci in [
+            ("ndcg_ci", self.ndcg_ci),
+            ("hit_ci", self.hit_ci),
+            ("mrr_ci", self.mrr_ci),
+        ]:
+            if ci:
+                result[name] = ci.to_dict()
+        return result
+    def _fmt_metric(
+        self, name: str, value: float, ci: ConfidenceInterval | None
+    ) -> str:
+        """Format a metric with optional CI."""
+        if ci:
+            return f"{name:<14s} {value:.4f}  [{ci.lower:.3f}, {ci.upper:.3f}]"
+        return f"{name:<14s} {value:.4f}"
     def __str__(self) -> str:
         lines = [
             f"Evaluation Results (n={self.n_cases}, k={self.k})",
+            "-" * 50,
+            self._fmt_metric(f"NDCG@{self.k}:", self.ndcg_at_k, self.ndcg_ci),
+            self._fmt_metric(f"Hit@{self.k}:", self.hit_at_k, self.hit_ci),
+            self._fmt_metric("MRR:", self.mrr, self.mrr_ci),
+            self._fmt_metric(f"Precision@{self.k}:", self.precision_at_k, None),
+            self._fmt_metric(f"Recall@{self.k}:", self.recall_at_k, None),
+            "-" * 50,
+            self._fmt_metric("Diversity:", self.diversity, None),
+            self._fmt_metric("Coverage:", self.coverage, None),
+            self._fmt_metric("Novelty:", self.novelty, None),
         ]
         return "\n".join(lines)

sage/services/baselines.py CHANGED Viewed

@@ -230,8 +230,6 @@ def load_product_embeddings_from_qdrant() -> dict[str, np.ndarray]:
         product_id = point.payload.get("product_id")
         product_vectors[product_id].append(np.array(point.vector))
-    client.close()
     # Mean aggregation + normalize
     return {
         product_id: normalize_vectors(np.mean(vectors, axis=0))
@@ -265,8 +263,6 @@ def compute_item_popularity_from_qdrant(
         if point.payload.get("product_id")
     )
-    client.close()
     if not normalize:
         return dict(counts)

         product_id = point.payload.get("product_id")
         product_vectors[product_id].append(np.array(point.vector))
     # Mean aggregation + normalize
     return {
         product_id: normalize_vectors(np.mean(vectors, axis=0))
         if point.payload.get("product_id")
     )
     if not normalize:
         return dict(counts)

sage/services/evaluation.py CHANGED Viewed

@@ -20,7 +20,7 @@ from typing import Callable
 import numpy as np
-from sage.core import EvalCase, EvalResult, MetricsReport
 from sage.utils import normalize_vectors
@@ -160,6 +160,19 @@ def compute_item_popularity(
     return {item: count / total for item, count in counts.items()}
 class EvaluationService:
     """
     Service for evaluating recommendation quality.
@@ -260,15 +273,16 @@ class EvaluationService:
         report = MetricsReport(
             n_cases=len(eval_cases),
             k=self.k,
-            ndcg_at_k=float(np.mean(ndcg_scores)) if ndcg_scores else 0.0,
-            hit_at_k=float(np.mean(hit_scores)) if hit_scores else 0.0,
-            mrr=float(np.mean(mrr_scores)) if mrr_scores else 0.0,
-            precision_at_k=float(np.mean(precision_scores))
-            if precision_scores
-            else 0.0,
-            recall_at_k=float(np.mean(recall_scores)) if recall_scores else 0.0,
-            diversity=float(np.mean(diversity_scores)) if diversity_scores else 0.0,
-            novelty=float(np.mean(novelty_scores)) if novelty_scores else 0.0,
         )
         if self.total_items:

 import numpy as np
+from sage.core import ConfidenceInterval, EvalCase, EvalResult, MetricsReport
 from sage.utils import normalize_vectors
     return {item: count / total for item, count in counts.items()}
+def _safe_mean(scores: list[float]) -> float:
+    """Compute mean of scores, returning 0.0 for empty list."""
+    return float(np.mean(scores)) if scores else 0.0
+def _compute_ci(scores: list[float]) -> ConfidenceInterval | None:
+    """Compute bootstrap CI for scores, returning None for empty list."""
+    if not scores:
+        return None
+    mean, lower, upper = bootstrap_confidence_interval(scores)
+    return ConfidenceInterval(mean=mean, lower=lower, upper=upper)
 class EvaluationService:
     """
     Service for evaluating recommendation quality.
         report = MetricsReport(
             n_cases=len(eval_cases),
             k=self.k,
+            ndcg_at_k=_safe_mean(ndcg_scores),
+            hit_at_k=_safe_mean(hit_scores),
+            mrr=_safe_mean(mrr_scores),
+            precision_at_k=_safe_mean(precision_scores),
+            recall_at_k=_safe_mean(recall_scores),
+            diversity=_safe_mean(diversity_scores),
+            novelty=_safe_mean(novelty_scores),
+            ndcg_ci=_compute_ci(ndcg_scores),
+            hit_ci=_compute_ci(hit_scores),
+            mrr_ci=_compute_ci(mrr_scores),
         )
         if self.total_items:

scripts/evaluation.py CHANGED Viewed

@@ -87,7 +87,7 @@ def run_primary_evaluation(cases, item_embeddings, item_popularity, total_items)
     )
     logger.info(str(report))
-    return {
         "ndcg_at_10": report.ndcg_at_k,
         "hit_at_10": report.hit_at_k,
         "mrr": report.mrr,
@@ -98,6 +98,17 @@ def run_primary_evaluation(cases, item_embeddings, item_popularity, total_items)
         "novelty": report.novelty,
     }
 # ============================================================================
 # SECTION: Aggregation Methods

     )
     logger.info(str(report))
+    result = {
         "ndcg_at_10": report.ndcg_at_k,
         "hit_at_10": report.hit_at_k,
         "mrr": report.mrr,
         "novelty": report.novelty,
     }
+    # Add confidence intervals if available
+    for name, ci in [
+        ("ndcg_ci", report.ndcg_ci),
+        ("hit_ci", report.hit_ci),
+        ("mrr_ci", report.mrr_ci),
+    ]:
+        if ci:
+            result[name] = ci.to_dict()
+    return result
 # ============================================================================
 # SECTION: Aggregation Methods

scripts/summary.py CHANGED Viewed

@@ -42,6 +42,17 @@ def fmt(value: float | None, decimals: int = 4) -> str:
     return f"{value:.{decimals}f}"
 def print_section(title: str):
     print(f"\n{title}")
@@ -56,9 +67,12 @@ def main():
     print_section("Recommendation Quality (Natural Queries):")
     if nat and "primary_metrics" in nat:
         m = nat["primary_metrics"]
-        print(f"  NDCG@10:    {fmt(m.get('ndcg_at_10'))}")
-        print(f"  Hit@10:     {fmt(m.get('hit_at_10'))}")
-        print(f"  MRR:        {fmt(m.get('mrr'))}")
     else:
         print("  (not available)")

     return f"{value:.{decimals}f}"
+def fmt_with_ci(value: float | None, ci: dict | None, decimals: int = 3) -> str:
+    """Format a value with optional confidence interval."""
+    if value is None:
+        return "   ---"
+    if ci and "ci_lower" in ci and "ci_upper" in ci:
+        lower = ci["ci_lower"]
+        upper = ci["ci_upper"]
+        return f"{value:.{decimals}f}  [{lower:.{decimals}f}, {upper:.{decimals}f}]"
+    return f"{value:.{decimals}f}"
 def print_section(title: str):
     print(f"\n{title}")
     print_section("Recommendation Quality (Natural Queries):")
     if nat and "primary_metrics" in nat:
         m = nat["primary_metrics"]
+        for label, key, ci_key in [
+            ("NDCG@10", "ndcg_at_10", "ndcg_ci"),
+            ("Hit@10", "hit_at_10", "hit_ci"),
+            ("MRR", "mrr", "mrr_ci"),
+        ]:
+            print(f"  {label + ':':<10s} {fmt_with_ci(m.get(key), m.get(ci_key))}")
     else:
         print("  (not available)")