Spaces:

vxa8502
/

Sage

Running

App Files Files Community

vxa8502 commited on Feb 10

Commit

ae4342a

1 Parent(s): 1f0ea41

Fix eval workflow and align README metrics with eval_results JSON

Browse files

Files changed (23) hide show

.gitignore +14 -2
Makefile +55 -34
README.md +9 -9
data/eval_results/adjusted_faithfulness_20260210_115509.json +6 -0
data/eval_results/adjusted_faithfulness_latest.json +6 -0
data/eval_results/eval_natural_queries_20260210_114459.json +83 -0
data/eval_results/eval_natural_queries_20260210_114955.json +16 -0
data/eval_results/eval_natural_queries_latest.json +16 -0
data/eval_results/failure_analysis_20260210_115508.json +332 -0
data/eval_results/failure_analysis_latest.json +332 -0
data/eval_results/faithfulness_20260210_115238.json +36 -0
data/eval_results/faithfulness_latest.json +36 -0
data/eval_results/grounding_delta_20260210_115418.json +30 -0
data/eval_results/grounding_delta_latest.json +30 -0
data/eval_results/human_eval_20260210_124705.json +50 -0
data/eval_results/human_eval_latest.json +50 -0
data/eval_results/load_test_20260210_115634.json +23 -0
data/eval_results/load_test_latest.json +23 -0
reports/eda_report.md +4 -4
sage/data/__init__.py +35 -0
sage/data/eval.py +36 -0
sage/data/loader.py +479 -0
scripts/build_natural_eval_dataset.py +35 -27

.gitignore CHANGED Viewed

@@ -6,8 +6,20 @@ __pycache__/
 *.pyc
 .Python
-# Data (too large for git)
-data/
 # IDE
 .vscode/

 *.pyc
 .Python
+# Data (large files - ignore selectively)
+# Use /data/ prefix to ONLY match top-level data/, not sage/data/
+/data/splits/
+/data/cache/
+/data/qdrant_storage/
+/data/figures/
+/data/eval/
+/data/human_eval/
+# But TRACK eval results (small JSONs, source of truth for metrics)
+!/data/eval_results/
+# Assets (PNGs break HF deployment - hosted on assets-only branch)
+assets/
 # IDE
 .vscode/

Makefile CHANGED Viewed

@@ -1,4 +1,4 @@
-.PHONY: all setup data data-validate eval eval-all eval-full eval-quick demo demo-interview reset reset-eval reset-hard check-env qdrant-up qdrant-down qdrant-status eda serve serve-dev docker-build docker-run deploy-info deploy-health human-eval-generate human-eval human-eval-analyze fmt test lint typecheck ci info summary metrics-snapshot health load-test load-test-quick kaggle-test help
 # ---------------------------------------------------------------------------
 # Configurable Variables (override: make demo QUERY="gaming mouse")
@@ -165,45 +165,29 @@ eval-all: check-env
 	python scripts/summary.py && \
 	echo "=== COMPLETE EVALUATION DONE ==="
-# Full reproducibility: eval-all + human eval + load test (~15 min + ~1 hour manual)
 # Run after: make reset-eval
-# This is the COMPLETE evaluation for portfolio close-out
 eval-full: check-env
 	@echo "=== FULL REPRODUCIBLE EVALUATION ===" && \
 	echo "" && \
 	echo "=== PART 1: AUTOMATED METRICS (~15 min) ===" && \
 	$(MAKE) eval-all && \
 	echo "" && \
-	echo "=== PART 2: HUMAN EVALUATION ===" && \
-	echo "" && \
-	echo "--- Generating 50 samples ---" && \
-	python scripts/human_eval.py --generate --force && \
-	echo "" && \
-	echo "--- Interactive annotation (50 samples, ~1 hour) ---" && \
-	echo "Rate each sample 1-5 on: comprehension, trust, usefulness, satisfaction" && \
-	echo "Press Ctrl+C to pause and resume later with 'make human-eval'" && \
-	echo "" && \
-	python scripts/human_eval.py --annotate && \
-	echo "" && \
-	echo "--- Computing human eval results ---" && \
-	python scripts/human_eval.py --analyze && \
-	echo "" && \
-	echo "=== PART 3: LOAD TEST ===" && \
 	python scripts/load_test.py --url $(URL) --requests $(REQUESTS) --save && \
 	echo "" && \
-	echo "=== PART 4: FINAL SUMMARY ===" && \
-	python scripts/summary.py && \
-	echo "" && \
-	echo "=== FULL REPRODUCIBLE EVALUATION COMPLETE ===" && \
 	echo "" && \
 	echo "Results saved to: data/eval_results/" && \
 	echo "  - eval_natural_queries_latest.json  (NDCG, Hit@K, MRR)" && \
 	echo "  - faithfulness_latest.json          (HHEM, RAGAS)" && \
 	echo "  - grounding_delta_latest.json       (WITH vs WITHOUT evidence)" && \
-	echo "  - human_eval_latest.json            (50-sample ratings)" && \
 	echo "  - load_test_latest.json             (P99 latency)" && \
 	echo "" && \
-	echo "To verify docs match results: check README.md and home/*.md"
 # ---------------------------------------------------------------------------
 # Demo
@@ -269,21 +253,54 @@ deploy-health:
 		(echo "Deployment not healthy at $(URL)" && exit 1)
 # ---------------------------------------------------------------------------
-# Human Evaluation
 # ---------------------------------------------------------------------------
 human-eval-generate: check-env
 	@echo "=== GENERATING HUMAN EVAL SAMPLES ==="
 	python scripts/human_eval.py --generate --seed $(SEED)
 human-eval: check-env
 	@echo "=== HUMAN EVALUATION ==="
 	python scripts/human_eval.py --annotate
 human-eval-analyze: check-env
 	@echo "=== HUMAN EVAL ANALYSIS ==="
 	python scripts/human_eval.py --analyze
 # ---------------------------------------------------------------------------
 # Quality
 # ---------------------------------------------------------------------------
@@ -318,7 +335,8 @@ info:
 	print(f'Qdrant: {QDRANT_URL}'); \
 	print(f'LLM: {LLM_PROVIDER} ({ANTHROPIC_MODEL if LLM_PROVIDER == \"anthropic\" else OPENAI_MODEL})')"
-summary:
 	@python scripts/summary.py
 metrics-snapshot:
@@ -468,10 +486,10 @@ help:
 	@echo "  make demo-interview            3-query showcase (includes cache hit)"
 	@echo ""
 	@echo "INFO & METRICS:"
-	@echo "  make info            Show version, models, and URLs"
-	@echo "  make summary         Print evaluation summary"
-	@echo "  make metrics-snapshot Quick metrics display"
-	@echo "  make health          Check API health (requires running server)"
 	@echo ""
 	@echo "PIPELINE:"
 	@echo "  make data            Load, chunk, embed, and index reviews (local)"
@@ -484,8 +502,9 @@ help:
 	@echo "  make eval            Standard: metrics + explanation + faithfulness (~5 min)"
 	@echo "  make eval-all        Complete: everything automated (~15 min)"
 	@echo "                       Includes: EDA, ablations, baselines, delta, analysis"
-	@echo "  make eval-full       Full reproducibility: eval-all + human eval + load test"
-	@echo "                       Includes: 50-sample manual annotation (~1 hour)"
 	@echo ""
 	@echo "LOAD TESTING:"
 	@echo "  make load-test             Run 50 requests against production (P99 target)"
@@ -500,9 +519,11 @@ help:
 	@echo "  make deploy-info     Show HuggingFace Spaces deployment info"
 	@echo "  make deploy-health   Check production deployment health"
 	@echo ""
-	@echo "HUMAN EVALUATION:"
 	@echo "  make human-eval-generate  Generate 50 eval samples (SEED=42)"
-	@echo "  make human-eval           Rate samples interactively"
 	@echo "  make human-eval-analyze   Compute results from ratings"
 	@echo ""
 	@echo "QUALITY:"

+.PHONY: all setup data data-validate eval eval-all eval-full eval-quick eval-summary demo demo-interview reset reset-eval reset-hard check-env qdrant-up qdrant-down qdrant-status eda serve serve-dev docker-build docker-run deploy-info deploy-health human-eval-workflow human-eval-generate human-eval human-eval-analyze human-eval-status fmt test lint typecheck ci info metrics-snapshot health load-test load-test-quick kaggle-test help
 # ---------------------------------------------------------------------------
 # Configurable Variables (override: make demo QUERY="gaming mouse")
 	python scripts/summary.py && \
 	echo "=== COMPLETE EVALUATION DONE ==="
+# Full reproducibility: eval-all + load test (~17 min, fully automated)
+# Human evaluation is a SEPARATE workflow (see: make human-eval-workflow)
 # Run after: make reset-eval
 eval-full: check-env
 	@echo "=== FULL REPRODUCIBLE EVALUATION ===" && \
 	echo "" && \
 	echo "=== PART 1: AUTOMATED METRICS (~15 min) ===" && \
 	$(MAKE) eval-all && \
 	echo "" && \
+	echo "=== PART 2: LOAD TEST ===" && \
 	python scripts/load_test.py --url $(URL) --requests $(REQUESTS) --save && \
 	echo "" && \
+	echo "=== AUTOMATED EVALUATION COMPLETE ===" && \
 	echo "" && \
 	echo "Results saved to: data/eval_results/" && \
 	echo "  - eval_natural_queries_latest.json  (NDCG, Hit@K, MRR)" && \
 	echo "  - faithfulness_latest.json          (HHEM, RAGAS)" && \
 	echo "  - grounding_delta_latest.json       (WITH vs WITHOUT evidence)" && \
 	echo "  - load_test_latest.json             (P99 latency)" && \
 	echo "" && \
+	echo "NEXT STEPS:" && \
+	echo "  1. make human-eval-workflow   # ~1 hour manual annotation" && \
+	echo "  2. make eval-summary          # view complete results"
 # ---------------------------------------------------------------------------
 # Demo
 		(echo "Deployment not healthy at $(URL)" && exit 1)
 # ---------------------------------------------------------------------------
+# Human Evaluation (separate workflow from automated eval)
 # ---------------------------------------------------------------------------
+# Complete human eval workflow: generate → annotate → analyze
+# Run this AFTER make eval-full completes
+human-eval-workflow: check-env
+	@echo "=== HUMAN EVALUATION WORKFLOW ===" && \
+	echo "" && \
+	echo "This is a separate ~1 hour manual process." && \
+	echo "You can pause anytime with Ctrl+C and resume with 'make human-eval'" && \
+	echo "" && \
+	echo "--- Step 1/3: Generating 50 samples ---" && \
+	python scripts/human_eval.py --generate --seed $(SEED) && \
+	echo "" && \
+	echo "--- Step 2/3: Interactive annotation ---" && \
+	echo "Rate each sample 1-5 on: comprehension, trust, usefulness, satisfaction" && \
+	echo "" && \
+	python scripts/human_eval.py --annotate && \
+	echo "" && \
+	echo "--- Step 3/3: Computing results ---" && \
+	python scripts/human_eval.py --analyze && \
+	echo "" && \
+	echo "=== HUMAN EVALUATION COMPLETE ===" && \
+	echo "Results: data/eval_results/human_eval_latest.json" && \
+	echo "" && \
+	echo "Run 'make eval-summary' to see updated metrics."
+# Generate samples only (non-blocking)
 human-eval-generate: check-env
 	@echo "=== GENERATING HUMAN EVAL SAMPLES ==="
 	python scripts/human_eval.py --generate --seed $(SEED)
+# Interactive annotation (can pause with Ctrl+C, resume anytime)
 human-eval: check-env
 	@echo "=== HUMAN EVALUATION ==="
+	@echo "Pause anytime with Ctrl+C. Resume with 'make human-eval'"
+	@echo ""
 	python scripts/human_eval.py --annotate
+# Compute results from annotations
 human-eval-analyze: check-env
 	@echo "=== HUMAN EVAL ANALYSIS ==="
 	python scripts/human_eval.py --analyze
+# Check annotation progress
+human-eval-status:
+	@python scripts/human_eval.py --status 2>/dev/null || echo "No samples yet. Run: make human-eval-generate"
 # ---------------------------------------------------------------------------
 # Quality
 # ---------------------------------------------------------------------------
 	print(f'Qdrant: {QDRANT_URL}'); \
 	print(f'LLM: {LLM_PROVIDER} ({ANTHROPIC_MODEL if LLM_PROVIDER == \"anthropic\" else OPENAI_MODEL})')"
+# Comprehensive evaluation summary (handles missing human eval gracefully)
+eval-summary:
 	@python scripts/summary.py
 metrics-snapshot:
 	@echo "  make demo-interview            3-query showcase (includes cache hit)"
 	@echo ""
 	@echo "INFO & METRICS:"
+	@echo "  make info              Show version, models, and URLs"
+	@echo "  make eval-summary      Print comprehensive evaluation results"
+	@echo "  make metrics-snapshot  Quick metrics display"
+	@echo "  make health            Check API health (requires running server)"
 	@echo ""
 	@echo "PIPELINE:"
 	@echo "  make data            Load, chunk, embed, and index reviews (local)"
 	@echo "  make eval            Standard: metrics + explanation + faithfulness (~5 min)"
 	@echo "  make eval-all        Complete: everything automated (~15 min)"
 	@echo "                       Includes: EDA, ablations, baselines, delta, analysis"
+	@echo "  make eval-full       Full automated eval + load test (~17 min)"
+	@echo "                       Does NOT include human eval (see below)"
+	@echo "  make eval-summary    View comprehensive results (handles missing data)"
 	@echo ""
 	@echo "LOAD TESTING:"
 	@echo "  make load-test             Run 50 requests against production (P99 target)"
 	@echo "  make deploy-info     Show HuggingFace Spaces deployment info"
 	@echo "  make deploy-health   Check production deployment health"
 	@echo ""
+	@echo "HUMAN EVALUATION (separate workflow, ~1 hour):"
+	@echo "  make human-eval-workflow  Complete workflow: generate → annotate → analyze"
+	@echo "  make human-eval-status    Check annotation progress"
 	@echo "  make human-eval-generate  Generate 50 eval samples (SEED=42)"
+	@echo "  make human-eval           Rate samples interactively (Ctrl+C to pause)"
 	@echo "  make human-eval-analyze   Compute results from ratings"
 	@echo ""
 	@echo "QUALITY:"

README.md CHANGED Viewed

@@ -34,7 +34,7 @@ A recommendation system that refuses to hallucinate.
 Product recommendations without explanations are black boxes. Users see "You might like X" but never learn *why*. When you ask an LLM to explain, it confidently invents features and fabricates reviews.
-**Sage is different:** Every claim is a verified quote from real customer reviews. When evidence is sparse, it refuses rather than guesses. Human evaluation scored trust at **4.3/5** because honesty beats confident fabrication.
 ---
@@ -42,13 +42,13 @@ Product recommendations without explanations are black boxes. Users see "You mig
 | Metric | Target | Achieved | Status |
 |--------|--------|----------|--------|
-| NDCG@10 (recommendation quality) | > 0.30 | 0.295 | 98% |
-| Claim-level faithfulness (HHEM) | > 0.85 | 0.967 | Pass |
-| Human evaluation (n=50) | > 3.5/5 | 3.85/5 | Pass |
-| P99 latency (production) | < 500ms | 200ms | Pass |
-| P99 latency (cache hit) | < 100ms | 86ms | Pass |
-**Grounding impact:** Explanations generated WITH evidence score 71% on HHEM. WITHOUT evidence: 2.5%. RAG grounding reduces hallucination by 68 percentage points.
 ---
@@ -96,7 +96,7 @@ When you give an LLM one short review as context, it fills in the gaps with plau
 | **Semantic chunking** | Fixed-window | Preserves complete arguments; better quote verification. |
 | **HHEM** (Vectara) | GPT-4 judge, NLI models | Purpose-built for RAG hallucination; no API cost. |
 | **Claim-level evaluation** | Full-explanation | Isolates which claims hallucinate; more actionable. |
-| **Quality gate** (refuse) | Always answer | 48% refusal rate → 4.3/5 trust. Honesty > coverage. |
 ---
@@ -108,7 +108,7 @@ When you give an LLM one short review as context, it fills in the gaps with plau
 | **No image features** | Misses visual product attributes | Could add CLIP embeddings in future |
 | **English only** | Non-English reviews have lower retrieval quality | E5 is primarily English-trained |
 | **Cache invalidation manual** | Stale explanations possible | TTL-based expiry (1 hour); manual `/cache/clear` |
-| **LLM latency on free tier** | P99 ~4s with explanations | Retrieval alone is 200ms; cache hits are 86ms |
 | **No user personalization** | Same results for all users | Would need user history for collaborative filtering |
 ---

 Product recommendations without explanations are black boxes. Users see "You might like X" but never learn *why*. When you ask an LLM to explain, it confidently invents features and fabricates reviews.
+**Sage is different:** Every claim is a verified quote from real customer reviews. When evidence is sparse, it refuses rather than guesses. Human evaluation scored trust at **4.0/5** because honesty beats confident fabrication.
 ---
 | Metric | Target | Achieved | Status |
 |--------|--------|----------|--------|
+| NDCG@10 (recommendation quality) | > 0.30 | 0.487 | Pass |
+| Claim-level faithfulness (HHEM) | > 0.85 | 0.968 | Pass |
+| Human evaluation (n=50) | > 3.5/5 | 3.6/5 | Pass |
+| P99 latency (production) | < 500ms | 283ms | Pass |
+| P99 latency (cache hit) | < 100ms | 88ms | Pass |
+**Grounding impact:** Explanations generated WITH evidence score 73% on HHEM. WITHOUT evidence: 2.6%. RAG grounding reduces hallucination by 70 percentage points.
 ---
 | **Semantic chunking** | Fixed-window | Preserves complete arguments; better quote verification. |
 | **HHEM** (Vectara) | GPT-4 judge, NLI models | Purpose-built for RAG hallucination; no API cost. |
 | **Claim-level evaluation** | Full-explanation | Isolates which claims hallucinate; more actionable. |
+| **Quality gate** (refuse) | Always answer | 64% refusal rate → 4.0/5 trust. Honesty > coverage. |
 ---
 | **No image features** | Misses visual product attributes | Could add CLIP embeddings in future |
 | **English only** | Non-English reviews have lower retrieval quality | E5 is primarily English-trained |
 | **Cache invalidation manual** | Stale explanations possible | TTL-based expiry (1 hour); manual `/cache/clear` |
+| **LLM latency on free tier** | P99 ~4s with explanations | Retrieval alone is 283ms; cache hits are 88ms |
 | **No user personalization** | Same results for all users | Would need user history for collaborative filtering |
 ---

data/eval_results/adjusted_faithfulness_20260210_115509.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "n_total": 28,
+  "n_refusals": 18,
+  "raw_pass_rate": 0.17857142857142858,
+  "adjusted_pass_rate": 0.7857142857142857
+}

data/eval_results/adjusted_faithfulness_latest.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "n_total": 28,
+  "n_refusals": 18,
+  "raw_pass_rate": 0.17857142857142858,
+  "adjusted_pass_rate": 0.7857142857142857
+}

data/eval_results/eval_natural_queries_20260210_114459.json ADDED Viewed

	@@ -0,0 +1,83 @@

+{
+  "timestamp": "2026-02-10T11:39:34.313213",
+  "dataset": "eval_natural_queries.json",
+  "catalog_size": 21827,
+  "experiments": {
+    "aggregation_methods": {
+      "max": {
+        "ndcg_at_10": 0.4871922222425982,
+        "hit_at_10": 0.7380952380952381,
+        "mrr": 0.42086167800453517
+      },
+      "mean": {
+        "ndcg_at_10": 0.26098878491132294,
+        "hit_at_10": 0.40476190476190477,
+        "mrr": 0.23921957671957672
+      },
+      "weighted_mean": {
+        "ndcg_at_10": 0.26256611709364913,
+        "hit_at_10": 0.40476190476190477,
+        "mrr": 0.23895502645502648
+      }
+    },
+    "weight_tuning": [
+      {
+        "alpha": 1.0,
+        "beta": 0.0,
+        "ndcg_at_10": 0.4871922222425982,
+        "hit_at_10": 0.7380952380952381,
+        "mrr": 0.42086167800453517
+      },
+      {
+        "alpha": 0.9,
+        "beta": 0.1,
+        "ndcg_at_10": 0.4847806754077341,
+        "hit_at_10": 0.7380952380952381,
+        "mrr": 0.419340513983371
+      },
+      {
+        "alpha": 0.8,
+        "beta": 0.2,
+        "ndcg_at_10": 0.48823514035842697,
+        "hit_at_10": 0.7380952380952381,
+        "mrr": 0.4292894935752078
+      },
+      {
+        "alpha": 0.7,
+        "beta": 0.3,
+        "ndcg_at_10": 0.4649563743541078,
+        "hit_at_10": 0.6904761904761905,
+        "mrr": 0.4129535147392291
+      },
+      {
+        "alpha": 0.6,
+        "beta": 0.4,
+        "ndcg_at_10": 0.4676786860620628,
+        "hit_at_10": 0.6904761904761905,
+        "mrr": 0.4146069538926682
+      },
+      {
+        "alpha": 0.5,
+        "beta": 0.5,
+        "ndcg_at_10": 0.4707198108786378,
+        "hit_at_10": 0.6904761904761905,
+        "mrr": 0.4125661375661376
+      }
+    ]
+  },
+  "primary_metrics": {
+    "ndcg_at_10": 0.4871922222425982,
+    "hit_at_10": 0.7380952380952381,
+    "mrr": 0.42086167800453517,
+    "precision_at_10": 0.12857142857142856,
+    "recall_at_10": 0.4722222222222222,
+    "diversity": 0.01957190520646696,
+    "coverage": 0.015531222797452697,
+    "novelty": 9.808908578271737
+  },
+  "best_weights": {
+    "alpha": 0.8,
+    "beta": 0.2,
+    "ndcg_at_10": 0.48823514035842697
+  }
+}

data/eval_results/eval_natural_queries_20260210_114955.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "timestamp": "2026-02-10T11:49:08.500849",
+  "dataset": "eval_natural_queries.json",
+  "catalog_size": 21827,
+  "experiments": {},
+  "primary_metrics": {
+    "ndcg_at_10": 0.4871922222425982,
+    "hit_at_10": 0.7380952380952381,
+    "mrr": 0.42086167800453517,
+    "precision_at_10": 0.12857142857142856,
+    "recall_at_10": 0.4722222222222222,
+    "diversity": 0.01957190520646696,
+    "coverage": 0.015531222797452697,
+    "novelty": 9.808908578271737
+  }
+}

data/eval_results/eval_natural_queries_latest.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "timestamp": "2026-02-10T11:49:08.500849",
+  "dataset": "eval_natural_queries.json",
+  "catalog_size": 21827,
+  "experiments": {},
+  "primary_metrics": {
+    "ndcg_at_10": 0.4871922222425982,
+    "hit_at_10": 0.7380952380952381,
+    "mrr": 0.42086167800453517,
+    "precision_at_10": 0.12857142857142856,
+    "recall_at_10": 0.4722222222222222,
+    "diversity": 0.01957190520646696,
+    "coverage": 0.015531222797452697,
+    "novelty": 9.808908578271737
+  }
+}

data/eval_results/failure_analysis_20260210_115508.json ADDED Viewed

	@@ -0,0 +1,332 @@

+{
+  "cases": [
+    {
+      "case_id": 1,
+      "query": "wireless headphones with noise cancellation",
+      "product_id": "B01EL7VTC0",
+      "explanation": "Reviewers describe these as \"the best noise cancellation, wireless headphones on the market\" [review_137128] and \"Best Bluetooth headphones set with noise cancellation\" [review_117666]. One user notes \"when playing music via Bluetooth mild background sound does seem to be suppressed\" [review_291616].",
+      "evidence_texts": [
+        "These are the best noise cancellation, wireless headphones on the market. They offer audiophile quality sound not usually found with wireless headphones. They are very comfortable to wear for long periods of time and strangely enough, my ears do not sweat or become hot while wearing them. (I do not exercise with these on). You can change the volume, pause, skip tracks and take phone calls with the headset on. The battery life is impressive and the build quality is great.  These are worth every penny. I would give it 6 stars if I could!!  Way to go Bose!",
+        "Best Bluetooth headphones set with noise cancellation. Very comfortable for long time.",
+        "I'm not sure what I should have expected from the noise cancellation. I have found that when playing music via Bluetooth mild background sound does seem to be suppressed, but louder sounds come through. I tested the noise cancellation playing a variety of white sounds on my iPhone while playing music via my MacBook. The sounds do seem to be suppressed at low volume, but are easy to hear at higher volume. My conclusion is that noise cancellation is limited even in these highly rated earphones. They are superb earphones, well built, with great sound quality, but not sure they are worth the price."
+      ],
+      "hhem_score": 0.13946935534477234,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 2,
+      "query": "wireless headphones with noise cancellation",
+      "product_id": "B0C7V6PQ5D",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"wireless headphones with noise cancellation\".",
+      "evidence_texts": [
+        "These seem to be good wireless noise cancelling headphones.  I have been using them a few days and I am very happy with the results.  The background noise is almost all cancelled out when I listen to music.  The clarity and quality of the music is very good thru these headphones.  The headphones sit comfortably on my head and over my ears.  These are lightweight.  The headphone comes with the following:  1 Srhythm Over-Ear Bluetooth Headphone,  1 USB charging cable,  1 3.5mm Audio cable,  1 Airplane adapter,  1 Carrying Zip Case,  and 1 User manual.  I recommend these headphones and I think they are well worth the money."
+      ],
+      "hhem_score": 0.47767794132232666,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 3,
+      "query": "laptop charger for MacBook",
+      "product_id": "B08PNSRG33",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"laptop charger for MacBook\".",
+      "evidence_texts": [
+        "I always liked the old MacBooks with their magnetic charging cable feature...before they switched to USB C.  And I really like the current all-aluminum Acer Chromebooks, which cosmetically are a MacBook clone (they are a premium Chromebook).  This adapter works great and rekindles the MacBook magnetic charging idea.  The adapter looks great and works great!  Well done!"
+      ],
+      "hhem_score": 0.376436322927475,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 4,
+      "query": "laptop charger for MacBook",
+      "product_id": "B0BLMFG5D2",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"laptop charger for MacBook\".",
+      "evidence_texts": [
+        "Note that this model is the standard USB A charger. The other Apple version has a USB-C connector and fits the MacBook Pro and other more recent notebook computers that only have USB-C ports. Be sure you order the right one."
+      ],
+      "hhem_score": 0.2948443591594696,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 5,
+      "query": "USB hub with multiple ports",
+      "product_id": "B0BVQQY5KD",
+      "explanation": "This product \"allows 4 devices with USB-A connectors to connect to a single USB-C port\" [review_288811] and \"provides the means of making these connections including devices that operate at 10Gps speeds\" [review_288811]. The reviewer notes it \"makes a very good accessory for connecting USB-A devices\" [review_288811].",
+      "evidence_texts": [
+        "This is a hub that allows 4 devices with USB-A connectors to connect to a single USB-C port on another device, which would usually be a computer.",
+        "In this case, power and video transmission is not supported. Even though most of the new devices like solid state drives are using USB-C connectors there still exists a lot of equipment with USB-A connectors that have to connected. This Hub provides the means of making these connections including devices that operate at 10Gps speeds. Since this is not a powered hub the power to the connected devices is limited. It is unlikely that more than two solid state drives can be connected without affecting their performance. Lower power devices like mice and keyboards are easily accommodated. It is constructed very nicely and makes a very good accessory for connecting USB-A devices."
+      ],
+      "hhem_score": 0.9468567371368408,
+      "is_hallucinated": false
+    },
+    {
+      "case_id": 6,
+      "query": "USB hub with multiple ports",
+      "product_id": "B00483WRZ6",
+      "explanation": "Users report \"it's great to have 10 ports\" [review_49478] and describe \"The hub has 6 regular USB ports on one side, and 4 on the side where the power line heads off to the wall plug\" [review_183047]. One reviewer notes \"it has performed flawlessly\" [review_49478] after a year of use.",
+      "evidence_texts": [
+        "The hub has 6 regular USB ports on one side, and 4 on the side where the power line heads off to the wall plug. In addition, one side has the mini-USB connection for the line to the USB source--probably but not necessarily the computer--so there are technically 11 USB ports on this hub, of which one has a specific use. I already have two 4-port USB 1.0 hubs for my older equipment, and two 5-port USB 2.0 hubs. After I reorganize all the cables to nowhere or everywhere, I'll be using the Plugable 10-port hub and one of the 5-port USB 2.0 hubs, with a couple of ports to spare for the first time in years.",
+        "I've been using this hub for about a year now and it has performed flawlessly. With so many devices using USB, it's great to have 10 ports. This hub has enough ports that I can leave the iPod and Camera cords plugged in and just attach the devices when needed.",
+        "The hub arrived quickly from the vendor. I waded into my spaghetti, extracted the old hub and put in my new hub. Many of my connections I have routed behind a shelf so I found the 6 connections on one side to work well with those. That left the two rotating ports and 2 facing outward-I can access the remainder to use for charging things etc. The rotating ports did help orienting the unit so I can easily access those ports. The USB 3.0 TB HD thinks the hub is dandy."
+      ],
+      "hhem_score": 0.8917474150657654,
+      "is_hallucinated": false
+    },
+    {
+      "case_id": 7,
+      "query": "portable battery pack for travel",
+      "product_id": "B078XDLDNM",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"portable battery pack for travel\".",
+      "evidence_texts": [
+        "Excellent for use with portable battery packs."
+      ],
+      "hhem_score": 0.25542503595352173,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 8,
+      "query": "portable battery pack for travel",
+      "product_id": "B071YYK9CB",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"portable battery pack for travel\".",
+      "evidence_texts": [
+        "Excellent for use with portable battery packs."
+      ],
+      "hhem_score": 0.25542503595352173,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 9,
+      "query": "bluetooth speaker with good bass",
+      "product_id": "B081HC253G",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"bluetooth speaker with good bass\".",
+      "evidence_texts": [
+        "Love this Bluetooth speaker has a good amount of bass and clear and loud. Can put on a shelf and looks nice."
+      ],
+      "hhem_score": 0.41683968901634216,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 10,
+      "query": "bluetooth speaker with good bass",
+      "product_id": "B08CS391BR",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"bluetooth speaker with good bass\".",
+      "evidence_texts": [
+        "Nice loud bluetooth speaker with very loud bass."
+      ],
+      "hhem_score": 0.2693043053150177,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 11,
+      "query": "cheap but good quality earbuds",
+      "product_id": "B07S764D9V",
+      "explanation": "Reviewers describe these as \"the best cheap earbuds on the market\" [review_33223] and note they are \"cheap (less than $10) and solid\" [review_99276]. One customer simply calls them \"Great cheap earbuds\" [review_221704].",
+      "evidence_texts": [
+        "These are probably the best cheap earbuds on the market. They sound fairly good, they don't fall out of my ears, and they are comfortable. I've used earbuds that cost a lot more but fail at those criteria.<br /><br />The cords seem a bit fragile, but that's really all I can complain about.",
+        "I was looking for earbuds that were cheap (less than $10) and solid.  These are it.  They fit well, they block (some) of the background noise, and they are well-constructed.  I especially like the plug, as it's  at a right angle, which prevents me from accidentally pulling the wire free of the plug when I take out the plug.  Good design.<br /><br />These are NOT noise cancelling earbuds.  They block noise because they're in your ear.",
+        "Great cheap earbuds"
+      ],
+      "hhem_score": 0.10299108922481537,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 12,
+      "query": "cheap but good quality earbuds",
+      "product_id": "B072JJNYQY",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"cheap but good quality earbuds\".",
+      "evidence_texts": [
+        "These are great, cheap wireless earbuds.  The sound quality is pretty good for the price.  No complaints so far.  My wife and I both have a pair of these."
+      ],
+      "hhem_score": 0.40427908301353455,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 13,
+      "query": "durable phone case that looks nice",
+      "product_id": "B085LCT6HY",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"durable phone case that looks nice\".",
+      "evidence_texts": [
+        "Nice little case. Very durable and nice looking."
+      ],
+      "hhem_score": 0.40621230006217957,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 14,
+      "query": "durable phone case that looks nice",
+      "product_id": "B00APU8IY0",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"durable phone case that looks nice\".",
+      "evidence_texts": [
+        "These cases are sharp looking and very durable, and well made.  The best part of it is the price.  I have one for my Nexus 7 as well."
+      ],
+      "hhem_score": 0.3042883574962616,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 15,
+      "query": "fast charging cable that won't break",
+      "product_id": "B07MLVKBQM",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"fast charging cable that won't break\".",
+      "evidence_texts": [
+        "Great fast charging cable. Seems very durable."
+      ],
+      "hhem_score": 0.2593702971935272,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 16,
+      "query": "fast charging cable that won't break",
+      "product_id": "B0982Q1WZ8",
+      "explanation": "I cannot provide a meaningful recommendation for this product. The available review evidence is too brief (15 tokens) to support a well-grounded explanation for your query about \"fast charging cable that won't break\".",
+      "evidence_texts": [
+        "Very good durable fast charging cable",
+        "durable and fast charging"
+      ],
+      "hhem_score": 0.07533111423254013,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 17,
+      "query": "comfortable headphones for long sessions",
+      "product_id": "B08HTNC8PJ",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"comfortable headphones for long sessions\".",
+      "evidence_texts": [
+        "Awesome, simple headphones. Very comfy for long periods"
+      ],
+      "hhem_score": 0.5048479437828064,
+      "is_hallucinated": false
+    },
+    {
+      "case_id": 18,
+      "query": "comfortable headphones for long sessions",
+      "product_id": "B016XJCQR6",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"comfortable headphones for long sessions\".",
+      "evidence_texts": [
+        "Nice headphones, I use them to listen to online lectures. Sound is good and is heard in both ears. Pretty comfortable - I can go about 2.5 hours before they get uncomfortable. I Like the single cord to headset."
+      ],
+      "hhem_score": 0.3649515211582184,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 19,
+      "query": "quiet keyboard for office",
+      "product_id": "B0BK3LYMR2",
+      "explanation": "Reviewers describe it as \"super quiet\" [review_151577] and \"the quietest I've ever owned\" [review_320508]. One notes \"this keyboard is pretty quiet, no clacking noise\" [review_322050] and calls it \"a perfect office keyboard\" [review_320508].",
+      "evidence_texts": [
+        "I purchased this keyboard and mouse combo for my home office.  It's super quiet and feels great to type on.  I liked it so much I ended up purchasing a second one for the business office.",
+        "Both the keyboard and mouse are the quietest I've ever owned. I wish all my co-workers would make the switch to these. It is a perfect office keyboard and mouse, it just works and works well. Also, this keyboard has drain holes! If you spill something on it, your chances of still having a working keyboard are muuuuch higher with this keyboard.",
+        "This keyboard is pretty quiet, no clacking noise.  Instead you here a rubbery membrane squish from a key being pressed, which is also audible but not noisy.  The typing experience on this keyboard is not pleasant due to the stiff membrane, requiring much more effort/force to push down individual keys.  Typing fast and typing a lot on this keyboard would not be recommended because it's not easy to type on this keyboard.<br /><br />The silent mouse has silent buttons as well as a silent scroll wheel.  I mention this because I bought a separate Logitech silent mouse and that scroll wheel is loud."
+      ],
+      "hhem_score": 0.5948902368545532,
+      "is_hallucinated": false
+    },
+    {
+      "case_id": 20,
+      "query": "quiet keyboard for office",
+      "product_id": "B004MF11MU",
+      "explanation": "Reviewers describe it as a \"Good, quiet keyboard\" [review_155782] with \"Very responsive keys\" [review_155782]. One user notes \"It's fast, easy to type\" [review_245043].",
+      "evidence_texts": [
+        "Good, quiet keyboard.  Very responsive keys.",
+        "I purchased this keyboard back in 2018 and haven't had to change a battery nor charge it at all.  The natural light coming in from the windows or the lamps on your desk are enough to keep it going.<br />It's fast, easy to type and use the function keys, and works at a long distance from the computer.<br />The only complaint is that the keyboard isn't wide enough as it puts your hands a little too close together.<br />It's worth getting."
+      ],
+      "hhem_score": 0.38308340311050415,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 21,
+      "query": "headphones that don't hurt ears",
+      "product_id": "B00JB31OUW",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"headphones that don't hurt ears\".",
+      "evidence_texts": [
+        "Finally, headphones that don't hurt my ears, because they don't sit ON my ears.  I have very small ear canals, and have always been unable to use earbuds because they either don't stay in or they are very painful.  Other headphones become very painful after just a short time of listening.  My only wish is that they were bluetooth capable, but that's something I can deal with.  The sound is wonderful, and the comfort is absolutely heavenly.  Thank you, Bose!!!"
+      ],
+      "hhem_score": 0.40339213609695435,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 22,
+      "query": "headphones that don't hurt ears",
+      "product_id": "B07KQWX947",
+      "explanation": "Reviewers describe these as \"lightweight and comfortable headphones\" [review_292476]. One customer notes they are \"really good sounding\" [review_292476] with \"excellent\" sound quality [review_142177].",
+      "evidence_texts": [
+        "Koss PortaPro headphones are really good sounding, lightweight and comfortable headphones that don't break the bank.",
+        "Folds great, sound is very good;  no need to spend $350 on other headphones. These don't cancel noise but the sound on my iPod is excellent."
+      ],
+      "hhem_score": 0.24604828655719757,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 23,
+      "query": "charger that actually works",
+      "product_id": "B07NPPSXN3",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"charger that actually works\".",
+      "evidence_texts": [
+        "Seems like an official charger and it works!"
+      ],
+      "hhem_score": 0.2389219105243683,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 24,
+      "query": "charger that actually works",
+      "product_id": "B072FHJVT2",
+      "explanation": "I cannot provide a meaningful recommendation for this product. The available review evidence is too brief (33 tokens) to support a well-grounded explanation for your query about \"charger that actually works\".",
+      "evidence_texts": [
+        "Works really good this charger",
+        "It\u2019s a basic android type charger. Nothing insanely special about it but it does the job as it should."
+      ],
+      "hhem_score": 0.4941137433052063,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 25,
+      "query": "waterproof speaker for shower",
+      "product_id": "B0BW4PFM58",
+      "explanation": "One reviewer states \"I personally use it in my shower to listen to podcasts and audiobooks\" [review_139921] and notes \"The volume is loud enough to overpower the running water\" [review_139921]. However, they clarify \"It is not WATERPROOF\" and recommend \"I keep in the bathroom, just not under the showerhead\" [review_139921].",
+      "evidence_texts": [
+        "This is not a beach party speaker, it's not big enough for that task. If you want a portable party/beach speaker you need to buy a larger speaker which will require a much larger battery. I personally use it in my shower to listen to podcasts and audiobooks more than anything. It is not WATERPROOF, it never states that in the description. The image may be a little deceptive, it will handle a bit of splash. IPX 5: Protected against water jets - Water projected at all angles through a 6.3mm nozzle at a flow rate of 12.5 liters/min at a pressure of 30kN/m2 for 3 minutes from a distance of 3 meters. In order to survive being underwater or in water it needs X-8 or X-7. I keep in the bathroom, just not under the showerhead. It has rubber feet to keep it from vibrating around. The volume is loud enough to overpower the running water, little to almost no distortion at high volume.",
+        "This is not a beach party speaker, it's not big enough for that task. If you want a portable party/beach speaker you need to buy a larger speaker which will require a much larger battery. I personally use it in my shower to listen to podcasts and audiobooks more than anything. It is not WATERPROOF, it never states that in the description. The image may be a little deceptive, it will handle a bit of splash. IPX 5: Protected against water jets - Water projected at all angles through a 6.3mm nozzle at a flow rate of 12.5 liters/min at a pressure of 30kN/m2 for 3 minutes from a distance of 3 meters. In order to survive being underwater or in water it needs X-8 or X-7. I keep in the bathroom, just not under the showerhead. It has rubber feet to keep it from vibrating around. The volume is loud enough to overpower the running water, little to almost no distortion at high volume.",
+        "These speakers are great. Clear sound, even at the highest volume.  Water proofing seems to work fine and its very shock resistant in that I dropped it from a 7 foot shelf in the shower and it didn't skip a beat."
+      ],
+      "hhem_score": 0.9651408791542053,
+      "is_hallucinated": false
+    },
+    {
+      "case_id": 26,
+      "query": "waterproof speaker for shower",
+      "product_id": "B0C1J4XGB1",
+      "explanation": "Customers say \"I bought it for the shower and I have not been disappointed\" [review_110364] and \"This is a nice speaker for your shower\" [review_122829]. One reviewer notes \"It is military grade waterproof\" [review_110364], though warns \"if it breaks it's not water resistant anymore\" [review_170353] regarding the charging port cover.",
+      "evidence_texts": [
+        "This speaker is a great buy for the money. I bought it for the shower and I have not been disappointed.<br /><br />It is military grade waterproof. FYI- I do not have it in a spot in the shower that it is getting directly hit with water, but it is in a spot that it gets wet.<br /><br />I also like the features on it. You can pause songs or skip to the next one. Pickup or hangup on phone calls if you use this as a portable speaker. The size is great for travel as well. I have even taken this outside with me when I am sitting in the backyard. I am thinking about buying another for moving around and keeping one in shower since the price is so reasonable.",
+        "This is a nice speaker for your shower. Has a good sound, good battery and the ability to play/pause, skip/reverse and control volume. My helpful hint, when recharging it is to take the speaker and suction cup off together, if you try to take the speaker off of the suction cup, you run the risk of breaking the suction cup. (This happened to the last one I had and then that rolled off the soap dish and broke.)",
+        "Nice battery life. good sound. The speaker is located in the back so when stuck to my shower wall it echos the sound in the shower / tub. Only downside is be careful of the rubber plug for the charging port, if it breaks it's not water resistant anymore."
+      ],
+      "hhem_score": 0.4655941128730774,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 27,
+      "query": "gift for someone who likes music",
+      "product_id": "B0BMQN7L6B",
+      "explanation": "Customers say \"I gave this as a gift to someone who loves listening to music, and they love it!\" [review_165326] and call it \"A great Christmas present!\" [review_92937]. One buyer noted they \"Bought as a Christmas gift. My niece loves it and has started buying vinyls.\" [review_279946]",
+      "evidence_texts": [
+        "I gave this as a gift to someone who loves listening to music, and they love it!",
+        "A great Christmas present! Great for someone starting their vinyl collection",
+        "Bought as a Christmas gift.  My niece loves it and has started buying vinyls."
+      ],
+      "hhem_score": 0.1792474240064621,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 28,
+      "query": "gift for someone who likes music",
+      "product_id": "B08HTNC8PJ",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"gift for someone who likes music\".",
+      "evidence_texts": [
+        "It was a gift for my niece. She likes these to listen to music while at home."
+      ],
+      "hhem_score": 0.2865367531776428,
+      "is_hallucinated": true
+    }
+  ],
+  "n_failures": 23
+}

data/eval_results/failure_analysis_latest.json ADDED Viewed

	@@ -0,0 +1,332 @@

+{
+  "cases": [
+    {
+      "case_id": 1,
+      "query": "wireless headphones with noise cancellation",
+      "product_id": "B01EL7VTC0",
+      "explanation": "Reviewers describe these as \"the best noise cancellation, wireless headphones on the market\" [review_137128] and \"Best Bluetooth headphones set with noise cancellation\" [review_117666]. One user notes \"when playing music via Bluetooth mild background sound does seem to be suppressed\" [review_291616].",
+      "evidence_texts": [
+        "These are the best noise cancellation, wireless headphones on the market. They offer audiophile quality sound not usually found with wireless headphones. They are very comfortable to wear for long periods of time and strangely enough, my ears do not sweat or become hot while wearing them. (I do not exercise with these on). You can change the volume, pause, skip tracks and take phone calls with the headset on. The battery life is impressive and the build quality is great.  These are worth every penny. I would give it 6 stars if I could!!  Way to go Bose!",
+        "Best Bluetooth headphones set with noise cancellation. Very comfortable for long time.",
+        "I'm not sure what I should have expected from the noise cancellation. I have found that when playing music via Bluetooth mild background sound does seem to be suppressed, but louder sounds come through. I tested the noise cancellation playing a variety of white sounds on my iPhone while playing music via my MacBook. The sounds do seem to be suppressed at low volume, but are easy to hear at higher volume. My conclusion is that noise cancellation is limited even in these highly rated earphones. They are superb earphones, well built, with great sound quality, but not sure they are worth the price."
+      ],
+      "hhem_score": 0.13946935534477234,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 2,
+      "query": "wireless headphones with noise cancellation",
+      "product_id": "B0C7V6PQ5D",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"wireless headphones with noise cancellation\".",
+      "evidence_texts": [
+        "These seem to be good wireless noise cancelling headphones.  I have been using them a few days and I am very happy with the results.  The background noise is almost all cancelled out when I listen to music.  The clarity and quality of the music is very good thru these headphones.  The headphones sit comfortably on my head and over my ears.  These are lightweight.  The headphone comes with the following:  1 Srhythm Over-Ear Bluetooth Headphone,  1 USB charging cable,  1 3.5mm Audio cable,  1 Airplane adapter,  1 Carrying Zip Case,  and 1 User manual.  I recommend these headphones and I think they are well worth the money."
+      ],
+      "hhem_score": 0.47767794132232666,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 3,
+      "query": "laptop charger for MacBook",
+      "product_id": "B08PNSRG33",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"laptop charger for MacBook\".",
+      "evidence_texts": [
+        "I always liked the old MacBooks with their magnetic charging cable feature...before they switched to USB C.  And I really like the current all-aluminum Acer Chromebooks, which cosmetically are a MacBook clone (they are a premium Chromebook).  This adapter works great and rekindles the MacBook magnetic charging idea.  The adapter looks great and works great!  Well done!"
+      ],
+      "hhem_score": 0.376436322927475,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 4,
+      "query": "laptop charger for MacBook",
+      "product_id": "B0BLMFG5D2",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"laptop charger for MacBook\".",
+      "evidence_texts": [
+        "Note that this model is the standard USB A charger. The other Apple version has a USB-C connector and fits the MacBook Pro and other more recent notebook computers that only have USB-C ports. Be sure you order the right one."
+      ],
+      "hhem_score": 0.2948443591594696,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 5,
+      "query": "USB hub with multiple ports",
+      "product_id": "B0BVQQY5KD",
+      "explanation": "This product \"allows 4 devices with USB-A connectors to connect to a single USB-C port\" [review_288811] and \"provides the means of making these connections including devices that operate at 10Gps speeds\" [review_288811]. The reviewer notes it \"makes a very good accessory for connecting USB-A devices\" [review_288811].",
+      "evidence_texts": [
+        "This is a hub that allows 4 devices with USB-A connectors to connect to a single USB-C port on another device, which would usually be a computer.",
+        "In this case, power and video transmission is not supported. Even though most of the new devices like solid state drives are using USB-C connectors there still exists a lot of equipment with USB-A connectors that have to connected. This Hub provides the means of making these connections including devices that operate at 10Gps speeds. Since this is not a powered hub the power to the connected devices is limited. It is unlikely that more than two solid state drives can be connected without affecting their performance. Lower power devices like mice and keyboards are easily accommodated. It is constructed very nicely and makes a very good accessory for connecting USB-A devices."
+      ],
+      "hhem_score": 0.9468567371368408,
+      "is_hallucinated": false
+    },
+    {
+      "case_id": 6,
+      "query": "USB hub with multiple ports",
+      "product_id": "B00483WRZ6",
+      "explanation": "Users report \"it's great to have 10 ports\" [review_49478] and describe \"The hub has 6 regular USB ports on one side, and 4 on the side where the power line heads off to the wall plug\" [review_183047]. One reviewer notes \"it has performed flawlessly\" [review_49478] after a year of use.",
+      "evidence_texts": [
+        "The hub has 6 regular USB ports on one side, and 4 on the side where the power line heads off to the wall plug. In addition, one side has the mini-USB connection for the line to the USB source--probably but not necessarily the computer--so there are technically 11 USB ports on this hub, of which one has a specific use. I already have two 4-port USB 1.0 hubs for my older equipment, and two 5-port USB 2.0 hubs. After I reorganize all the cables to nowhere or everywhere, I'll be using the Plugable 10-port hub and one of the 5-port USB 2.0 hubs, with a couple of ports to spare for the first time in years.",
+        "I've been using this hub for about a year now and it has performed flawlessly. With so many devices using USB, it's great to have 10 ports. This hub has enough ports that I can leave the iPod and Camera cords plugged in and just attach the devices when needed.",
+        "The hub arrived quickly from the vendor. I waded into my spaghetti, extracted the old hub and put in my new hub. Many of my connections I have routed behind a shelf so I found the 6 connections on one side to work well with those. That left the two rotating ports and 2 facing outward-I can access the remainder to use for charging things etc. The rotating ports did help orienting the unit so I can easily access those ports. The USB 3.0 TB HD thinks the hub is dandy."
+      ],
+      "hhem_score": 0.8917474150657654,
+      "is_hallucinated": false
+    },
+    {
+      "case_id": 7,
+      "query": "portable battery pack for travel",
+      "product_id": "B078XDLDNM",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"portable battery pack for travel\".",
+      "evidence_texts": [
+        "Excellent for use with portable battery packs."
+      ],
+      "hhem_score": 0.25542503595352173,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 8,
+      "query": "portable battery pack for travel",
+      "product_id": "B071YYK9CB",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"portable battery pack for travel\".",
+      "evidence_texts": [
+        "Excellent for use with portable battery packs."
+      ],
+      "hhem_score": 0.25542503595352173,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 9,
+      "query": "bluetooth speaker with good bass",
+      "product_id": "B081HC253G",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"bluetooth speaker with good bass\".",
+      "evidence_texts": [
+        "Love this Bluetooth speaker has a good amount of bass and clear and loud. Can put on a shelf and looks nice."
+      ],
+      "hhem_score": 0.41683968901634216,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 10,
+      "query": "bluetooth speaker with good bass",
+      "product_id": "B08CS391BR",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"bluetooth speaker with good bass\".",
+      "evidence_texts": [
+        "Nice loud bluetooth speaker with very loud bass."
+      ],
+      "hhem_score": 0.2693043053150177,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 11,
+      "query": "cheap but good quality earbuds",
+      "product_id": "B07S764D9V",
+      "explanation": "Reviewers describe these as \"the best cheap earbuds on the market\" [review_33223] and note they are \"cheap (less than $10) and solid\" [review_99276]. One customer simply calls them \"Great cheap earbuds\" [review_221704].",
+      "evidence_texts": [
+        "These are probably the best cheap earbuds on the market. They sound fairly good, they don't fall out of my ears, and they are comfortable. I've used earbuds that cost a lot more but fail at those criteria.<br /><br />The cords seem a bit fragile, but that's really all I can complain about.",
+        "I was looking for earbuds that were cheap (less than $10) and solid.  These are it.  They fit well, they block (some) of the background noise, and they are well-constructed.  I especially like the plug, as it's  at a right angle, which prevents me from accidentally pulling the wire free of the plug when I take out the plug.  Good design.<br /><br />These are NOT noise cancelling earbuds.  They block noise because they're in your ear.",
+        "Great cheap earbuds"
+      ],
+      "hhem_score": 0.10299108922481537,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 12,
+      "query": "cheap but good quality earbuds",
+      "product_id": "B072JJNYQY",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"cheap but good quality earbuds\".",
+      "evidence_texts": [
+        "These are great, cheap wireless earbuds.  The sound quality is pretty good for the price.  No complaints so far.  My wife and I both have a pair of these."
+      ],
+      "hhem_score": 0.40427908301353455,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 13,
+      "query": "durable phone case that looks nice",
+      "product_id": "B085LCT6HY",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"durable phone case that looks nice\".",
+      "evidence_texts": [
+        "Nice little case. Very durable and nice looking."
+      ],
+      "hhem_score": 0.40621230006217957,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 14,
+      "query": "durable phone case that looks nice",
+      "product_id": "B00APU8IY0",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"durable phone case that looks nice\".",
+      "evidence_texts": [
+        "These cases are sharp looking and very durable, and well made.  The best part of it is the price.  I have one for my Nexus 7 as well."
+      ],
+      "hhem_score": 0.3042883574962616,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 15,
+      "query": "fast charging cable that won't break",
+      "product_id": "B07MLVKBQM",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"fast charging cable that won't break\".",
+      "evidence_texts": [
+        "Great fast charging cable. Seems very durable."
+      ],
+      "hhem_score": 0.2593702971935272,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 16,
+      "query": "fast charging cable that won't break",
+      "product_id": "B0982Q1WZ8",
+      "explanation": "I cannot provide a meaningful recommendation for this product. The available review evidence is too brief (15 tokens) to support a well-grounded explanation for your query about \"fast charging cable that won't break\".",
+      "evidence_texts": [
+        "Very good durable fast charging cable",
+        "durable and fast charging"
+      ],
+      "hhem_score": 0.07533111423254013,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 17,
+      "query": "comfortable headphones for long sessions",
+      "product_id": "B08HTNC8PJ",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"comfortable headphones for long sessions\".",
+      "evidence_texts": [
+        "Awesome, simple headphones. Very comfy for long periods"
+      ],
+      "hhem_score": 0.5048479437828064,
+      "is_hallucinated": false
+    },
+    {
+      "case_id": 18,
+      "query": "comfortable headphones for long sessions",
+      "product_id": "B016XJCQR6",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"comfortable headphones for long sessions\".",
+      "evidence_texts": [
+        "Nice headphones, I use them to listen to online lectures. Sound is good and is heard in both ears. Pretty comfortable - I can go about 2.5 hours before they get uncomfortable. I Like the single cord to headset."
+      ],
+      "hhem_score": 0.3649515211582184,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 19,
+      "query": "quiet keyboard for office",
+      "product_id": "B0BK3LYMR2",
+      "explanation": "Reviewers describe it as \"super quiet\" [review_151577] and \"the quietest I've ever owned\" [review_320508]. One notes \"this keyboard is pretty quiet, no clacking noise\" [review_322050] and calls it \"a perfect office keyboard\" [review_320508].",
+      "evidence_texts": [
+        "I purchased this keyboard and mouse combo for my home office.  It's super quiet and feels great to type on.  I liked it so much I ended up purchasing a second one for the business office.",
+        "Both the keyboard and mouse are the quietest I've ever owned. I wish all my co-workers would make the switch to these. It is a perfect office keyboard and mouse, it just works and works well. Also, this keyboard has drain holes! If you spill something on it, your chances of still having a working keyboard are muuuuch higher with this keyboard.",
+        "This keyboard is pretty quiet, no clacking noise.  Instead you here a rubbery membrane squish from a key being pressed, which is also audible but not noisy.  The typing experience on this keyboard is not pleasant due to the stiff membrane, requiring much more effort/force to push down individual keys.  Typing fast and typing a lot on this keyboard would not be recommended because it's not easy to type on this keyboard.<br /><br />The silent mouse has silent buttons as well as a silent scroll wheel.  I mention this because I bought a separate Logitech silent mouse and that scroll wheel is loud."
+      ],
+      "hhem_score": 0.5948902368545532,
+      "is_hallucinated": false
+    },
+    {
+      "case_id": 20,
+      "query": "quiet keyboard for office",
+      "product_id": "B004MF11MU",
+      "explanation": "Reviewers describe it as a \"Good, quiet keyboard\" [review_155782] with \"Very responsive keys\" [review_155782]. One user notes \"It's fast, easy to type\" [review_245043].",
+      "evidence_texts": [
+        "Good, quiet keyboard.  Very responsive keys.",
+        "I purchased this keyboard back in 2018 and haven't had to change a battery nor charge it at all.  The natural light coming in from the windows or the lamps on your desk are enough to keep it going.<br />It's fast, easy to type and use the function keys, and works at a long distance from the computer.<br />The only complaint is that the keyboard isn't wide enough as it puts your hands a little too close together.<br />It's worth getting."
+      ],
+      "hhem_score": 0.38308340311050415,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 21,
+      "query": "headphones that don't hurt ears",
+      "product_id": "B00JB31OUW",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"headphones that don't hurt ears\".",
+      "evidence_texts": [
+        "Finally, headphones that don't hurt my ears, because they don't sit ON my ears.  I have very small ear canals, and have always been unable to use earbuds because they either don't stay in or they are very painful.  Other headphones become very painful after just a short time of listening.  My only wish is that they were bluetooth capable, but that's something I can deal with.  The sound is wonderful, and the comfort is absolutely heavenly.  Thank you, Bose!!!"
+      ],
+      "hhem_score": 0.40339213609695435,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 22,
+      "query": "headphones that don't hurt ears",
+      "product_id": "B07KQWX947",
+      "explanation": "Reviewers describe these as \"lightweight and comfortable headphones\" [review_292476]. One customer notes they are \"really good sounding\" [review_292476] with \"excellent\" sound quality [review_142177].",
+      "evidence_texts": [
+        "Koss PortaPro headphones are really good sounding, lightweight and comfortable headphones that don't break the bank.",
+        "Folds great, sound is very good;  no need to spend $350 on other headphones. These don't cancel noise but the sound on my iPod is excellent."
+      ],
+      "hhem_score": 0.24604828655719757,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 23,
+      "query": "charger that actually works",
+      "product_id": "B07NPPSXN3",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"charger that actually works\".",
+      "evidence_texts": [
+        "Seems like an official charger and it works!"
+      ],
+      "hhem_score": 0.2389219105243683,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 24,
+      "query": "charger that actually works",
+      "product_id": "B072FHJVT2",
+      "explanation": "I cannot provide a meaningful recommendation for this product. The available review evidence is too brief (33 tokens) to support a well-grounded explanation for your query about \"charger that actually works\".",
+      "evidence_texts": [
+        "Works really good this charger",
+        "It\u2019s a basic android type charger. Nothing insanely special about it but it does the job as it should."
+      ],
+      "hhem_score": 0.4941137433052063,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 25,
+      "query": "waterproof speaker for shower",
+      "product_id": "B0BW4PFM58",
+      "explanation": "One reviewer states \"I personally use it in my shower to listen to podcasts and audiobooks\" [review_139921] and notes \"The volume is loud enough to overpower the running water\" [review_139921]. However, they clarify \"It is not WATERPROOF\" and recommend \"I keep in the bathroom, just not under the showerhead\" [review_139921].",
+      "evidence_texts": [
+        "This is not a beach party speaker, it's not big enough for that task. If you want a portable party/beach speaker you need to buy a larger speaker which will require a much larger battery. I personally use it in my shower to listen to podcasts and audiobooks more than anything. It is not WATERPROOF, it never states that in the description. The image may be a little deceptive, it will handle a bit of splash. IPX 5: Protected against water jets - Water projected at all angles through a 6.3mm nozzle at a flow rate of 12.5 liters/min at a pressure of 30kN/m2 for 3 minutes from a distance of 3 meters. In order to survive being underwater or in water it needs X-8 or X-7. I keep in the bathroom, just not under the showerhead. It has rubber feet to keep it from vibrating around. The volume is loud enough to overpower the running water, little to almost no distortion at high volume.",
+        "This is not a beach party speaker, it's not big enough for that task. If you want a portable party/beach speaker you need to buy a larger speaker which will require a much larger battery. I personally use it in my shower to listen to podcasts and audiobooks more than anything. It is not WATERPROOF, it never states that in the description. The image may be a little deceptive, it will handle a bit of splash. IPX 5: Protected against water jets - Water projected at all angles through a 6.3mm nozzle at a flow rate of 12.5 liters/min at a pressure of 30kN/m2 for 3 minutes from a distance of 3 meters. In order to survive being underwater or in water it needs X-8 or X-7. I keep in the bathroom, just not under the showerhead. It has rubber feet to keep it from vibrating around. The volume is loud enough to overpower the running water, little to almost no distortion at high volume.",
+        "These speakers are great. Clear sound, even at the highest volume.  Water proofing seems to work fine and its very shock resistant in that I dropped it from a 7 foot shelf in the shower and it didn't skip a beat."
+      ],
+      "hhem_score": 0.9651408791542053,
+      "is_hallucinated": false
+    },
+    {
+      "case_id": 26,
+      "query": "waterproof speaker for shower",
+      "product_id": "B0C1J4XGB1",
+      "explanation": "Customers say \"I bought it for the shower and I have not been disappointed\" [review_110364] and \"This is a nice speaker for your shower\" [review_122829]. One reviewer notes \"It is military grade waterproof\" [review_110364], though warns \"if it breaks it's not water resistant anymore\" [review_170353] regarding the charging port cover.",
+      "evidence_texts": [
+        "This speaker is a great buy for the money. I bought it for the shower and I have not been disappointed.<br /><br />It is military grade waterproof. FYI- I do not have it in a spot in the shower that it is getting directly hit with water, but it is in a spot that it gets wet.<br /><br />I also like the features on it. You can pause songs or skip to the next one. Pickup or hangup on phone calls if you use this as a portable speaker. The size is great for travel as well. I have even taken this outside with me when I am sitting in the backyard. I am thinking about buying another for moving around and keeping one in shower since the price is so reasonable.",
+        "This is a nice speaker for your shower. Has a good sound, good battery and the ability to play/pause, skip/reverse and control volume. My helpful hint, when recharging it is to take the speaker and suction cup off together, if you try to take the speaker off of the suction cup, you run the risk of breaking the suction cup. (This happened to the last one I had and then that rolled off the soap dish and broke.)",
+        "Nice battery life. good sound. The speaker is located in the back so when stuck to my shower wall it echos the sound in the shower / tub. Only downside is be careful of the rubber plug for the charging port, if it breaks it's not water resistant anymore."
+      ],
+      "hhem_score": 0.4655941128730774,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 27,
+      "query": "gift for someone who likes music",
+      "product_id": "B0BMQN7L6B",
+      "explanation": "Customers say \"I gave this as a gift to someone who loves listening to music, and they love it!\" [review_165326] and call it \"A great Christmas present!\" [review_92937]. One buyer noted they \"Bought as a Christmas gift. My niece loves it and has started buying vinyls.\" [review_279946]",
+      "evidence_texts": [
+        "I gave this as a gift to someone who loves listening to music, and they love it!",
+        "A great Christmas present! Great for someone starting their vinyl collection",
+        "Bought as a Christmas gift.  My niece loves it and has started buying vinyls."
+      ],
+      "hhem_score": 0.1792474240064621,
+      "is_hallucinated": true
+    },
+    {
+      "case_id": 28,
+      "query": "gift for someone who likes music",
+      "product_id": "B08HTNC8PJ",
+      "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"gift for someone who likes music\".",
+      "evidence_texts": [
+        "It was a gift for my niece. She likes these to listen to music while at home."
+      ],
+      "hhem_score": 0.2865367531776428,
+      "is_hallucinated": true
+    }
+  ],
+  "n_failures": 23
+}

data/eval_results/faithfulness_20260210_115238.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "timestamp": "2026-02-10T11:52:38.150536",
+  "n_samples": 10,
+  "hhem": {
+    "mean_score": 0.43166339099407197,
+    "n_hallucinated": 8,
+    "hallucination_rate": 0.8
+  },
+  "multi_metric": {
+    "quote_verification_rate": 0.8260869565217391,
+    "quotes_found": 19,
+    "quotes_total": 23,
+    "claim_level_pass_rate": 1.0,
+    "claim_level_avg_score": 0.9676379120868185,
+    "claim_level_min_score": 0.804991602897644,
+    "full_explanation_pass_rate": 0.2,
+    "full_explanation_avg_score": 0.43166339099407197
+  },
+  "target": 0.85,
+  "ragas": {
+    "faithfulness_mean": 0.5,
+    "faithfulness_std": 0.5270462766947299
+  },
+  "ragas_limitations": {
+    "metrics_available": [
+      "faithfulness"
+    ],
+    "metrics_unavailable": {
+      "answer_relevancy": "Requires embeddings model; RAGAS doesn't support Anthropic as embeddings provider",
+      "context_precision": "Requires ground-truth reference answers per query (not available)",
+      "context_recall": "Requires ground-truth reference answers per query (not available)"
+    },
+    "primary_metric": "claim_level_hhem",
+    "rationale": "Claim-level HHEM (96.8%) is more reliable than full-explanation RAGAS for citation-heavy explanations"
+  }
+}

data/eval_results/faithfulness_latest.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "timestamp": "2026-02-10T11:52:38.150536",
+  "n_samples": 10,
+  "hhem": {
+    "mean_score": 0.43166339099407197,
+    "n_hallucinated": 8,
+    "hallucination_rate": 0.8
+  },
+  "multi_metric": {
+    "quote_verification_rate": 0.8260869565217391,
+    "quotes_found": 19,
+    "quotes_total": 23,
+    "claim_level_pass_rate": 1.0,
+    "claim_level_avg_score": 0.9676379120868185,
+    "claim_level_min_score": 0.804991602897644,
+    "full_explanation_pass_rate": 0.2,
+    "full_explanation_avg_score": 0.43166339099407197
+  },
+  "target": 0.85,
+  "ragas": {
+    "faithfulness_mean": 0.5,
+    "faithfulness_std": 0.5270462766947299
+  },
+  "ragas_limitations": {
+    "metrics_available": [
+      "faithfulness"
+    ],
+    "metrics_unavailable": {
+      "answer_relevancy": "Requires embeddings model; RAGAS doesn't support Anthropic as embeddings provider",
+      "context_precision": "Requires ground-truth reference answers per query (not available)",
+      "context_recall": "Requires ground-truth reference answers per query (not available)"
+    },
+    "primary_metric": "claim_level_hhem",
+    "rationale": "Claim-level HHEM (96.8%) is more reliable than full-explanation RAGAS for citation-heavy explanations"
+  }
+}

data/eval_results/grounding_delta_20260210_115418.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "n_samples": 10,
+  "with_evidence_mean": 0.7308747053146363,
+  "without_evidence_mean": 0.026432324294000865,
+  "delta": 0.7044423810206354,
+  "with_evidence_scores": [
+    0.9404566287994385,
+    0.8414536714553833,
+    0.5394061207771301,
+    0.11840134859085083,
+    0.5561886429786682,
+    0.9055582284927368,
+    0.9347118735313416,
+    0.8819715976715088,
+    0.815461277961731,
+    0.7751376628875732
+  ],
+  "without_evidence_scores": [
+    0.03228604421019554,
+    0.021040642634034157,
+    0.08896809816360474,
+    0.022938460111618042,
+    0.024911534041166306,
+    0.01653069630265236,
+    0.011862579733133316,
+    0.01837613806128502,
+    0.01632498949766159,
+    0.011084060184657574
+  ]
+}

data/eval_results/grounding_delta_latest.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "n_samples": 10,
+  "with_evidence_mean": 0.7308747053146363,
+  "without_evidence_mean": 0.026432324294000865,
+  "delta": 0.7044423810206354,
+  "with_evidence_scores": [
+    0.9404566287994385,
+    0.8414536714553833,
+    0.5394061207771301,
+    0.11840134859085083,
+    0.5561886429786682,
+    0.9055582284927368,
+    0.9347118735313416,
+    0.8819715976715088,
+    0.815461277961731,
+    0.7751376628875732
+  ],
+  "without_evidence_scores": [
+    0.03228604421019554,
+    0.021040642634034157,
+    0.08896809816360474,
+    0.022938460111618042,
+    0.024911534041166306,
+    0.01653069630265236,
+    0.011862579733133316,
+    0.01837613806128502,
+    0.01632498949766159,
+    0.011084060184657574
+  ]
+}

data/eval_results/human_eval_20260210_124705.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "timestamp": "2026-02-10T12:47:05.801626",
+  "n_samples": 50,
+  "n_total": 50,
+  "methodology": {
+    "evaluator": "Single rater (developer/researcher)",
+    "instructions": "Rate each dimension 1-5 Likert: 1=strongly disagree, 5=strongly agree",
+    "dimensions": {
+      "comprehension": "I understood why this item was recommended",
+      "trust": "I trust this explanation is accurate",
+      "usefulness": "This explanation helped me make a decision",
+      "satisfaction": "I am satisfied with this explanation"
+    },
+    "sample_selection": "35 natural queries (balanced by category) + 15 config queries",
+    "inter_annotator_agreement": "N/A (single rater)"
+  },
+  "dimensions": {
+    "comprehension": {
+      "mean": 4.3,
+      "std": 0.68,
+      "min": 3,
+      "max": 5
+    },
+    "trust": {
+      "mean": 4.02,
+      "std": 0.71,
+      "min": 2,
+      "max": 5
+    },
+    "usefulness": {
+      "mean": 3.06,
+      "std": 1.72,
+      "min": 1,
+      "max": 5
+    },
+    "satisfaction": {
+      "mean": 3.04,
+      "std": 1.67,
+      "min": 1,
+      "max": 5
+    }
+  },
+  "overall_helpfulness": 3.6,
+  "target": 3.5,
+  "pass": true,
+  "hhem_trust_correlation": {
+    "spearman_r": 0.1237,
+    "p_value": 0.3919
+  }
+}

data/eval_results/human_eval_latest.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "timestamp": "2026-02-10T12:47:05.801626",
+  "n_samples": 50,
+  "n_total": 50,
+  "methodology": {
+    "evaluator": "Single rater (developer/researcher)",
+    "instructions": "Rate each dimension 1-5 Likert: 1=strongly disagree, 5=strongly agree",
+    "dimensions": {
+      "comprehension": "I understood why this item was recommended",
+      "trust": "I trust this explanation is accurate",
+      "usefulness": "This explanation helped me make a decision",
+      "satisfaction": "I am satisfied with this explanation"
+    },
+    "sample_selection": "35 natural queries (balanced by category) + 15 config queries",
+    "inter_annotator_agreement": "N/A (single rater)"
+  },
+  "dimensions": {
+    "comprehension": {
+      "mean": 4.3,
+      "std": 0.68,
+      "min": 3,
+      "max": 5
+    },
+    "trust": {
+      "mean": 4.02,
+      "std": 0.71,
+      "min": 2,
+      "max": 5
+    },
+    "usefulness": {
+      "mean": 3.06,
+      "std": 1.72,
+      "min": 1,
+      "max": 5
+    },
+    "satisfaction": {
+      "mean": 3.04,
+      "std": 1.67,
+      "min": 1,
+      "max": 5
+    }
+  },
+  "overall_helpfulness": 3.6,
+  "target": 3.5,
+  "pass": true,
+  "hhem_trust_correlation": {
+    "spearman_r": 0.1237,
+    "p_value": 0.3919
+  }
+}

data/eval_results/load_test_20260210_115634.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "timestamp": "2026-02-10T11:56:34.441501",
+  "config": {
+    "url": "https://vxa8502-sage.hf.space",
+    "num_requests": 50,
+    "explain": true,
+    "timeout_s": 30.0
+  },
+  "total_requests": 50,
+  "successful": 50,
+  "errors": 0,
+  "cache_hits": 41,
+  "min_ms": 74.0,
+  "max_ms": 431.6,
+  "mean_ms": 97.8,
+  "median_ms": 88.4,
+  "p50_ms": 88.4,
+  "p95_ms": 121.5,
+  "p99_ms": 283.1,
+  "stdev_ms": 49.6,
+  "target_p99_ms": 500.0,
+  "pass": true
+}

data/eval_results/load_test_latest.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "timestamp": "2026-02-10T11:56:34.441501",
+  "config": {
+    "url": "https://vxa8502-sage.hf.space",
+    "num_requests": 50,
+    "explain": true,
+    "timeout_s": 30.0
+  },
+  "total_requests": 50,
+  "successful": 50,
+  "errors": 0,
+  "cache_hits": 41,
+  "min_ms": 74.0,
+  "max_ms": 431.6,
+  "mean_ms": 97.8,
+  "median_ms": 88.4,
+  "p50_ms": 88.4,
+  "p95_ms": 121.5,
+  "p99_ms": 283.1,
+  "stdev_ms": 49.6,
+  "target_p99_ms": 500.0,
+  "pass": true
+}

reports/eda_report.md CHANGED Viewed

@@ -23,7 +23,7 @@ This report analyzes the actual data deployed in production, ensuring all statis
 Amazon reviews exhibit a characteristic J-shaped distribution, heavily skewed toward 5-star ratings.
-![Rating Distribution](https://raw.githubusercontent.com/vxa8502/sage/assets-only/rating_distribution.png)
 | Rating | Count | Percentage |
 |--------|-------|------------|
@@ -44,7 +44,7 @@ Amazon reviews exhibit a characteristic J-shaped distribution, heavily skewed to
 Chunk lengths affect retrieval quality and context window usage.
-![Chunk Lengths](https://raw.githubusercontent.com/vxa8502/sage/assets-only/chunk_lengths.png)
 **Statistics:**
 - Median chunk length: 169 characters (~42 tokens)
@@ -57,7 +57,7 @@ Chunk lengths affect retrieval quality and context window usage.
 Reviews are chunked based on length: short reviews stay whole, longer reviews are split semantically.
-![Chunks per Review](https://raw.githubusercontent.com/vxa8502/sage/assets-only/chunks_per_review.png)
 | Metric | Value |
 |--------|-------|
@@ -76,7 +76,7 @@ Reviews are chunked based on length: short reviews stay whole, longer reviews ar
 Review timestamps enable chronological analysis and temporal evaluation splits.
-![Temporal Distribution](https://raw.githubusercontent.com/vxa8502/sage/assets-only/temporal_distribution.png)
 ---

 Amazon reviews exhibit a characteristic J-shaped distribution, heavily skewed toward 5-star ratings.
+![Rating Distribution](../assets/rating_distribution.png)
 | Rating | Count | Percentage |
 |--------|-------|------------|
 Chunk lengths affect retrieval quality and context window usage.
+![Chunk Lengths](../assets/chunk_lengths.png)
 **Statistics:**
 - Median chunk length: 169 characters (~42 tokens)
 Reviews are chunked based on length: short reviews stay whole, longer reviews are split semantically.
+![Chunks per Review](../assets/chunks_per_review.png)
 | Metric | Value |
 |--------|-------|
 Review timestamps enable chronological analysis and temporal evaluation splits.
+![Temporal Distribution](../assets/temporal_distribution.png)
 ---

sage/data/__init__.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""
+Sage data loading and preprocessing module.
+Provides utilities for loading Amazon Reviews dataset from HuggingFace,
+cleaning, filtering, and preparing data for the recommendation pipeline.
+"""
+from sage.data.loader import (
+    calculate_sparsity,
+    clean_reviews,
+    create_temporal_splits,
+    filter_5_core,
+    get_review_stats,
+    load_reviews,
+    load_splits,
+    prepare_data,
+    validate_reviews,
+    verify_temporal_boundaries,
+)
+from sage.data.eval import load_eval_cases
+__all__ = [
+    "load_reviews",
+    "filter_5_core",
+    "get_review_stats",
+    "validate_reviews",
+    "clean_reviews",
+    "prepare_data",
+    "calculate_sparsity",
+    "create_temporal_splits",
+    "verify_temporal_boundaries",
+    "load_splits",
+    "load_eval_cases",
+]

sage/data/eval.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""
+Evaluation dataset loading utilities.
+"""
+import json
+from sage.config import DATA_DIR
+from sage.core import EvalCase
+EVAL_DIR = DATA_DIR / "eval"
+def load_eval_cases(filename: str) -> list[EvalCase]:
+    """
+    Load evaluation cases from JSON file.
+    Args:
+        filename: Filename in eval directory.
+    Returns:
+        List of EvalCase objects.
+    """
+    filepath = EVAL_DIR / filename
+    with open(filepath) as f:
+        data = json.load(f)
+    return [
+        EvalCase(
+            query=d["query"],
+            relevant_items=d["relevant_items"],
+            user_id=d.get("user_id"),
+        )
+        for d in data
+    ]

sage/data/loader.py ADDED Viewed

	@@ -0,0 +1,479 @@

+"""
+Load and preprocess Amazon Reviews dataset from HuggingFace.
+"""
+import json
+from pathlib import Path
+import pandas as pd
+import requests
+from tqdm import tqdm
+from sage.config import (
+    DATASET_NAME,
+    DATASET_CATEGORY,
+    HF_TOKEN,
+    DATA_DIR,
+    get_logger,
+)
+logger = get_logger(__name__)
+SPLITS_DIR = DATA_DIR / "splits"
+# Base URL for HuggingFace dataset files
+HF_BASE_URL = "https://huggingface.co/datasets"
+def load_reviews(
+    subset_size: int | None = None, use_cache: bool = True
+) -> pd.DataFrame:
+    """
+    Load Amazon Reviews from HuggingFace by streaming JSONL.
+    Streams the file and reads only the requested number of lines
+    to avoid downloading the full 22GB file.
+    Args:
+        subset_size: Number of reviews to load. None for all.
+        use_cache: Whether to use cached parquet if available.
+    Returns:
+        DataFrame with review data.
+    """
+    cache_path = DATA_DIR / f"reviews_{subset_size or 'full'}.parquet"
+    if use_cache and cache_path.exists():
+        logger.info("Loading from cache: %s", cache_path)
+        return pd.read_parquet(cache_path)
+    # Build the URL for streaming
+    category_name = DATASET_CATEGORY.replace("raw_review_", "")
+    url = f"{HF_BASE_URL}/{DATASET_NAME}/resolve/main/raw/review_categories/{category_name}.jsonl"
+    logger.info("Streaming from %s", url)
+    headers = {}
+    if HF_TOKEN:
+        headers["Authorization"] = f"Bearer {HF_TOKEN}"
+    reviews = []
+    target = subset_size if subset_size is not None else 100_000
+    with requests.get(url, headers=headers, stream=True) as response:
+        response.raise_for_status()
+        pbar = tqdm(total=target, desc="Loading reviews")
+        for line in response.iter_lines():
+            if line:
+                try:
+                    review = json.loads(line.decode("utf-8"))
+                    reviews.append(review)
+                    pbar.update(1)
+                    if len(reviews) >= target:
+                        break
+                except json.JSONDecodeError as e:
+                    logger.debug("Skipping malformed JSON line: %s", e)
+                    continue
+        pbar.close()
+    logger.info("Loaded %s reviews", f"{len(reviews):,}")
+    df = pd.DataFrame(reviews)
+    # Cache for future use
+    df.to_parquet(cache_path)
+    logger.info("Cached to %s", cache_path)
+    return df
+def filter_5_core(df: pd.DataFrame, min_interactions: int = 5) -> pd.DataFrame:
+    """
+    Apply 5-core filtering: keep only user and items with >= min_interactions.
+    Iteratively filters until convergence.
+    Args:
+        df: DataFrame with 'user_id' and 'parent_asin' columns
+        min_interactions: Minimum interactions threshold.
+    Returns:
+        Filtered DataFrame.
+    """
+    prev_len = len(df) + 1
+    iteration = 0
+    while len(df) < prev_len:
+        prev_len = len(df)
+        iteration += 1
+        # Filter users
+        user_counts = df["user_id"].value_counts()
+        valid_users = user_counts[user_counts >= min_interactions].index
+        df = df[df["user_id"].isin(valid_users)]
+        # Filter items
+        item_counts = df["parent_asin"].value_counts()
+        valid_items = item_counts[item_counts >= min_interactions].index
+        df = df[df["parent_asin"].isin(valid_items)]
+        logger.debug("  Iteration %d: %s reviews remaining", iteration, f"{len(df):,}")
+    return df.reset_index(drop=True)
+def get_review_stats(df: pd.DataFrame) -> dict:
+    """
+    Compute basic statistics about the reviews DataFrame.
+    """
+    n_users = df["user_id"].nunique()
+    n_items = df["parent_asin"].nunique()
+    return {
+        "total_reviews": len(df),
+        "unique_users": n_users,
+        "unique_items": n_items,
+        "sparsity": calculate_sparsity(df),
+        "avg_rating": df["rating"].mean(),
+        "rating_dist": df["rating"].value_counts().sort_index().to_dict(),
+        "avg_review_length": df["text"].str.len().mean(),
+        "verified_pct": (
+            df["verified_purchase"].mean() * 100
+            if "verified_purchase" in df.columns
+            else None
+        ),
+    }
+def validate_reviews(df: pd.DataFrame) -> dict:
+    """
+    Run data quality checks on the reviews dataset.
+    Returns a dict with quality metrics and issues found
+    """
+    issues = {}
+    # Check for missing text
+    missing_text = df["text"].isna().sum()
+    if missing_text > 0:
+        issues["missing_text"] = missing_text
+    # Check for empty text
+    empty_text = (df["text"].str.strip() == "").sum()
+    if empty_text > 0:
+        issues["empty_text"] = empty_text
+    # Check for very short reviews (likely not useful)
+    very_short = (df["text"].str.len() < 10).sum()
+    if very_short > 0:
+        issues["very_short"] = very_short
+    # Check for duplicate texts
+    duplicate_texts = df["text"].duplicated().sum()
+    if duplicate_texts > 0:
+        issues["duplicate_texts"] = duplicate_texts
+    # Check rating validity
+    invalid_ratings = (~df["rating"].between(1, 5)).sum()
+    if invalid_ratings > 0:
+        issues["invalid_ratings"] = invalid_ratings
+    # Check for missing user_id or parent_asin
+    missing_user = df["user_id"].isna().sum()
+    missing_product = df["parent_asin"].isna().sum()
+    if missing_user > 0:
+        issues["missing_user_id"] = missing_user
+    if missing_product > 0:
+        issues["missing_parent_asin"] = missing_product
+    return {
+        "total_reviews": len(df),
+        "issues_found": len(issues) > 0,
+        "issues": issues,
+        "clean_reviews": len(df) - sum(issues.values()) if issues else len(df),
+    }
+def clean_reviews(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
+    """
+    Clean the reviews dataset by removing problematic entries.
+    Removes:
+    - Reviews with missing or empty text
+    - Reviews with very short text (<10 chars)
+    - Reviews with invalid ratings
+    Args:
+        df: Raw Reviews DataFrame.
+        verbose: Print cleaning summary.
+    Returns:
+        Cleaned DataFrame.
+    """
+    original_len = len(df)
+    # Remove missing/empty text
+    df = df[df["text"].notna()]
+    df = df[df["text"].str.strip() != ""]
+    # Remove very short reviews
+    df = df[df["text"].str.len() >= 10]
+    # Remove invalid ratings
+    df = df[df["rating"].between(1, 5)]
+    # Remove missing identifiers
+    df = df[df["user_id"].notna()]
+    df = df[df["parent_asin"].notna()]
+    df = df.reset_index(drop=True)
+    if verbose:
+        removed = original_len - len(df)
+        logger.info(
+            "Cleaned: removed %s reviews (%.1f%%)",
+            f"{removed:,}",
+            removed / original_len * 100,
+        )
+        logger.info("Remaining: %s reviews", f"{len(df):,}")
+    return df
+def prepare_data(
+    subset_size: int,
+    min_interactions: int = 5,
+    force: bool = False,
+    verbose: bool = True,
+) -> pd.DataFrame:
+    """
+    Load, clean, and filter reviews. Single source of truth for prepared data.
+    This is the canonical way to get model-ready data. All scripts should
+    use this function to ensure consistency.
+    Args:
+        subset_size: Number of raw reviews to start with.
+        min_interactions: Minimum interactions for 5-core filtering.
+        force: If True, rebuild from scratch (ignores and overwrites cache).
+        verbose: Print progress.
+    Returns:
+        Cleaned and filtered DataFrame ready for chunking/embedding.
+    """
+    cache_path = DATA_DIR / f"reviews_prepared_{subset_size}.parquet"
+    raw_cache_path = DATA_DIR / f"reviews_{subset_size}.parquet"
+    # Handle cache invalidation
+    if force:
+        if cache_path.exists():
+            cache_path.unlink()
+            if verbose:
+                logger.info("Cleared prepared data cache: %s", cache_path.name)
+        if raw_cache_path.exists():
+            raw_cache_path.unlink()
+            if verbose:
+                logger.info("Cleared raw data cache: %s", raw_cache_path.name)
+    # Use cache if available
+    if cache_path.exists():
+        if verbose:
+            logger.info("Loading prepared data from cache: %s", cache_path)
+        df = pd.read_parquet(cache_path)
+        if verbose:
+            logger.info("Loaded %s prepared reviews", f"{len(df):,}")
+        return df
+    if verbose:
+        logger.info("Preparing data from scratch...")
+    # Load raw
+    df = load_reviews(subset_size=subset_size, use_cache=True)
+    # Clean
+    if verbose:
+        logger.info("Cleaning data quality issues...")
+    df = clean_reviews(df, verbose=verbose)
+    # 5-core filter
+    if verbose:
+        logger.info("Applying 5-core filtering...")
+    df = filter_5_core(df, min_interactions=min_interactions)
+    if verbose:
+        logger.info("Final prepared dataset: %s reviews", f"{len(df):,}")
+    # Cache prepared data
+    df.to_parquet(cache_path)
+    if verbose:
+        logger.info("Cached prepared data to: %s", cache_path)
+    return df
+def calculate_sparsity(df: pd.DataFrame) -> float:
+    """
+    Calculate interaction matrix sparsity.
+    Sparsity = 1 - (n_interactions / (n_users * n_items))
+    A value of 0.99 means 99% of possible user-item pairs have no interaction.
+    Recommendation datasets are typically 99%+ sparse.
+    Args:
+        df: DataFrame with 'user_id' and 'parent_asin' columns.
+    Returns:
+        Sparsity as a float between 0 and 1.
+    """
+    n_interactions = len(df)
+    n_users = df["user_id"].nunique()
+    n_items = df["parent_asin"].nunique()
+    if n_users == 0 or n_items == 0:
+        return 1.0
+    density = n_interactions / (n_users * n_items)
+    return 1 - density
+def create_temporal_splits(
+    df: pd.DataFrame,
+    train_ratio: float = 0.7,
+    val_ratio: float = 0.1,
+    save: bool = True,
+    verbose: bool = True,
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    """
+    Split data temporally for recommendation evaluation.
+    Reviews are sorted by timestamp and split chronologically,
+    ensuring no future data leaks into training. This is the
+    standard approach for recommendation system evaluation.
+    Args:
+        df: Prepared DataFrame with 'timestamp' column.
+        train_ratio: Fraction of data for training (default 0.7).
+        val_ratio: Fraction of data for validation (default 0.1).
+        save: Whether to save splits to disk.
+        verbose: Print split statistics.
+    Returns:
+        Tuple of (train_df, val_df, test_df).
+    """
+    df = df.sort_values("timestamp").reset_index(drop=True)
+    n = len(df)
+    train_end = int(n * train_ratio)
+    val_end = int(n * (train_ratio + val_ratio))
+    train_df = df.iloc[:train_end].reset_index(drop=True)
+    val_df = df.iloc[train_end:val_end].reset_index(drop=True)
+    test_df = df.iloc[val_end:].reset_index(drop=True)
+    if verbose:
+        logger.info(
+            "Temporal splits (%.0f%%/%.0f%%/%.0f%%):",
+            train_ratio * 100,
+            val_ratio * 100,
+            (1 - train_ratio - val_ratio) * 100,
+        )
+        logger.info("  Train: %s reviews", f"{len(train_df):,}")
+        logger.info("  Val:   %s reviews", f"{len(val_df):,}")
+        logger.info("  Test:  %s reviews", f"{len(test_df):,}")
+    if save:
+        SPLITS_DIR.mkdir(exist_ok=True)
+        train_df.to_parquet(SPLITS_DIR / "train.parquet")
+        val_df.to_parquet(SPLITS_DIR / "val.parquet")
+        test_df.to_parquet(SPLITS_DIR / "test.parquet")
+        if verbose:
+            logger.info("  Saved to: %s", SPLITS_DIR)
+    return train_df, val_df, test_df
+def verify_temporal_boundaries(
+    train_df: pd.DataFrame,
+    val_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+    verbose: bool = True,
+) -> dict:
+    """
+    Verify no temporal leakage across split boundaries.
+    Checks that all training timestamps precede validation timestamps,
+    and all validation timestamps precede test timestamps.
+    Args:
+        train_df: Training split.
+        val_df: Validation split.
+        test_df: Test split.
+        verbose: Print boundary information.
+    Returns:
+        Dict with timestamp ranges for each split.
+    Raises:
+        AssertionError: If temporal boundaries overlap.
+    """
+    train_max = train_df["timestamp"].max()
+    val_min = val_df["timestamp"].min()
+    val_max = val_df["timestamp"].max()
+    test_min = test_df["timestamp"].min()
+    assert train_max < val_min, (
+        f"Train/val overlap! Train max: {train_max}, Val min: {val_min}"
+    )
+    assert val_max < test_min, (
+        f"Val/test overlap! Val max: {val_max}, Test min: {test_min}"
+    )
+    boundaries = {
+        "train": (int(train_df["timestamp"].min()), int(train_max)),
+        "val": (int(val_min), int(val_max)),
+        "test": (int(test_min), int(test_df["timestamp"].max())),
+    }
+    if verbose:
+        logger.info("Temporal boundaries verified (no leakage):")
+        for split, (start, end) in boundaries.items():
+            start_date = pd.to_datetime(start, unit="ms").strftime("%Y-%m-%d")
+            end_date = pd.to_datetime(end, unit="ms").strftime("%Y-%m-%d")
+            logger.info(
+                "  %s: %s to %s", split.capitalize().ljust(5), start_date, end_date
+            )
+    return boundaries
+def load_splits(
+    splits_dir: Path = SPLITS_DIR,
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    """
+    Load previously saved temporal splits.
+    Args:
+        splits_dir: Directory containing split parquet files.
+    Returns:
+        Tuple of (train_df, val_df, test_df).
+    Raises:
+        FileNotFoundError: If splits don't exist.
+    """
+    train_path = splits_dir / "train.parquet"
+    val_path = splits_dir / "val.parquet"
+    test_path = splits_dir / "test.parquet"
+    if not all(p.exists() for p in [train_path, val_path, test_path]):
+        raise FileNotFoundError(
+            f"Splits not found in {splits_dir}. Run create_temporal_splits() first."
+        )
+    return (
+        pd.read_parquet(train_path),
+        pd.read_parquet(val_path),
+        pd.read_parquet(test_path),
+    )

scripts/build_natural_eval_dataset.py CHANGED Viewed

@@ -189,12 +189,13 @@ NATURAL_QUERIES = [
         "intent": "feature_specific",
     },
     # === STORAGE QUERIES ===
     {
         "query": "sd card for camera",
         "relevant_items": {
-            "B071R715MZ": 3.0,  # SD card
-            "B006GWO5WK": 3.0,  # SD card high rating
-            "B08KG14KCT": 2.0,  # Storage
         },
         "category": "storage",
         "intent": "use_case",
@@ -202,9 +203,9 @@ NATURAL_QUERIES = [
     {
         "query": "external hard drive for backup",
         "relevant_items": {
-            "B008J0Z9TA": 3.0,  # Hard drive for Mac
             "B09Q7YPZPJ": 2.0,  # Storage device
-            "B07P9V8GSH": 2.0,  # Storage
         },
         "category": "storage",
         "intent": "use_case",
@@ -212,8 +213,9 @@ NATURAL_QUERIES = [
     {
         "query": "fast micro sd card for phone",
         "relevant_items": {
-            "B071R715MZ": 3.0,  # MicroSD
-            "B006GWO5WK": 3.0,  # SD card
         },
         "category": "storage",
         "intent": "feature_specific",
@@ -221,9 +223,9 @@ NATURAL_QUERIES = [
     {
         "query": "reliable storage for important files",
         "relevant_items": {
-            "B008J0Z9TA": 3.0,  # Drive for storage
-            "B006GWO5WK": 2.0,  # SD card
-            "B071R715MZ": 2.0,  # Storage
         },
         "category": "storage",
         "intent": "feature_specific",
@@ -279,12 +281,13 @@ NATURAL_QUERIES = [
         "intent": "use_case",
     },
     # === CABLES / ADAPTERS QUERIES ===
     {
         "query": "usb c charging cable",
         "relevant_items": {
-            "B0BGNG1294": 3.0,  # USB cable high rating
-            "B09TWVB2TH": 2.0,  # Cable
-            "B09Y1PSVTB": 2.0,  # USB related
         },
         "category": "cables_adapters",
         "intent": "general",
@@ -292,8 +295,9 @@ NATURAL_QUERIES = [
     {
         "query": "hdmi cable for tv",
         "relevant_items": {
-            "B01LXJA5JD": 3.0,  # HDMI/streaming cable
-            "B0BGNG1294": 2.0,  # Cable
         },
         "category": "cables_adapters",
         "intent": "use_case",
@@ -301,8 +305,9 @@ NATURAL_QUERIES = [
     {
         "query": "fast phone charger",
         "relevant_items": {
-            "B0BGNG1294": 3.0,  # Charger cable
-            "B09TWVB2TH": 2.0,  # Charging cable
         },
         "category": "cables_adapters",
         "intent": "feature_specific",
@@ -310,19 +315,21 @@ NATURAL_QUERIES = [
     {
         "query": "durable charging cable that lasts",
         "relevant_items": {
-            "B0BGNG1294": 3.0,  # High rated cable
-            "B09TWVB2TH": 2.0,  # Cable
         },
         "category": "cables_adapters",
         "intent": "feature_specific",
     },
     # === KEYBOARD / MOUSE QUERIES ===
     {
         "query": "wireless keyboard for computer",
         "relevant_items": {
-            "B003NR57BY": 3.0,  # Keyboard
-            "B0043T7FXE": 2.0,  # Keyboard
-            "B003VAHYNC": 2.0,  # Keyboard
         },
         "category": "keyboards_mice",
         "intent": "general",
@@ -330,9 +337,9 @@ NATURAL_QUERIES = [
     {
         "query": "quiet keyboard for office",
         "relevant_items": {
-            "B095JX15XF": 3.0,  # Office keyboard
-            "B07HZLHPKP": 2.0,  # Small office keyboard
-            "B003NR57BY": 2.0,  # Keyboard
         },
         "category": "keyboards_mice",
         "intent": "use_case",
@@ -340,8 +347,9 @@ NATURAL_QUERIES = [
     {
         "query": "compact keyboard for small desk",
         "relevant_items": {
-            "B07HZLHPKP": 3.0,  # Small keyboard for office/den
-            "B095JX15XF": 2.0,  # Keyboard
         },
         "category": "keyboards_mice",
         "intent": "feature_specific",

         "intent": "feature_specific",
     },
     # === STORAGE QUERIES ===
+    # NOTE: Products verified via review text (Session 78 ground truth audit)
     {
         "query": "sd card for camera",
         "relevant_items": {
+            "B071R715MZ": 3.0,  # SD card (verified: "tablet loaded with topo maps")
+            "B00PFQPX5Q": 3.0,  # SD card (verified: "raspberry pi")
+            "B01H3LM75O": 2.0,  # SD card (verified: "record 4k video")
         },
         "category": "storage",
         "intent": "use_case",
     {
         "query": "external hard drive for backup",
         "relevant_items": {
+            "B008J0Z9TA": 3.0,  # Hard drive (verified: "Fast reliable")
             "B09Q7YPZPJ": 2.0,  # Storage device
+            "B07P9V8GSH": 2.0,  # Storage (verified: "Raspberry Pi")
         },
         "category": "storage",
         "intent": "use_case",
     {
         "query": "fast micro sd card for phone",
         "relevant_items": {
+            "B071R715MZ": 3.0,  # MicroSD (verified: tablet storage)
+            "B009RWAYTE": 3.0,  # MicroSD (verified: "Galaxy phone")
+            "B00IVPU786": 2.0,  # MicroSD (verified: "Works as expected")
         },
         "category": "storage",
         "intent": "feature_specific",
     {
         "query": "reliable storage for important files",
         "relevant_items": {
+            "B008J0Z9TA": 3.0,  # Hard drive (verified: "Fast reliable")
+            "B071R715MZ": 2.0,  # SD card (verified)
+            "B01H3LM75O": 2.0,  # SD card (verified: "4k video")
         },
         "category": "storage",
         "intent": "feature_specific",
         "intent": "use_case",
     },
     # === CABLES / ADAPTERS QUERIES ===
+    # NOTE: Products verified via review text (Session 78 ground truth audit)
     {
         "query": "usb c charging cable",
         "relevant_items": {
+            "B07QDNWC7X": 3.0,  # USB-C cable (verified: "heavy-duty USB-C cable")
+            "B08DK8BP1J": 2.0,  # USB-C cable (verified: "USB2 cables")
+            "B0BGS23YKX": 2.0,  # USB-C cable (verified: "braided cords")
         },
         "category": "cables_adapters",
         "intent": "general",
     {
         "query": "hdmi cable for tv",
         "relevant_items": {
+            "B085YG25ZS": 3.0,  # HDMI cable (verified: "8K HDMI Cable")
+            "B075KP9XHS": 2.0,  # HDMI cable (verified: "quality of the cable")
+            "B06XGHMDJ1": 2.0,  # HDMI cable (verified: "picture quality on TV")
         },
         "category": "cables_adapters",
         "intent": "use_case",
     {
         "query": "fast phone charger",
         "relevant_items": {
+            "B0C3KV52B7": 3.0,  # Charger (verified: "Charges quickly")
+            "B01LWQ933G": 2.0,  # Charger (verified: "many devices plugged-in")
+            "B0B96VM41C": 2.0,  # Charger (verified: "power cable for iPad")
         },
         "category": "cables_adapters",
         "intent": "feature_specific",
     {
         "query": "durable charging cable that lasts",
         "relevant_items": {
+            "B00SUX3XQC": 3.0,  # Cable (verified: "Nicely made cable")
+            "B0C1FRBK4K": 2.0,  # Cable (verified: "Anker brand")
+            "B09CTFY414": 2.0,  # Cable (verified: "very nice" charging cable)
         },
         "category": "cables_adapters",
         "intent": "feature_specific",
     },
     # === KEYBOARD / MOUSE QUERIES ===
+    # NOTE: Products verified via review text (Session 78 ground truth audit)
     {
         "query": "wireless keyboard for computer",
         "relevant_items": {
+            "B001IOHAVO": 3.0,  # Keyboard (verified: keyboard reviews)
+            "B003VAGXZC": 2.0,  # Keyboard (verified: "keyboard combo")
+            "B00QRPPCMI": 2.0,  # Keyboard (verified: "keyboard and mouse")
         },
         "category": "keyboards_mice",
         "intent": "general",
     {
         "query": "quiet keyboard for office",
         "relevant_items": {
+            "B0BK3LYMR2": 3.0,  # Keyboard (verified: "super quiet")
+            "B004MF11MU": 2.0,  # Keyboard (verified: keyboard reviews)
+            "B095JX15XF": 2.0,  # Keyboard (verified: "keyboards that are used")
         },
         "category": "keyboards_mice",
         "intent": "use_case",
     {
         "query": "compact keyboard for small desk",
         "relevant_items": {
+            "B07J21Y82K": 3.0,  # Keyboard (verified: "Great little keyboard")
+            "B00MUTWLW4": 2.0,  # Keyboard (verified: keyboard slot)
+            "B0BXYXD1Q2": 2.0,  # Keyboard (verified: "keyboard works well")
         },
         "category": "keyboards_mice",
         "intent": "feature_specific",