vxa8502 commited on
Commit
ae4342a
·
1 Parent(s): 1f0ea41

Fix eval workflow and align README metrics with eval_results JSON

Browse files
.gitignore CHANGED
@@ -6,8 +6,20 @@ __pycache__/
6
  *.pyc
7
  .Python
8
 
9
- # Data (too large for git)
10
- data/
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  # IDE
13
  .vscode/
 
6
  *.pyc
7
  .Python
8
 
9
+ # Data (large files - ignore selectively)
10
+ # Use /data/ prefix to ONLY match top-level data/, not sage/data/
11
+ /data/splits/
12
+ /data/cache/
13
+ /data/qdrant_storage/
14
+ /data/figures/
15
+ /data/eval/
16
+ /data/human_eval/
17
+
18
+ # But TRACK eval results (small JSONs, source of truth for metrics)
19
+ !/data/eval_results/
20
+
21
+ # Assets (PNGs break HF deployment - hosted on assets-only branch)
22
+ assets/
23
 
24
  # IDE
25
  .vscode/
Makefile CHANGED
@@ -1,4 +1,4 @@
1
- .PHONY: all setup data data-validate eval eval-all eval-full eval-quick demo demo-interview reset reset-eval reset-hard check-env qdrant-up qdrant-down qdrant-status eda serve serve-dev docker-build docker-run deploy-info deploy-health human-eval-generate human-eval human-eval-analyze fmt test lint typecheck ci info summary metrics-snapshot health load-test load-test-quick kaggle-test help
2
 
3
  # ---------------------------------------------------------------------------
4
  # Configurable Variables (override: make demo QUERY="gaming mouse")
@@ -165,45 +165,29 @@ eval-all: check-env
165
  python scripts/summary.py && \
166
  echo "=== COMPLETE EVALUATION DONE ==="
167
 
168
- # Full reproducibility: eval-all + human eval + load test (~15 min + ~1 hour manual)
 
169
  # Run after: make reset-eval
170
- # This is the COMPLETE evaluation for portfolio close-out
171
  eval-full: check-env
172
  @echo "=== FULL REPRODUCIBLE EVALUATION ===" && \
173
  echo "" && \
174
  echo "=== PART 1: AUTOMATED METRICS (~15 min) ===" && \
175
  $(MAKE) eval-all && \
176
  echo "" && \
177
- echo "=== PART 2: HUMAN EVALUATION ===" && \
178
- echo "" && \
179
- echo "--- Generating 50 samples ---" && \
180
- python scripts/human_eval.py --generate --force && \
181
- echo "" && \
182
- echo "--- Interactive annotation (50 samples, ~1 hour) ---" && \
183
- echo "Rate each sample 1-5 on: comprehension, trust, usefulness, satisfaction" && \
184
- echo "Press Ctrl+C to pause and resume later with 'make human-eval'" && \
185
- echo "" && \
186
- python scripts/human_eval.py --annotate && \
187
- echo "" && \
188
- echo "--- Computing human eval results ---" && \
189
- python scripts/human_eval.py --analyze && \
190
- echo "" && \
191
- echo "=== PART 3: LOAD TEST ===" && \
192
  python scripts/load_test.py --url $(URL) --requests $(REQUESTS) --save && \
193
  echo "" && \
194
- echo "=== PART 4: FINAL SUMMARY ===" && \
195
- python scripts/summary.py && \
196
- echo "" && \
197
- echo "=== FULL REPRODUCIBLE EVALUATION COMPLETE ===" && \
198
  echo "" && \
199
  echo "Results saved to: data/eval_results/" && \
200
  echo " - eval_natural_queries_latest.json (NDCG, Hit@K, MRR)" && \
201
  echo " - faithfulness_latest.json (HHEM, RAGAS)" && \
202
  echo " - grounding_delta_latest.json (WITH vs WITHOUT evidence)" && \
203
- echo " - human_eval_latest.json (50-sample ratings)" && \
204
  echo " - load_test_latest.json (P99 latency)" && \
205
  echo "" && \
206
- echo "To verify docs match results: check README.md and home/*.md"
 
 
207
 
208
  # ---------------------------------------------------------------------------
209
  # Demo
@@ -269,21 +253,54 @@ deploy-health:
269
  (echo "Deployment not healthy at $(URL)" && exit 1)
270
 
271
  # ---------------------------------------------------------------------------
272
- # Human Evaluation
273
  # ---------------------------------------------------------------------------
274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  human-eval-generate: check-env
276
  @echo "=== GENERATING HUMAN EVAL SAMPLES ==="
277
  python scripts/human_eval.py --generate --seed $(SEED)
278
 
 
279
  human-eval: check-env
280
  @echo "=== HUMAN EVALUATION ==="
 
 
281
  python scripts/human_eval.py --annotate
282
 
 
283
  human-eval-analyze: check-env
284
  @echo "=== HUMAN EVAL ANALYSIS ==="
285
  python scripts/human_eval.py --analyze
286
 
 
 
 
 
287
  # ---------------------------------------------------------------------------
288
  # Quality
289
  # ---------------------------------------------------------------------------
@@ -318,7 +335,8 @@ info:
318
  print(f'Qdrant: {QDRANT_URL}'); \
319
  print(f'LLM: {LLM_PROVIDER} ({ANTHROPIC_MODEL if LLM_PROVIDER == \"anthropic\" else OPENAI_MODEL})')"
320
 
321
- summary:
 
322
  @python scripts/summary.py
323
 
324
  metrics-snapshot:
@@ -468,10 +486,10 @@ help:
468
  @echo " make demo-interview 3-query showcase (includes cache hit)"
469
  @echo ""
470
  @echo "INFO & METRICS:"
471
- @echo " make info Show version, models, and URLs"
472
- @echo " make summary Print evaluation summary"
473
- @echo " make metrics-snapshot Quick metrics display"
474
- @echo " make health Check API health (requires running server)"
475
  @echo ""
476
  @echo "PIPELINE:"
477
  @echo " make data Load, chunk, embed, and index reviews (local)"
@@ -484,8 +502,9 @@ help:
484
  @echo " make eval Standard: metrics + explanation + faithfulness (~5 min)"
485
  @echo " make eval-all Complete: everything automated (~15 min)"
486
  @echo " Includes: EDA, ablations, baselines, delta, analysis"
487
- @echo " make eval-full Full reproducibility: eval-all + human eval + load test"
488
- @echo " Includes: 50-sample manual annotation (~1 hour)"
 
489
  @echo ""
490
  @echo "LOAD TESTING:"
491
  @echo " make load-test Run 50 requests against production (P99 target)"
@@ -500,9 +519,11 @@ help:
500
  @echo " make deploy-info Show HuggingFace Spaces deployment info"
501
  @echo " make deploy-health Check production deployment health"
502
  @echo ""
503
- @echo "HUMAN EVALUATION:"
 
 
504
  @echo " make human-eval-generate Generate 50 eval samples (SEED=42)"
505
- @echo " make human-eval Rate samples interactively"
506
  @echo " make human-eval-analyze Compute results from ratings"
507
  @echo ""
508
  @echo "QUALITY:"
 
1
+ .PHONY: all setup data data-validate eval eval-all eval-full eval-quick eval-summary demo demo-interview reset reset-eval reset-hard check-env qdrant-up qdrant-down qdrant-status eda serve serve-dev docker-build docker-run deploy-info deploy-health human-eval-workflow human-eval-generate human-eval human-eval-analyze human-eval-status fmt test lint typecheck ci info metrics-snapshot health load-test load-test-quick kaggle-test help
2
 
3
  # ---------------------------------------------------------------------------
4
  # Configurable Variables (override: make demo QUERY="gaming mouse")
 
165
  python scripts/summary.py && \
166
  echo "=== COMPLETE EVALUATION DONE ==="
167
 
168
+ # Full reproducibility: eval-all + load test (~17 min, fully automated)
169
+ # Human evaluation is a SEPARATE workflow (see: make human-eval-workflow)
170
  # Run after: make reset-eval
 
171
  eval-full: check-env
172
  @echo "=== FULL REPRODUCIBLE EVALUATION ===" && \
173
  echo "" && \
174
  echo "=== PART 1: AUTOMATED METRICS (~15 min) ===" && \
175
  $(MAKE) eval-all && \
176
  echo "" && \
177
+ echo "=== PART 2: LOAD TEST ===" && \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  python scripts/load_test.py --url $(URL) --requests $(REQUESTS) --save && \
179
  echo "" && \
180
+ echo "=== AUTOMATED EVALUATION COMPLETE ===" && \
 
 
 
181
  echo "" && \
182
  echo "Results saved to: data/eval_results/" && \
183
  echo " - eval_natural_queries_latest.json (NDCG, Hit@K, MRR)" && \
184
  echo " - faithfulness_latest.json (HHEM, RAGAS)" && \
185
  echo " - grounding_delta_latest.json (WITH vs WITHOUT evidence)" && \
 
186
  echo " - load_test_latest.json (P99 latency)" && \
187
  echo "" && \
188
+ echo "NEXT STEPS:" && \
189
+ echo " 1. make human-eval-workflow # ~1 hour manual annotation" && \
190
+ echo " 2. make eval-summary # view complete results"
191
 
192
  # ---------------------------------------------------------------------------
193
  # Demo
 
253
  (echo "Deployment not healthy at $(URL)" && exit 1)
254
 
255
  # ---------------------------------------------------------------------------
256
+ # Human Evaluation (separate workflow from automated eval)
257
  # ---------------------------------------------------------------------------
258
 
259
+ # Complete human eval workflow: generate → annotate → analyze
260
+ # Run this AFTER make eval-full completes
261
+ human-eval-workflow: check-env
262
+ @echo "=== HUMAN EVALUATION WORKFLOW ===" && \
263
+ echo "" && \
264
+ echo "This is a separate ~1 hour manual process." && \
265
+ echo "You can pause anytime with Ctrl+C and resume with 'make human-eval'" && \
266
+ echo "" && \
267
+ echo "--- Step 1/3: Generating 50 samples ---" && \
268
+ python scripts/human_eval.py --generate --seed $(SEED) && \
269
+ echo "" && \
270
+ echo "--- Step 2/3: Interactive annotation ---" && \
271
+ echo "Rate each sample 1-5 on: comprehension, trust, usefulness, satisfaction" && \
272
+ echo "" && \
273
+ python scripts/human_eval.py --annotate && \
274
+ echo "" && \
275
+ echo "--- Step 3/3: Computing results ---" && \
276
+ python scripts/human_eval.py --analyze && \
277
+ echo "" && \
278
+ echo "=== HUMAN EVALUATION COMPLETE ===" && \
279
+ echo "Results: data/eval_results/human_eval_latest.json" && \
280
+ echo "" && \
281
+ echo "Run 'make eval-summary' to see updated metrics."
282
+
283
+ # Generate samples only (non-blocking)
284
  human-eval-generate: check-env
285
  @echo "=== GENERATING HUMAN EVAL SAMPLES ==="
286
  python scripts/human_eval.py --generate --seed $(SEED)
287
 
288
+ # Interactive annotation (can pause with Ctrl+C, resume anytime)
289
  human-eval: check-env
290
  @echo "=== HUMAN EVALUATION ==="
291
+ @echo "Pause anytime with Ctrl+C. Resume with 'make human-eval'"
292
+ @echo ""
293
  python scripts/human_eval.py --annotate
294
 
295
+ # Compute results from annotations
296
  human-eval-analyze: check-env
297
  @echo "=== HUMAN EVAL ANALYSIS ==="
298
  python scripts/human_eval.py --analyze
299
 
300
+ # Check annotation progress
301
+ human-eval-status:
302
+ @python scripts/human_eval.py --status 2>/dev/null || echo "No samples yet. Run: make human-eval-generate"
303
+
304
  # ---------------------------------------------------------------------------
305
  # Quality
306
  # ---------------------------------------------------------------------------
 
335
  print(f'Qdrant: {QDRANT_URL}'); \
336
  print(f'LLM: {LLM_PROVIDER} ({ANTHROPIC_MODEL if LLM_PROVIDER == \"anthropic\" else OPENAI_MODEL})')"
337
 
338
+ # Comprehensive evaluation summary (handles missing human eval gracefully)
339
+ eval-summary:
340
  @python scripts/summary.py
341
 
342
  metrics-snapshot:
 
486
  @echo " make demo-interview 3-query showcase (includes cache hit)"
487
  @echo ""
488
  @echo "INFO & METRICS:"
489
+ @echo " make info Show version, models, and URLs"
490
+ @echo " make eval-summary Print comprehensive evaluation results"
491
+ @echo " make metrics-snapshot Quick metrics display"
492
+ @echo " make health Check API health (requires running server)"
493
  @echo ""
494
  @echo "PIPELINE:"
495
  @echo " make data Load, chunk, embed, and index reviews (local)"
 
502
  @echo " make eval Standard: metrics + explanation + faithfulness (~5 min)"
503
  @echo " make eval-all Complete: everything automated (~15 min)"
504
  @echo " Includes: EDA, ablations, baselines, delta, analysis"
505
+ @echo " make eval-full Full automated eval + load test (~17 min)"
506
+ @echo " Does NOT include human eval (see below)"
507
+ @echo " make eval-summary View comprehensive results (handles missing data)"
508
  @echo ""
509
  @echo "LOAD TESTING:"
510
  @echo " make load-test Run 50 requests against production (P99 target)"
 
519
  @echo " make deploy-info Show HuggingFace Spaces deployment info"
520
  @echo " make deploy-health Check production deployment health"
521
  @echo ""
522
+ @echo "HUMAN EVALUATION (separate workflow, ~1 hour):"
523
+ @echo " make human-eval-workflow Complete workflow: generate → annotate → analyze"
524
+ @echo " make human-eval-status Check annotation progress"
525
  @echo " make human-eval-generate Generate 50 eval samples (SEED=42)"
526
+ @echo " make human-eval Rate samples interactively (Ctrl+C to pause)"
527
  @echo " make human-eval-analyze Compute results from ratings"
528
  @echo ""
529
  @echo "QUALITY:"
README.md CHANGED
@@ -34,7 +34,7 @@ A recommendation system that refuses to hallucinate.
34
 
35
  Product recommendations without explanations are black boxes. Users see "You might like X" but never learn *why*. When you ask an LLM to explain, it confidently invents features and fabricates reviews.
36
 
37
- **Sage is different:** Every claim is a verified quote from real customer reviews. When evidence is sparse, it refuses rather than guesses. Human evaluation scored trust at **4.3/5** because honesty beats confident fabrication.
38
 
39
  ---
40
 
@@ -42,13 +42,13 @@ Product recommendations without explanations are black boxes. Users see "You mig
42
 
43
  | Metric | Target | Achieved | Status |
44
  |--------|--------|----------|--------|
45
- | NDCG@10 (recommendation quality) | > 0.30 | 0.295 | 98% |
46
- | Claim-level faithfulness (HHEM) | > 0.85 | 0.967 | Pass |
47
- | Human evaluation (n=50) | > 3.5/5 | 3.85/5 | Pass |
48
- | P99 latency (production) | < 500ms | 200ms | Pass |
49
- | P99 latency (cache hit) | < 100ms | 86ms | Pass |
50
 
51
- **Grounding impact:** Explanations generated WITH evidence score 71% on HHEM. WITHOUT evidence: 2.5%. RAG grounding reduces hallucination by 68 percentage points.
52
 
53
  ---
54
 
@@ -96,7 +96,7 @@ When you give an LLM one short review as context, it fills in the gaps with plau
96
  | **Semantic chunking** | Fixed-window | Preserves complete arguments; better quote verification. |
97
  | **HHEM** (Vectara) | GPT-4 judge, NLI models | Purpose-built for RAG hallucination; no API cost. |
98
  | **Claim-level evaluation** | Full-explanation | Isolates which claims hallucinate; more actionable. |
99
- | **Quality gate** (refuse) | Always answer | 48% refusal rate → 4.3/5 trust. Honesty > coverage. |
100
 
101
  ---
102
 
@@ -108,7 +108,7 @@ When you give an LLM one short review as context, it fills in the gaps with plau
108
  | **No image features** | Misses visual product attributes | Could add CLIP embeddings in future |
109
  | **English only** | Non-English reviews have lower retrieval quality | E5 is primarily English-trained |
110
  | **Cache invalidation manual** | Stale explanations possible | TTL-based expiry (1 hour); manual `/cache/clear` |
111
- | **LLM latency on free tier** | P99 ~4s with explanations | Retrieval alone is 200ms; cache hits are 86ms |
112
  | **No user personalization** | Same results for all users | Would need user history for collaborative filtering |
113
 
114
  ---
 
34
 
35
  Product recommendations without explanations are black boxes. Users see "You might like X" but never learn *why*. When you ask an LLM to explain, it confidently invents features and fabricates reviews.
36
 
37
+ **Sage is different:** Every claim is a verified quote from real customer reviews. When evidence is sparse, it refuses rather than guesses. Human evaluation scored trust at **4.0/5** because honesty beats confident fabrication.
38
 
39
  ---
40
 
 
42
 
43
  | Metric | Target | Achieved | Status |
44
  |--------|--------|----------|--------|
45
+ | NDCG@10 (recommendation quality) | > 0.30 | 0.487 | Pass |
46
+ | Claim-level faithfulness (HHEM) | > 0.85 | 0.968 | Pass |
47
+ | Human evaluation (n=50) | > 3.5/5 | 3.6/5 | Pass |
48
+ | P99 latency (production) | < 500ms | 283ms | Pass |
49
+ | P99 latency (cache hit) | < 100ms | 88ms | Pass |
50
 
51
+ **Grounding impact:** Explanations generated WITH evidence score 73% on HHEM. WITHOUT evidence: 2.6%. RAG grounding reduces hallucination by 70 percentage points.
52
 
53
  ---
54
 
 
96
  | **Semantic chunking** | Fixed-window | Preserves complete arguments; better quote verification. |
97
  | **HHEM** (Vectara) | GPT-4 judge, NLI models | Purpose-built for RAG hallucination; no API cost. |
98
  | **Claim-level evaluation** | Full-explanation | Isolates which claims hallucinate; more actionable. |
99
+ | **Quality gate** (refuse) | Always answer | 64% refusal rate → 4.0/5 trust. Honesty > coverage. |
100
 
101
  ---
102
 
 
108
  | **No image features** | Misses visual product attributes | Could add CLIP embeddings in future |
109
  | **English only** | Non-English reviews have lower retrieval quality | E5 is primarily English-trained |
110
  | **Cache invalidation manual** | Stale explanations possible | TTL-based expiry (1 hour); manual `/cache/clear` |
111
+ | **LLM latency on free tier** | P99 ~4s with explanations | Retrieval alone is 283ms; cache hits are 88ms |
112
  | **No user personalization** | Same results for all users | Would need user history for collaborative filtering |
113
 
114
  ---
data/eval_results/adjusted_faithfulness_20260210_115509.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "n_total": 28,
3
+ "n_refusals": 18,
4
+ "raw_pass_rate": 0.17857142857142858,
5
+ "adjusted_pass_rate": 0.7857142857142857
6
+ }
data/eval_results/adjusted_faithfulness_latest.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "n_total": 28,
3
+ "n_refusals": 18,
4
+ "raw_pass_rate": 0.17857142857142858,
5
+ "adjusted_pass_rate": 0.7857142857142857
6
+ }
data/eval_results/eval_natural_queries_20260210_114459.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2026-02-10T11:39:34.313213",
3
+ "dataset": "eval_natural_queries.json",
4
+ "catalog_size": 21827,
5
+ "experiments": {
6
+ "aggregation_methods": {
7
+ "max": {
8
+ "ndcg_at_10": 0.4871922222425982,
9
+ "hit_at_10": 0.7380952380952381,
10
+ "mrr": 0.42086167800453517
11
+ },
12
+ "mean": {
13
+ "ndcg_at_10": 0.26098878491132294,
14
+ "hit_at_10": 0.40476190476190477,
15
+ "mrr": 0.23921957671957672
16
+ },
17
+ "weighted_mean": {
18
+ "ndcg_at_10": 0.26256611709364913,
19
+ "hit_at_10": 0.40476190476190477,
20
+ "mrr": 0.23895502645502648
21
+ }
22
+ },
23
+ "weight_tuning": [
24
+ {
25
+ "alpha": 1.0,
26
+ "beta": 0.0,
27
+ "ndcg_at_10": 0.4871922222425982,
28
+ "hit_at_10": 0.7380952380952381,
29
+ "mrr": 0.42086167800453517
30
+ },
31
+ {
32
+ "alpha": 0.9,
33
+ "beta": 0.1,
34
+ "ndcg_at_10": 0.4847806754077341,
35
+ "hit_at_10": 0.7380952380952381,
36
+ "mrr": 0.419340513983371
37
+ },
38
+ {
39
+ "alpha": 0.8,
40
+ "beta": 0.2,
41
+ "ndcg_at_10": 0.48823514035842697,
42
+ "hit_at_10": 0.7380952380952381,
43
+ "mrr": 0.4292894935752078
44
+ },
45
+ {
46
+ "alpha": 0.7,
47
+ "beta": 0.3,
48
+ "ndcg_at_10": 0.4649563743541078,
49
+ "hit_at_10": 0.6904761904761905,
50
+ "mrr": 0.4129535147392291
51
+ },
52
+ {
53
+ "alpha": 0.6,
54
+ "beta": 0.4,
55
+ "ndcg_at_10": 0.4676786860620628,
56
+ "hit_at_10": 0.6904761904761905,
57
+ "mrr": 0.4146069538926682
58
+ },
59
+ {
60
+ "alpha": 0.5,
61
+ "beta": 0.5,
62
+ "ndcg_at_10": 0.4707198108786378,
63
+ "hit_at_10": 0.6904761904761905,
64
+ "mrr": 0.4125661375661376
65
+ }
66
+ ]
67
+ },
68
+ "primary_metrics": {
69
+ "ndcg_at_10": 0.4871922222425982,
70
+ "hit_at_10": 0.7380952380952381,
71
+ "mrr": 0.42086167800453517,
72
+ "precision_at_10": 0.12857142857142856,
73
+ "recall_at_10": 0.4722222222222222,
74
+ "diversity": 0.01957190520646696,
75
+ "coverage": 0.015531222797452697,
76
+ "novelty": 9.808908578271737
77
+ },
78
+ "best_weights": {
79
+ "alpha": 0.8,
80
+ "beta": 0.2,
81
+ "ndcg_at_10": 0.48823514035842697
82
+ }
83
+ }
data/eval_results/eval_natural_queries_20260210_114955.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2026-02-10T11:49:08.500849",
3
+ "dataset": "eval_natural_queries.json",
4
+ "catalog_size": 21827,
5
+ "experiments": {},
6
+ "primary_metrics": {
7
+ "ndcg_at_10": 0.4871922222425982,
8
+ "hit_at_10": 0.7380952380952381,
9
+ "mrr": 0.42086167800453517,
10
+ "precision_at_10": 0.12857142857142856,
11
+ "recall_at_10": 0.4722222222222222,
12
+ "diversity": 0.01957190520646696,
13
+ "coverage": 0.015531222797452697,
14
+ "novelty": 9.808908578271737
15
+ }
16
+ }
data/eval_results/eval_natural_queries_latest.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2026-02-10T11:49:08.500849",
3
+ "dataset": "eval_natural_queries.json",
4
+ "catalog_size": 21827,
5
+ "experiments": {},
6
+ "primary_metrics": {
7
+ "ndcg_at_10": 0.4871922222425982,
8
+ "hit_at_10": 0.7380952380952381,
9
+ "mrr": 0.42086167800453517,
10
+ "precision_at_10": 0.12857142857142856,
11
+ "recall_at_10": 0.4722222222222222,
12
+ "diversity": 0.01957190520646696,
13
+ "coverage": 0.015531222797452697,
14
+ "novelty": 9.808908578271737
15
+ }
16
+ }
data/eval_results/failure_analysis_20260210_115508.json ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cases": [
3
+ {
4
+ "case_id": 1,
5
+ "query": "wireless headphones with noise cancellation",
6
+ "product_id": "B01EL7VTC0",
7
+ "explanation": "Reviewers describe these as \"the best noise cancellation, wireless headphones on the market\" [review_137128] and \"Best Bluetooth headphones set with noise cancellation\" [review_117666]. One user notes \"when playing music via Bluetooth mild background sound does seem to be suppressed\" [review_291616].",
8
+ "evidence_texts": [
9
+ "These are the best noise cancellation, wireless headphones on the market. They offer audiophile quality sound not usually found with wireless headphones. They are very comfortable to wear for long periods of time and strangely enough, my ears do not sweat or become hot while wearing them. (I do not exercise with these on). You can change the volume, pause, skip tracks and take phone calls with the headset on. The battery life is impressive and the build quality is great. These are worth every penny. I would give it 6 stars if I could!! Way to go Bose!",
10
+ "Best Bluetooth headphones set with noise cancellation. Very comfortable for long time.",
11
+ "I'm not sure what I should have expected from the noise cancellation. I have found that when playing music via Bluetooth mild background sound does seem to be suppressed, but louder sounds come through. I tested the noise cancellation playing a variety of white sounds on my iPhone while playing music via my MacBook. The sounds do seem to be suppressed at low volume, but are easy to hear at higher volume. My conclusion is that noise cancellation is limited even in these highly rated earphones. They are superb earphones, well built, with great sound quality, but not sure they are worth the price."
12
+ ],
13
+ "hhem_score": 0.13946935534477234,
14
+ "is_hallucinated": true
15
+ },
16
+ {
17
+ "case_id": 2,
18
+ "query": "wireless headphones with noise cancellation",
19
+ "product_id": "B0C7V6PQ5D",
20
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"wireless headphones with noise cancellation\".",
21
+ "evidence_texts": [
22
+ "These seem to be good wireless noise cancelling headphones. I have been using them a few days and I am very happy with the results. The background noise is almost all cancelled out when I listen to music. The clarity and quality of the music is very good thru these headphones. The headphones sit comfortably on my head and over my ears. These are lightweight. The headphone comes with the following: 1 Srhythm Over-Ear Bluetooth Headphone, 1 USB charging cable, 1 3.5mm Audio cable, 1 Airplane adapter, 1 Carrying Zip Case, and 1 User manual. I recommend these headphones and I think they are well worth the money."
23
+ ],
24
+ "hhem_score": 0.47767794132232666,
25
+ "is_hallucinated": true
26
+ },
27
+ {
28
+ "case_id": 3,
29
+ "query": "laptop charger for MacBook",
30
+ "product_id": "B08PNSRG33",
31
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"laptop charger for MacBook\".",
32
+ "evidence_texts": [
33
+ "I always liked the old MacBooks with their magnetic charging cable feature...before they switched to USB C. And I really like the current all-aluminum Acer Chromebooks, which cosmetically are a MacBook clone (they are a premium Chromebook). This adapter works great and rekindles the MacBook magnetic charging idea. The adapter looks great and works great! Well done!"
34
+ ],
35
+ "hhem_score": 0.376436322927475,
36
+ "is_hallucinated": true
37
+ },
38
+ {
39
+ "case_id": 4,
40
+ "query": "laptop charger for MacBook",
41
+ "product_id": "B0BLMFG5D2",
42
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"laptop charger for MacBook\".",
43
+ "evidence_texts": [
44
+ "Note that this model is the standard USB A charger. The other Apple version has a USB-C connector and fits the MacBook Pro and other more recent notebook computers that only have USB-C ports. Be sure you order the right one."
45
+ ],
46
+ "hhem_score": 0.2948443591594696,
47
+ "is_hallucinated": true
48
+ },
49
+ {
50
+ "case_id": 5,
51
+ "query": "USB hub with multiple ports",
52
+ "product_id": "B0BVQQY5KD",
53
+ "explanation": "This product \"allows 4 devices with USB-A connectors to connect to a single USB-C port\" [review_288811] and \"provides the means of making these connections including devices that operate at 10Gps speeds\" [review_288811]. The reviewer notes it \"makes a very good accessory for connecting USB-A devices\" [review_288811].",
54
+ "evidence_texts": [
55
+ "This is a hub that allows 4 devices with USB-A connectors to connect to a single USB-C port on another device, which would usually be a computer.",
56
+ "In this case, power and video transmission is not supported. Even though most of the new devices like solid state drives are using USB-C connectors there still exists a lot of equipment with USB-A connectors that have to connected. This Hub provides the means of making these connections including devices that operate at 10Gps speeds. Since this is not a powered hub the power to the connected devices is limited. It is unlikely that more than two solid state drives can be connected without affecting their performance. Lower power devices like mice and keyboards are easily accommodated. It is constructed very nicely and makes a very good accessory for connecting USB-A devices."
57
+ ],
58
+ "hhem_score": 0.9468567371368408,
59
+ "is_hallucinated": false
60
+ },
61
+ {
62
+ "case_id": 6,
63
+ "query": "USB hub with multiple ports",
64
+ "product_id": "B00483WRZ6",
65
+ "explanation": "Users report \"it's great to have 10 ports\" [review_49478] and describe \"The hub has 6 regular USB ports on one side, and 4 on the side where the power line heads off to the wall plug\" [review_183047]. One reviewer notes \"it has performed flawlessly\" [review_49478] after a year of use.",
66
+ "evidence_texts": [
67
+ "The hub has 6 regular USB ports on one side, and 4 on the side where the power line heads off to the wall plug. In addition, one side has the mini-USB connection for the line to the USB source--probably but not necessarily the computer--so there are technically 11 USB ports on this hub, of which one has a specific use. I already have two 4-port USB 1.0 hubs for my older equipment, and two 5-port USB 2.0 hubs. After I reorganize all the cables to nowhere or everywhere, I'll be using the Plugable 10-port hub and one of the 5-port USB 2.0 hubs, with a couple of ports to spare for the first time in years.",
68
+ "I've been using this hub for about a year now and it has performed flawlessly. With so many devices using USB, it's great to have 10 ports. This hub has enough ports that I can leave the iPod and Camera cords plugged in and just attach the devices when needed.",
69
+ "The hub arrived quickly from the vendor. I waded into my spaghetti, extracted the old hub and put in my new hub. Many of my connections I have routed behind a shelf so I found the 6 connections on one side to work well with those. That left the two rotating ports and 2 facing outward-I can access the remainder to use for charging things etc. The rotating ports did help orienting the unit so I can easily access those ports. The USB 3.0 TB HD thinks the hub is dandy."
70
+ ],
71
+ "hhem_score": 0.8917474150657654,
72
+ "is_hallucinated": false
73
+ },
74
+ {
75
+ "case_id": 7,
76
+ "query": "portable battery pack for travel",
77
+ "product_id": "B078XDLDNM",
78
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"portable battery pack for travel\".",
79
+ "evidence_texts": [
80
+ "Excellent for use with portable battery packs."
81
+ ],
82
+ "hhem_score": 0.25542503595352173,
83
+ "is_hallucinated": true
84
+ },
85
+ {
86
+ "case_id": 8,
87
+ "query": "portable battery pack for travel",
88
+ "product_id": "B071YYK9CB",
89
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"portable battery pack for travel\".",
90
+ "evidence_texts": [
91
+ "Excellent for use with portable battery packs."
92
+ ],
93
+ "hhem_score": 0.25542503595352173,
94
+ "is_hallucinated": true
95
+ },
96
+ {
97
+ "case_id": 9,
98
+ "query": "bluetooth speaker with good bass",
99
+ "product_id": "B081HC253G",
100
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"bluetooth speaker with good bass\".",
101
+ "evidence_texts": [
102
+ "Love this Bluetooth speaker has a good amount of bass and clear and loud. Can put on a shelf and looks nice."
103
+ ],
104
+ "hhem_score": 0.41683968901634216,
105
+ "is_hallucinated": true
106
+ },
107
+ {
108
+ "case_id": 10,
109
+ "query": "bluetooth speaker with good bass",
110
+ "product_id": "B08CS391BR",
111
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"bluetooth speaker with good bass\".",
112
+ "evidence_texts": [
113
+ "Nice loud bluetooth speaker with very loud bass."
114
+ ],
115
+ "hhem_score": 0.2693043053150177,
116
+ "is_hallucinated": true
117
+ },
118
+ {
119
+ "case_id": 11,
120
+ "query": "cheap but good quality earbuds",
121
+ "product_id": "B07S764D9V",
122
+ "explanation": "Reviewers describe these as \"the best cheap earbuds on the market\" [review_33223] and note they are \"cheap (less than $10) and solid\" [review_99276]. One customer simply calls them \"Great cheap earbuds\" [review_221704].",
123
+ "evidence_texts": [
124
+ "These are probably the best cheap earbuds on the market. They sound fairly good, they don't fall out of my ears, and they are comfortable. I've used earbuds that cost a lot more but fail at those criteria.<br /><br />The cords seem a bit fragile, but that's really all I can complain about.",
125
+ "I was looking for earbuds that were cheap (less than $10) and solid. These are it. They fit well, they block (some) of the background noise, and they are well-constructed. I especially like the plug, as it's at a right angle, which prevents me from accidentally pulling the wire free of the plug when I take out the plug. Good design.<br /><br />These are NOT noise cancelling earbuds. They block noise because they're in your ear.",
126
+ "Great cheap earbuds"
127
+ ],
128
+ "hhem_score": 0.10299108922481537,
129
+ "is_hallucinated": true
130
+ },
131
+ {
132
+ "case_id": 12,
133
+ "query": "cheap but good quality earbuds",
134
+ "product_id": "B072JJNYQY",
135
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"cheap but good quality earbuds\".",
136
+ "evidence_texts": [
137
+ "These are great, cheap wireless earbuds. The sound quality is pretty good for the price. No complaints so far. My wife and I both have a pair of these."
138
+ ],
139
+ "hhem_score": 0.40427908301353455,
140
+ "is_hallucinated": true
141
+ },
142
+ {
143
+ "case_id": 13,
144
+ "query": "durable phone case that looks nice",
145
+ "product_id": "B085LCT6HY",
146
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"durable phone case that looks nice\".",
147
+ "evidence_texts": [
148
+ "Nice little case. Very durable and nice looking."
149
+ ],
150
+ "hhem_score": 0.40621230006217957,
151
+ "is_hallucinated": true
152
+ },
153
+ {
154
+ "case_id": 14,
155
+ "query": "durable phone case that looks nice",
156
+ "product_id": "B00APU8IY0",
157
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"durable phone case that looks nice\".",
158
+ "evidence_texts": [
159
+ "These cases are sharp looking and very durable, and well made. The best part of it is the price. I have one for my Nexus 7 as well."
160
+ ],
161
+ "hhem_score": 0.3042883574962616,
162
+ "is_hallucinated": true
163
+ },
164
+ {
165
+ "case_id": 15,
166
+ "query": "fast charging cable that won't break",
167
+ "product_id": "B07MLVKBQM",
168
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"fast charging cable that won't break\".",
169
+ "evidence_texts": [
170
+ "Great fast charging cable. Seems very durable."
171
+ ],
172
+ "hhem_score": 0.2593702971935272,
173
+ "is_hallucinated": true
174
+ },
175
+ {
176
+ "case_id": 16,
177
+ "query": "fast charging cable that won't break",
178
+ "product_id": "B0982Q1WZ8",
179
+ "explanation": "I cannot provide a meaningful recommendation for this product. The available review evidence is too brief (15 tokens) to support a well-grounded explanation for your query about \"fast charging cable that won't break\".",
180
+ "evidence_texts": [
181
+ "Very good durable fast charging cable",
182
+ "durable and fast charging"
183
+ ],
184
+ "hhem_score": 0.07533111423254013,
185
+ "is_hallucinated": true
186
+ },
187
+ {
188
+ "case_id": 17,
189
+ "query": "comfortable headphones for long sessions",
190
+ "product_id": "B08HTNC8PJ",
191
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"comfortable headphones for long sessions\".",
192
+ "evidence_texts": [
193
+ "Awesome, simple headphones. Very comfy for long periods"
194
+ ],
195
+ "hhem_score": 0.5048479437828064,
196
+ "is_hallucinated": false
197
+ },
198
+ {
199
+ "case_id": 18,
200
+ "query": "comfortable headphones for long sessions",
201
+ "product_id": "B016XJCQR6",
202
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"comfortable headphones for long sessions\".",
203
+ "evidence_texts": [
204
+ "Nice headphones, I use them to listen to online lectures. Sound is good and is heard in both ears. Pretty comfortable - I can go about 2.5 hours before they get uncomfortable. I Like the single cord to headset."
205
+ ],
206
+ "hhem_score": 0.3649515211582184,
207
+ "is_hallucinated": true
208
+ },
209
+ {
210
+ "case_id": 19,
211
+ "query": "quiet keyboard for office",
212
+ "product_id": "B0BK3LYMR2",
213
+ "explanation": "Reviewers describe it as \"super quiet\" [review_151577] and \"the quietest I've ever owned\" [review_320508]. One notes \"this keyboard is pretty quiet, no clacking noise\" [review_322050] and calls it \"a perfect office keyboard\" [review_320508].",
214
+ "evidence_texts": [
215
+ "I purchased this keyboard and mouse combo for my home office. It's super quiet and feels great to type on. I liked it so much I ended up purchasing a second one for the business office.",
216
+ "Both the keyboard and mouse are the quietest I've ever owned. I wish all my co-workers would make the switch to these. It is a perfect office keyboard and mouse, it just works and works well. Also, this keyboard has drain holes! If you spill something on it, your chances of still having a working keyboard are muuuuch higher with this keyboard.",
217
+ "This keyboard is pretty quiet, no clacking noise. Instead you here a rubbery membrane squish from a key being pressed, which is also audible but not noisy. The typing experience on this keyboard is not pleasant due to the stiff membrane, requiring much more effort/force to push down individual keys. Typing fast and typing a lot on this keyboard would not be recommended because it's not easy to type on this keyboard.<br /><br />The silent mouse has silent buttons as well as a silent scroll wheel. I mention this because I bought a separate Logitech silent mouse and that scroll wheel is loud."
218
+ ],
219
+ "hhem_score": 0.5948902368545532,
220
+ "is_hallucinated": false
221
+ },
222
+ {
223
+ "case_id": 20,
224
+ "query": "quiet keyboard for office",
225
+ "product_id": "B004MF11MU",
226
+ "explanation": "Reviewers describe it as a \"Good, quiet keyboard\" [review_155782] with \"Very responsive keys\" [review_155782]. One user notes \"It's fast, easy to type\" [review_245043].",
227
+ "evidence_texts": [
228
+ "Good, quiet keyboard. Very responsive keys.",
229
+ "I purchased this keyboard back in 2018 and haven't had to change a battery nor charge it at all. The natural light coming in from the windows or the lamps on your desk are enough to keep it going.<br />It's fast, easy to type and use the function keys, and works at a long distance from the computer.<br />The only complaint is that the keyboard isn't wide enough as it puts your hands a little too close together.<br />It's worth getting."
230
+ ],
231
+ "hhem_score": 0.38308340311050415,
232
+ "is_hallucinated": true
233
+ },
234
+ {
235
+ "case_id": 21,
236
+ "query": "headphones that don't hurt ears",
237
+ "product_id": "B00JB31OUW",
238
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"headphones that don't hurt ears\".",
239
+ "evidence_texts": [
240
+ "Finally, headphones that don't hurt my ears, because they don't sit ON my ears. I have very small ear canals, and have always been unable to use earbuds because they either don't stay in or they are very painful. Other headphones become very painful after just a short time of listening. My only wish is that they were bluetooth capable, but that's something I can deal with. The sound is wonderful, and the comfort is absolutely heavenly. Thank you, Bose!!!"
241
+ ],
242
+ "hhem_score": 0.40339213609695435,
243
+ "is_hallucinated": true
244
+ },
245
+ {
246
+ "case_id": 22,
247
+ "query": "headphones that don't hurt ears",
248
+ "product_id": "B07KQWX947",
249
+ "explanation": "Reviewers describe these as \"lightweight and comfortable headphones\" [review_292476]. One customer notes they are \"really good sounding\" [review_292476] with \"excellent\" sound quality [review_142177].",
250
+ "evidence_texts": [
251
+ "Koss PortaPro headphones are really good sounding, lightweight and comfortable headphones that don't break the bank.",
252
+ "Folds great, sound is very good; no need to spend $350 on other headphones. These don't cancel noise but the sound on my iPod is excellent."
253
+ ],
254
+ "hhem_score": 0.24604828655719757,
255
+ "is_hallucinated": true
256
+ },
257
+ {
258
+ "case_id": 23,
259
+ "query": "charger that actually works",
260
+ "product_id": "B07NPPSXN3",
261
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"charger that actually works\".",
262
+ "evidence_texts": [
263
+ "Seems like an official charger and it works!"
264
+ ],
265
+ "hhem_score": 0.2389219105243683,
266
+ "is_hallucinated": true
267
+ },
268
+ {
269
+ "case_id": 24,
270
+ "query": "charger that actually works",
271
+ "product_id": "B072FHJVT2",
272
+ "explanation": "I cannot provide a meaningful recommendation for this product. The available review evidence is too brief (33 tokens) to support a well-grounded explanation for your query about \"charger that actually works\".",
273
+ "evidence_texts": [
274
+ "Works really good this charger",
275
+ "It\u2019s a basic android type charger. Nothing insanely special about it but it does the job as it should."
276
+ ],
277
+ "hhem_score": 0.4941137433052063,
278
+ "is_hallucinated": true
279
+ },
280
+ {
281
+ "case_id": 25,
282
+ "query": "waterproof speaker for shower",
283
+ "product_id": "B0BW4PFM58",
284
+ "explanation": "One reviewer states \"I personally use it in my shower to listen to podcasts and audiobooks\" [review_139921] and notes \"The volume is loud enough to overpower the running water\" [review_139921]. However, they clarify \"It is not WATERPROOF\" and recommend \"I keep in the bathroom, just not under the showerhead\" [review_139921].",
285
+ "evidence_texts": [
286
+ "This is not a beach party speaker, it's not big enough for that task. If you want a portable party/beach speaker you need to buy a larger speaker which will require a much larger battery. I personally use it in my shower to listen to podcasts and audiobooks more than anything. It is not WATERPROOF, it never states that in the description. The image may be a little deceptive, it will handle a bit of splash. IPX 5: Protected against water jets - Water projected at all angles through a 6.3mm nozzle at a flow rate of 12.5 liters/min at a pressure of 30kN/m2 for 3 minutes from a distance of 3 meters. In order to survive being underwater or in water it needs X-8 or X-7. I keep in the bathroom, just not under the showerhead. It has rubber feet to keep it from vibrating around. The volume is loud enough to overpower the running water, little to almost no distortion at high volume.",
287
+ "This is not a beach party speaker, it's not big enough for that task. If you want a portable party/beach speaker you need to buy a larger speaker which will require a much larger battery. I personally use it in my shower to listen to podcasts and audiobooks more than anything. It is not WATERPROOF, it never states that in the description. The image may be a little deceptive, it will handle a bit of splash. IPX 5: Protected against water jets - Water projected at all angles through a 6.3mm nozzle at a flow rate of 12.5 liters/min at a pressure of 30kN/m2 for 3 minutes from a distance of 3 meters. In order to survive being underwater or in water it needs X-8 or X-7. I keep in the bathroom, just not under the showerhead. It has rubber feet to keep it from vibrating around. The volume is loud enough to overpower the running water, little to almost no distortion at high volume.",
288
+ "These speakers are great. Clear sound, even at the highest volume. Water proofing seems to work fine and its very shock resistant in that I dropped it from a 7 foot shelf in the shower and it didn't skip a beat."
289
+ ],
290
+ "hhem_score": 0.9651408791542053,
291
+ "is_hallucinated": false
292
+ },
293
+ {
294
+ "case_id": 26,
295
+ "query": "waterproof speaker for shower",
296
+ "product_id": "B0C1J4XGB1",
297
+ "explanation": "Customers say \"I bought it for the shower and I have not been disappointed\" [review_110364] and \"This is a nice speaker for your shower\" [review_122829]. One reviewer notes \"It is military grade waterproof\" [review_110364], though warns \"if it breaks it's not water resistant anymore\" [review_170353] regarding the charging port cover.",
298
+ "evidence_texts": [
299
+ "This speaker is a great buy for the money. I bought it for the shower and I have not been disappointed.<br /><br />It is military grade waterproof. FYI- I do not have it in a spot in the shower that it is getting directly hit with water, but it is in a spot that it gets wet.<br /><br />I also like the features on it. You can pause songs or skip to the next one. Pickup or hangup on phone calls if you use this as a portable speaker. The size is great for travel as well. I have even taken this outside with me when I am sitting in the backyard. I am thinking about buying another for moving around and keeping one in shower since the price is so reasonable.",
300
+ "This is a nice speaker for your shower. Has a good sound, good battery and the ability to play/pause, skip/reverse and control volume. My helpful hint, when recharging it is to take the speaker and suction cup off together, if you try to take the speaker off of the suction cup, you run the risk of breaking the suction cup. (This happened to the last one I had and then that rolled off the soap dish and broke.)",
301
+ "Nice battery life. good sound. The speaker is located in the back so when stuck to my shower wall it echos the sound in the shower / tub. Only downside is be careful of the rubber plug for the charging port, if it breaks it's not water resistant anymore."
302
+ ],
303
+ "hhem_score": 0.4655941128730774,
304
+ "is_hallucinated": true
305
+ },
306
+ {
307
+ "case_id": 27,
308
+ "query": "gift for someone who likes music",
309
+ "product_id": "B0BMQN7L6B",
310
+ "explanation": "Customers say \"I gave this as a gift to someone who loves listening to music, and they love it!\" [review_165326] and call it \"A great Christmas present!\" [review_92937]. One buyer noted they \"Bought as a Christmas gift. My niece loves it and has started buying vinyls.\" [review_279946]",
311
+ "evidence_texts": [
312
+ "I gave this as a gift to someone who loves listening to music, and they love it!",
313
+ "A great Christmas present! Great for someone starting their vinyl collection",
314
+ "Bought as a Christmas gift. My niece loves it and has started buying vinyls."
315
+ ],
316
+ "hhem_score": 0.1792474240064621,
317
+ "is_hallucinated": true
318
+ },
319
+ {
320
+ "case_id": 28,
321
+ "query": "gift for someone who likes music",
322
+ "product_id": "B08HTNC8PJ",
323
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"gift for someone who likes music\".",
324
+ "evidence_texts": [
325
+ "It was a gift for my niece. She likes these to listen to music while at home."
326
+ ],
327
+ "hhem_score": 0.2865367531776428,
328
+ "is_hallucinated": true
329
+ }
330
+ ],
331
+ "n_failures": 23
332
+ }
data/eval_results/failure_analysis_latest.json ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cases": [
3
+ {
4
+ "case_id": 1,
5
+ "query": "wireless headphones with noise cancellation",
6
+ "product_id": "B01EL7VTC0",
7
+ "explanation": "Reviewers describe these as \"the best noise cancellation, wireless headphones on the market\" [review_137128] and \"Best Bluetooth headphones set with noise cancellation\" [review_117666]. One user notes \"when playing music via Bluetooth mild background sound does seem to be suppressed\" [review_291616].",
8
+ "evidence_texts": [
9
+ "These are the best noise cancellation, wireless headphones on the market. They offer audiophile quality sound not usually found with wireless headphones. They are very comfortable to wear for long periods of time and strangely enough, my ears do not sweat or become hot while wearing them. (I do not exercise with these on). You can change the volume, pause, skip tracks and take phone calls with the headset on. The battery life is impressive and the build quality is great. These are worth every penny. I would give it 6 stars if I could!! Way to go Bose!",
10
+ "Best Bluetooth headphones set with noise cancellation. Very comfortable for long time.",
11
+ "I'm not sure what I should have expected from the noise cancellation. I have found that when playing music via Bluetooth mild background sound does seem to be suppressed, but louder sounds come through. I tested the noise cancellation playing a variety of white sounds on my iPhone while playing music via my MacBook. The sounds do seem to be suppressed at low volume, but are easy to hear at higher volume. My conclusion is that noise cancellation is limited even in these highly rated earphones. They are superb earphones, well built, with great sound quality, but not sure they are worth the price."
12
+ ],
13
+ "hhem_score": 0.13946935534477234,
14
+ "is_hallucinated": true
15
+ },
16
+ {
17
+ "case_id": 2,
18
+ "query": "wireless headphones with noise cancellation",
19
+ "product_id": "B0C7V6PQ5D",
20
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"wireless headphones with noise cancellation\".",
21
+ "evidence_texts": [
22
+ "These seem to be good wireless noise cancelling headphones. I have been using them a few days and I am very happy with the results. The background noise is almost all cancelled out when I listen to music. The clarity and quality of the music is very good thru these headphones. The headphones sit comfortably on my head and over my ears. These are lightweight. The headphone comes with the following: 1 Srhythm Over-Ear Bluetooth Headphone, 1 USB charging cable, 1 3.5mm Audio cable, 1 Airplane adapter, 1 Carrying Zip Case, and 1 User manual. I recommend these headphones and I think they are well worth the money."
23
+ ],
24
+ "hhem_score": 0.47767794132232666,
25
+ "is_hallucinated": true
26
+ },
27
+ {
28
+ "case_id": 3,
29
+ "query": "laptop charger for MacBook",
30
+ "product_id": "B08PNSRG33",
31
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"laptop charger for MacBook\".",
32
+ "evidence_texts": [
33
+ "I always liked the old MacBooks with their magnetic charging cable feature...before they switched to USB C. And I really like the current all-aluminum Acer Chromebooks, which cosmetically are a MacBook clone (they are a premium Chromebook). This adapter works great and rekindles the MacBook magnetic charging idea. The adapter looks great and works great! Well done!"
34
+ ],
35
+ "hhem_score": 0.376436322927475,
36
+ "is_hallucinated": true
37
+ },
38
+ {
39
+ "case_id": 4,
40
+ "query": "laptop charger for MacBook",
41
+ "product_id": "B0BLMFG5D2",
42
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"laptop charger for MacBook\".",
43
+ "evidence_texts": [
44
+ "Note that this model is the standard USB A charger. The other Apple version has a USB-C connector and fits the MacBook Pro and other more recent notebook computers that only have USB-C ports. Be sure you order the right one."
45
+ ],
46
+ "hhem_score": 0.2948443591594696,
47
+ "is_hallucinated": true
48
+ },
49
+ {
50
+ "case_id": 5,
51
+ "query": "USB hub with multiple ports",
52
+ "product_id": "B0BVQQY5KD",
53
+ "explanation": "This product \"allows 4 devices with USB-A connectors to connect to a single USB-C port\" [review_288811] and \"provides the means of making these connections including devices that operate at 10Gps speeds\" [review_288811]. The reviewer notes it \"makes a very good accessory for connecting USB-A devices\" [review_288811].",
54
+ "evidence_texts": [
55
+ "This is a hub that allows 4 devices with USB-A connectors to connect to a single USB-C port on another device, which would usually be a computer.",
56
+ "In this case, power and video transmission is not supported. Even though most of the new devices like solid state drives are using USB-C connectors there still exists a lot of equipment with USB-A connectors that have to connected. This Hub provides the means of making these connections including devices that operate at 10Gps speeds. Since this is not a powered hub the power to the connected devices is limited. It is unlikely that more than two solid state drives can be connected without affecting their performance. Lower power devices like mice and keyboards are easily accommodated. It is constructed very nicely and makes a very good accessory for connecting USB-A devices."
57
+ ],
58
+ "hhem_score": 0.9468567371368408,
59
+ "is_hallucinated": false
60
+ },
61
+ {
62
+ "case_id": 6,
63
+ "query": "USB hub with multiple ports",
64
+ "product_id": "B00483WRZ6",
65
+ "explanation": "Users report \"it's great to have 10 ports\" [review_49478] and describe \"The hub has 6 regular USB ports on one side, and 4 on the side where the power line heads off to the wall plug\" [review_183047]. One reviewer notes \"it has performed flawlessly\" [review_49478] after a year of use.",
66
+ "evidence_texts": [
67
+ "The hub has 6 regular USB ports on one side, and 4 on the side where the power line heads off to the wall plug. In addition, one side has the mini-USB connection for the line to the USB source--probably but not necessarily the computer--so there are technically 11 USB ports on this hub, of which one has a specific use. I already have two 4-port USB 1.0 hubs for my older equipment, and two 5-port USB 2.0 hubs. After I reorganize all the cables to nowhere or everywhere, I'll be using the Plugable 10-port hub and one of the 5-port USB 2.0 hubs, with a couple of ports to spare for the first time in years.",
68
+ "I've been using this hub for about a year now and it has performed flawlessly. With so many devices using USB, it's great to have 10 ports. This hub has enough ports that I can leave the iPod and Camera cords plugged in and just attach the devices when needed.",
69
+ "The hub arrived quickly from the vendor. I waded into my spaghetti, extracted the old hub and put in my new hub. Many of my connections I have routed behind a shelf so I found the 6 connections on one side to work well with those. That left the two rotating ports and 2 facing outward-I can access the remainder to use for charging things etc. The rotating ports did help orienting the unit so I can easily access those ports. The USB 3.0 TB HD thinks the hub is dandy."
70
+ ],
71
+ "hhem_score": 0.8917474150657654,
72
+ "is_hallucinated": false
73
+ },
74
+ {
75
+ "case_id": 7,
76
+ "query": "portable battery pack for travel",
77
+ "product_id": "B078XDLDNM",
78
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"portable battery pack for travel\".",
79
+ "evidence_texts": [
80
+ "Excellent for use with portable battery packs."
81
+ ],
82
+ "hhem_score": 0.25542503595352173,
83
+ "is_hallucinated": true
84
+ },
85
+ {
86
+ "case_id": 8,
87
+ "query": "portable battery pack for travel",
88
+ "product_id": "B071YYK9CB",
89
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"portable battery pack for travel\".",
90
+ "evidence_texts": [
91
+ "Excellent for use with portable battery packs."
92
+ ],
93
+ "hhem_score": 0.25542503595352173,
94
+ "is_hallucinated": true
95
+ },
96
+ {
97
+ "case_id": 9,
98
+ "query": "bluetooth speaker with good bass",
99
+ "product_id": "B081HC253G",
100
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"bluetooth speaker with good bass\".",
101
+ "evidence_texts": [
102
+ "Love this Bluetooth speaker has a good amount of bass and clear and loud. Can put on a shelf and looks nice."
103
+ ],
104
+ "hhem_score": 0.41683968901634216,
105
+ "is_hallucinated": true
106
+ },
107
+ {
108
+ "case_id": 10,
109
+ "query": "bluetooth speaker with good bass",
110
+ "product_id": "B08CS391BR",
111
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"bluetooth speaker with good bass\".",
112
+ "evidence_texts": [
113
+ "Nice loud bluetooth speaker with very loud bass."
114
+ ],
115
+ "hhem_score": 0.2693043053150177,
116
+ "is_hallucinated": true
117
+ },
118
+ {
119
+ "case_id": 11,
120
+ "query": "cheap but good quality earbuds",
121
+ "product_id": "B07S764D9V",
122
+ "explanation": "Reviewers describe these as \"the best cheap earbuds on the market\" [review_33223] and note they are \"cheap (less than $10) and solid\" [review_99276]. One customer simply calls them \"Great cheap earbuds\" [review_221704].",
123
+ "evidence_texts": [
124
+ "These are probably the best cheap earbuds on the market. They sound fairly good, they don't fall out of my ears, and they are comfortable. I've used earbuds that cost a lot more but fail at those criteria.<br /><br />The cords seem a bit fragile, but that's really all I can complain about.",
125
+ "I was looking for earbuds that were cheap (less than $10) and solid. These are it. They fit well, they block (some) of the background noise, and they are well-constructed. I especially like the plug, as it's at a right angle, which prevents me from accidentally pulling the wire free of the plug when I take out the plug. Good design.<br /><br />These are NOT noise cancelling earbuds. They block noise because they're in your ear.",
126
+ "Great cheap earbuds"
127
+ ],
128
+ "hhem_score": 0.10299108922481537,
129
+ "is_hallucinated": true
130
+ },
131
+ {
132
+ "case_id": 12,
133
+ "query": "cheap but good quality earbuds",
134
+ "product_id": "B072JJNYQY",
135
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"cheap but good quality earbuds\".",
136
+ "evidence_texts": [
137
+ "These are great, cheap wireless earbuds. The sound quality is pretty good for the price. No complaints so far. My wife and I both have a pair of these."
138
+ ],
139
+ "hhem_score": 0.40427908301353455,
140
+ "is_hallucinated": true
141
+ },
142
+ {
143
+ "case_id": 13,
144
+ "query": "durable phone case that looks nice",
145
+ "product_id": "B085LCT6HY",
146
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"durable phone case that looks nice\".",
147
+ "evidence_texts": [
148
+ "Nice little case. Very durable and nice looking."
149
+ ],
150
+ "hhem_score": 0.40621230006217957,
151
+ "is_hallucinated": true
152
+ },
153
+ {
154
+ "case_id": 14,
155
+ "query": "durable phone case that looks nice",
156
+ "product_id": "B00APU8IY0",
157
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"durable phone case that looks nice\".",
158
+ "evidence_texts": [
159
+ "These cases are sharp looking and very durable, and well made. The best part of it is the price. I have one for my Nexus 7 as well."
160
+ ],
161
+ "hhem_score": 0.3042883574962616,
162
+ "is_hallucinated": true
163
+ },
164
+ {
165
+ "case_id": 15,
166
+ "query": "fast charging cable that won't break",
167
+ "product_id": "B07MLVKBQM",
168
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"fast charging cable that won't break\".",
169
+ "evidence_texts": [
170
+ "Great fast charging cable. Seems very durable."
171
+ ],
172
+ "hhem_score": 0.2593702971935272,
173
+ "is_hallucinated": true
174
+ },
175
+ {
176
+ "case_id": 16,
177
+ "query": "fast charging cable that won't break",
178
+ "product_id": "B0982Q1WZ8",
179
+ "explanation": "I cannot provide a meaningful recommendation for this product. The available review evidence is too brief (15 tokens) to support a well-grounded explanation for your query about \"fast charging cable that won't break\".",
180
+ "evidence_texts": [
181
+ "Very good durable fast charging cable",
182
+ "durable and fast charging"
183
+ ],
184
+ "hhem_score": 0.07533111423254013,
185
+ "is_hallucinated": true
186
+ },
187
+ {
188
+ "case_id": 17,
189
+ "query": "comfortable headphones for long sessions",
190
+ "product_id": "B08HTNC8PJ",
191
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"comfortable headphones for long sessions\".",
192
+ "evidence_texts": [
193
+ "Awesome, simple headphones. Very comfy for long periods"
194
+ ],
195
+ "hhem_score": 0.5048479437828064,
196
+ "is_hallucinated": false
197
+ },
198
+ {
199
+ "case_id": 18,
200
+ "query": "comfortable headphones for long sessions",
201
+ "product_id": "B016XJCQR6",
202
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"comfortable headphones for long sessions\".",
203
+ "evidence_texts": [
204
+ "Nice headphones, I use them to listen to online lectures. Sound is good and is heard in both ears. Pretty comfortable - I can go about 2.5 hours before they get uncomfortable. I Like the single cord to headset."
205
+ ],
206
+ "hhem_score": 0.3649515211582184,
207
+ "is_hallucinated": true
208
+ },
209
+ {
210
+ "case_id": 19,
211
+ "query": "quiet keyboard for office",
212
+ "product_id": "B0BK3LYMR2",
213
+ "explanation": "Reviewers describe it as \"super quiet\" [review_151577] and \"the quietest I've ever owned\" [review_320508]. One notes \"this keyboard is pretty quiet, no clacking noise\" [review_322050] and calls it \"a perfect office keyboard\" [review_320508].",
214
+ "evidence_texts": [
215
+ "I purchased this keyboard and mouse combo for my home office. It's super quiet and feels great to type on. I liked it so much I ended up purchasing a second one for the business office.",
216
+ "Both the keyboard and mouse are the quietest I've ever owned. I wish all my co-workers would make the switch to these. It is a perfect office keyboard and mouse, it just works and works well. Also, this keyboard has drain holes! If you spill something on it, your chances of still having a working keyboard are muuuuch higher with this keyboard.",
217
+ "This keyboard is pretty quiet, no clacking noise. Instead you here a rubbery membrane squish from a key being pressed, which is also audible but not noisy. The typing experience on this keyboard is not pleasant due to the stiff membrane, requiring much more effort/force to push down individual keys. Typing fast and typing a lot on this keyboard would not be recommended because it's not easy to type on this keyboard.<br /><br />The silent mouse has silent buttons as well as a silent scroll wheel. I mention this because I bought a separate Logitech silent mouse and that scroll wheel is loud."
218
+ ],
219
+ "hhem_score": 0.5948902368545532,
220
+ "is_hallucinated": false
221
+ },
222
+ {
223
+ "case_id": 20,
224
+ "query": "quiet keyboard for office",
225
+ "product_id": "B004MF11MU",
226
+ "explanation": "Reviewers describe it as a \"Good, quiet keyboard\" [review_155782] with \"Very responsive keys\" [review_155782]. One user notes \"It's fast, easy to type\" [review_245043].",
227
+ "evidence_texts": [
228
+ "Good, quiet keyboard. Very responsive keys.",
229
+ "I purchased this keyboard back in 2018 and haven't had to change a battery nor charge it at all. The natural light coming in from the windows or the lamps on your desk are enough to keep it going.<br />It's fast, easy to type and use the function keys, and works at a long distance from the computer.<br />The only complaint is that the keyboard isn't wide enough as it puts your hands a little too close together.<br />It's worth getting."
230
+ ],
231
+ "hhem_score": 0.38308340311050415,
232
+ "is_hallucinated": true
233
+ },
234
+ {
235
+ "case_id": 21,
236
+ "query": "headphones that don't hurt ears",
237
+ "product_id": "B00JB31OUW",
238
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"headphones that don't hurt ears\".",
239
+ "evidence_texts": [
240
+ "Finally, headphones that don't hurt my ears, because they don't sit ON my ears. I have very small ear canals, and have always been unable to use earbuds because they either don't stay in or they are very painful. Other headphones become very painful after just a short time of listening. My only wish is that they were bluetooth capable, but that's something I can deal with. The sound is wonderful, and the comfort is absolutely heavenly. Thank you, Bose!!!"
241
+ ],
242
+ "hhem_score": 0.40339213609695435,
243
+ "is_hallucinated": true
244
+ },
245
+ {
246
+ "case_id": 22,
247
+ "query": "headphones that don't hurt ears",
248
+ "product_id": "B07KQWX947",
249
+ "explanation": "Reviewers describe these as \"lightweight and comfortable headphones\" [review_292476]. One customer notes they are \"really good sounding\" [review_292476] with \"excellent\" sound quality [review_142177].",
250
+ "evidence_texts": [
251
+ "Koss PortaPro headphones are really good sounding, lightweight and comfortable headphones that don't break the bank.",
252
+ "Folds great, sound is very good; no need to spend $350 on other headphones. These don't cancel noise but the sound on my iPod is excellent."
253
+ ],
254
+ "hhem_score": 0.24604828655719757,
255
+ "is_hallucinated": true
256
+ },
257
+ {
258
+ "case_id": 23,
259
+ "query": "charger that actually works",
260
+ "product_id": "B07NPPSXN3",
261
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"charger that actually works\".",
262
+ "evidence_texts": [
263
+ "Seems like an official charger and it works!"
264
+ ],
265
+ "hhem_score": 0.2389219105243683,
266
+ "is_hallucinated": true
267
+ },
268
+ {
269
+ "case_id": 24,
270
+ "query": "charger that actually works",
271
+ "product_id": "B072FHJVT2",
272
+ "explanation": "I cannot provide a meaningful recommendation for this product. The available review evidence is too brief (33 tokens) to support a well-grounded explanation for your query about \"charger that actually works\".",
273
+ "evidence_texts": [
274
+ "Works really good this charger",
275
+ "It\u2019s a basic android type charger. Nothing insanely special about it but it does the job as it should."
276
+ ],
277
+ "hhem_score": 0.4941137433052063,
278
+ "is_hallucinated": true
279
+ },
280
+ {
281
+ "case_id": 25,
282
+ "query": "waterproof speaker for shower",
283
+ "product_id": "B0BW4PFM58",
284
+ "explanation": "One reviewer states \"I personally use it in my shower to listen to podcasts and audiobooks\" [review_139921] and notes \"The volume is loud enough to overpower the running water\" [review_139921]. However, they clarify \"It is not WATERPROOF\" and recommend \"I keep in the bathroom, just not under the showerhead\" [review_139921].",
285
+ "evidence_texts": [
286
+ "This is not a beach party speaker, it's not big enough for that task. If you want a portable party/beach speaker you need to buy a larger speaker which will require a much larger battery. I personally use it in my shower to listen to podcasts and audiobooks more than anything. It is not WATERPROOF, it never states that in the description. The image may be a little deceptive, it will handle a bit of splash. IPX 5: Protected against water jets - Water projected at all angles through a 6.3mm nozzle at a flow rate of 12.5 liters/min at a pressure of 30kN/m2 for 3 minutes from a distance of 3 meters. In order to survive being underwater or in water it needs X-8 or X-7. I keep in the bathroom, just not under the showerhead. It has rubber feet to keep it from vibrating around. The volume is loud enough to overpower the running water, little to almost no distortion at high volume.",
287
+ "This is not a beach party speaker, it's not big enough for that task. If you want a portable party/beach speaker you need to buy a larger speaker which will require a much larger battery. I personally use it in my shower to listen to podcasts and audiobooks more than anything. It is not WATERPROOF, it never states that in the description. The image may be a little deceptive, it will handle a bit of splash. IPX 5: Protected against water jets - Water projected at all angles through a 6.3mm nozzle at a flow rate of 12.5 liters/min at a pressure of 30kN/m2 for 3 minutes from a distance of 3 meters. In order to survive being underwater or in water it needs X-8 or X-7. I keep in the bathroom, just not under the showerhead. It has rubber feet to keep it from vibrating around. The volume is loud enough to overpower the running water, little to almost no distortion at high volume.",
288
+ "These speakers are great. Clear sound, even at the highest volume. Water proofing seems to work fine and its very shock resistant in that I dropped it from a 7 foot shelf in the shower and it didn't skip a beat."
289
+ ],
290
+ "hhem_score": 0.9651408791542053,
291
+ "is_hallucinated": false
292
+ },
293
+ {
294
+ "case_id": 26,
295
+ "query": "waterproof speaker for shower",
296
+ "product_id": "B0C1J4XGB1",
297
+ "explanation": "Customers say \"I bought it for the shower and I have not been disappointed\" [review_110364] and \"This is a nice speaker for your shower\" [review_122829]. One reviewer notes \"It is military grade waterproof\" [review_110364], though warns \"if it breaks it's not water resistant anymore\" [review_170353] regarding the charging port cover.",
298
+ "evidence_texts": [
299
+ "This speaker is a great buy for the money. I bought it for the shower and I have not been disappointed.<br /><br />It is military grade waterproof. FYI- I do not have it in a spot in the shower that it is getting directly hit with water, but it is in a spot that it gets wet.<br /><br />I also like the features on it. You can pause songs or skip to the next one. Pickup or hangup on phone calls if you use this as a portable speaker. The size is great for travel as well. I have even taken this outside with me when I am sitting in the backyard. I am thinking about buying another for moving around and keeping one in shower since the price is so reasonable.",
300
+ "This is a nice speaker for your shower. Has a good sound, good battery and the ability to play/pause, skip/reverse and control volume. My helpful hint, when recharging it is to take the speaker and suction cup off together, if you try to take the speaker off of the suction cup, you run the risk of breaking the suction cup. (This happened to the last one I had and then that rolled off the soap dish and broke.)",
301
+ "Nice battery life. good sound. The speaker is located in the back so when stuck to my shower wall it echos the sound in the shower / tub. Only downside is be careful of the rubber plug for the charging port, if it breaks it's not water resistant anymore."
302
+ ],
303
+ "hhem_score": 0.4655941128730774,
304
+ "is_hallucinated": true
305
+ },
306
+ {
307
+ "case_id": 27,
308
+ "query": "gift for someone who likes music",
309
+ "product_id": "B0BMQN7L6B",
310
+ "explanation": "Customers say \"I gave this as a gift to someone who loves listening to music, and they love it!\" [review_165326] and call it \"A great Christmas present!\" [review_92937]. One buyer noted they \"Bought as a Christmas gift. My niece loves it and has started buying vinyls.\" [review_279946]",
311
+ "evidence_texts": [
312
+ "I gave this as a gift to someone who loves listening to music, and they love it!",
313
+ "A great Christmas present! Great for someone starting their vinyl collection",
314
+ "Bought as a Christmas gift. My niece loves it and has started buying vinyls."
315
+ ],
316
+ "hhem_score": 0.1792474240064621,
317
+ "is_hallucinated": true
318
+ },
319
+ {
320
+ "case_id": 28,
321
+ "query": "gift for someone who likes music",
322
+ "product_id": "B08HTNC8PJ",
323
+ "explanation": "I cannot provide a confident recommendation for this product based on the available review evidence. Only 1 review excerpt(s) were found, which is insufficient to make a well-grounded recommendation for your query about \"gift for someone who likes music\".",
324
+ "evidence_texts": [
325
+ "It was a gift for my niece. She likes these to listen to music while at home."
326
+ ],
327
+ "hhem_score": 0.2865367531776428,
328
+ "is_hallucinated": true
329
+ }
330
+ ],
331
+ "n_failures": 23
332
+ }
data/eval_results/faithfulness_20260210_115238.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2026-02-10T11:52:38.150536",
3
+ "n_samples": 10,
4
+ "hhem": {
5
+ "mean_score": 0.43166339099407197,
6
+ "n_hallucinated": 8,
7
+ "hallucination_rate": 0.8
8
+ },
9
+ "multi_metric": {
10
+ "quote_verification_rate": 0.8260869565217391,
11
+ "quotes_found": 19,
12
+ "quotes_total": 23,
13
+ "claim_level_pass_rate": 1.0,
14
+ "claim_level_avg_score": 0.9676379120868185,
15
+ "claim_level_min_score": 0.804991602897644,
16
+ "full_explanation_pass_rate": 0.2,
17
+ "full_explanation_avg_score": 0.43166339099407197
18
+ },
19
+ "target": 0.85,
20
+ "ragas": {
21
+ "faithfulness_mean": 0.5,
22
+ "faithfulness_std": 0.5270462766947299
23
+ },
24
+ "ragas_limitations": {
25
+ "metrics_available": [
26
+ "faithfulness"
27
+ ],
28
+ "metrics_unavailable": {
29
+ "answer_relevancy": "Requires embeddings model; RAGAS doesn't support Anthropic as embeddings provider",
30
+ "context_precision": "Requires ground-truth reference answers per query (not available)",
31
+ "context_recall": "Requires ground-truth reference answers per query (not available)"
32
+ },
33
+ "primary_metric": "claim_level_hhem",
34
+ "rationale": "Claim-level HHEM (96.8%) is more reliable than full-explanation RAGAS for citation-heavy explanations"
35
+ }
36
+ }
data/eval_results/faithfulness_latest.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2026-02-10T11:52:38.150536",
3
+ "n_samples": 10,
4
+ "hhem": {
5
+ "mean_score": 0.43166339099407197,
6
+ "n_hallucinated": 8,
7
+ "hallucination_rate": 0.8
8
+ },
9
+ "multi_metric": {
10
+ "quote_verification_rate": 0.8260869565217391,
11
+ "quotes_found": 19,
12
+ "quotes_total": 23,
13
+ "claim_level_pass_rate": 1.0,
14
+ "claim_level_avg_score": 0.9676379120868185,
15
+ "claim_level_min_score": 0.804991602897644,
16
+ "full_explanation_pass_rate": 0.2,
17
+ "full_explanation_avg_score": 0.43166339099407197
18
+ },
19
+ "target": 0.85,
20
+ "ragas": {
21
+ "faithfulness_mean": 0.5,
22
+ "faithfulness_std": 0.5270462766947299
23
+ },
24
+ "ragas_limitations": {
25
+ "metrics_available": [
26
+ "faithfulness"
27
+ ],
28
+ "metrics_unavailable": {
29
+ "answer_relevancy": "Requires embeddings model; RAGAS doesn't support Anthropic as embeddings provider",
30
+ "context_precision": "Requires ground-truth reference answers per query (not available)",
31
+ "context_recall": "Requires ground-truth reference answers per query (not available)"
32
+ },
33
+ "primary_metric": "claim_level_hhem",
34
+ "rationale": "Claim-level HHEM (96.8%) is more reliable than full-explanation RAGAS for citation-heavy explanations"
35
+ }
36
+ }
data/eval_results/grounding_delta_20260210_115418.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_samples": 10,
3
+ "with_evidence_mean": 0.7308747053146363,
4
+ "without_evidence_mean": 0.026432324294000865,
5
+ "delta": 0.7044423810206354,
6
+ "with_evidence_scores": [
7
+ 0.9404566287994385,
8
+ 0.8414536714553833,
9
+ 0.5394061207771301,
10
+ 0.11840134859085083,
11
+ 0.5561886429786682,
12
+ 0.9055582284927368,
13
+ 0.9347118735313416,
14
+ 0.8819715976715088,
15
+ 0.815461277961731,
16
+ 0.7751376628875732
17
+ ],
18
+ "without_evidence_scores": [
19
+ 0.03228604421019554,
20
+ 0.021040642634034157,
21
+ 0.08896809816360474,
22
+ 0.022938460111618042,
23
+ 0.024911534041166306,
24
+ 0.01653069630265236,
25
+ 0.011862579733133316,
26
+ 0.01837613806128502,
27
+ 0.01632498949766159,
28
+ 0.011084060184657574
29
+ ]
30
+ }
data/eval_results/grounding_delta_latest.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_samples": 10,
3
+ "with_evidence_mean": 0.7308747053146363,
4
+ "without_evidence_mean": 0.026432324294000865,
5
+ "delta": 0.7044423810206354,
6
+ "with_evidence_scores": [
7
+ 0.9404566287994385,
8
+ 0.8414536714553833,
9
+ 0.5394061207771301,
10
+ 0.11840134859085083,
11
+ 0.5561886429786682,
12
+ 0.9055582284927368,
13
+ 0.9347118735313416,
14
+ 0.8819715976715088,
15
+ 0.815461277961731,
16
+ 0.7751376628875732
17
+ ],
18
+ "without_evidence_scores": [
19
+ 0.03228604421019554,
20
+ 0.021040642634034157,
21
+ 0.08896809816360474,
22
+ 0.022938460111618042,
23
+ 0.024911534041166306,
24
+ 0.01653069630265236,
25
+ 0.011862579733133316,
26
+ 0.01837613806128502,
27
+ 0.01632498949766159,
28
+ 0.011084060184657574
29
+ ]
30
+ }
data/eval_results/human_eval_20260210_124705.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2026-02-10T12:47:05.801626",
3
+ "n_samples": 50,
4
+ "n_total": 50,
5
+ "methodology": {
6
+ "evaluator": "Single rater (developer/researcher)",
7
+ "instructions": "Rate each dimension 1-5 Likert: 1=strongly disagree, 5=strongly agree",
8
+ "dimensions": {
9
+ "comprehension": "I understood why this item was recommended",
10
+ "trust": "I trust this explanation is accurate",
11
+ "usefulness": "This explanation helped me make a decision",
12
+ "satisfaction": "I am satisfied with this explanation"
13
+ },
14
+ "sample_selection": "35 natural queries (balanced by category) + 15 config queries",
15
+ "inter_annotator_agreement": "N/A (single rater)"
16
+ },
17
+ "dimensions": {
18
+ "comprehension": {
19
+ "mean": 4.3,
20
+ "std": 0.68,
21
+ "min": 3,
22
+ "max": 5
23
+ },
24
+ "trust": {
25
+ "mean": 4.02,
26
+ "std": 0.71,
27
+ "min": 2,
28
+ "max": 5
29
+ },
30
+ "usefulness": {
31
+ "mean": 3.06,
32
+ "std": 1.72,
33
+ "min": 1,
34
+ "max": 5
35
+ },
36
+ "satisfaction": {
37
+ "mean": 3.04,
38
+ "std": 1.67,
39
+ "min": 1,
40
+ "max": 5
41
+ }
42
+ },
43
+ "overall_helpfulness": 3.6,
44
+ "target": 3.5,
45
+ "pass": true,
46
+ "hhem_trust_correlation": {
47
+ "spearman_r": 0.1237,
48
+ "p_value": 0.3919
49
+ }
50
+ }
data/eval_results/human_eval_latest.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2026-02-10T12:47:05.801626",
3
+ "n_samples": 50,
4
+ "n_total": 50,
5
+ "methodology": {
6
+ "evaluator": "Single rater (developer/researcher)",
7
+ "instructions": "Rate each dimension 1-5 Likert: 1=strongly disagree, 5=strongly agree",
8
+ "dimensions": {
9
+ "comprehension": "I understood why this item was recommended",
10
+ "trust": "I trust this explanation is accurate",
11
+ "usefulness": "This explanation helped me make a decision",
12
+ "satisfaction": "I am satisfied with this explanation"
13
+ },
14
+ "sample_selection": "35 natural queries (balanced by category) + 15 config queries",
15
+ "inter_annotator_agreement": "N/A (single rater)"
16
+ },
17
+ "dimensions": {
18
+ "comprehension": {
19
+ "mean": 4.3,
20
+ "std": 0.68,
21
+ "min": 3,
22
+ "max": 5
23
+ },
24
+ "trust": {
25
+ "mean": 4.02,
26
+ "std": 0.71,
27
+ "min": 2,
28
+ "max": 5
29
+ },
30
+ "usefulness": {
31
+ "mean": 3.06,
32
+ "std": 1.72,
33
+ "min": 1,
34
+ "max": 5
35
+ },
36
+ "satisfaction": {
37
+ "mean": 3.04,
38
+ "std": 1.67,
39
+ "min": 1,
40
+ "max": 5
41
+ }
42
+ },
43
+ "overall_helpfulness": 3.6,
44
+ "target": 3.5,
45
+ "pass": true,
46
+ "hhem_trust_correlation": {
47
+ "spearman_r": 0.1237,
48
+ "p_value": 0.3919
49
+ }
50
+ }
data/eval_results/load_test_20260210_115634.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2026-02-10T11:56:34.441501",
3
+ "config": {
4
+ "url": "https://vxa8502-sage.hf.space",
5
+ "num_requests": 50,
6
+ "explain": true,
7
+ "timeout_s": 30.0
8
+ },
9
+ "total_requests": 50,
10
+ "successful": 50,
11
+ "errors": 0,
12
+ "cache_hits": 41,
13
+ "min_ms": 74.0,
14
+ "max_ms": 431.6,
15
+ "mean_ms": 97.8,
16
+ "median_ms": 88.4,
17
+ "p50_ms": 88.4,
18
+ "p95_ms": 121.5,
19
+ "p99_ms": 283.1,
20
+ "stdev_ms": 49.6,
21
+ "target_p99_ms": 500.0,
22
+ "pass": true
23
+ }
data/eval_results/load_test_latest.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2026-02-10T11:56:34.441501",
3
+ "config": {
4
+ "url": "https://vxa8502-sage.hf.space",
5
+ "num_requests": 50,
6
+ "explain": true,
7
+ "timeout_s": 30.0
8
+ },
9
+ "total_requests": 50,
10
+ "successful": 50,
11
+ "errors": 0,
12
+ "cache_hits": 41,
13
+ "min_ms": 74.0,
14
+ "max_ms": 431.6,
15
+ "mean_ms": 97.8,
16
+ "median_ms": 88.4,
17
+ "p50_ms": 88.4,
18
+ "p95_ms": 121.5,
19
+ "p99_ms": 283.1,
20
+ "stdev_ms": 49.6,
21
+ "target_p99_ms": 500.0,
22
+ "pass": true
23
+ }
reports/eda_report.md CHANGED
@@ -23,7 +23,7 @@ This report analyzes the actual data deployed in production, ensuring all statis
23
 
24
  Amazon reviews exhibit a characteristic J-shaped distribution, heavily skewed toward 5-star ratings.
25
 
26
- ![Rating Distribution](https://raw.githubusercontent.com/vxa8502/sage/assets-only/rating_distribution.png)
27
 
28
  | Rating | Count | Percentage |
29
  |--------|-------|------------|
@@ -44,7 +44,7 @@ Amazon reviews exhibit a characteristic J-shaped distribution, heavily skewed to
44
 
45
  Chunk lengths affect retrieval quality and context window usage.
46
 
47
- ![Chunk Lengths](https://raw.githubusercontent.com/vxa8502/sage/assets-only/chunk_lengths.png)
48
 
49
  **Statistics:**
50
  - Median chunk length: 169 characters (~42 tokens)
@@ -57,7 +57,7 @@ Chunk lengths affect retrieval quality and context window usage.
57
 
58
  Reviews are chunked based on length: short reviews stay whole, longer reviews are split semantically.
59
 
60
- ![Chunks per Review](https://raw.githubusercontent.com/vxa8502/sage/assets-only/chunks_per_review.png)
61
 
62
  | Metric | Value |
63
  |--------|-------|
@@ -76,7 +76,7 @@ Reviews are chunked based on length: short reviews stay whole, longer reviews ar
76
 
77
  Review timestamps enable chronological analysis and temporal evaluation splits.
78
 
79
- ![Temporal Distribution](https://raw.githubusercontent.com/vxa8502/sage/assets-only/temporal_distribution.png)
80
 
81
  ---
82
 
 
23
 
24
  Amazon reviews exhibit a characteristic J-shaped distribution, heavily skewed toward 5-star ratings.
25
 
26
+ ![Rating Distribution](../assets/rating_distribution.png)
27
 
28
  | Rating | Count | Percentage |
29
  |--------|-------|------------|
 
44
 
45
  Chunk lengths affect retrieval quality and context window usage.
46
 
47
+ ![Chunk Lengths](../assets/chunk_lengths.png)
48
 
49
  **Statistics:**
50
  - Median chunk length: 169 characters (~42 tokens)
 
57
 
58
  Reviews are chunked based on length: short reviews stay whole, longer reviews are split semantically.
59
 
60
+ ![Chunks per Review](../assets/chunks_per_review.png)
61
 
62
  | Metric | Value |
63
  |--------|-------|
 
76
 
77
  Review timestamps enable chronological analysis and temporal evaluation splits.
78
 
79
+ ![Temporal Distribution](../assets/temporal_distribution.png)
80
 
81
  ---
82
 
sage/data/__init__.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sage data loading and preprocessing module.
3
+
4
+ Provides utilities for loading Amazon Reviews dataset from HuggingFace,
5
+ cleaning, filtering, and preparing data for the recommendation pipeline.
6
+ """
7
+
8
+ from sage.data.loader import (
9
+ calculate_sparsity,
10
+ clean_reviews,
11
+ create_temporal_splits,
12
+ filter_5_core,
13
+ get_review_stats,
14
+ load_reviews,
15
+ load_splits,
16
+ prepare_data,
17
+ validate_reviews,
18
+ verify_temporal_boundaries,
19
+ )
20
+
21
+ from sage.data.eval import load_eval_cases
22
+
23
+ __all__ = [
24
+ "load_reviews",
25
+ "filter_5_core",
26
+ "get_review_stats",
27
+ "validate_reviews",
28
+ "clean_reviews",
29
+ "prepare_data",
30
+ "calculate_sparsity",
31
+ "create_temporal_splits",
32
+ "verify_temporal_boundaries",
33
+ "load_splits",
34
+ "load_eval_cases",
35
+ ]
sage/data/eval.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Evaluation dataset loading utilities.
3
+ """
4
+
5
+ import json
6
+
7
+ from sage.config import DATA_DIR
8
+ from sage.core import EvalCase
9
+
10
+
11
+ EVAL_DIR = DATA_DIR / "eval"
12
+
13
+
14
+ def load_eval_cases(filename: str) -> list[EvalCase]:
15
+ """
16
+ Load evaluation cases from JSON file.
17
+
18
+ Args:
19
+ filename: Filename in eval directory.
20
+
21
+ Returns:
22
+ List of EvalCase objects.
23
+ """
24
+ filepath = EVAL_DIR / filename
25
+
26
+ with open(filepath) as f:
27
+ data = json.load(f)
28
+
29
+ return [
30
+ EvalCase(
31
+ query=d["query"],
32
+ relevant_items=d["relevant_items"],
33
+ user_id=d.get("user_id"),
34
+ )
35
+ for d in data
36
+ ]
sage/data/loader.py ADDED
@@ -0,0 +1,479 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Load and preprocess Amazon Reviews dataset from HuggingFace.
3
+ """
4
+
5
+ import json
6
+ from pathlib import Path
7
+
8
+ import pandas as pd
9
+ import requests
10
+ from tqdm import tqdm
11
+
12
+ from sage.config import (
13
+ DATASET_NAME,
14
+ DATASET_CATEGORY,
15
+ HF_TOKEN,
16
+ DATA_DIR,
17
+ get_logger,
18
+ )
19
+
20
+ logger = get_logger(__name__)
21
+
22
+ SPLITS_DIR = DATA_DIR / "splits"
23
+
24
+ # Base URL for HuggingFace dataset files
25
+ HF_BASE_URL = "https://huggingface.co/datasets"
26
+
27
+
28
+ def load_reviews(
29
+ subset_size: int | None = None, use_cache: bool = True
30
+ ) -> pd.DataFrame:
31
+ """
32
+ Load Amazon Reviews from HuggingFace by streaming JSONL.
33
+
34
+ Streams the file and reads only the requested number of lines
35
+ to avoid downloading the full 22GB file.
36
+
37
+ Args:
38
+ subset_size: Number of reviews to load. None for all.
39
+ use_cache: Whether to use cached parquet if available.
40
+
41
+ Returns:
42
+ DataFrame with review data.
43
+ """
44
+ cache_path = DATA_DIR / f"reviews_{subset_size or 'full'}.parquet"
45
+
46
+ if use_cache and cache_path.exists():
47
+ logger.info("Loading from cache: %s", cache_path)
48
+ return pd.read_parquet(cache_path)
49
+
50
+ # Build the URL for streaming
51
+ category_name = DATASET_CATEGORY.replace("raw_review_", "")
52
+ url = f"{HF_BASE_URL}/{DATASET_NAME}/resolve/main/raw/review_categories/{category_name}.jsonl"
53
+
54
+ logger.info("Streaming from %s", url)
55
+
56
+ headers = {}
57
+ if HF_TOKEN:
58
+ headers["Authorization"] = f"Bearer {HF_TOKEN}"
59
+
60
+ reviews = []
61
+ target = subset_size if subset_size is not None else 100_000
62
+
63
+ with requests.get(url, headers=headers, stream=True) as response:
64
+ response.raise_for_status()
65
+
66
+ pbar = tqdm(total=target, desc="Loading reviews")
67
+
68
+ for line in response.iter_lines():
69
+ if line:
70
+ try:
71
+ review = json.loads(line.decode("utf-8"))
72
+ reviews.append(review)
73
+ pbar.update(1)
74
+
75
+ if len(reviews) >= target:
76
+ break
77
+ except json.JSONDecodeError as e:
78
+ logger.debug("Skipping malformed JSON line: %s", e)
79
+ continue
80
+
81
+ pbar.close()
82
+
83
+ logger.info("Loaded %s reviews", f"{len(reviews):,}")
84
+ df = pd.DataFrame(reviews)
85
+
86
+ # Cache for future use
87
+ df.to_parquet(cache_path)
88
+ logger.info("Cached to %s", cache_path)
89
+
90
+ return df
91
+
92
+
93
+ def filter_5_core(df: pd.DataFrame, min_interactions: int = 5) -> pd.DataFrame:
94
+ """
95
+ Apply 5-core filtering: keep only user and items with >= min_interactions.
96
+ Iteratively filters until convergence.
97
+
98
+ Args:
99
+ df: DataFrame with 'user_id' and 'parent_asin' columns
100
+ min_interactions: Minimum interactions threshold.
101
+
102
+ Returns:
103
+ Filtered DataFrame.
104
+ """
105
+ prev_len = len(df) + 1
106
+ iteration = 0
107
+
108
+ while len(df) < prev_len:
109
+ prev_len = len(df)
110
+ iteration += 1
111
+
112
+ # Filter users
113
+ user_counts = df["user_id"].value_counts()
114
+ valid_users = user_counts[user_counts >= min_interactions].index
115
+ df = df[df["user_id"].isin(valid_users)]
116
+
117
+ # Filter items
118
+ item_counts = df["parent_asin"].value_counts()
119
+ valid_items = item_counts[item_counts >= min_interactions].index
120
+ df = df[df["parent_asin"].isin(valid_items)]
121
+
122
+ logger.debug(" Iteration %d: %s reviews remaining", iteration, f"{len(df):,}")
123
+
124
+ return df.reset_index(drop=True)
125
+
126
+
127
+ def get_review_stats(df: pd.DataFrame) -> dict:
128
+ """
129
+ Compute basic statistics about the reviews DataFrame.
130
+ """
131
+ n_users = df["user_id"].nunique()
132
+ n_items = df["parent_asin"].nunique()
133
+
134
+ return {
135
+ "total_reviews": len(df),
136
+ "unique_users": n_users,
137
+ "unique_items": n_items,
138
+ "sparsity": calculate_sparsity(df),
139
+ "avg_rating": df["rating"].mean(),
140
+ "rating_dist": df["rating"].value_counts().sort_index().to_dict(),
141
+ "avg_review_length": df["text"].str.len().mean(),
142
+ "verified_pct": (
143
+ df["verified_purchase"].mean() * 100
144
+ if "verified_purchase" in df.columns
145
+ else None
146
+ ),
147
+ }
148
+
149
+
150
+ def validate_reviews(df: pd.DataFrame) -> dict:
151
+ """
152
+ Run data quality checks on the reviews dataset.
153
+ Returns a dict with quality metrics and issues found
154
+ """
155
+ issues = {}
156
+
157
+ # Check for missing text
158
+ missing_text = df["text"].isna().sum()
159
+ if missing_text > 0:
160
+ issues["missing_text"] = missing_text
161
+
162
+ # Check for empty text
163
+ empty_text = (df["text"].str.strip() == "").sum()
164
+ if empty_text > 0:
165
+ issues["empty_text"] = empty_text
166
+
167
+ # Check for very short reviews (likely not useful)
168
+ very_short = (df["text"].str.len() < 10).sum()
169
+ if very_short > 0:
170
+ issues["very_short"] = very_short
171
+
172
+ # Check for duplicate texts
173
+ duplicate_texts = df["text"].duplicated().sum()
174
+ if duplicate_texts > 0:
175
+ issues["duplicate_texts"] = duplicate_texts
176
+
177
+ # Check rating validity
178
+ invalid_ratings = (~df["rating"].between(1, 5)).sum()
179
+ if invalid_ratings > 0:
180
+ issues["invalid_ratings"] = invalid_ratings
181
+
182
+ # Check for missing user_id or parent_asin
183
+ missing_user = df["user_id"].isna().sum()
184
+ missing_product = df["parent_asin"].isna().sum()
185
+ if missing_user > 0:
186
+ issues["missing_user_id"] = missing_user
187
+ if missing_product > 0:
188
+ issues["missing_parent_asin"] = missing_product
189
+
190
+ return {
191
+ "total_reviews": len(df),
192
+ "issues_found": len(issues) > 0,
193
+ "issues": issues,
194
+ "clean_reviews": len(df) - sum(issues.values()) if issues else len(df),
195
+ }
196
+
197
+
198
+ def clean_reviews(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
199
+ """
200
+ Clean the reviews dataset by removing problematic entries.
201
+
202
+ Removes:
203
+ - Reviews with missing or empty text
204
+ - Reviews with very short text (<10 chars)
205
+ - Reviews with invalid ratings
206
+
207
+ Args:
208
+ df: Raw Reviews DataFrame.
209
+ verbose: Print cleaning summary.
210
+
211
+ Returns:
212
+ Cleaned DataFrame.
213
+ """
214
+ original_len = len(df)
215
+
216
+ # Remove missing/empty text
217
+ df = df[df["text"].notna()]
218
+ df = df[df["text"].str.strip() != ""]
219
+
220
+ # Remove very short reviews
221
+ df = df[df["text"].str.len() >= 10]
222
+
223
+ # Remove invalid ratings
224
+ df = df[df["rating"].between(1, 5)]
225
+
226
+ # Remove missing identifiers
227
+ df = df[df["user_id"].notna()]
228
+ df = df[df["parent_asin"].notna()]
229
+
230
+ df = df.reset_index(drop=True)
231
+
232
+ if verbose:
233
+ removed = original_len - len(df)
234
+ logger.info(
235
+ "Cleaned: removed %s reviews (%.1f%%)",
236
+ f"{removed:,}",
237
+ removed / original_len * 100,
238
+ )
239
+ logger.info("Remaining: %s reviews", f"{len(df):,}")
240
+
241
+ return df
242
+
243
+
244
+ def prepare_data(
245
+ subset_size: int,
246
+ min_interactions: int = 5,
247
+ force: bool = False,
248
+ verbose: bool = True,
249
+ ) -> pd.DataFrame:
250
+ """
251
+ Load, clean, and filter reviews. Single source of truth for prepared data.
252
+
253
+ This is the canonical way to get model-ready data. All scripts should
254
+ use this function to ensure consistency.
255
+
256
+ Args:
257
+ subset_size: Number of raw reviews to start with.
258
+ min_interactions: Minimum interactions for 5-core filtering.
259
+ force: If True, rebuild from scratch (ignores and overwrites cache).
260
+ verbose: Print progress.
261
+
262
+ Returns:
263
+ Cleaned and filtered DataFrame ready for chunking/embedding.
264
+ """
265
+ cache_path = DATA_DIR / f"reviews_prepared_{subset_size}.parquet"
266
+ raw_cache_path = DATA_DIR / f"reviews_{subset_size}.parquet"
267
+
268
+ # Handle cache invalidation
269
+ if force:
270
+ if cache_path.exists():
271
+ cache_path.unlink()
272
+ if verbose:
273
+ logger.info("Cleared prepared data cache: %s", cache_path.name)
274
+ if raw_cache_path.exists():
275
+ raw_cache_path.unlink()
276
+ if verbose:
277
+ logger.info("Cleared raw data cache: %s", raw_cache_path.name)
278
+
279
+ # Use cache if available
280
+ if cache_path.exists():
281
+ if verbose:
282
+ logger.info("Loading prepared data from cache: %s", cache_path)
283
+ df = pd.read_parquet(cache_path)
284
+ if verbose:
285
+ logger.info("Loaded %s prepared reviews", f"{len(df):,}")
286
+ return df
287
+
288
+ if verbose:
289
+ logger.info("Preparing data from scratch...")
290
+
291
+ # Load raw
292
+ df = load_reviews(subset_size=subset_size, use_cache=True)
293
+
294
+ # Clean
295
+ if verbose:
296
+ logger.info("Cleaning data quality issues...")
297
+ df = clean_reviews(df, verbose=verbose)
298
+
299
+ # 5-core filter
300
+ if verbose:
301
+ logger.info("Applying 5-core filtering...")
302
+ df = filter_5_core(df, min_interactions=min_interactions)
303
+
304
+ if verbose:
305
+ logger.info("Final prepared dataset: %s reviews", f"{len(df):,}")
306
+
307
+ # Cache prepared data
308
+ df.to_parquet(cache_path)
309
+ if verbose:
310
+ logger.info("Cached prepared data to: %s", cache_path)
311
+
312
+ return df
313
+
314
+
315
+ def calculate_sparsity(df: pd.DataFrame) -> float:
316
+ """
317
+ Calculate interaction matrix sparsity.
318
+
319
+ Sparsity = 1 - (n_interactions / (n_users * n_items))
320
+
321
+ A value of 0.99 means 99% of possible user-item pairs have no interaction.
322
+ Recommendation datasets are typically 99%+ sparse.
323
+
324
+ Args:
325
+ df: DataFrame with 'user_id' and 'parent_asin' columns.
326
+
327
+ Returns:
328
+ Sparsity as a float between 0 and 1.
329
+ """
330
+ n_interactions = len(df)
331
+ n_users = df["user_id"].nunique()
332
+ n_items = df["parent_asin"].nunique()
333
+
334
+ if n_users == 0 or n_items == 0:
335
+ return 1.0
336
+
337
+ density = n_interactions / (n_users * n_items)
338
+ return 1 - density
339
+
340
+
341
+ def create_temporal_splits(
342
+ df: pd.DataFrame,
343
+ train_ratio: float = 0.7,
344
+ val_ratio: float = 0.1,
345
+ save: bool = True,
346
+ verbose: bool = True,
347
+ ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
348
+ """
349
+ Split data temporally for recommendation evaluation.
350
+
351
+ Reviews are sorted by timestamp and split chronologically,
352
+ ensuring no future data leaks into training. This is the
353
+ standard approach for recommendation system evaluation.
354
+
355
+ Args:
356
+ df: Prepared DataFrame with 'timestamp' column.
357
+ train_ratio: Fraction of data for training (default 0.7).
358
+ val_ratio: Fraction of data for validation (default 0.1).
359
+ save: Whether to save splits to disk.
360
+ verbose: Print split statistics.
361
+
362
+ Returns:
363
+ Tuple of (train_df, val_df, test_df).
364
+ """
365
+ df = df.sort_values("timestamp").reset_index(drop=True)
366
+
367
+ n = len(df)
368
+ train_end = int(n * train_ratio)
369
+ val_end = int(n * (train_ratio + val_ratio))
370
+
371
+ train_df = df.iloc[:train_end].reset_index(drop=True)
372
+ val_df = df.iloc[train_end:val_end].reset_index(drop=True)
373
+ test_df = df.iloc[val_end:].reset_index(drop=True)
374
+
375
+ if verbose:
376
+ logger.info(
377
+ "Temporal splits (%.0f%%/%.0f%%/%.0f%%):",
378
+ train_ratio * 100,
379
+ val_ratio * 100,
380
+ (1 - train_ratio - val_ratio) * 100,
381
+ )
382
+ logger.info(" Train: %s reviews", f"{len(train_df):,}")
383
+ logger.info(" Val: %s reviews", f"{len(val_df):,}")
384
+ logger.info(" Test: %s reviews", f"{len(test_df):,}")
385
+
386
+ if save:
387
+ SPLITS_DIR.mkdir(exist_ok=True)
388
+ train_df.to_parquet(SPLITS_DIR / "train.parquet")
389
+ val_df.to_parquet(SPLITS_DIR / "val.parquet")
390
+ test_df.to_parquet(SPLITS_DIR / "test.parquet")
391
+ if verbose:
392
+ logger.info(" Saved to: %s", SPLITS_DIR)
393
+
394
+ return train_df, val_df, test_df
395
+
396
+
397
+ def verify_temporal_boundaries(
398
+ train_df: pd.DataFrame,
399
+ val_df: pd.DataFrame,
400
+ test_df: pd.DataFrame,
401
+ verbose: bool = True,
402
+ ) -> dict:
403
+ """
404
+ Verify no temporal leakage across split boundaries.
405
+
406
+ Checks that all training timestamps precede validation timestamps,
407
+ and all validation timestamps precede test timestamps.
408
+
409
+ Args:
410
+ train_df: Training split.
411
+ val_df: Validation split.
412
+ test_df: Test split.
413
+ verbose: Print boundary information.
414
+
415
+ Returns:
416
+ Dict with timestamp ranges for each split.
417
+
418
+ Raises:
419
+ AssertionError: If temporal boundaries overlap.
420
+ """
421
+ train_max = train_df["timestamp"].max()
422
+ val_min = val_df["timestamp"].min()
423
+ val_max = val_df["timestamp"].max()
424
+ test_min = test_df["timestamp"].min()
425
+
426
+ assert train_max < val_min, (
427
+ f"Train/val overlap! Train max: {train_max}, Val min: {val_min}"
428
+ )
429
+ assert val_max < test_min, (
430
+ f"Val/test overlap! Val max: {val_max}, Test min: {test_min}"
431
+ )
432
+
433
+ boundaries = {
434
+ "train": (int(train_df["timestamp"].min()), int(train_max)),
435
+ "val": (int(val_min), int(val_max)),
436
+ "test": (int(test_min), int(test_df["timestamp"].max())),
437
+ }
438
+
439
+ if verbose:
440
+ logger.info("Temporal boundaries verified (no leakage):")
441
+ for split, (start, end) in boundaries.items():
442
+ start_date = pd.to_datetime(start, unit="ms").strftime("%Y-%m-%d")
443
+ end_date = pd.to_datetime(end, unit="ms").strftime("%Y-%m-%d")
444
+ logger.info(
445
+ " %s: %s to %s", split.capitalize().ljust(5), start_date, end_date
446
+ )
447
+
448
+ return boundaries
449
+
450
+
451
+ def load_splits(
452
+ splits_dir: Path = SPLITS_DIR,
453
+ ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
454
+ """
455
+ Load previously saved temporal splits.
456
+
457
+ Args:
458
+ splits_dir: Directory containing split parquet files.
459
+
460
+ Returns:
461
+ Tuple of (train_df, val_df, test_df).
462
+
463
+ Raises:
464
+ FileNotFoundError: If splits don't exist.
465
+ """
466
+ train_path = splits_dir / "train.parquet"
467
+ val_path = splits_dir / "val.parquet"
468
+ test_path = splits_dir / "test.parquet"
469
+
470
+ if not all(p.exists() for p in [train_path, val_path, test_path]):
471
+ raise FileNotFoundError(
472
+ f"Splits not found in {splits_dir}. Run create_temporal_splits() first."
473
+ )
474
+
475
+ return (
476
+ pd.read_parquet(train_path),
477
+ pd.read_parquet(val_path),
478
+ pd.read_parquet(test_path),
479
+ )
scripts/build_natural_eval_dataset.py CHANGED
@@ -189,12 +189,13 @@ NATURAL_QUERIES = [
189
  "intent": "feature_specific",
190
  },
191
  # === STORAGE QUERIES ===
 
192
  {
193
  "query": "sd card for camera",
194
  "relevant_items": {
195
- "B071R715MZ": 3.0, # SD card
196
- "B006GWO5WK": 3.0, # SD card high rating
197
- "B08KG14KCT": 2.0, # Storage
198
  },
199
  "category": "storage",
200
  "intent": "use_case",
@@ -202,9 +203,9 @@ NATURAL_QUERIES = [
202
  {
203
  "query": "external hard drive for backup",
204
  "relevant_items": {
205
- "B008J0Z9TA": 3.0, # Hard drive for Mac
206
  "B09Q7YPZPJ": 2.0, # Storage device
207
- "B07P9V8GSH": 2.0, # Storage
208
  },
209
  "category": "storage",
210
  "intent": "use_case",
@@ -212,8 +213,9 @@ NATURAL_QUERIES = [
212
  {
213
  "query": "fast micro sd card for phone",
214
  "relevant_items": {
215
- "B071R715MZ": 3.0, # MicroSD
216
- "B006GWO5WK": 3.0, # SD card
 
217
  },
218
  "category": "storage",
219
  "intent": "feature_specific",
@@ -221,9 +223,9 @@ NATURAL_QUERIES = [
221
  {
222
  "query": "reliable storage for important files",
223
  "relevant_items": {
224
- "B008J0Z9TA": 3.0, # Drive for storage
225
- "B006GWO5WK": 2.0, # SD card
226
- "B071R715MZ": 2.0, # Storage
227
  },
228
  "category": "storage",
229
  "intent": "feature_specific",
@@ -279,12 +281,13 @@ NATURAL_QUERIES = [
279
  "intent": "use_case",
280
  },
281
  # === CABLES / ADAPTERS QUERIES ===
 
282
  {
283
  "query": "usb c charging cable",
284
  "relevant_items": {
285
- "B0BGNG1294": 3.0, # USB cable high rating
286
- "B09TWVB2TH": 2.0, # Cable
287
- "B09Y1PSVTB": 2.0, # USB related
288
  },
289
  "category": "cables_adapters",
290
  "intent": "general",
@@ -292,8 +295,9 @@ NATURAL_QUERIES = [
292
  {
293
  "query": "hdmi cable for tv",
294
  "relevant_items": {
295
- "B01LXJA5JD": 3.0, # HDMI/streaming cable
296
- "B0BGNG1294": 2.0, # Cable
 
297
  },
298
  "category": "cables_adapters",
299
  "intent": "use_case",
@@ -301,8 +305,9 @@ NATURAL_QUERIES = [
301
  {
302
  "query": "fast phone charger",
303
  "relevant_items": {
304
- "B0BGNG1294": 3.0, # Charger cable
305
- "B09TWVB2TH": 2.0, # Charging cable
 
306
  },
307
  "category": "cables_adapters",
308
  "intent": "feature_specific",
@@ -310,19 +315,21 @@ NATURAL_QUERIES = [
310
  {
311
  "query": "durable charging cable that lasts",
312
  "relevant_items": {
313
- "B0BGNG1294": 3.0, # High rated cable
314
- "B09TWVB2TH": 2.0, # Cable
 
315
  },
316
  "category": "cables_adapters",
317
  "intent": "feature_specific",
318
  },
319
  # === KEYBOARD / MOUSE QUERIES ===
 
320
  {
321
  "query": "wireless keyboard for computer",
322
  "relevant_items": {
323
- "B003NR57BY": 3.0, # Keyboard
324
- "B0043T7FXE": 2.0, # Keyboard
325
- "B003VAHYNC": 2.0, # Keyboard
326
  },
327
  "category": "keyboards_mice",
328
  "intent": "general",
@@ -330,9 +337,9 @@ NATURAL_QUERIES = [
330
  {
331
  "query": "quiet keyboard for office",
332
  "relevant_items": {
333
- "B095JX15XF": 3.0, # Office keyboard
334
- "B07HZLHPKP": 2.0, # Small office keyboard
335
- "B003NR57BY": 2.0, # Keyboard
336
  },
337
  "category": "keyboards_mice",
338
  "intent": "use_case",
@@ -340,8 +347,9 @@ NATURAL_QUERIES = [
340
  {
341
  "query": "compact keyboard for small desk",
342
  "relevant_items": {
343
- "B07HZLHPKP": 3.0, # Small keyboard for office/den
344
- "B095JX15XF": 2.0, # Keyboard
 
345
  },
346
  "category": "keyboards_mice",
347
  "intent": "feature_specific",
 
189
  "intent": "feature_specific",
190
  },
191
  # === STORAGE QUERIES ===
192
+ # NOTE: Products verified via review text (Session 78 ground truth audit)
193
  {
194
  "query": "sd card for camera",
195
  "relevant_items": {
196
+ "B071R715MZ": 3.0, # SD card (verified: "tablet loaded with topo maps")
197
+ "B00PFQPX5Q": 3.0, # SD card (verified: "raspberry pi")
198
+ "B01H3LM75O": 2.0, # SD card (verified: "record 4k video")
199
  },
200
  "category": "storage",
201
  "intent": "use_case",
 
203
  {
204
  "query": "external hard drive for backup",
205
  "relevant_items": {
206
+ "B008J0Z9TA": 3.0, # Hard drive (verified: "Fast reliable")
207
  "B09Q7YPZPJ": 2.0, # Storage device
208
+ "B07P9V8GSH": 2.0, # Storage (verified: "Raspberry Pi")
209
  },
210
  "category": "storage",
211
  "intent": "use_case",
 
213
  {
214
  "query": "fast micro sd card for phone",
215
  "relevant_items": {
216
+ "B071R715MZ": 3.0, # MicroSD (verified: tablet storage)
217
+ "B009RWAYTE": 3.0, # MicroSD (verified: "Galaxy phone")
218
+ "B00IVPU786": 2.0, # MicroSD (verified: "Works as expected")
219
  },
220
  "category": "storage",
221
  "intent": "feature_specific",
 
223
  {
224
  "query": "reliable storage for important files",
225
  "relevant_items": {
226
+ "B008J0Z9TA": 3.0, # Hard drive (verified: "Fast reliable")
227
+ "B071R715MZ": 2.0, # SD card (verified)
228
+ "B01H3LM75O": 2.0, # SD card (verified: "4k video")
229
  },
230
  "category": "storage",
231
  "intent": "feature_specific",
 
281
  "intent": "use_case",
282
  },
283
  # === CABLES / ADAPTERS QUERIES ===
284
+ # NOTE: Products verified via review text (Session 78 ground truth audit)
285
  {
286
  "query": "usb c charging cable",
287
  "relevant_items": {
288
+ "B07QDNWC7X": 3.0, # USB-C cable (verified: "heavy-duty USB-C cable")
289
+ "B08DK8BP1J": 2.0, # USB-C cable (verified: "USB2 cables")
290
+ "B0BGS23YKX": 2.0, # USB-C cable (verified: "braided cords")
291
  },
292
  "category": "cables_adapters",
293
  "intent": "general",
 
295
  {
296
  "query": "hdmi cable for tv",
297
  "relevant_items": {
298
+ "B085YG25ZS": 3.0, # HDMI cable (verified: "8K HDMI Cable")
299
+ "B075KP9XHS": 2.0, # HDMI cable (verified: "quality of the cable")
300
+ "B06XGHMDJ1": 2.0, # HDMI cable (verified: "picture quality on TV")
301
  },
302
  "category": "cables_adapters",
303
  "intent": "use_case",
 
305
  {
306
  "query": "fast phone charger",
307
  "relevant_items": {
308
+ "B0C3KV52B7": 3.0, # Charger (verified: "Charges quickly")
309
+ "B01LWQ933G": 2.0, # Charger (verified: "many devices plugged-in")
310
+ "B0B96VM41C": 2.0, # Charger (verified: "power cable for iPad")
311
  },
312
  "category": "cables_adapters",
313
  "intent": "feature_specific",
 
315
  {
316
  "query": "durable charging cable that lasts",
317
  "relevant_items": {
318
+ "B00SUX3XQC": 3.0, # Cable (verified: "Nicely made cable")
319
+ "B0C1FRBK4K": 2.0, # Cable (verified: "Anker brand")
320
+ "B09CTFY414": 2.0, # Cable (verified: "very nice" charging cable)
321
  },
322
  "category": "cables_adapters",
323
  "intent": "feature_specific",
324
  },
325
  # === KEYBOARD / MOUSE QUERIES ===
326
+ # NOTE: Products verified via review text (Session 78 ground truth audit)
327
  {
328
  "query": "wireless keyboard for computer",
329
  "relevant_items": {
330
+ "B001IOHAVO": 3.0, # Keyboard (verified: keyboard reviews)
331
+ "B003VAGXZC": 2.0, # Keyboard (verified: "keyboard combo")
332
+ "B00QRPPCMI": 2.0, # Keyboard (verified: "keyboard and mouse")
333
  },
334
  "category": "keyboards_mice",
335
  "intent": "general",
 
337
  {
338
  "query": "quiet keyboard for office",
339
  "relevant_items": {
340
+ "B0BK3LYMR2": 3.0, # Keyboard (verified: "super quiet")
341
+ "B004MF11MU": 2.0, # Keyboard (verified: keyboard reviews)
342
+ "B095JX15XF": 2.0, # Keyboard (verified: "keyboards that are used")
343
  },
344
  "category": "keyboards_mice",
345
  "intent": "use_case",
 
347
  {
348
  "query": "compact keyboard for small desk",
349
  "relevant_items": {
350
+ "B07J21Y82K": 3.0, # Keyboard (verified: "Great little keyboard")
351
+ "B00MUTWLW4": 2.0, # Keyboard (verified: keyboard slot)
352
+ "B0BXYXD1Q2": 2.0, # Keyboard (verified: "keyboard works well")
353
  },
354
  "category": "keyboards_mice",
355
  "intent": "feature_specific",