vxa8502 commited on
Commit
66926c8
·
1 Parent(s): f9c51d8

Replace EDA with production Qdrant queries

Browse files
Makefile CHANGED
@@ -67,14 +67,12 @@ data-validate:
67
  assert emb is not None and emb.shape[1] == 384, 'Embedding dimension mismatch'; \
68
  print('Validation passed')"
69
 
70
- # Exploratory data analysis (generates figures + report)
71
- eda:
72
- @echo "=== EDA ANALYSIS ==="
73
  @mkdir -p data/figures
74
  @mkdir -p reports
75
  python scripts/eda.py
76
- @echo "Figures saved to data/figures/"
77
- @echo "Report generated: reports/eda_report.md"
78
 
79
  # ---------------------------------------------------------------------------
80
  # Evaluation Suite
@@ -82,16 +80,11 @@ eda:
82
 
83
  # Standard evaluation: primary metrics, spot-checks, explanation tests, faithfulness
84
  eval: check-env
85
- @test -d data/splits || (echo "ERROR: Run 'make data' first" && exit 1)
86
  @echo "=== EVALUATION SUITE ===" && \
87
  echo "" && \
88
- echo "--- Building evaluation datasets ---" && \
89
- python scripts/build_eval_dataset.py && \
90
  python scripts/build_natural_eval_dataset.py && \
91
  echo "" && \
92
- echo "--- Recommendation evaluation (LOO history) ---" && \
93
- python scripts/evaluation.py --dataset eval_loo_history.json --section primary && \
94
- echo "" && \
95
  echo "--- Recommendation evaluation (natural queries) ---" && \
96
  python scripts/evaluation.py --dataset eval_natural_queries.json --section primary && \
97
  echo "" && \
@@ -114,9 +107,6 @@ eval-deep: check-env
114
  @test -d data/eval || (echo "ERROR: Run 'make eval' first to build eval datasets" && exit 1)
115
  @echo "=== DEEP EVALUATION (ablations + baselines) ===" && \
116
  echo "" && \
117
- echo "--- Full recommendation evaluation (LOO history) ---" && \
118
- python scripts/evaluation.py --dataset eval_loo_history.json --section all --baselines && \
119
- echo "" && \
120
  echo "--- Full recommendation evaluation (natural queries) ---" && \
121
  python scripts/evaluation.py --dataset eval_natural_queries.json --section all && \
122
  echo "" && \
@@ -131,11 +121,9 @@ eval-deep: check-env
131
 
132
  # Quick eval: skip RAGAS (faster iteration)
133
  eval-quick: check-env
134
- @test -d data/splits || (echo "ERROR: Run 'make data' first" && exit 1)
135
  @echo "=== QUICK EVALUATION (no RAGAS) ==="
136
- python scripts/build_eval_dataset.py && \
137
  python scripts/build_natural_eval_dataset.py && \
138
- python scripts/evaluation.py --dataset eval_loo_history.json --section primary && \
139
  python scripts/faithfulness.py --samples 5
140
  @echo "Quick eval complete"
141
 
@@ -248,10 +236,10 @@ metrics-snapshot:
248
  @python -c "\
249
  import json; from pathlib import Path; \
250
  r = Path('data/eval_results'); \
251
- loo = json.load(open(r/'eval_loo_history_latest.json', encoding='utf-8')) if (r/'eval_loo_history_latest.json').exists() else {}; \
252
  faith = json.load(open(r/'faithfulness_latest.json', encoding='utf-8')) if (r/'faithfulness_latest.json').exists() else {}; \
253
  human = json.load(open(r/'human_eval_latest.json', encoding='utf-8')) if (r/'human_eval_latest.json').exists() else {}; \
254
- pm = loo.get('primary_metrics', {}); mm = faith.get('multi_metric', {}); \
255
  print('=== SAGE METRICS ==='); \
256
  print(f'NDCG@10: {pm.get(\"ndcg_at_10\", \"n/a\")}'); \
257
  print(f'Claim HHEM: {mm.get(\"claim_level_avg_score\", \"n/a\")}'); \
 
67
  assert emb is not None and emb.shape[1] == 384, 'Embedding dimension mismatch'; \
68
  print('Validation passed')"
69
 
70
+ # Exploratory data analysis (queries production Qdrant)
71
+ eda: check-env
72
+ @echo "=== PRODUCTION EDA ==="
73
  @mkdir -p data/figures
74
  @mkdir -p reports
75
  python scripts/eda.py
 
 
76
 
77
  # ---------------------------------------------------------------------------
78
  # Evaluation Suite
 
80
 
81
  # Standard evaluation: primary metrics, spot-checks, explanation tests, faithfulness
82
  eval: check-env
 
83
  @echo "=== EVALUATION SUITE ===" && \
84
  echo "" && \
85
+ echo "--- Building natural query evaluation dataset ---" && \
 
86
  python scripts/build_natural_eval_dataset.py && \
87
  echo "" && \
 
 
 
88
  echo "--- Recommendation evaluation (natural queries) ---" && \
89
  python scripts/evaluation.py --dataset eval_natural_queries.json --section primary && \
90
  echo "" && \
 
107
  @test -d data/eval || (echo "ERROR: Run 'make eval' first to build eval datasets" && exit 1)
108
  @echo "=== DEEP EVALUATION (ablations + baselines) ===" && \
109
  echo "" && \
 
 
 
110
  echo "--- Full recommendation evaluation (natural queries) ---" && \
111
  python scripts/evaluation.py --dataset eval_natural_queries.json --section all && \
112
  echo "" && \
 
121
 
122
  # Quick eval: skip RAGAS (faster iteration)
123
  eval-quick: check-env
 
124
  @echo "=== QUICK EVALUATION (no RAGAS) ==="
 
125
  python scripts/build_natural_eval_dataset.py && \
126
+ python scripts/evaluation.py --dataset eval_natural_queries.json --section primary && \
127
  python scripts/faithfulness.py --samples 5
128
  @echo "Quick eval complete"
129
 
 
236
  @python -c "\
237
  import json; from pathlib import Path; \
238
  r = Path('data/eval_results'); \
239
+ nq = json.load(open(r/'eval_natural_queries_latest.json', encoding='utf-8')) if (r/'eval_natural_queries_latest.json').exists() else {}; \
240
  faith = json.load(open(r/'faithfulness_latest.json', encoding='utf-8')) if (r/'faithfulness_latest.json').exists() else {}; \
241
  human = json.load(open(r/'human_eval_latest.json', encoding='utf-8')) if (r/'human_eval_latest.json').exists() else {}; \
242
+ pm = nq.get('primary_metrics', {}); mm = faith.get('multi_metric', {}); \
243
  print('=== SAGE METRICS ==='); \
244
  print(f'NDCG@10: {pm.get(\"ndcg_at_10\", \"n/a\")}'); \
245
  print(f'Claim HHEM: {mm.get(\"claim_level_avg_score\", \"n/a\")}'); \
reports/eda_report.md ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Exploratory Data Analysis: Production Data
2
+
3
+ **Source:** Qdrant Cloud (Collection: `sage_reviews`)
4
+ **Status:** green
5
+ **Generated from live production data**
6
+
7
+ ---
8
+
9
+ ## Dataset Overview
10
+
11
+ This report analyzes the actual data deployed in production, ensuring all statistics match what the recommendation system uses.
12
+
13
+ | Metric | Value |
14
+ |--------|-------|
15
+ | Total Chunks | 423,165 |
16
+ | Unique Reviews | 334,282 |
17
+ | Unique Products | 21,827 |
18
+ | Expansion Ratio | 1.27x |
19
+
20
+ ---
21
+
22
+ ## Rating Distribution
23
+
24
+ Amazon reviews exhibit a characteristic J-shaped distribution, heavily skewed toward 5-star ratings.
25
+
26
+ ![Rating Distribution](../data/figures/rating_distribution.png)
27
+
28
+ | Rating | Count | Percentage |
29
+ |--------|-------|------------|
30
+ | 1 | 31,924 | 7.5% |
31
+ | 2 | 21,301 | 5.0% |
32
+ | 3 | 34,078 | 8.1% |
33
+ | 4 | 71,153 | 16.8% |
34
+ | 5 | 264,709 | 62.6% |
35
+
36
+ **Key Observations:**
37
+ - 5-star ratings: 62.6% of chunks
38
+ - 1-star ratings: 7.5% of chunks
39
+ - This polarization is typical for e-commerce review data
40
+
41
+ ---
42
+
43
+ ## Chunk Length Analysis
44
+
45
+ Chunk lengths affect retrieval quality and context window usage.
46
+
47
+ ![Chunk Lengths](../data/figures/chunk_lengths.png)
48
+
49
+ **Statistics:**
50
+ - Median chunk length: 169 characters (~42 tokens)
51
+ - Mean chunk length: 258 characters
52
+ - Most chunks fit comfortably within embedding model context
53
+
54
+ ---
55
+
56
+ ## Chunking Distribution
57
+
58
+ Reviews are chunked based on length: short reviews stay whole, longer reviews are split semantically.
59
+
60
+ ![Chunks per Review](../data/figures/chunks_per_review.png)
61
+
62
+ | Metric | Value |
63
+ |--------|-------|
64
+ | Single-chunk reviews | 303,550 |
65
+ | Multi-chunk reviews | 30,732 |
66
+ | Expansion ratio | 1.27x |
67
+
68
+ **Chunking Strategy:**
69
+ - Reviews < 200 tokens: No chunking (embedded whole)
70
+ - Reviews 200-500 tokens: Semantic chunking
71
+ - Reviews > 500 tokens: Semantic + sliding window
72
+
73
+ ---
74
+
75
+ ## Temporal Distribution
76
+
77
+ Review timestamps enable chronological analysis and temporal evaluation splits.
78
+
79
+ ![Temporal Distribution](../data/figures/temporal_distribution.png)
80
+
81
+ ---
82
+
83
+ ## Data Quality
84
+
85
+ The production dataset has been through 5-core filtering (users and items with 5+ interactions) and quality checks:
86
+
87
+ - All chunks have valid text content
88
+ - All ratings are in [1, 5] range
89
+ - All product identifiers present
90
+ - Deterministic chunk IDs (MD5 hash of review_id + chunk_index)
91
+
92
+ ---
93
+
94
+ ## Summary
95
+
96
+ This production EDA confirms the deployed data characteristics:
97
+
98
+ 1. **Scale:** 423,165 chunks across 21,827 products
99
+ 2. **Quality:** 5-core filtered, validated payloads
100
+ 3. **Distribution:** J-shaped ratings, typical e-commerce pattern
101
+ 4. **Chunking:** 1.27x expansion from reviews to chunks
102
+
103
+ The data matches what the recommendation API queries in real-time.
104
+
105
+ ---
106
+
107
+ *Report generated from Qdrant Cloud. Run `make eda` to regenerate.*
sage/services/__init__.py CHANGED
@@ -31,6 +31,21 @@ from sage.services.cold_start import (
31
  recommend_cold_start_user,
32
  )
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  # Evaluation and faithfulness services are loaded lazily to avoid
35
  # pulling in ragas/langchain when only retrieval is needed.
36
  # Import from sage.services.evaluation or sage.services.faithfulness directly.
@@ -75,6 +90,7 @@ __all__ = [
75
  # Explanation
76
  "Explainer",
77
  "explain_recommendations",
 
78
  # Cold-start
79
  "ColdStartService",
80
  "recommend_cold_start_user",
 
31
  recommend_cold_start_user,
32
  )
33
 
34
+
35
+ def get_explanation_services():
36
+ """Initialize Explainer and HallucinationDetector.
37
+
38
+ Centralizes the common pattern of creating both services together.
39
+ Import is deferred to avoid loading heavy models until needed.
40
+
41
+ Returns:
42
+ Tuple of (Explainer, HallucinationDetector) instances.
43
+ """
44
+ from sage.adapters.hhem import HallucinationDetector
45
+
46
+ return Explainer(), HallucinationDetector()
47
+
48
+
49
  # Evaluation and faithfulness services are loaded lazily to avoid
50
  # pulling in ragas/langchain when only retrieval is needed.
51
  # Import from sage.services.evaluation or sage.services.faithfulness directly.
 
90
  # Explanation
91
  "Explainer",
92
  "explain_recommendations",
93
+ "get_explanation_services",
94
  # Cold-start
95
  "ColdStartService",
96
  "recommend_cold_start_user",
scripts/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Scripts package marker for relative imports.
scripts/build_eval_dataset.py DELETED
@@ -1,660 +0,0 @@
1
- """
2
- Build evaluation dataset from test split using leave-one-out protocol.
3
-
4
- For each user with 2+ reviews in the test set:
5
- 1. Hold out their most recent review (the "target" item)
6
- 2. Generate a query from:
7
- - Keywords extracted from held-out review (simulates search)
8
- - OR user's historical reviews (profile-based)
9
- 3. Create EvalCase with target item as relevant
10
-
11
- Run from project root:
12
- python scripts/build_eval_dataset.py
13
- """
14
-
15
- import re
16
- import json
17
- from collections import Counter
18
- from pathlib import Path
19
-
20
- import pandas as pd
21
- import numpy as np
22
-
23
- from sage.core import EvalCase
24
- from sage.config import DATA_DIR, get_logger, log_banner, log_section
25
- from sage.services.evaluation import rating_to_relevance
26
-
27
- logger = get_logger(__name__)
28
-
29
- EVAL_DIR = DATA_DIR / "eval"
30
-
31
-
32
- # ---------------------------------------------------------------------------
33
- # Query Generation Strategies
34
- # ---------------------------------------------------------------------------
35
-
36
- # Common stopwords to filter out
37
- STOPWORDS = {
38
- "i",
39
- "me",
40
- "my",
41
- "myself",
42
- "we",
43
- "our",
44
- "ours",
45
- "ourselves",
46
- "you",
47
- "your",
48
- "yours",
49
- "yourself",
50
- "yourselves",
51
- "he",
52
- "him",
53
- "his",
54
- "himself",
55
- "she",
56
- "her",
57
- "hers",
58
- "herself",
59
- "it",
60
- "its",
61
- "itself",
62
- "they",
63
- "them",
64
- "their",
65
- "theirs",
66
- "themselves",
67
- "what",
68
- "which",
69
- "who",
70
- "whom",
71
- "this",
72
- "that",
73
- "these",
74
- "those",
75
- "am",
76
- "is",
77
- "are",
78
- "was",
79
- "were",
80
- "be",
81
- "been",
82
- "being",
83
- "have",
84
- "has",
85
- "had",
86
- "having",
87
- "do",
88
- "does",
89
- "did",
90
- "doing",
91
- "a",
92
- "an",
93
- "the",
94
- "and",
95
- "but",
96
- "if",
97
- "or",
98
- "because",
99
- "as",
100
- "until",
101
- "while",
102
- "of",
103
- "at",
104
- "by",
105
- "for",
106
- "with",
107
- "about",
108
- "against",
109
- "between",
110
- "into",
111
- "through",
112
- "during",
113
- "before",
114
- "after",
115
- "above",
116
- "below",
117
- "to",
118
- "from",
119
- "up",
120
- "down",
121
- "in",
122
- "out",
123
- "on",
124
- "off",
125
- "over",
126
- "under",
127
- "again",
128
- "further",
129
- "then",
130
- "once",
131
- "here",
132
- "there",
133
- "when",
134
- "where",
135
- "why",
136
- "how",
137
- "all",
138
- "each",
139
- "few",
140
- "more",
141
- "most",
142
- "other",
143
- "some",
144
- "such",
145
- "no",
146
- "nor",
147
- "not",
148
- "only",
149
- "own",
150
- "same",
151
- "so",
152
- "than",
153
- "too",
154
- "very",
155
- "s",
156
- "t",
157
- "can",
158
- "will",
159
- "just",
160
- "don",
161
- "should",
162
- "now",
163
- "d",
164
- "ll",
165
- "m",
166
- "o",
167
- "re",
168
- "ve",
169
- "y",
170
- "ain",
171
- "aren",
172
- "couldn",
173
- "didn",
174
- "doesn",
175
- "hadn",
176
- "hasn",
177
- "haven",
178
- "isn",
179
- "ma",
180
- "mightn",
181
- "mustn",
182
- "needn",
183
- "shan",
184
- "shouldn",
185
- "wasn",
186
- "weren",
187
- "won",
188
- "wouldn",
189
- "also",
190
- "would",
191
- "could",
192
- "get",
193
- "got",
194
- "one",
195
- "two",
196
- "really",
197
- "like",
198
- "just",
199
- "even",
200
- "well",
201
- "much",
202
- "still",
203
- "back",
204
- "way",
205
- "thing",
206
- "things",
207
- "make",
208
- "made",
209
- "work",
210
- "works",
211
- "worked",
212
- "use",
213
- "used",
214
- "using",
215
- "good",
216
- "great",
217
- "nice",
218
- "product",
219
- "item",
220
- "bought",
221
- "buy",
222
- "amazon",
223
- "review",
224
- "ordered",
225
- "order",
226
- "received",
227
- "came",
228
- "arrived",
229
- "shipping",
230
- "shipped",
231
- }
232
-
233
-
234
- def extract_keywords(text: str, max_keywords: int = 8) -> list[str]:
235
- """
236
- Extract keywords from review text using simple frequency analysis.
237
-
238
- Focuses on nouns and adjectives that describe product attributes.
239
-
240
- Args:
241
- text: Review text.
242
- max_keywords: Maximum keywords to extract.
243
-
244
- Returns:
245
- List of keyword strings.
246
- """
247
- # Clean text
248
- text = text.lower()
249
- text = re.sub(r"<br\s*/?>", " ", text) # Remove HTML breaks
250
- text = re.sub(r"[^a-z\s]", " ", text) # Keep only letters
251
- text = re.sub(r"\s+", " ", text).strip()
252
-
253
- # Tokenize and filter
254
- words = text.split()
255
- words = [w for w in words if len(w) > 2 and w not in STOPWORDS]
256
-
257
- # Count frequencies
258
- counts = Counter(words)
259
-
260
- # Get top keywords
261
- keywords = [word for word, _ in counts.most_common(max_keywords)]
262
-
263
- return keywords
264
-
265
-
266
- def generate_query_from_review(
267
- title: str,
268
- text: str,
269
- max_words: int = 10,
270
- ) -> str:
271
- """
272
- Generate a search query from a review's title and text.
273
-
274
- Combines title keywords with text keywords to create a realistic
275
- query that a user might type to find this product.
276
-
277
- Args:
278
- title: Review title.
279
- text: Review text.
280
- max_words: Maximum words in generated query.
281
-
282
- Returns:
283
- Query string.
284
- """
285
- # Extract from title (usually more specific)
286
- title_keywords = extract_keywords(title or "", max_keywords=4)
287
-
288
- # Extract from text
289
- text_keywords = extract_keywords(text or "", max_keywords=8)
290
-
291
- # Combine, prioritizing title
292
- all_keywords = []
293
- seen = set()
294
-
295
- for kw in title_keywords + text_keywords:
296
- if kw not in seen:
297
- all_keywords.append(kw)
298
- seen.add(kw)
299
-
300
- # Limit length
301
- query_words = all_keywords[:max_words]
302
-
303
- return " ".join(query_words) if query_words else "electronics product"
304
-
305
-
306
- def generate_query_from_history(
307
- reviews: list[dict],
308
- max_words: int = 15,
309
- ) -> str:
310
- """
311
- Generate a query from user's review history (profile-based).
312
-
313
- Concatenates positive review texts and extracts common themes.
314
-
315
- Args:
316
- reviews: List of review dicts with 'text' and 'rating' keys.
317
- max_words: Maximum words in generated query.
318
-
319
- Returns:
320
- Query string.
321
- """
322
- # Filter to positive reviews
323
- positive = [r for r in reviews if r.get("rating", 0) >= 4]
324
- if not positive:
325
- positive = reviews
326
-
327
- # Combine texts
328
- combined_text = " ".join(r.get("text", "")[:500] for r in positive[:5])
329
-
330
- # Extract keywords
331
- keywords = extract_keywords(combined_text, max_keywords=max_words)
332
-
333
- return " ".join(keywords) if keywords else "electronics product"
334
-
335
-
336
- # ---------------------------------------------------------------------------
337
- # Evaluation Dataset Construction
338
- # ---------------------------------------------------------------------------
339
-
340
-
341
- def build_leave_one_out_cases(
342
- df: pd.DataFrame,
343
- min_reviews: int = 2,
344
- query_strategy: str = "keyword",
345
- verbose: bool = True,
346
- ) -> list[EvalCase]:
347
- """
348
- Build evaluation cases using leave-one-out protocol.
349
-
350
- For each user with enough reviews:
351
- 1. Sort reviews by timestamp
352
- 2. Hold out the most recent review as target
353
- 3. Generate query based on strategy
354
- 4. Create EvalCase with graded relevance
355
-
356
- Args:
357
- df: DataFrame with review data.
358
- min_reviews: Minimum reviews per user to include.
359
- query_strategy: "keyword" (from target) or "history" (from past reviews).
360
- verbose: Print progress.
361
-
362
- Returns:
363
- List of EvalCase objects.
364
- """
365
- if verbose:
366
- logger.info("Building eval cases with strategy: %s", query_strategy)
367
- logger.info("Minimum reviews per user: %d", min_reviews)
368
-
369
- # Group by user
370
- user_groups = df.groupby("user_id")
371
-
372
- eval_cases = []
373
- skipped_users = 0
374
-
375
- for user_id, group in user_groups:
376
- if len(group) < min_reviews:
377
- skipped_users += 1
378
- continue
379
-
380
- # Sort by timestamp (ascending)
381
- group = group.sort_values("timestamp")
382
- reviews = group.to_dict("records")
383
-
384
- # Hold out the most recent review
385
- target_review = reviews[-1]
386
- history_reviews = reviews[:-1]
387
-
388
- # Generate query
389
- if query_strategy == "keyword":
390
- query = generate_query_from_review(
391
- title=target_review.get("title", ""),
392
- text=target_review.get("text", ""),
393
- )
394
- elif query_strategy == "history":
395
- query = generate_query_from_history(history_reviews)
396
- else:
397
- raise ValueError(f"Unknown query strategy: {query_strategy}")
398
-
399
- # Build relevance dict
400
- # Target item gets relevance based on rating
401
- target_product = target_review.get("parent_asin")
402
- target_rating = target_review.get("rating", 3)
403
- relevance = rating_to_relevance(target_rating)
404
-
405
- # Only include if target has positive relevance
406
- if relevance > 0:
407
- eval_cases.append(
408
- EvalCase(
409
- query=query,
410
- relevant_items={target_product: relevance},
411
- user_id=user_id,
412
- )
413
- )
414
-
415
- if verbose:
416
- logger.info("Users with enough reviews: %d", len(user_groups) - skipped_users)
417
- logger.info("Eval cases created: %d", len(eval_cases))
418
- logger.info(
419
- "Skipped (low relevance): %d",
420
- len(user_groups) - skipped_users - len(eval_cases),
421
- )
422
-
423
- return eval_cases
424
-
425
-
426
- def build_multi_relevant_cases(
427
- df: pd.DataFrame,
428
- train_df: pd.DataFrame,
429
- min_test_reviews: int = 1,
430
- verbose: bool = True,
431
- ) -> list[EvalCase]:
432
- """
433
- Build cases where ALL user's test reviews are relevant.
434
-
435
- Uses user's training history to generate query, and ALL their
436
- test reviews as relevant items. Better for users with multiple
437
- test items.
438
-
439
- Args:
440
- df: Test split DataFrame.
441
- train_df: Training split DataFrame.
442
- min_test_reviews: Minimum test reviews to include user.
443
- verbose: Print progress.
444
-
445
- Returns:
446
- List of EvalCase objects.
447
- """
448
- if verbose:
449
- logger.info("Building multi-relevant eval cases...")
450
-
451
- # Get users with training history
452
- train_users = set(train_df["user_id"].unique())
453
-
454
- # Group test reviews by user
455
- test_groups = df.groupby("user_id")
456
-
457
- eval_cases = []
458
-
459
- for user_id, group in test_groups:
460
- if len(group) < min_test_reviews:
461
- continue
462
-
463
- # Skip if no training history
464
- if user_id not in train_users:
465
- continue
466
-
467
- # Get training reviews for query generation
468
- user_train = train_df[train_df["user_id"] == user_id]
469
- train_reviews = user_train.to_dict("records")
470
-
471
- if not train_reviews:
472
- continue
473
-
474
- # Generate query from training history
475
- query = generate_query_from_history(train_reviews)
476
-
477
- # All test reviews are relevant
478
- relevant_items = {}
479
- for row in group.to_dict("records"):
480
- product_id = row["parent_asin"]
481
- rating = row["rating"]
482
- relevance = rating_to_relevance(rating)
483
- if relevance > 0:
484
- # Take max relevance if product appears multiple times
485
- relevant_items[product_id] = max(
486
- relevant_items.get(product_id, 0),
487
- relevance,
488
- )
489
-
490
- if relevant_items:
491
- eval_cases.append(
492
- EvalCase(
493
- query=query,
494
- relevant_items=relevant_items,
495
- user_id=user_id,
496
- )
497
- )
498
-
499
- if verbose:
500
- logger.info("Users with train history: %d", len(train_users))
501
- logger.info("Eval cases created: %d", len(eval_cases))
502
- avg_relevant = (
503
- np.mean([len(c.relevant_items) for c in eval_cases]) if eval_cases else 0
504
- )
505
- logger.info("Avg relevant items per case: %.1f", avg_relevant)
506
-
507
- return eval_cases
508
-
509
-
510
- def save_eval_cases(
511
- cases: list[EvalCase],
512
- filename: str,
513
- verbose: bool = True,
514
- ) -> Path:
515
- """
516
- Save evaluation cases to JSON file.
517
-
518
- Args:
519
- cases: List of EvalCase objects.
520
- filename: Output filename (without directory).
521
- verbose: Print confirmation.
522
-
523
- Returns:
524
- Path to saved file.
525
- """
526
- EVAL_DIR.mkdir(exist_ok=True)
527
- filepath = EVAL_DIR / filename
528
-
529
- # Convert to serializable format
530
- data = [
531
- {
532
- "query": c.query,
533
- "relevant_items": c.relevant_items,
534
- "user_id": c.user_id,
535
- }
536
- for c in cases
537
- ]
538
-
539
- with open(filepath, "w", encoding="utf-8") as f:
540
- json.dump(data, f, indent=2)
541
-
542
- if verbose:
543
- logger.info("Saved %d eval cases to: %s", len(cases), filepath)
544
-
545
- return filepath
546
-
547
-
548
- def load_eval_cases(filename: str) -> list[EvalCase]:
549
- """
550
- Load evaluation cases from JSON file.
551
-
552
- Args:
553
- filename: Filename in eval directory.
554
-
555
- Returns:
556
- List of EvalCase objects.
557
- """
558
- filepath = EVAL_DIR / filename
559
-
560
- with open(filepath, encoding="utf-8") as f:
561
- data = json.load(f)
562
-
563
- return [
564
- EvalCase(
565
- query=d["query"],
566
- relevant_items=d["relevant_items"],
567
- user_id=d.get("user_id"),
568
- )
569
- for d in data
570
- ]
571
-
572
-
573
- # ---------------------------------------------------------------------------
574
- # Main
575
- # ---------------------------------------------------------------------------
576
-
577
- if __name__ == "__main__":
578
- from sage.data import load_splits
579
-
580
- log_banner(logger, "BUILD EVALUATION DATASET")
581
-
582
- # Load splits
583
- log_section(logger, "Loading data splits")
584
- train_df, val_df, test_df = load_splits()
585
- logger.info(
586
- "Train: %s | Val: %s | Test: %s",
587
- f"{len(train_df):,}",
588
- f"{len(val_df):,}",
589
- f"{len(test_df):,}",
590
- )
591
-
592
- # Strategy 1: Leave-one-out with keyword queries
593
- # WARNING: This strategy has TARGET LEAKAGE - queries are generated from
594
- # the held-out review itself. Only use as a retrieval sanity check,
595
- # NOT for measuring recommendation quality.
596
- log_section(logger, "Strategy 1: Leave-One-Out (Keyword Queries)")
597
- logger.warning("Target leakage - use for sanity check only!")
598
-
599
- loo_keyword_cases = build_leave_one_out_cases(
600
- test_df,
601
- min_reviews=2,
602
- query_strategy="keyword",
603
- )
604
-
605
- # Show examples
606
- logger.info("Sample queries:")
607
- for case in loo_keyword_cases[:5]:
608
- logger.info(' Query: "%s"', case.query)
609
- logger.info(
610
- " Target: %s (rel=%s)",
611
- list(case.relevant_items.keys())[0],
612
- list(case.relevant_items.values())[0],
613
- )
614
-
615
- save_eval_cases(loo_keyword_cases, "eval_loo_keyword.json")
616
-
617
- # Strategy 2: Leave-one-out with history queries
618
- log_section(logger, "Strategy 2: Leave-One-Out (History Queries)")
619
-
620
- loo_history_cases = build_leave_one_out_cases(
621
- test_df,
622
- min_reviews=2,
623
- query_strategy="history",
624
- )
625
-
626
- # Show examples
627
- logger.info("Sample queries:")
628
- for case in loo_history_cases[:5]:
629
- logger.info(' Query: "%s"', case.query)
630
- logger.info(
631
- " Target: %s (rel=%s)",
632
- list(case.relevant_items.keys())[0],
633
- list(case.relevant_items.values())[0],
634
- )
635
-
636
- save_eval_cases(loo_history_cases, "eval_loo_history.json")
637
-
638
- # Strategy 3: Multi-relevant (all test items)
639
- log_section(logger, "Strategy 3: Multi-Relevant (Train->Test)")
640
-
641
- multi_cases = build_multi_relevant_cases(
642
- test_df,
643
- train_df,
644
- min_test_reviews=1,
645
- )
646
-
647
- if multi_cases:
648
- logger.info("Sample queries:")
649
- for case in multi_cases[:3]:
650
- logger.info(' Query: "%s..."', case.query[:60])
651
- logger.info(" Relevant: %d items", len(case.relevant_items))
652
-
653
- save_eval_cases(multi_cases, "eval_multi_relevant.json")
654
-
655
- # Summary
656
- log_banner(logger, "EVALUATION DATASETS CREATED")
657
- logger.info(" eval_loo_keyword.json: %d cases", len(loo_keyword_cases))
658
- logger.info(" eval_loo_history.json: %d cases", len(loo_history_cases))
659
- logger.info(" eval_multi_relevant.json: %d cases", len(multi_cases))
660
- logger.info(" Location: %s", EVAL_DIR)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/demo.py CHANGED
@@ -44,7 +44,7 @@ def demo_recommendation(query: str, top_k: int = 3, max_evidence: int = 3):
44
  return None
45
 
46
  # Initialize services
47
- from scripts.lib.services import get_explanation_services
48
 
49
  explainer, detector = get_explanation_services()
50
 
 
44
  return None
45
 
46
  # Initialize services
47
+ from sage.services import get_explanation_services
48
 
49
  explainer, detector = get_explanation_services()
50
 
scripts/e2e_success_rate.py CHANGED
@@ -104,7 +104,7 @@ class E2EReport:
104
 
105
  def run_e2e_evaluation(n_samples: int = 20) -> E2EReport:
106
  """Run end-to-end success rate evaluation."""
107
- from scripts.lib.services import get_explanation_services
108
  from sage.services.faithfulness import (
109
  is_refusal,
110
  is_mismatch_warning,
 
104
 
105
  def run_e2e_evaluation(n_samples: int = 20) -> E2EReport:
106
  """Run end-to-end success rate evaluation."""
107
+ from sage.services import get_explanation_services
108
  from sage.services.faithfulness import (
109
  is_refusal,
110
  is_mismatch_warning,
scripts/eda.py CHANGED
@@ -1,21 +1,46 @@
1
- # %% [markdown]
2
- # # Exploratory Data Analysis
3
- #
4
- # Analyze the Amazon Electronics reviews dataset to understand
5
- # data distributions, quality issues, and inform modeling decisions.
 
 
 
 
 
 
 
 
 
6
 
7
- # %% Imports
 
 
 
 
8
  from pathlib import Path
9
 
10
- import pandas as pd
 
 
 
 
 
 
 
 
 
11
  import matplotlib.pyplot as plt
 
12
 
13
- from sage.config import CHARS_PER_TOKEN, DEV_SUBSET_SIZE, DATA_DIR
14
- from sage.data import load_reviews, get_review_stats, prepare_data
15
 
16
- # Output directory for figures
17
  FIGURES_DIR = DATA_DIR / "figures"
18
- FIGURES_DIR.mkdir(exist_ok=True)
 
 
 
19
 
20
  # Plot configuration
21
  plt.style.use("seaborn-v0_8-whitegrid")
@@ -23,7 +48,7 @@ plt.rcParams.update(
23
  {
24
  "figure.figsize": (10, 5),
25
  "figure.dpi": 100,
26
- "savefig.dpi": 300, # High-res for markdown reports
27
  "savefig.bbox": "tight",
28
  "savefig.pad_inches": 0.1,
29
  "font.size": 11,
@@ -33,481 +58,412 @@ plt.rcParams.update(
33
  }
34
  )
35
 
36
- # Enable retina display for Jupyter notebooks
37
- try:
38
- from IPython import get_ipython
39
-
40
- if get_ipython() is not None:
41
- get_ipython().run_line_magic("matplotlib", "inline")
42
- get_ipython().run_line_magic("config", "InlineBackend.figure_format='retina'")
43
- except (ImportError, AttributeError):
44
- pass
45
-
46
  PRIMARY_COLOR = "#05A0D1"
47
  SECONDARY_COLOR = "#FF9900"
48
  FIGURE_SIZE_WIDE = (12, 5)
49
 
50
- # %% Load data
51
- df = load_reviews(subset_size=DEV_SUBSET_SIZE)
52
- print(f"Loaded {len(df):,} reviews")
53
-
54
- # %% Basic statistics
55
- stats = get_review_stats(df)
56
- print("\n=== Dataset Overview ===")
57
- for key, value in stats.items():
58
- if isinstance(value, float):
59
- print(f"{key}: {value:.2f}")
60
- else:
61
- print(f"{key}: {value}")
62
-
63
- # %% Rating distribution
64
- fig, ax = plt.subplots()
65
- rating_counts = pd.Series(stats["rating_dist"])
66
- bars = ax.bar(
67
- rating_counts.index, rating_counts.values, color=PRIMARY_COLOR, edgecolor="black"
68
- )
69
- ax.set_xlabel("Rating")
70
- ax.set_ylabel("Count")
71
- ax.set_title("Rating Distribution")
72
- ax.set_xticks(rating_counts.index)
73
-
74
- for bar, count in zip(bars, rating_counts.values, strict=True):
75
- ax.text(
76
- bar.get_x() + bar.get_width() / 2,
77
- bar.get_height() + 50,
78
- f"{count:,}",
79
- ha="center",
80
- va="bottom",
81
- fontsize=10,
82
- )
83
-
84
- plt.savefig(FIGURES_DIR / "rating_distribution.png")
85
-
86
- print("\nRating breakdown:")
87
- for rating, count in rating_counts.items():
88
- pct = count / len(df) * 100
89
- print(f" {int(rating)} stars: {count:,} ({pct:.1f}%)")
90
-
91
- # %% Review length analysis
92
- df["text_length"] = df["text"].str.len()
93
- df["word_count"] = df["text"].str.split().str.len()
94
- df["estimated_tokens"] = df["text_length"] // CHARS_PER_TOKEN
95
-
96
- fig, axes = plt.subplots(1, 2, figsize=FIGURE_SIZE_WIDE)
97
-
98
- # Character length histogram
99
- ax1 = axes[0]
100
- df["text_length"].clip(upper=2000).hist(
101
- bins=50, ax=ax1, color=PRIMARY_COLOR, edgecolor="white"
102
- )
103
- ax1.set_xlabel("Character Length (clipped at 2000)")
104
- ax1.set_ylabel("Count")
105
- ax1.set_title("Review Length Distribution")
106
- ax1.axvline(
107
- df["text_length"].median(),
108
- color="red",
109
- linestyle="--",
110
- label=f"Median: {df['text_length'].median():.0f}",
111
- )
112
- ax1.legend()
113
-
114
- # Token estimate histogram
115
- ax2 = axes[1]
116
- df["estimated_tokens"].clip(upper=500).hist(
117
- bins=50, ax=ax2, color=SECONDARY_COLOR, edgecolor="white"
118
- )
119
- ax2.set_xlabel("Estimated Tokens (clipped at 500)")
120
- ax2.set_ylabel("Count")
121
- ax2.set_title("Estimated Token Distribution")
122
- ax2.axvline(200, color="red", linestyle="--", label="Chunking threshold (200)")
123
- ax2.legend()
124
-
125
- plt.savefig(FIGURES_DIR / "review_lengths.png")
126
-
127
- needs_chunking = (df["estimated_tokens"] > 200).sum()
128
- print("\nReview length stats:")
129
- print(f" Median characters: {df['text_length'].median():.0f}")
130
- print(f" Median tokens (est): {df['estimated_tokens'].median():.0f}")
131
- print(
132
- f" Reviews > 200 tokens: {needs_chunking:,} ({needs_chunking / len(df) * 100:.1f}%)"
133
- )
134
-
135
- # %% Review length by rating
136
- fig, ax = plt.subplots()
137
- length_by_rating = df.groupby("rating")["text_length"].median()
138
- bars = ax.bar(
139
- length_by_rating.index,
140
- length_by_rating.values,
141
- color=PRIMARY_COLOR,
142
- edgecolor="white",
143
- )
144
- ax.set_xlabel("Rating")
145
- ax.set_ylabel("Median Review Length (chars)")
146
- ax.set_title("Review Length by Rating")
147
- ax.set_xticks([1, 2, 3, 4, 5])
148
-
149
- plt.savefig(FIGURES_DIR / "length_by_rating.png")
150
-
151
- print("\nMedian review length by rating:")
152
- for rating, length in length_by_rating.items():
153
- print(f" {int(rating)} stars: {length:.0f} chars")
154
-
155
- # %% Temporal analysis
156
- df["datetime"] = pd.to_datetime(df["timestamp"], unit="ms")
157
- df["year_month"] = df["datetime"].dt.to_period("M")
158
-
159
- reviews_over_time = df.groupby("year_month").size()
160
-
161
- fig, ax = plt.subplots(figsize=FIGURE_SIZE_WIDE)
162
- reviews_over_time.plot(
163
- kind="line", ax=ax, marker="o", markersize=3, linewidth=1, color=PRIMARY_COLOR
164
- )
165
- ax.set_xlabel("Month")
166
- ax.set_ylabel("Number of Reviews")
167
- ax.set_title("Reviews Over Time")
168
- plt.xticks(rotation=45)
169
-
170
- plt.savefig(FIGURES_DIR / "reviews_over_time.png")
171
-
172
- print("\nTemporal range:")
173
- print(f" Earliest: {df['datetime'].min()}")
174
- print(f" Latest: {df['datetime'].max()}")
175
-
176
- # %% Data quality checks
177
- print("\n=== Data Quality Checks ===")
178
-
179
- # Missing values
180
- missing = df.isnull().sum()
181
- print("\nMissing values:")
182
- for col, count in missing.items():
183
- if count > 0:
184
- print(f" {col}: {count:,} ({count / len(df) * 100:.2f}%)")
185
- if missing.sum() == 0:
186
- print(" None!")
187
-
188
- # Empty reviews
189
- empty_reviews = (df["text"].str.strip() == "").sum()
190
- print(f"\nEmpty reviews: {empty_reviews:,}")
191
-
192
- # Very short reviews (< 10 chars)
193
- very_short = (df["text_length"] < 10).sum()
194
- print(f"Very short reviews (<10 chars): {very_short:,}")
195
-
196
- # Duplicate reviews
197
- duplicate_texts = df["text"].duplicated().sum()
198
- print(f"Duplicate review texts: {duplicate_texts:,}")
199
-
200
- # Verified vs unverified
201
- if "verified_purchase" in df.columns:
202
- verified_pct = df["verified_purchase"].mean() * 100
203
- print(f"\nVerified purchases: {verified_pct:.1f}%")
204
-
205
- # %% User and item coverage
206
- user_counts = df["user_id"].value_counts()
207
- item_counts = df["parent_asin"].value_counts()
208
-
209
- fig, axes = plt.subplots(1, 2, figsize=FIGURE_SIZE_WIDE)
210
-
211
- # Reviews per user
212
- ax1 = axes[0]
213
- user_counts.clip(upper=20).value_counts().sort_index().plot(
214
- kind="bar", ax=ax1, color=PRIMARY_COLOR
215
- )
216
- ax1.set_xlabel("Reviews per User")
217
- ax1.set_ylabel("Number of Users")
218
- ax1.set_title("User Activity Distribution")
219
-
220
- # Reviews per item
221
- ax2 = axes[1]
222
- item_counts.clip(upper=20).value_counts().sort_index().plot(
223
- kind="bar", ax=ax2, color=SECONDARY_COLOR
224
- )
225
- ax2.set_xlabel("Reviews per Item")
226
- ax2.set_ylabel("Number of Items")
227
- ax2.set_title("Item Popularity Distribution")
228
-
229
- plt.savefig(FIGURES_DIR / "user_item_distribution.png")
230
-
231
- print("\nUser activity:")
232
- print(
233
- f" Users with 1 review: {(user_counts == 1).sum():,} ({(user_counts == 1).sum() / len(user_counts) * 100:.1f}%)"
234
- )
235
- print(f" Users with 5+ reviews: {(user_counts >= 5).sum():,}")
236
- print(f" Max reviews by one user: {user_counts.max()}")
237
 
238
- print("\nItem popularity:")
239
- print(
240
- f" Items with 1 review: {(item_counts == 1).sum():,} ({(item_counts == 1).sum() / len(item_counts) * 100:.1f}%)"
241
- )
242
- print(f" Items with 5+ reviews: {(item_counts >= 5).sum():,}")
243
- print(f" Max reviews for one item: {item_counts.max()}")
244
-
245
- # %% 5-core eligibility
246
- users_5plus = set(user_counts[user_counts >= 5].index)
247
- items_5plus = set(item_counts[item_counts >= 5].index)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
- eligible_mask = df["user_id"].isin(users_5plus) & df["parent_asin"].isin(items_5plus)
250
- print("\n5-core filtering preview:")
251
- print(
252
- f" Reviews eligible (first pass): {eligible_mask.sum():,} ({eligible_mask.sum() / len(df) * 100:.1f}%)"
253
- )
254
 
255
- # %% Sample reviews across length buckets
256
- print("\n=== Sample Reviews by Length Bucket ===")
257
- print("(Understanding content patterns before chunking)\n")
258
-
259
- length_buckets = [
260
- (0, 50, "Very short (0-50 tokens)"),
261
- (50, 100, "Short (50-100 tokens)"),
262
- (100, 200, "Medium (100-200 tokens)"),
263
- (200, 400, "Long (200-400 tokens)"),
264
- (400, float("inf"), "Very long (400+ tokens)"),
265
- ]
266
-
267
- for min_tok, max_tok, label in length_buckets:
268
- bucket_mask = (df["estimated_tokens"] >= min_tok) & (
269
- df["estimated_tokens"] < max_tok
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  )
271
- bucket_df = df[bucket_mask]
272
-
273
- if len(bucket_df) == 0:
274
- print(f"{label}: No reviews")
275
- continue
276
-
277
- print(
278
- f"{label}: {len(bucket_df):,} reviews ({len(bucket_df) / len(df) * 100:.1f}%)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  )
280
 
281
- samples = bucket_df.sample(min(3, len(bucket_df)), random_state=42)
282
- for _, row in samples.iterrows():
283
- rating = int(row["rating"])
284
- tokens = row["estimated_tokens"]
285
- text = row["text"][:200] + "..." if len(row["text"]) > 200 else row["text"]
286
- text = text.replace("\n", " ")
287
- print(f" [{rating}*] ({tokens} tok) {text}")
288
- print()
289
-
290
- # %% Prepared data comparison
291
- print("\n=== Prepared Data (what the model sees) ===")
292
- df_prepared = prepare_data(subset_size=DEV_SUBSET_SIZE, verbose=False)
293
- prepared_stats = get_review_stats(df_prepared)
294
-
295
- print(f"Raw reviews: {len(df):,}")
296
- print(
297
- f"Prepared reviews: {len(df_prepared):,} ({len(df_prepared) / len(df) * 100:.1f}% retained)"
298
- )
299
- print(f"Unique users: {prepared_stats['unique_users']:,}")
300
- print(f"Unique items: {prepared_stats['unique_items']:,}")
301
- print(
302
- f"Avg rating: {prepared_stats['avg_rating']:.2f} (raw: {stats['avg_rating']:.2f})"
303
- )
304
 
305
- # %% Summary
306
- print("\n" + "=" * 50)
307
- print("EDA SUMMARY")
308
- print("=" * 50)
309
- print(f"Total reviews: {len(df):,}")
310
- print(f"Unique users: {df['user_id'].nunique():,}")
311
- print(f"Unique items: {df['parent_asin'].nunique():,}")
312
- print(f"Average rating: {df['rating'].mean():.2f}")
313
- print(
314
- f"Reviews needing chunking: {needs_chunking:,} ({needs_chunking / len(df) * 100:.1f}%)"
315
- )
316
- print(f"Data quality issues: {empty_reviews + very_short + duplicate_texts}")
317
- print(f"\nPlots saved to: {FIGURES_DIR}")
318
 
319
- # %% Generate markdown report
320
- REPORTS_DIR = Path("reports")
321
- REPORTS_DIR.mkdir(exist_ok=True)
 
 
 
 
 
322
 
323
- # Compute all stats for report
324
- raw_total = len(df)
325
- prepared_total = len(df_prepared)
326
- unique_users_raw = df["user_id"].nunique()
327
- unique_items_raw = df["parent_asin"].nunique()
328
- unique_users_prepared = prepared_stats["unique_users"]
329
- unique_items_prepared = prepared_stats["unique_items"]
330
- avg_rating_raw = stats["avg_rating"]
331
- avg_rating_prepared = prepared_stats["avg_rating"]
332
- retention_pct = prepared_total / raw_total * 100
333
-
334
- median_chars = df["text_length"].median()
335
- mean_chars = df["text_length"].mean()
336
- median_tokens = df["estimated_tokens"].median()
337
- chunking_pct = needs_chunking / len(df) * 100
338
-
339
- five_star_pct = rating_counts.get(5, 0) / len(df) * 100
340
- one_star_pct = rating_counts.get(1, 0) / len(df) * 100
341
- middle_pct = 100 - five_star_pct - one_star_pct
342
-
343
- users_one_review = (user_counts == 1).sum()
344
- users_one_review_pct = users_one_review / len(user_counts) * 100
345
- users_5plus = (user_counts >= 5).sum()
346
- max_user_reviews = user_counts.max()
347
-
348
- items_one_review = (item_counts == 1).sum()
349
- items_one_review_pct = items_one_review / len(item_counts) * 100
350
- items_5plus = (item_counts >= 5).sum()
351
- max_item_reviews = item_counts.max()
352
-
353
- length_1star = length_by_rating.get(1, 0)
354
- length_2star = length_by_rating.get(2, 0)
355
- length_3star = length_by_rating.get(3, 0)
356
- length_4star = length_by_rating.get(4, 0)
357
- length_5star = length_by_rating.get(5, 0)
358
-
359
- report_content = f"""# Exploratory Data Analysis: Amazon Electronics Reviews
360
-
361
- **Dataset:** McAuley-Lab/Amazon-Reviews-2023 (Electronics category)
362
- **Subset:** {raw_total:,} raw reviews -> {prepared_total:,} after 5-core filtering
363
 
364
  ---
365
 
366
  ## Dataset Overview
367
 
368
- The Amazon Electronics reviews dataset provides rich user feedback data for building recommendation systems. After standard preprocessing and 5-core filtering (requiring users and items to have at least 5 interactions), the dataset exhibits the characteristic sparsity of real-world recommendation scenarios.
369
 
370
- | Metric | Raw | After 5-Core |
371
- |--------|-----|--------------|
372
- | Total Reviews | {raw_total:,} | {prepared_total:,} |
373
- | Unique Users | {unique_users_raw:,} | {unique_users_prepared:,} |
374
- | Unique Items | {unique_items_raw:,} | {unique_items_prepared:,} |
375
- | Avg Rating | {avg_rating_raw:.2f} | {avg_rating_prepared:.2f} |
376
- | Retention | - | {retention_pct:.1f}% |
377
 
378
  ---
379
 
380
  ## Rating Distribution
381
 
382
- Amazon reviews exhibit a well-known J-shaped distribution, heavily skewed toward 5-star ratings. This reflects both genuine satisfaction and selection bias (dissatisfied customers often don't leave reviews).
383
 
384
  ![Rating Distribution](../data/figures/rating_distribution.png)
385
 
 
 
 
 
386
  **Key Observations:**
387
- - 5-star ratings dominate ({five_star_pct:.1f}% of reviews)
388
- - 1-star reviews form the second largest group ({one_star_pct:.1f}%)
389
- - Middle ratings (2-4 stars) are relatively rare ({middle_pct:.1f}% combined)
390
  - This polarization is typical for e-commerce review data
391
 
392
- **Implications for Modeling:**
393
- - Binary classification (positive/negative) may be more robust than regression
394
- - Rating-weighted aggregation should account for the skewed distribution
395
- - Evidence from 4-5 star reviews carries stronger positive signal
396
-
397
  ---
398
 
399
- ## Review Length Analysis
400
-
401
- Review length varies significantly and correlates with the chunking strategy for the RAG pipeline. Most reviews are short enough to embed directly without chunking.
402
 
403
- ![Review Length Distribution](../data/figures/review_lengths.png)
404
 
405
- **Length Statistics:**
406
- - Median: {median_chars:.0f} characters (~{median_tokens:.0f} tokens)
407
- - Mean: {mean_chars:.0f} characters (~{mean_chars / 4:.0f} tokens)
408
- - Reviews exceeding 200 tokens: {chunking_pct:.1f}% (require chunking)
409
 
410
- **Chunking Strategy Validation:**
411
- The tiered chunking approach is well-suited to this distribution:
412
- - **Short (<200 tokens):** No chunking needed - majority of reviews
413
- - **Medium (200-500 tokens):** Semantic chunking at topic boundaries
414
- - **Long (>500 tokens):** Semantic + sliding window fallback
415
 
416
  ---
417
 
418
- ## Review Length by Rating
419
 
420
- Negative reviews tend to be longer than positive ones. Users who are dissatisfied often provide detailed explanations of issues, while satisfied users may simply express approval.
421
 
422
- ![Review Length by Rating](../data/figures/length_by_rating.png)
423
 
424
- **Pattern:**
425
- - 1-star reviews: {length_1star:.0f} chars median
426
- - 2-3 star reviews: {length_2star:.0f}-{length_3star:.0f} chars median (users explain nuance)
427
- - 4-star reviews: {length_4star:.0f} chars median
428
- - 5-star reviews: {length_5star:.0f} chars median
429
 
430
- **Implications:**
431
- - Negative reviews provide richer evidence for issue identification
432
- - Positive reviews may require multiple chunks for substantive explanations
433
- - Rating filters (min_rating=4) naturally bias toward shorter evidence
434
 
435
  ---
436
 
437
  ## Temporal Distribution
438
 
439
- The dataset spans multiple years of reviews, enabling proper temporal train/validation/test splits that prevent data leakage.
440
 
441
- ![Reviews Over Time](../data/figures/reviews_over_time.png)
442
 
443
- **Temporal Split Strategy:**
444
- - **Train (70%):** Oldest reviews - model learns from historical patterns
445
- - **Validation (10%):** Middle period - hyperparameter tuning
446
- - **Test (20%):** Most recent - simulates production deployment
447
 
448
- This chronological ordering ensures the model never sees "future" data during training.
449
 
450
- ---
451
 
452
- ## User and Item Activity
 
 
 
453
 
454
- The long-tail distribution is pronounced: most users write few reviews, and most items receive few reviews. This sparsity is the fundamental challenge recommendation systems address.
455
 
456
- ![User and Item Distribution](../data/figures/user_item_distribution.png)
457
 
458
- **User Activity:**
459
- - Users with only 1 review: {users_one_review_pct:.1f}%
460
- - Users with 5+ reviews: {users_5plus:,}
461
- - Power user max: {max_user_reviews} reviews
462
 
463
- **Item Popularity:**
464
- - Items with only 1 review: {items_one_review_pct:.1f}%
465
- - Items with 5+ reviews: {items_5plus:,}
466
- - Most reviewed item: {max_item_reviews} reviews
467
 
468
- **Cold-Start Implications:**
469
- - Many items have sparse evidence - content-based features are critical
470
- - User cold-start is common - onboarding preferences help
471
- - 5-core filtering ensures minimum evidence density for evaluation
472
 
473
  ---
474
 
475
- ## Data Quality Assessment
 
476
 
477
- The raw dataset contains several quality issues addressed during preprocessing.
 
 
478
 
479
- | Issue | Count | Resolution |
480
- |-------|-------|------------|
481
- | Missing text | 0 | - |
482
- | Empty reviews | {empty_reviews} | Removed |
483
- | Very short (<10 chars) | {very_short:,} | Removed |
484
- | Duplicate texts | {duplicate_texts:,} | Kept (valid re-purchases) |
485
- | Invalid ratings | 0 | - |
486
 
487
- **Post-Cleaning:**
488
- - All reviews have valid text content
489
- - All ratings are in [1, 5] range
490
- - All user/product identifiers present
491
 
492
- ---
493
 
494
- ## Summary
 
 
 
 
 
 
 
 
 
495
 
496
- The Amazon Electronics dataset, after 5-core filtering and cleaning, provides a solid foundation for building and evaluating a RAG-based recommendation system:
 
 
497
 
498
- 1. **Scale:** {prepared_total:,} reviews across {unique_users_prepared:,} users and {unique_items_prepared:,} items
499
- 2. **Sparsity:** {100 - retention_pct:.1f}% filtered - realistic for recommendation evaluation
500
- 3. **Quality:** Clean text, valid ratings, proper identifiers
501
- 4. **Temporal:** Supports chronological train/val/test splits
502
- 5. **Content:** Review lengths suit the tiered chunking strategy
503
 
504
- The J-shaped rating distribution and long-tail user/item activity are characteristic of real e-commerce data, making this an appropriate benchmark for portfolio demonstration.
 
 
505
 
506
- ---
 
 
 
 
 
 
507
 
508
- *Report auto-generated by `scripts/eda.py`. Run `make eda` to regenerate.*
509
- """
510
 
511
- report_path = REPORTS_DIR / "eda_report.md"
512
- report_path.write_text(report_content)
513
- print(f"\nReport generated: {report_path}")
 
1
+ # ruff: noqa: E402
2
+ """
3
+ Production EDA: Analyze data directly from Qdrant Cloud.
4
+
5
+ Queries the production vector store to generate accurate statistics
6
+ and visualizations. This ensures EDA reports match deployed data.
7
+
8
+ Usage:
9
+ python scripts/eda.py
10
+ make eda
11
+
12
+ Requires:
13
+ QDRANT_URL and QDRANT_API_KEY environment variables.
14
+ """
15
 
16
+ from __future__ import annotations
17
+
18
+ import os
19
+ import sys
20
+ from collections import Counter
21
  from pathlib import Path
22
 
23
+ from dotenv import load_dotenv
24
+
25
+ load_dotenv()
26
+
27
+ # Validate environment before imports
28
+ if not os.getenv("QDRANT_URL"):
29
+ print("ERROR: QDRANT_URL not set. Cannot run production EDA.")
30
+ print("Set QDRANT_URL and QDRANT_API_KEY in .env or environment.")
31
+ sys.exit(1)
32
+
33
  import matplotlib.pyplot as plt
34
+ import numpy as np
35
 
36
+ from sage.adapters.vector_store import get_client, get_collection_info
37
+ from sage.config import COLLECTION_NAME, CHARS_PER_TOKEN, DATA_DIR
38
 
 
39
  FIGURES_DIR = DATA_DIR / "figures"
40
+ FIGURES_DIR.mkdir(parents=True, exist_ok=True)
41
+
42
+ REPORTS_DIR = Path("reports")
43
+ REPORTS_DIR.mkdir(exist_ok=True)
44
 
45
  # Plot configuration
46
  plt.style.use("seaborn-v0_8-whitegrid")
 
48
  {
49
  "figure.figsize": (10, 5),
50
  "figure.dpi": 100,
51
+ "savefig.dpi": 300,
52
  "savefig.bbox": "tight",
53
  "savefig.pad_inches": 0.1,
54
  "font.size": 11,
 
58
  }
59
  )
60
 
 
 
 
 
 
 
 
 
 
 
61
  PRIMARY_COLOR = "#05A0D1"
62
  SECONDARY_COLOR = "#FF9900"
63
  FIGURE_SIZE_WIDE = (12, 5)
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ def scroll_all_payloads(client, batch_size: int = 1000, limit: int | None = None):
67
+ """
68
+ Scroll through all points in the collection and yield payloads.
69
+
70
+ Args:
71
+ client: Qdrant client.
72
+ batch_size: Points per scroll request.
73
+ limit: Optional max points to retrieve (None = all).
74
+
75
+ Yields:
76
+ Payload dicts from each point.
77
+ """
78
+ offset = None
79
+ total = 0
80
+
81
+ while True:
82
+ results = client.scroll(
83
+ collection_name=COLLECTION_NAME,
84
+ limit=batch_size,
85
+ offset=offset,
86
+ with_payload=True,
87
+ with_vectors=False,
88
+ )
89
+
90
+ points, next_offset = results
91
+
92
+ if not points:
93
+ break
94
+
95
+ for point in points:
96
+ yield point.payload
97
+ total += 1
98
+ if limit and total >= limit:
99
+ return
100
+
101
+ offset = next_offset
102
+ if offset is None:
103
+ break
104
+
105
+
106
+ def compute_stats(client, sample_size: int | None = None) -> dict:
107
+ """
108
+ Compute statistics from production Qdrant data.
109
+
110
+ Args:
111
+ client: Qdrant client.
112
+ sample_size: Optional limit for faster iteration.
113
+
114
+ Returns:
115
+ Dict with computed statistics.
116
+ """
117
+ print("Scanning Qdrant collection...")
118
+
119
+ ratings = []
120
+ text_lengths = []
121
+ timestamps = []
122
+ product_ids = set()
123
+ review_ids = set()
124
+ chunks_per_review = {}
125
+
126
+ for i, payload in enumerate(scroll_all_payloads(client, limit=sample_size)):
127
+ if i % 10000 == 0 and i > 0:
128
+ print(f" Processed {i:,} chunks...")
129
+
130
+ ratings.append(payload.get("rating", 0))
131
+ text_lengths.append(len(payload.get("text", "")))
132
+ timestamps.append(payload.get("timestamp", 0))
133
+ product_ids.add(payload.get("product_id"))
134
+ review_ids.add(payload.get("review_id"))
135
+
136
+ # Track chunks per review
137
+ review_id = payload.get("review_id")
138
+ total_chunks = payload.get("total_chunks", 1)
139
+ if review_id:
140
+ chunks_per_review[review_id] = total_chunks
141
+
142
+ print(f" Scanned {len(ratings):,} total chunks")
143
+
144
+ # Compute distributions
145
+ rating_dist = Counter(ratings)
146
+ chunk_dist = Counter(chunks_per_review.values())
147
+
148
+ # Estimate tokens from text length
149
+ token_lengths = [length // CHARS_PER_TOKEN for length in text_lengths]
150
+
151
+ return {
152
+ "total_chunks": len(ratings),
153
+ "unique_reviews": len(review_ids),
154
+ "unique_products": len(product_ids),
155
+ "ratings": ratings,
156
+ "rating_dist": dict(sorted(rating_dist.items())),
157
+ "text_lengths": text_lengths,
158
+ "token_lengths": token_lengths,
159
+ "timestamps": timestamps,
160
+ "chunks_per_review": list(chunks_per_review.values()),
161
+ "chunk_dist": dict(sorted(chunk_dist.items())),
162
+ }
163
 
 
 
 
 
 
164
 
165
+ def generate_figures(stats: dict) -> None:
166
+ """Generate EDA figures from computed stats."""
167
+
168
+ # 1. Rating distribution
169
+ fig, ax = plt.subplots()
170
+ rating_counts = stats["rating_dist"]
171
+ ratings = list(rating_counts.keys())
172
+ counts = list(rating_counts.values())
173
+
174
+ bars = ax.bar(ratings, counts, color=PRIMARY_COLOR, edgecolor="black")
175
+ ax.set_xlabel("Rating")
176
+ ax.set_ylabel("Chunk Count")
177
+ ax.set_title("Rating Distribution (Production Data)")
178
+ ax.set_xticks(ratings)
179
+
180
+ for bar, count in zip(bars, counts, strict=True):
181
+ ax.text(
182
+ bar.get_x() + bar.get_width() / 2,
183
+ bar.get_height() + max(counts) * 0.01,
184
+ f"{count:,}",
185
+ ha="center",
186
+ va="bottom",
187
+ fontsize=9,
188
+ )
189
+
190
+ plt.savefig(FIGURES_DIR / "rating_distribution.png")
191
+ plt.close()
192
+ print(f" Saved: {FIGURES_DIR / 'rating_distribution.png'}")
193
+
194
+ # 2. Chunk text length distribution
195
+ fig, axes = plt.subplots(1, 2, figsize=FIGURE_SIZE_WIDE)
196
+
197
+ ax1 = axes[0]
198
+ lengths = np.array(stats["text_lengths"])
199
+ ax1.hist(lengths.clip(max=2000), bins=50, color=PRIMARY_COLOR, edgecolor="black")
200
+ ax1.set_xlabel("Characters")
201
+ ax1.set_ylabel("Chunk Count")
202
+ ax1.set_title("Chunk Length Distribution")
203
+ ax1.axvline(
204
+ np.median(lengths),
205
+ color=SECONDARY_COLOR,
206
+ linestyle="--",
207
+ label=f"Median: {np.median(lengths):.0f}",
208
  )
209
+ ax1.legend()
210
+
211
+ ax2 = axes[1]
212
+ tokens = np.array(stats["token_lengths"])
213
+ ax2.hist(tokens.clip(max=500), bins=50, color=SECONDARY_COLOR, edgecolor="black")
214
+ ax2.set_xlabel("Estimated Tokens")
215
+ ax2.set_ylabel("Chunk Count")
216
+ ax2.set_title("Chunk Token Distribution")
217
+ ax2.axvline(
218
+ np.median(tokens),
219
+ color=PRIMARY_COLOR,
220
+ linestyle="--",
221
+ label=f"Median: {np.median(tokens):.0f}",
222
+ )
223
+ ax2.legend()
224
+
225
+ plt.savefig(FIGURES_DIR / "chunk_lengths.png")
226
+ plt.close()
227
+ print(f" Saved: {FIGURES_DIR / 'chunk_lengths.png'}")
228
+
229
+ # 3. Chunks per review distribution
230
+ fig, ax = plt.subplots()
231
+ chunk_counts = stats["chunk_dist"]
232
+ x = list(chunk_counts.keys())
233
+ y = list(chunk_counts.values())
234
+
235
+ ax.bar(x, y, color=PRIMARY_COLOR, edgecolor="black")
236
+ ax.set_xlabel("Chunks per Review")
237
+ ax.set_ylabel("Number of Reviews")
238
+ ax.set_title("Review Chunking Distribution")
239
+
240
+ plt.savefig(FIGURES_DIR / "chunks_per_review.png")
241
+ plt.close()
242
+ print(f" Saved: {FIGURES_DIR / 'chunks_per_review.png'}")
243
+
244
+ # 4. Temporal distribution (if timestamps exist)
245
+ timestamps = [t for t in stats["timestamps"] if t and t > 0]
246
+ if timestamps:
247
+ from datetime import datetime
248
+
249
+ fig, ax = plt.subplots()
250
+
251
+ # Convert to dates and count by month
252
+ dates = [datetime.fromtimestamp(t / 1000) for t in timestamps]
253
+ months = [d.strftime("%Y-%m") for d in dates]
254
+ month_counts = Counter(months)
255
+ sorted_months = sorted(month_counts.items())
256
+
257
+ if len(sorted_months) > 24:
258
+ # Show only last 24 months if too many
259
+ sorted_months = sorted_months[-24:]
260
+
261
+ x = [m[0] for m in sorted_months]
262
+ y = [m[1] for m in sorted_months]
263
+
264
+ ax.bar(range(len(x)), y, color=PRIMARY_COLOR)
265
+ ax.set_xlabel("Month")
266
+ ax.set_ylabel("Chunk Count")
267
+ ax.set_title("Temporal Distribution")
268
+ ax.set_xticks(range(0, len(x), max(1, len(x) // 6)))
269
+ ax.set_xticklabels(
270
+ [x[i] for i in range(0, len(x), max(1, len(x) // 6))], rotation=45
271
+ )
272
+
273
+ plt.savefig(FIGURES_DIR / "temporal_distribution.png")
274
+ plt.close()
275
+ print(f" Saved: {FIGURES_DIR / 'temporal_distribution.png'}")
276
+
277
+
278
+ def generate_report(stats: dict, collection_info: dict) -> None:
279
+ """Generate markdown EDA report."""
280
+
281
+ total_chunks = stats["total_chunks"]
282
+ unique_reviews = stats["unique_reviews"]
283
+ unique_products = stats["unique_products"]
284
+
285
+ # Rating stats
286
+ rating_dist = stats["rating_dist"]
287
+ total_ratings = sum(rating_dist.values())
288
+ five_star_pct = (
289
+ rating_dist.get(5.0, rating_dist.get(5, 0)) / total_ratings * 100
290
+ if total_ratings
291
+ else 0
292
+ )
293
+ one_star_pct = (
294
+ rating_dist.get(1.0, rating_dist.get(1, 0)) / total_ratings * 100
295
+ if total_ratings
296
+ else 0
297
  )
298
 
299
+ # Length stats
300
+ lengths = stats["text_lengths"]
301
+ tokens = stats["token_lengths"]
302
+ median_chars = int(np.median(lengths)) if lengths else 0
303
+ median_tokens = int(np.median(tokens)) if tokens else 0
304
+ mean_chars = int(np.mean(lengths)) if lengths else 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
+ # Chunk distribution
307
+ chunk_dist = stats["chunk_dist"]
308
+ single_chunk_reviews = chunk_dist.get(1, 0)
309
+ multi_chunk_reviews = unique_reviews - single_chunk_reviews
310
+ expansion_ratio = total_chunks / unique_reviews if unique_reviews else 0
 
 
 
 
 
 
 
 
311
 
312
+ # Rating breakdown
313
+ rating_lines = []
314
+ for rating in sorted(rating_dist.keys()):
315
+ count = rating_dist[rating]
316
+ pct = count / total_ratings * 100 if total_ratings else 0
317
+ rating_lines.append(f"| {int(rating)} | {count:,} | {pct:.1f}% |")
318
+
319
+ report_content = f"""# Exploratory Data Analysis: Production Data
320
 
321
+ **Source:** Qdrant Cloud (Collection: `{collection_info.get("name", COLLECTION_NAME)}`)
322
+ **Status:** {collection_info.get("status", "unknown")}
323
+ **Generated from live production data**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
  ---
326
 
327
  ## Dataset Overview
328
 
329
+ This report analyzes the actual data deployed in production, ensuring all statistics match what the recommendation system uses.
330
 
331
+ | Metric | Value |
332
+ |--------|-------|
333
+ | Total Chunks | {total_chunks:,} |
334
+ | Unique Reviews | {unique_reviews:,} |
335
+ | Unique Products | {unique_products:,} |
336
+ | Expansion Ratio | {expansion_ratio:.2f}x |
 
337
 
338
  ---
339
 
340
  ## Rating Distribution
341
 
342
+ Amazon reviews exhibit a characteristic J-shaped distribution, heavily skewed toward 5-star ratings.
343
 
344
  ![Rating Distribution](../data/figures/rating_distribution.png)
345
 
346
+ | Rating | Count | Percentage |
347
+ |--------|-------|------------|
348
+ {chr(10).join(rating_lines)}
349
+
350
  **Key Observations:**
351
+ - 5-star ratings: {five_star_pct:.1f}% of chunks
352
+ - 1-star ratings: {one_star_pct:.1f}% of chunks
 
353
  - This polarization is typical for e-commerce review data
354
 
 
 
 
 
 
355
  ---
356
 
357
+ ## Chunk Length Analysis
 
 
358
 
359
+ Chunk lengths affect retrieval quality and context window usage.
360
 
361
+ ![Chunk Lengths](../data/figures/chunk_lengths.png)
 
 
 
362
 
363
+ **Statistics:**
364
+ - Median chunk length: {median_chars:,} characters (~{median_tokens} tokens)
365
+ - Mean chunk length: {mean_chars:,} characters
366
+ - Most chunks fit comfortably within embedding model context
 
367
 
368
  ---
369
 
370
+ ## Chunking Distribution
371
 
372
+ Reviews are chunked based on length: short reviews stay whole, longer reviews are split semantically.
373
 
374
+ ![Chunks per Review](../data/figures/chunks_per_review.png)
375
 
376
+ | Metric | Value |
377
+ |--------|-------|
378
+ | Single-chunk reviews | {single_chunk_reviews:,} |
379
+ | Multi-chunk reviews | {multi_chunk_reviews:,} |
380
+ | Expansion ratio | {expansion_ratio:.2f}x |
381
 
382
+ **Chunking Strategy:**
383
+ - Reviews < 200 tokens: No chunking (embedded whole)
384
+ - Reviews 200-500 tokens: Semantic chunking
385
+ - Reviews > 500 tokens: Semantic + sliding window
386
 
387
  ---
388
 
389
  ## Temporal Distribution
390
 
391
+ Review timestamps enable chronological analysis and temporal evaluation splits.
392
 
393
+ ![Temporal Distribution](../data/figures/temporal_distribution.png)
394
 
395
+ ---
 
 
 
396
 
397
+ ## Data Quality
398
 
399
+ The production dataset has been through 5-core filtering (users and items with 5+ interactions) and quality checks:
400
 
401
+ - All chunks have valid text content
402
+ - All ratings are in [1, 5] range
403
+ - All product identifiers present
404
+ - Deterministic chunk IDs (MD5 hash of review_id + chunk_index)
405
 
406
+ ---
407
 
408
+ ## Summary
409
 
410
+ This production EDA confirms the deployed data characteristics:
 
 
 
411
 
412
+ 1. **Scale:** {total_chunks:,} chunks across {unique_products:,} products
413
+ 2. **Quality:** 5-core filtered, validated payloads
414
+ 3. **Distribution:** J-shaped ratings, typical e-commerce pattern
415
+ 4. **Chunking:** {expansion_ratio:.2f}x expansion from reviews to chunks
416
 
417
+ The data matches what the recommendation API queries in real-time.
 
 
 
418
 
419
  ---
420
 
421
+ *Report generated from Qdrant Cloud. Run `make eda` to regenerate.*
422
+ """
423
 
424
+ report_path = REPORTS_DIR / "eda_report.md"
425
+ report_path.write_text(report_content)
426
+ print(f" Report: {report_path}")
427
 
 
 
 
 
 
 
 
428
 
429
+ def main():
430
+ print("=" * 60)
431
+ print("PRODUCTION EDA: Querying Qdrant Cloud")
432
+ print("=" * 60)
433
 
434
+ client = get_client()
435
 
436
+ # Get collection info
437
+ try:
438
+ info = get_collection_info(client)
439
+ print(f"\nCollection: {info['name']}")
440
+ print(f"Points: {info['points_count']:,}")
441
+ print(f"Status: {info['status']}")
442
+ except Exception as e:
443
+ print(f"ERROR: Cannot access collection: {e}")
444
+ print("Ensure QDRANT_URL and QDRANT_API_KEY are correct.")
445
+ sys.exit(1)
446
 
447
+ # Compute stats
448
+ print("\n--- Computing Statistics ---")
449
+ stats = compute_stats(client)
450
 
451
+ # Generate figures
452
+ print("\n--- Generating Figures ---")
453
+ generate_figures(stats)
 
 
454
 
455
+ # Generate report
456
+ print("\n--- Generating Report ---")
457
+ generate_report(stats, info)
458
 
459
+ print("\n" + "=" * 60)
460
+ print("EDA COMPLETE")
461
+ print("=" * 60)
462
+ print(f"Figures: {FIGURES_DIR}/")
463
+ print(f"Report: {REPORTS_DIR / 'eda_report.md'}")
464
+
465
+ client.close()
466
 
 
 
467
 
468
+ if __name__ == "__main__":
469
+ main()
 
scripts/evaluation.py CHANGED
@@ -338,27 +338,32 @@ def main():
338
  parser.add_argument(
339
  "--dataset",
340
  "-d",
341
- default="eval_loo_history.json",
342
- help="Evaluation dataset file (default: eval_loo_history.json)",
343
  )
344
  args = parser.parse_args()
345
 
346
  log_banner(logger, "OFFLINE EVALUATION")
347
 
348
- # Load data
349
- logger.info("Loading data...")
350
- train_df, _, test_df = load_splits()
351
- train_records = train_df.to_dict("records")
352
- all_products = list(train_df["parent_asin"].unique())
353
-
354
- item_popularity = compute_item_popularity(train_records, item_key="parent_asin")
355
-
356
  logger.info("Loading product embeddings from Qdrant...")
357
  item_embeddings = load_product_embeddings_from_qdrant()
358
  total_items = len(item_embeddings)
359
-
360
  logger.info("Products in catalog: %d", total_items)
361
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  # Load eval cases
363
  logger.info("Loading evaluation dataset: %s", args.dataset)
364
  cases = load_eval_cases(args.dataset)
@@ -398,9 +403,14 @@ def main():
398
  "ndcg_at_10": best_ndcg,
399
  }
400
 
401
- # Baseline comparison
402
  if args.baselines:
403
- run_baseline_comparison(cases, train_records, all_products, item_embeddings)
 
 
 
 
 
404
 
405
  # Save results (uses dataset stem as prefix for both timestamped and latest files)
406
  prefix = Path(args.dataset).stem
 
338
  parser.add_argument(
339
  "--dataset",
340
  "-d",
341
+ default="eval_natural_queries.json",
342
+ help="Evaluation dataset file (default: eval_natural_queries.json)",
343
  )
344
  args = parser.parse_args()
345
 
346
  log_banner(logger, "OFFLINE EVALUATION")
347
 
348
+ # Load product embeddings from Qdrant (always available)
 
 
 
 
 
 
 
349
  logger.info("Loading product embeddings from Qdrant...")
350
  item_embeddings = load_product_embeddings_from_qdrant()
351
  total_items = len(item_embeddings)
 
352
  logger.info("Products in catalog: %d", total_items)
353
 
354
+ # Try to load splits for beyond-accuracy metrics (optional)
355
+ item_popularity = None
356
+ train_records = None
357
+ all_products = None
358
+ try:
359
+ train_df, _, _ = load_splits()
360
+ train_records = train_df.to_dict("records")
361
+ all_products = list(train_df["parent_asin"].unique())
362
+ item_popularity = compute_item_popularity(train_records, item_key="parent_asin")
363
+ logger.info("Loaded splits for beyond-accuracy metrics")
364
+ except FileNotFoundError:
365
+ logger.info("Splits not available - beyond-accuracy metrics will be skipped")
366
+
367
  # Load eval cases
368
  logger.info("Loading evaluation dataset: %s", args.dataset)
369
  cases = load_eval_cases(args.dataset)
 
403
  "ndcg_at_10": best_ndcg,
404
  }
405
 
406
+ # Baseline comparison (requires splits)
407
  if args.baselines:
408
+ if train_records is None:
409
+ logger.warning(
410
+ "Skipping baselines - requires local splits (run 'make splits')"
411
+ )
412
+ else:
413
+ run_baseline_comparison(cases, train_records, all_products, item_embeddings)
414
 
415
  # Save results (uses dataset stem as prefix for both timestamped and latest files)
416
  prefix = Path(args.dataset).stem
scripts/explanation.py CHANGED
@@ -43,7 +43,7 @@ PRODUCTS_PER_QUERY = 2
43
 
44
  def run_basic_tests():
45
  """Test basic explanation generation and HHEM detection."""
46
- from scripts.lib.services import get_explanation_services
47
 
48
  log_banner(logger, "BASIC EXPLANATION TESTS")
49
  logger.info("Using LLM provider: %s", LLM_PROVIDER)
@@ -105,15 +105,18 @@ def run_basic_tests():
105
  logger.info('Query: "%s"', test_query)
106
  logger.info("Streaming: ")
107
 
108
- stream = explainer.generate_explanation_stream(test_query, test_product)
109
- chunks = list(stream)
110
- logger.info("".join(chunks))
 
111
 
112
- streamed_result = stream.get_complete_result()
113
- hhem = detector.check_explanation(
114
- streamed_result.evidence_texts, streamed_result.explanation
115
- )
116
- logger.info("HHEM Score: %.3f", hhem.score)
 
 
117
 
118
  log_banner(logger, "BASIC TESTS COMPLETE")
119
 
@@ -273,17 +276,20 @@ def run_cold_start_tests():
273
  )
274
  from sage.core import UserPreferences
275
  from sage.services.cold_start import preferences_to_query
276
- from sage.data import load_splits
277
 
278
  log_banner(logger, "COLD-START HANDLING TESTS")
279
 
280
- # Load data
281
- logger.info("Loading data...")
282
- train_df, val_df, test_df = load_splits()
283
-
284
- user_counts = train_df.groupby("user_id").size().to_dict()
285
 
286
- logger.info("Training users: %d", len(user_counts))
 
 
 
 
287
 
288
  # Test warmup levels
289
  log_section(logger, "1. WARMUP LEVEL DETECTION")
@@ -347,20 +353,23 @@ def run_cold_start_tests():
347
  for r in recs:
348
  logger.info(" %s: score=%.3f", r.product_id, r.score)
349
 
350
- # Find a warm user
351
- warm_users = [u for u, c in user_counts.items() if c >= 5]
352
- if warm_users:
353
- warm_user = warm_users[0]
354
- user_history = train_df[train_df["user_id"] == warm_user].to_dict("records")
355
-
356
- logger.info("Warm user (%d interactions):", len(user_history))
357
- recs = hybrid_recommend(
358
- query="similar products",
359
- user_history=user_history,
360
- top_k=3,
361
- )
362
- for r in recs:
363
- logger.info(" %s: score=%.3f", r.product_id, r.score)
 
 
 
364
 
365
  log_banner(logger, "COLD-START TESTS COMPLETE")
366
 
 
43
 
44
  def run_basic_tests():
45
  """Test basic explanation generation and HHEM detection."""
46
+ from sage.services import get_explanation_services
47
 
48
  log_banner(logger, "BASIC EXPLANATION TESTS")
49
  logger.info("Using LLM provider: %s", LLM_PROVIDER)
 
105
  logger.info('Query: "%s"', test_query)
106
  logger.info("Streaming: ")
107
 
108
+ try:
109
+ stream = explainer.generate_explanation_stream(test_query, test_product)
110
+ chunks = list(stream)
111
+ logger.info("".join(chunks))
112
 
113
+ streamed_result = stream.get_complete_result()
114
+ hhem = detector.check_explanation(
115
+ streamed_result.evidence_texts, streamed_result.explanation
116
+ )
117
+ logger.info("HHEM Score: %.3f", hhem.score)
118
+ except ValueError as e:
119
+ logger.info("Quality gate refused streaming: %s", e)
120
 
121
  log_banner(logger, "BASIC TESTS COMPLETE")
122
 
 
276
  )
277
  from sage.core import UserPreferences
278
  from sage.services.cold_start import preferences_to_query
 
279
 
280
  log_banner(logger, "COLD-START HANDLING TESTS")
281
 
282
+ # Try to load splits for warm user tests (optional)
283
+ train_df = None
284
+ user_counts = {}
285
+ try:
286
+ from sage.data import load_splits
287
 
288
+ train_df, _, _ = load_splits()
289
+ user_counts = train_df.groupby("user_id").size().to_dict()
290
+ logger.info("Loaded splits: %d training users", len(user_counts))
291
+ except FileNotFoundError:
292
+ logger.info("Splits not available - warm user tests will be skipped")
293
 
294
  # Test warmup levels
295
  log_section(logger, "1. WARMUP LEVEL DETECTION")
 
353
  for r in recs:
354
  logger.info(" %s: score=%.3f", r.product_id, r.score)
355
 
356
+ # Find a warm user (only if splits available)
357
+ if train_df is not None:
358
+ warm_users = [u for u, c in user_counts.items() if c >= 5]
359
+ if warm_users:
360
+ warm_user = warm_users[0]
361
+ user_history = train_df[train_df["user_id"] == warm_user].to_dict("records")
362
+
363
+ logger.info("Warm user (%d interactions):", len(user_history))
364
+ recs = hybrid_recommend(
365
+ query="similar products",
366
+ user_history=user_history,
367
+ top_k=3,
368
+ )
369
+ for r in recs:
370
+ logger.info(" %s: score=%.3f", r.product_id, r.score)
371
+ else:
372
+ logger.info("Skipping warm user test (no splits)")
373
 
374
  log_banner(logger, "COLD-START TESTS COMPLETE")
375
 
scripts/faithfulness.py CHANGED
@@ -51,7 +51,7 @@ TOP_K_PRODUCTS = 3
51
 
52
  def run_evaluation(n_samples: int, run_ragas: bool = False):
53
  """Run faithfulness evaluation on sample queries."""
54
- from scripts.lib.services import get_explanation_services
55
 
56
  queries = EVALUATION_QUERIES[:n_samples]
57
 
@@ -202,7 +202,7 @@ def run_evaluation(n_samples: int, run_ragas: bool = False):
202
 
203
  def run_failure_analysis():
204
  """Analyze failure cases to identify root causes."""
205
- from scripts.lib.services import get_explanation_services
206
 
207
  log_banner(logger, "FAILURE CASE ANALYSIS")
208
 
 
51
 
52
  def run_evaluation(n_samples: int, run_ragas: bool = False):
53
  """Run faithfulness evaluation on sample queries."""
54
+ from sage.services import get_explanation_services
55
 
56
  queries = EVALUATION_QUERIES[:n_samples]
57
 
 
202
 
203
  def run_failure_analysis():
204
  """Analyze failure cases to identify root causes."""
205
+ from sage.services import get_explanation_services
206
 
207
  log_banner(logger, "FAILURE CASE ANALYSIS")
208
 
scripts/human_eval.py CHANGED
@@ -105,7 +105,7 @@ def generate_samples(force: bool = False, seed: int = 42):
105
  import random
106
 
107
  from sage.services.retrieval import get_candidates
108
- from scripts.lib.services import get_explanation_services
109
 
110
  # Protect existing rated samples from accidental overwrite
111
  if SAMPLES_FILE.exists() and not force:
 
105
  import random
106
 
107
  from sage.services.retrieval import get_candidates
108
+ from sage.services import get_explanation_services
109
 
110
  # Protect existing rated samples from accidental overwrite
111
  if SAMPLES_FILE.exists() and not force:
scripts/lib/__init__.py CHANGED
@@ -1,5 +1,6 @@
1
  """Shared utilities for scripts."""
2
 
3
- from scripts.lib.services import get_explanation_services
 
4
 
5
  __all__ = ["get_explanation_services"]
 
1
  """Shared utilities for scripts."""
2
 
3
+ # Re-export from sage.services for backwards compatibility
4
+ from sage.services import get_explanation_services
5
 
6
  __all__ = ["get_explanation_services"]
scripts/summary.py CHANGED
@@ -51,17 +51,6 @@ def main():
51
  print("SAGE PIPELINE RESULTS")
52
  print(SEP)
53
 
54
- # -- Recommendation Quality (LOO History) ---------------------------------
55
- loo = load_json(RESULTS_DIR / "eval_loo_history_latest.json")
56
- print_section("Recommendation Quality (LOO History):")
57
- if loo and "primary_metrics" in loo:
58
- m = loo["primary_metrics"]
59
- print(f" NDCG@10: {fmt(m.get('ndcg_at_10'))}")
60
- print(f" Hit@10: {fmt(m.get('hit_at_10'))}")
61
- print(f" MRR: {fmt(m.get('mrr'))}")
62
- else:
63
- print(" (not available)")
64
-
65
  # -- Recommendation Quality (Natural Queries) -----------------------------
66
  nat = load_json(RESULTS_DIR / "eval_natural_queries_latest.json")
67
  print_section("Recommendation Quality (Natural Queries):")
@@ -69,6 +58,7 @@ def main():
69
  m = nat["primary_metrics"]
70
  print(f" NDCG@10: {fmt(m.get('ndcg_at_10'))}")
71
  print(f" Hit@10: {fmt(m.get('hit_at_10'))}")
 
72
  else:
73
  print(" (not available)")
74
 
 
51
  print("SAGE PIPELINE RESULTS")
52
  print(SEP)
53
 
 
 
 
 
 
 
 
 
 
 
 
54
  # -- Recommendation Quality (Natural Queries) -----------------------------
55
  nat = load_json(RESULTS_DIR / "eval_natural_queries_latest.json")
56
  print_section("Recommendation Quality (Natural Queries):")
 
58
  m = nat["primary_metrics"]
59
  print(f" NDCG@10: {fmt(m.get('ndcg_at_10'))}")
60
  print(f" Hit@10: {fmt(m.get('hit_at_10'))}")
61
+ print(f" MRR: {fmt(m.get('mrr'))}")
62
  else:
63
  print(" (not available)")
64