Fix RAGAS faithfulness scoring (0.64 → 0.82)
Browse files- .dockerignore +1 -0
- Makefile +9 -7
- reports/eda_report.md +150 -0
- sage/adapters/hhem.py +2 -2
- sage/adapters/vector_store.py +10 -3
- sage/api/app.py +1 -2
- sage/api/routes.py +8 -5
- sage/config/__init__.py +7 -1
- sage/config/queries.py +43 -0
- sage/core/__init__.py +2 -2
- sage/core/chunking.py +2 -2
- sage/core/evidence.py +1 -1
- sage/core/models.py +27 -1
- sage/core/verification.py +7 -28
- sage/services/faithfulness.py +69 -4
- scripts/build_eval_dataset.py +2 -2
- scripts/build_natural_eval_dataset.py +1 -1
- scripts/demo.py +9 -2
- scripts/e2e_success_rate.py +4 -30
- scripts/eda.py +11 -21
- scripts/evaluation.py +5 -7
- scripts/explanation.py +1 -1
- scripts/faithfulness.py +2 -18
- scripts/pipeline.py +5 -4
- tests/test_chunking.py +8 -6
.dockerignore
CHANGED
|
@@ -2,6 +2,7 @@
|
|
| 2 |
.env.*
|
| 3 |
!.env.example
|
| 4 |
venv/
|
|
|
|
| 5 |
data/
|
| 6 |
home/
|
| 7 |
scripts/
|
|
|
|
| 2 |
.env.*
|
| 3 |
!.env.example
|
| 4 |
venv/
|
| 5 |
+
.venv/
|
| 6 |
data/
|
| 7 |
home/
|
| 8 |
scripts/
|
Makefile
CHANGED
|
@@ -23,10 +23,10 @@ check-env:
|
|
| 23 |
|
| 24 |
setup:
|
| 25 |
@echo "=== SETUP ==="
|
| 26 |
-
python -m venv venv
|
| 27 |
-
. venv/bin/activate && pip install -e ".[pipeline,api,anthropic,openai]"
|
| 28 |
@echo ""
|
| 29 |
-
@echo "Setup complete. Activate with: source venv/bin/activate"
|
| 30 |
|
| 31 |
# ---------------------------------------------------------------------------
|
| 32 |
# Data Pipeline
|
|
@@ -41,11 +41,13 @@ data: check-env
|
|
| 41 |
@test -f data/splits/train.parquet || (echo "FAIL: train.parquet not created" && exit 1)
|
| 42 |
@echo "Data pipeline complete"
|
| 43 |
|
| 44 |
-
# Exploratory data analysis
|
| 45 |
-
eda:
|
| 46 |
-
@test -d data/splits || (echo "ERROR: Run 'make data' first" && exit 1)
|
| 47 |
@echo "=== EDA ANALYSIS ==="
|
|
|
|
| 48 |
python scripts/eda.py
|
|
|
|
|
|
|
| 49 |
|
| 50 |
# ---------------------------------------------------------------------------
|
| 51 |
# Evaluation Suite
|
|
@@ -262,7 +264,7 @@ help:
|
|
| 262 |
@echo ""
|
| 263 |
@echo "PIPELINE:"
|
| 264 |
@echo " make data Load, chunk, embed, and index reviews"
|
| 265 |
-
@echo " make eda Exploratory data analysis"
|
| 266 |
@echo " make eval Standard evaluation (primary metrics + RAGAS + spot-checks)"
|
| 267 |
@echo " make eval-deep Deep evaluation (all ablations + baselines + calibration)"
|
| 268 |
@echo " make eval-quick Quick eval (skip RAGAS)"
|
|
|
|
| 23 |
|
| 24 |
setup:
|
| 25 |
@echo "=== SETUP ==="
|
| 26 |
+
python -m venv .venv
|
| 27 |
+
. .venv/bin/activate && pip install -e ".[pipeline,api,anthropic,openai]"
|
| 28 |
@echo ""
|
| 29 |
+
@echo "Setup complete. Activate with: source .venv/bin/activate"
|
| 30 |
|
| 31 |
# ---------------------------------------------------------------------------
|
| 32 |
# Data Pipeline
|
|
|
|
| 41 |
@test -f data/splits/train.parquet || (echo "FAIL: train.parquet not created" && exit 1)
|
| 42 |
@echo "Data pipeline complete"
|
| 43 |
|
| 44 |
+
# Exploratory data analysis (generates figures for reports/eda_report.md)
|
| 45 |
+
eda:
|
|
|
|
| 46 |
@echo "=== EDA ANALYSIS ==="
|
| 47 |
+
@mkdir -p data/figures
|
| 48 |
python scripts/eda.py
|
| 49 |
+
@echo "Figures saved to data/figures/"
|
| 50 |
+
@echo "View report: reports/eda_report.md"
|
| 51 |
|
| 52 |
# ---------------------------------------------------------------------------
|
| 53 |
# Evaluation Suite
|
|
|
|
| 264 |
@echo ""
|
| 265 |
@echo "PIPELINE:"
|
| 266 |
@echo " make data Load, chunk, embed, and index reviews"
|
| 267 |
+
@echo " make eda Exploratory data analysis (generates figures)"
|
| 268 |
@echo " make eval Standard evaluation (primary metrics + RAGAS + spot-checks)"
|
| 269 |
@echo " make eval-deep Deep evaluation (all ablations + baselines + calibration)"
|
| 270 |
@echo " make eval-quick Quick eval (skip RAGAS)"
|
reports/eda_report.md
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Exploratory Data Analysis: Amazon Electronics Reviews
|
| 2 |
+
|
| 3 |
+
**Dataset:** McAuley-Lab/Amazon-Reviews-2023 (Electronics category)
|
| 4 |
+
**Subset:** 100,000 raw reviews → 2,635 after 5-core filtering
|
| 5 |
+
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
## Dataset Overview
|
| 9 |
+
|
| 10 |
+
The Amazon Electronics reviews dataset provides rich user feedback data for building recommendation systems. After standard preprocessing and 5-core filtering (requiring users and items to have at least 5 interactions), the dataset exhibits the characteristic sparsity of real-world recommendation scenarios.
|
| 11 |
+
|
| 12 |
+
| Metric | Raw | After 5-Core |
|
| 13 |
+
|--------|-----|--------------|
|
| 14 |
+
| Total Reviews | 100,000 | 2,635 |
|
| 15 |
+
| Unique Users | 15,322 | 334 |
|
| 16 |
+
| Unique Items | 59,429 | 318 |
|
| 17 |
+
| Avg Rating | 4.26 | 4.44 |
|
| 18 |
+
| Retention | — | 2.6% |
|
| 19 |
+
|
| 20 |
+
---
|
| 21 |
+
|
| 22 |
+
## Rating Distribution
|
| 23 |
+
|
| 24 |
+
Amazon reviews exhibit a well-known J-shaped distribution, heavily skewed toward 5-star ratings. This reflects both genuine satisfaction and selection bias (dissatisfied customers often don't leave reviews).
|
| 25 |
+
|
| 26 |
+

|
| 27 |
+
|
| 28 |
+
**Key Observations:**
|
| 29 |
+
- 5-star ratings dominate (65.4% of reviews)
|
| 30 |
+
- 1-star reviews form the second largest group (8.0%)
|
| 31 |
+
- Middle ratings (2-4 stars) are relatively rare (26.6% combined)
|
| 32 |
+
- This polarization is typical for e-commerce review data
|
| 33 |
+
|
| 34 |
+
**Implications for Modeling:**
|
| 35 |
+
- Binary classification (positive/negative) may be more robust than regression
|
| 36 |
+
- Rating-weighted aggregation should account for the skewed distribution
|
| 37 |
+
- Evidence from 4-5 star reviews carries stronger positive signal
|
| 38 |
+
|
| 39 |
+
---
|
| 40 |
+
|
| 41 |
+
## Review Length Analysis
|
| 42 |
+
|
| 43 |
+
Review length varies significantly and correlates with the chunking strategy for the RAG pipeline. Most reviews are short enough to embed directly without chunking.
|
| 44 |
+
|
| 45 |
+

|
| 46 |
+
|
| 47 |
+
**Length Statistics:**
|
| 48 |
+
- Median: 183 characters (~45 tokens)
|
| 49 |
+
- Mean: 369 characters (~92 tokens)
|
| 50 |
+
- Reviews exceeding 200 tokens: 11.2% (require chunking)
|
| 51 |
+
|
| 52 |
+
**Chunking Strategy Validation:**
|
| 53 |
+
The tiered chunking approach is well-suited to this distribution:
|
| 54 |
+
- **Short (<200 tokens):** No chunking needed — majority of reviews
|
| 55 |
+
- **Medium (200-500 tokens):** Semantic chunking at topic boundaries
|
| 56 |
+
- **Long (>500 tokens):** Semantic + sliding window fallback
|
| 57 |
+
|
| 58 |
+
---
|
| 59 |
+
|
| 60 |
+
## Review Length by Rating
|
| 61 |
+
|
| 62 |
+
Negative reviews tend to be longer than positive ones. Users who are dissatisfied often provide detailed explanations of issues, while satisfied users may simply express approval.
|
| 63 |
+
|
| 64 |
+

|
| 65 |
+
|
| 66 |
+
**Pattern:**
|
| 67 |
+
- 1-star reviews: 187 chars median
|
| 68 |
+
- 2-3 star reviews: 258-265 chars median (users explain nuance)
|
| 69 |
+
- 4-star reviews: 297 chars median (longest — detailed positive feedback)
|
| 70 |
+
- 5-star reviews: 152 chars median (shortest — quick endorsements)
|
| 71 |
+
|
| 72 |
+
**Implications:**
|
| 73 |
+
- Negative reviews provide richer evidence for issue identification
|
| 74 |
+
- Positive reviews may require multiple chunks for substantive explanations
|
| 75 |
+
- Rating filters (min_rating=4) naturally bias toward shorter evidence
|
| 76 |
+
|
| 77 |
+
---
|
| 78 |
+
|
| 79 |
+
## Temporal Distribution
|
| 80 |
+
|
| 81 |
+
The dataset spans multiple years of reviews, enabling proper temporal train/validation/test splits that prevent data leakage.
|
| 82 |
+
|
| 83 |
+

|
| 84 |
+
|
| 85 |
+
**Temporal Split Strategy:**
|
| 86 |
+
- **Train (70%):** Oldest reviews — model learns from historical patterns
|
| 87 |
+
- **Validation (10%):** Middle period — hyperparameter tuning
|
| 88 |
+
- **Test (20%):** Most recent — simulates production deployment
|
| 89 |
+
|
| 90 |
+
This chronological ordering ensures the model never sees "future" data during training.
|
| 91 |
+
|
| 92 |
+
---
|
| 93 |
+
|
| 94 |
+
## User and Item Activity
|
| 95 |
+
|
| 96 |
+
The long-tail distribution is pronounced: most users write few reviews, and most items receive few reviews. This sparsity is the fundamental challenge recommendation systems address.
|
| 97 |
+
|
| 98 |
+

|
| 99 |
+
|
| 100 |
+
**User Activity:**
|
| 101 |
+
- Users with only 1 review: 30.1%
|
| 102 |
+
- Users with 5+ reviews: 4,991 (32.6%)
|
| 103 |
+
- Power user max: 820 reviews
|
| 104 |
+
|
| 105 |
+
**Item Popularity:**
|
| 106 |
+
- Items with only 1 review: 76.0%
|
| 107 |
+
- Items with 5+ reviews: 2,434 (4.1%)
|
| 108 |
+
- Most reviewed item: 326 reviews
|
| 109 |
+
|
| 110 |
+
**Cold-Start Implications:**
|
| 111 |
+
- Many items have sparse evidence — content-based features are critical
|
| 112 |
+
- User cold-start is common — onboarding preferences help
|
| 113 |
+
- 5-core filtering ensures minimum evidence density for evaluation
|
| 114 |
+
|
| 115 |
+
---
|
| 116 |
+
|
| 117 |
+
## Data Quality Assessment
|
| 118 |
+
|
| 119 |
+
The raw dataset contains several quality issues addressed during preprocessing.
|
| 120 |
+
|
| 121 |
+
| Issue | Count | Resolution |
|
| 122 |
+
|-------|-------|------------|
|
| 123 |
+
| Missing text | 0 | — |
|
| 124 |
+
| Empty reviews | 21 | Removed |
|
| 125 |
+
| Very short (<10 chars) | 2,512 | Removed |
|
| 126 |
+
| Duplicate texts | 5,219 | Kept (valid re-purchases) |
|
| 127 |
+
| Invalid ratings | 0 | — |
|
| 128 |
+
|
| 129 |
+
**Post-Cleaning:**
|
| 130 |
+
- All reviews have valid text content
|
| 131 |
+
- All ratings are in [1, 5] range
|
| 132 |
+
- All user/product identifiers present
|
| 133 |
+
|
| 134 |
+
---
|
| 135 |
+
|
| 136 |
+
## Summary
|
| 137 |
+
|
| 138 |
+
The Amazon Electronics dataset, after 5-core filtering and cleaning, provides a solid foundation for building and evaluating a RAG-based recommendation system:
|
| 139 |
+
|
| 140 |
+
1. **Scale:** 2,635 reviews across 334 users and 318 items
|
| 141 |
+
2. **Sparsity:** 97.5% — realistic for recommendation evaluation
|
| 142 |
+
3. **Quality:** Clean text, valid ratings, proper identifiers
|
| 143 |
+
4. **Temporal:** Supports chronological train/val/test splits
|
| 144 |
+
5. **Content:** Review lengths suit the tiered chunking strategy
|
| 145 |
+
|
| 146 |
+
The J-shaped rating distribution and long-tail user/item activity are characteristic of real e-commerce data, making this an appropriate benchmark for portfolio demonstration.
|
| 147 |
+
|
| 148 |
+
---
|
| 149 |
+
|
| 150 |
+
*Figures generated by `scripts/eda.py` at 300 DPI. Run `make figures` to regenerate.*
|
sage/adapters/hhem.py
CHANGED
|
@@ -269,7 +269,7 @@ class HallucinationDetector:
|
|
| 269 |
ClaimResult(
|
| 270 |
claim=claim, score=score, is_hallucinated=score < self.threshold
|
| 271 |
)
|
| 272 |
-
for claim, score in zip(claims, scores)
|
| 273 |
]
|
| 274 |
|
| 275 |
def check_batch(
|
|
@@ -293,7 +293,7 @@ class HallucinationDetector:
|
|
| 293 |
|
| 294 |
return [
|
| 295 |
self._make_result(score, explanation, len(premise))
|
| 296 |
-
for (premise, explanation), score in zip(pairs, scores)
|
| 297 |
]
|
| 298 |
|
| 299 |
|
|
|
|
| 269 |
ClaimResult(
|
| 270 |
claim=claim, score=score, is_hallucinated=score < self.threshold
|
| 271 |
)
|
| 272 |
+
for claim, score in zip(claims, scores, strict=True)
|
| 273 |
]
|
| 274 |
|
| 275 |
def check_batch(
|
|
|
|
| 293 |
|
| 294 |
return [
|
| 295 |
self._make_result(score, explanation, len(premise))
|
| 296 |
+
for (premise, explanation), score in zip(pairs, scores, strict=True)
|
| 297 |
]
|
| 298 |
|
| 299 |
|
sage/adapters/vector_store.py
CHANGED
|
@@ -4,7 +4,13 @@ Qdrant vector store adapter.
|
|
| 4 |
Wraps Qdrant client operations for storing and searching review embeddings.
|
| 5 |
"""
|
| 6 |
|
|
|
|
|
|
|
| 7 |
import hashlib
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
from sage.core import Chunk
|
| 10 |
from sage.config import (
|
|
@@ -114,7 +120,7 @@ def create_payload_indexes(client, collection_name: str = COLLECTION_NAME) -> No
|
|
| 114 |
def upload_chunks(
|
| 115 |
client,
|
| 116 |
chunks: list[Chunk],
|
| 117 |
-
embeddings: list,
|
| 118 |
collection_name: str = COLLECTION_NAME,
|
| 119 |
batch_size: int = 100,
|
| 120 |
) -> None:
|
|
@@ -133,7 +139,7 @@ def upload_chunks(
|
|
| 133 |
|
| 134 |
points = []
|
| 135 |
|
| 136 |
-
for chunk, embedding in zip(chunks, embeddings):
|
| 137 |
point_id = _generate_point_id(chunk.review_id, chunk.chunk_index)
|
| 138 |
point = PointStruct(
|
| 139 |
id=point_id,
|
|
@@ -251,5 +257,6 @@ def collection_exists(client, collection_name: str = COLLECTION_NAME) -> bool:
|
|
| 251 |
return False
|
| 252 |
info = client.get_collection(collection_name)
|
| 253 |
return info.points_count > 0
|
| 254 |
-
except Exception:
|
|
|
|
| 255 |
return False
|
|
|
|
| 4 |
Wraps Qdrant client operations for storing and searching review embeddings.
|
| 5 |
"""
|
| 6 |
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
import hashlib
|
| 10 |
+
from typing import TYPE_CHECKING
|
| 11 |
+
|
| 12 |
+
if TYPE_CHECKING:
|
| 13 |
+
import numpy as np
|
| 14 |
|
| 15 |
from sage.core import Chunk
|
| 16 |
from sage.config import (
|
|
|
|
| 120 |
def upload_chunks(
|
| 121 |
client,
|
| 122 |
chunks: list[Chunk],
|
| 123 |
+
embeddings: list | "np.ndarray",
|
| 124 |
collection_name: str = COLLECTION_NAME,
|
| 125 |
batch_size: int = 100,
|
| 126 |
) -> None:
|
|
|
|
| 139 |
|
| 140 |
points = []
|
| 141 |
|
| 142 |
+
for chunk, embedding in zip(chunks, embeddings, strict=True):
|
| 143 |
point_id = _generate_point_id(chunk.review_id, chunk.chunk_index)
|
| 144 |
point = PointStruct(
|
| 145 |
id=point_id,
|
|
|
|
| 257 |
return False
|
| 258 |
info = client.get_collection(collection_name)
|
| 259 |
return info.points_count > 0
|
| 260 |
+
except Exception as e:
|
| 261 |
+
logger.debug("collection_exists check failed: %s", e)
|
| 262 |
return False
|
sage/api/app.py
CHANGED
|
@@ -8,13 +8,12 @@ once at startup and shared across requests.
|
|
| 8 |
|
| 9 |
from __future__ import annotations
|
| 10 |
|
|
|
|
| 11 |
from contextlib import asynccontextmanager
|
| 12 |
|
| 13 |
from fastapi import FastAPI
|
| 14 |
from starlette.middleware.cors import CORSMiddleware
|
| 15 |
|
| 16 |
-
import os
|
| 17 |
-
|
| 18 |
from sage.api.middleware import LatencyMiddleware
|
| 19 |
from sage.api.routes import router
|
| 20 |
from sage.config import get_logger
|
|
|
|
| 8 |
|
| 9 |
from __future__ import annotations
|
| 10 |
|
| 11 |
+
import os
|
| 12 |
from contextlib import asynccontextmanager
|
| 13 |
|
| 14 |
from fastapi import FastAPI
|
| 15 |
from starlette.middleware.cors import CORSMiddleware
|
| 16 |
|
|
|
|
|
|
|
| 17 |
from sage.api.middleware import LatencyMiddleware
|
| 18 |
from sage.api.routes import router
|
| 19 |
from sage.config import get_logger
|
sage/api/routes.py
CHANGED
|
@@ -15,9 +15,12 @@ from __future__ import annotations
|
|
| 15 |
import json
|
| 16 |
from concurrent.futures import ThreadPoolExecutor
|
| 17 |
from dataclasses import dataclass
|
| 18 |
-
from typing import Iterator
|
| 19 |
|
| 20 |
-
from fastapi import APIRouter, Depends, Query, Request, Response
|
|
|
|
|
|
|
|
|
|
| 21 |
from fastapi.responses import JSONResponse, StreamingResponse
|
| 22 |
from pydantic import BaseModel
|
| 23 |
|
|
@@ -113,8 +116,8 @@ class RecommendParams:
|
|
| 113 |
|
| 114 |
def _fetch_products(
|
| 115 |
params: RecommendParams,
|
| 116 |
-
app,
|
| 117 |
-
query_embedding=None,
|
| 118 |
) -> list[ProductScore]:
|
| 119 |
"""Run candidate generation with lifespan-managed singletons."""
|
| 120 |
return get_candidates(
|
|
@@ -238,7 +241,7 @@ def recommend(
|
|
| 238 |
results = list(pool.map(_explain, products))
|
| 239 |
|
| 240 |
for i, (product, (er, hr, cr)) in enumerate(
|
| 241 |
-
zip(products, results),
|
| 242 |
1,
|
| 243 |
):
|
| 244 |
rec = _build_product_dict(i, product)
|
|
|
|
| 15 |
import json
|
| 16 |
from concurrent.futures import ThreadPoolExecutor
|
| 17 |
from dataclasses import dataclass
|
| 18 |
+
from typing import TYPE_CHECKING, Iterator
|
| 19 |
|
| 20 |
+
from fastapi import APIRouter, Depends, FastAPI, Query, Request, Response
|
| 21 |
+
|
| 22 |
+
if TYPE_CHECKING:
|
| 23 |
+
import numpy as np
|
| 24 |
from fastapi.responses import JSONResponse, StreamingResponse
|
| 25 |
from pydantic import BaseModel
|
| 26 |
|
|
|
|
| 116 |
|
| 117 |
def _fetch_products(
|
| 118 |
params: RecommendParams,
|
| 119 |
+
app: FastAPI,
|
| 120 |
+
query_embedding: "np.ndarray | None" = None,
|
| 121 |
) -> list[ProductScore]:
|
| 122 |
"""Run candidate generation with lifespan-managed singletons."""
|
| 123 |
return get_candidates(
|
|
|
|
| 241 |
results = list(pool.map(_explain, products))
|
| 242 |
|
| 243 |
for i, (product, (er, hr, cr)) in enumerate(
|
| 244 |
+
zip(products, results, strict=True),
|
| 245 |
1,
|
| 246 |
):
|
| 247 |
rec = _build_product_dict(i, product)
|
sage/config/__init__.py
CHANGED
|
@@ -156,7 +156,11 @@ EVAL_DIMENSIONS = {
|
|
| 156 |
}
|
| 157 |
|
| 158 |
|
| 159 |
-
from sage.config.queries import
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
|
| 162 |
# ---------------------------------------------------------------------------
|
|
@@ -246,6 +250,8 @@ __all__ = [
|
|
| 246 |
# Evaluation
|
| 247 |
"EVAL_DIMENSIONS",
|
| 248 |
"EVALUATION_QUERIES",
|
|
|
|
|
|
|
| 249 |
# Utilities
|
| 250 |
"save_results",
|
| 251 |
# Logging
|
|
|
|
| 156 |
}
|
| 157 |
|
| 158 |
|
| 159 |
+
from sage.config.queries import ( # noqa: E402
|
| 160 |
+
ANALYSIS_QUERIES,
|
| 161 |
+
E2E_EVAL_QUERIES,
|
| 162 |
+
EVALUATION_QUERIES,
|
| 163 |
+
)
|
| 164 |
|
| 165 |
|
| 166 |
# ---------------------------------------------------------------------------
|
|
|
|
| 250 |
# Evaluation
|
| 251 |
"EVAL_DIMENSIONS",
|
| 252 |
"EVALUATION_QUERIES",
|
| 253 |
+
"ANALYSIS_QUERIES",
|
| 254 |
+
"E2E_EVAL_QUERIES",
|
| 255 |
# Utilities
|
| 256 |
"save_results",
|
| 257 |
# Logging
|
sage/config/queries.py
CHANGED
|
@@ -5,6 +5,7 @@ Separated from main config to keep configuration declarative.
|
|
| 5 |
These are test fixtures used by evaluation scripts.
|
| 6 |
"""
|
| 7 |
|
|
|
|
| 8 |
EVALUATION_QUERIES = [
|
| 9 |
# Common product categories (high confidence expected)
|
| 10 |
"wireless headphones with noise cancellation",
|
|
@@ -29,3 +30,45 @@ EVALUATION_QUERIES = [
|
|
| 29 |
"noise cancelling headphones for travel",
|
| 30 |
"portable speaker with good bass",
|
| 31 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
These are test fixtures used by evaluation scripts.
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
# Primary evaluation queries - used for general RAGAS/HHEM evaluation
|
| 9 |
EVALUATION_QUERIES = [
|
| 10 |
# Common product categories (high confidence expected)
|
| 11 |
"wireless headphones with noise cancellation",
|
|
|
|
| 30 |
"noise cancelling headphones for travel",
|
| 31 |
"portable speaker with good bass",
|
| 32 |
]
|
| 33 |
+
|
| 34 |
+
# Queries for failure analysis - focused on edge cases and challenging queries
|
| 35 |
+
ANALYSIS_QUERIES = [
|
| 36 |
+
"wireless headphones with noise cancellation",
|
| 37 |
+
"laptop charger for MacBook",
|
| 38 |
+
"USB hub with multiple ports",
|
| 39 |
+
"portable battery pack for travel",
|
| 40 |
+
"bluetooth speaker with good bass",
|
| 41 |
+
"cheap but good quality earbuds",
|
| 42 |
+
"durable phone case that looks nice",
|
| 43 |
+
"fast charging cable that won't break",
|
| 44 |
+
"comfortable headphones for long sessions",
|
| 45 |
+
"quiet keyboard for office",
|
| 46 |
+
"headphones that don't hurt ears",
|
| 47 |
+
"charger that actually works",
|
| 48 |
+
"waterproof speaker for shower",
|
| 49 |
+
"gift for someone who likes music",
|
| 50 |
+
]
|
| 51 |
+
|
| 52 |
+
# Queries for end-to-end success rate evaluation - comprehensive coverage
|
| 53 |
+
E2E_EVAL_QUERIES = [
|
| 54 |
+
"wireless headphones with noise cancellation",
|
| 55 |
+
"laptop charger for MacBook",
|
| 56 |
+
"USB hub with multiple ports",
|
| 57 |
+
"portable battery pack for travel",
|
| 58 |
+
"bluetooth speaker with good bass",
|
| 59 |
+
"cheap but good quality earbuds",
|
| 60 |
+
"durable phone case that looks nice",
|
| 61 |
+
"fast charging cable that won't break",
|
| 62 |
+
"comfortable headphones for long sessions",
|
| 63 |
+
"quiet keyboard for office",
|
| 64 |
+
"headphones that don't hurt ears",
|
| 65 |
+
"charger that actually works",
|
| 66 |
+
"waterproof speaker for shower",
|
| 67 |
+
"gift for someone who likes music",
|
| 68 |
+
"tablet stand for kitchen",
|
| 69 |
+
"wireless mouse for laptop",
|
| 70 |
+
"HDMI cable for monitor",
|
| 71 |
+
"phone mount for car",
|
| 72 |
+
"screen protector for phone",
|
| 73 |
+
"backup battery for camping",
|
| 74 |
+
]
|
sage/core/__init__.py
CHANGED
|
@@ -21,6 +21,8 @@ from sage.core.models import (
|
|
| 21 |
ExplanationResult,
|
| 22 |
StreamingExplanation,
|
| 23 |
# Verification
|
|
|
|
|
|
|
| 24 |
QuoteVerification,
|
| 25 |
VerificationResult,
|
| 26 |
# Hallucination Detection
|
|
@@ -59,8 +61,6 @@ from sage.core.aggregation import (
|
|
| 59 |
# Verification
|
| 60 |
from sage.core.verification import (
|
| 61 |
FORBIDDEN_PHRASES,
|
| 62 |
-
CitationResult,
|
| 63 |
-
CitationVerificationResult,
|
| 64 |
ForbiddenPhraseResult,
|
| 65 |
check_forbidden_phrases,
|
| 66 |
extract_citations,
|
|
|
|
| 21 |
ExplanationResult,
|
| 22 |
StreamingExplanation,
|
| 23 |
# Verification
|
| 24 |
+
CitationResult,
|
| 25 |
+
CitationVerificationResult,
|
| 26 |
QuoteVerification,
|
| 27 |
VerificationResult,
|
| 28 |
# Hallucination Detection
|
|
|
|
| 61 |
# Verification
|
| 62 |
from sage.core.verification import (
|
| 63 |
FORBIDDEN_PHRASES,
|
|
|
|
|
|
|
| 64 |
ForbiddenPhraseResult,
|
| 65 |
check_forbidden_phrases,
|
| 66 |
extract_citations,
|
sage/core/chunking.py
CHANGED
|
@@ -91,8 +91,8 @@ def sliding_window_chunk(
|
|
| 91 |
Returns:
|
| 92 |
List of chunk texts.
|
| 93 |
"""
|
| 94 |
-
chars_per_chunk = chunk_size *
|
| 95 |
-
chars_overlap = overlap *
|
| 96 |
|
| 97 |
chunks = []
|
| 98 |
start = 0
|
|
|
|
| 91 |
Returns:
|
| 92 |
List of chunk texts.
|
| 93 |
"""
|
| 94 |
+
chars_per_chunk = chunk_size * CHARS_PER_TOKEN
|
| 95 |
+
chars_overlap = overlap * CHARS_PER_TOKEN
|
| 96 |
|
| 97 |
chunks = []
|
| 98 |
start = 0
|
sage/core/evidence.py
CHANGED
|
@@ -17,7 +17,7 @@ from sage.core.models import EvidenceQuality, ProductScore
|
|
| 17 |
# due to insufficient evidence. They prevent hallucination by declining to
|
| 18 |
# explain when the LLM would be forced to fabricate claims.
|
| 19 |
#
|
| 20 |
-
# Threshold selection rationale based on failure analysis
|
| 21 |
# =============================================================================
|
| 22 |
|
| 23 |
# Minimum number of evidence chunks required for explanation generation.
|
|
|
|
| 17 |
# due to insufficient evidence. They prevent hallucination by declining to
|
| 18 |
# explain when the LLM would be forced to fabricate claims.
|
| 19 |
#
|
| 20 |
+
# Threshold selection rationale based on failure analysis:
|
| 21 |
# =============================================================================
|
| 22 |
|
| 23 |
# Minimum number of evidence chunks required for explanation generation.
|
sage/core/models.py
CHANGED
|
@@ -167,7 +167,7 @@ class ExplanationResult:
|
|
| 167 |
"""Build serializable evidence list from ids and texts."""
|
| 168 |
return [
|
| 169 |
{"id": eid, "text": etxt}
|
| 170 |
-
for eid, etxt in zip(self.evidence_ids, self.evidence_texts)
|
| 171 |
]
|
| 172 |
|
| 173 |
|
|
@@ -266,6 +266,32 @@ class VerificationResult:
|
|
| 266 |
missing_quotes: list[str] = field(default_factory=list)
|
| 267 |
|
| 268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
# ============================================================================
|
| 270 |
# HALLUCINATION DETECTION MODELS
|
| 271 |
# ============================================================================
|
|
|
|
| 167 |
"""Build serializable evidence list from ids and texts."""
|
| 168 |
return [
|
| 169 |
{"id": eid, "text": etxt}
|
| 170 |
+
for eid, etxt in zip(self.evidence_ids, self.evidence_texts, strict=True)
|
| 171 |
]
|
| 172 |
|
| 173 |
|
|
|
|
| 266 |
missing_quotes: list[str] = field(default_factory=list)
|
| 267 |
|
| 268 |
|
| 269 |
+
@dataclass
|
| 270 |
+
class CitationResult:
|
| 271 |
+
"""Result of verifying a single citation."""
|
| 272 |
+
|
| 273 |
+
citation_id: str
|
| 274 |
+
found: bool
|
| 275 |
+
quote_text: str | None = None # The quote associated with this citation
|
| 276 |
+
source_text: str | None = None # The evidence text if found
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
@dataclass
|
| 280 |
+
class CitationVerificationResult:
|
| 281 |
+
"""Result of citation verification for an explanation."""
|
| 282 |
+
|
| 283 |
+
all_valid: bool
|
| 284 |
+
citations_found: int
|
| 285 |
+
citations_invalid: int
|
| 286 |
+
valid_citations: list[CitationResult] = field(default_factory=list)
|
| 287 |
+
invalid_citations: list[CitationResult] = field(default_factory=list)
|
| 288 |
+
|
| 289 |
+
@property
|
| 290 |
+
def n_citations(self) -> int:
|
| 291 |
+
"""Total number of citations in explanation."""
|
| 292 |
+
return self.citations_found + self.citations_invalid
|
| 293 |
+
|
| 294 |
+
|
| 295 |
# ============================================================================
|
| 296 |
# HALLUCINATION DETECTION MODELS
|
| 297 |
# ============================================================================
|
sage/core/verification.py
CHANGED
|
@@ -11,9 +11,14 @@ non-existent review IDs.
|
|
| 11 |
"""
|
| 12 |
|
| 13 |
import re
|
| 14 |
-
from dataclasses import dataclass
|
| 15 |
|
| 16 |
-
from sage.core.models import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
|
| 19 |
# Forbidden phrases that violate prompt constraints.
|
|
@@ -207,32 +212,6 @@ def verify_explanation(
|
|
| 207 |
# =============================================================================
|
| 208 |
|
| 209 |
|
| 210 |
-
@dataclass
|
| 211 |
-
class CitationResult:
|
| 212 |
-
"""Result of verifying a single citation."""
|
| 213 |
-
|
| 214 |
-
citation_id: str
|
| 215 |
-
found: bool
|
| 216 |
-
quote_text: str | None = None # The quote associated with this citation
|
| 217 |
-
source_text: str | None = None # The evidence text if found
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
@dataclass
|
| 221 |
-
class CitationVerificationResult:
|
| 222 |
-
"""Result of citation verification for an explanation."""
|
| 223 |
-
|
| 224 |
-
all_valid: bool
|
| 225 |
-
citations_found: int
|
| 226 |
-
citations_invalid: int
|
| 227 |
-
valid_citations: list[CitationResult] = field(default_factory=list)
|
| 228 |
-
invalid_citations: list[CitationResult] = field(default_factory=list)
|
| 229 |
-
|
| 230 |
-
@property
|
| 231 |
-
def n_citations(self) -> int:
|
| 232 |
-
"""Total number of citations in explanation."""
|
| 233 |
-
return self.citations_found + self.citations_invalid
|
| 234 |
-
|
| 235 |
-
|
| 236 |
def extract_citations(text: str) -> list[tuple[str, str | None]]:
|
| 237 |
"""
|
| 238 |
Extract citation IDs and their associated quotes from explanation text.
|
|
|
|
| 11 |
"""
|
| 12 |
|
| 13 |
import re
|
| 14 |
+
from dataclasses import dataclass
|
| 15 |
|
| 16 |
+
from sage.core.models import (
|
| 17 |
+
CitationResult,
|
| 18 |
+
CitationVerificationResult,
|
| 19 |
+
QuoteVerification,
|
| 20 |
+
VerificationResult,
|
| 21 |
+
)
|
| 22 |
|
| 23 |
|
| 24 |
# Forbidden phrases that violate prompt constraints.
|
|
|
|
| 212 |
# =============================================================================
|
| 213 |
|
| 214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
def extract_citations(text: str) -> list[tuple[str, str | None]]:
|
| 216 |
"""
|
| 217 |
Extract citation IDs and their associated quotes from explanation text.
|
sage/services/faithfulness.py
CHANGED
|
@@ -46,10 +46,69 @@ def is_event_loop_running() -> bool:
|
|
| 46 |
return False
|
| 47 |
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
def create_ragas_sample(query: str, explanation: str, evidence_texts: list[str]):
|
| 50 |
"""
|
| 51 |
Create a RAGAS SingleTurnSample from explanation data.
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
Args:
|
| 54 |
query: User's original query.
|
| 55 |
explanation: Generated explanation text.
|
|
@@ -66,10 +125,16 @@ def create_ragas_sample(query: str, explanation: str, evidence_texts: list[str])
|
|
| 66 |
except ImportError:
|
| 67 |
raise ImportError("ragas package required. Install with: pip install ragas")
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
return SingleTurnSample(
|
| 70 |
user_input=query,
|
| 71 |
-
response=
|
| 72 |
-
retrieved_contexts=
|
| 73 |
)
|
| 74 |
|
| 75 |
|
|
@@ -223,7 +288,7 @@ class FaithfulnessEvaluator:
|
|
| 223 |
evidence_count=len(er.evidence_texts),
|
| 224 |
meets_target=float(score) >= self.target,
|
| 225 |
)
|
| 226 |
-
for er, score in zip(explanation_results, scores)
|
| 227 |
]
|
| 228 |
|
| 229 |
scores_arr = np.array(scores)
|
|
@@ -400,7 +465,7 @@ def compute_adjusted_faithfulness(
|
|
| 400 |
# - Regular recommendations evaluated by HHEM
|
| 401 |
regular_passes = sum(
|
| 402 |
1
|
| 403 |
-
for r, is_non_rec in zip(results, valid_non_recs)
|
| 404 |
if not is_non_rec and not r.is_hallucinated
|
| 405 |
)
|
| 406 |
adjusted_passes = regular_passes + n_valid_non_recs
|
|
|
|
| 46 |
return False
|
| 47 |
|
| 48 |
|
| 49 |
+
def _clean_explanation_for_ragas(explanation: str) -> str:
|
| 50 |
+
"""
|
| 51 |
+
Clean explanation text for RAGAS evaluation.
|
| 52 |
+
|
| 53 |
+
RAGAS fails on explanations with quotes + citations together, even when
|
| 54 |
+
the quoted content is verbatim from evidence. This is a known limitation.
|
| 55 |
+
We clean the explanation to remove metadata (citations, framing) while
|
| 56 |
+
preserving the factual claims for evaluation.
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
explanation: Original explanation with framing and citations.
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
Cleaned explanation suitable for RAGAS faithfulness evaluation.
|
| 63 |
+
"""
|
| 64 |
+
import re
|
| 65 |
+
|
| 66 |
+
text = explanation
|
| 67 |
+
|
| 68 |
+
# Remove [review_X] citations - these are metadata, not claims
|
| 69 |
+
text = re.sub(r"\s*\[review_\d+\]", "", text)
|
| 70 |
+
|
| 71 |
+
# Remove framing phrases that aren't factual claims (order matters - longer first)
|
| 72 |
+
framing_patterns = [
|
| 73 |
+
r"According to reviews?,?\s*",
|
| 74 |
+
r"Customers report\s+",
|
| 75 |
+
r"Reviewers say\s+",
|
| 76 |
+
r"One user said\s+",
|
| 77 |
+
r"One user found\s+",
|
| 78 |
+
r"One reviewer found\s+",
|
| 79 |
+
r"One reviewer confirms?\s+(it\s+)?",
|
| 80 |
+
r"One reviewer\s+",
|
| 81 |
+
r"Users mention\s+",
|
| 82 |
+
r"Users also note\s+",
|
| 83 |
+
r"Users note\s+",
|
| 84 |
+
r"Reviewers?\s+(also\s+)?note\s+",
|
| 85 |
+
r"Reviewers?\s+(also\s+)?mention\s+",
|
| 86 |
+
r"Reviewers?\s+confirm\s+",
|
| 87 |
+
r"Reviewers?\s+praise\s+",
|
| 88 |
+
r"Reviewers?\s+highlight\s+",
|
| 89 |
+
]
|
| 90 |
+
for pattern in framing_patterns:
|
| 91 |
+
text = re.sub(pattern, "", text, flags=re.IGNORECASE)
|
| 92 |
+
|
| 93 |
+
# Clean up "and" between quotes to make separate sentences
|
| 94 |
+
text = re.sub(r'\s+and\s+"', '. "', text)
|
| 95 |
+
|
| 96 |
+
# Clean up residual empty/hanging parts
|
| 97 |
+
text = re.sub(r"\s+\.", ".", text)
|
| 98 |
+
text = re.sub(r"\s+,", ",", text)
|
| 99 |
+
text = re.sub(r"\s{2,}", " ", text)
|
| 100 |
+
|
| 101 |
+
return text.strip()
|
| 102 |
+
|
| 103 |
+
|
| 104 |
def create_ragas_sample(query: str, explanation: str, evidence_texts: list[str]):
|
| 105 |
"""
|
| 106 |
Create a RAGAS SingleTurnSample from explanation data.
|
| 107 |
|
| 108 |
+
Cleans the explanation to remove citations and framing that RAGAS
|
| 109 |
+
incorrectly penalizes, and combines evidence into a single context
|
| 110 |
+
for proper claim verification.
|
| 111 |
+
|
| 112 |
Args:
|
| 113 |
query: User's original query.
|
| 114 |
explanation: Generated explanation text.
|
|
|
|
| 125 |
except ImportError:
|
| 126 |
raise ImportError("ragas package required. Install with: pip install ragas")
|
| 127 |
|
| 128 |
+
# Clean explanation for RAGAS evaluation
|
| 129 |
+
cleaned_explanation = _clean_explanation_for_ragas(explanation)
|
| 130 |
+
|
| 131 |
+
# Combine evidence into single context (RAGAS has issues with multiple contexts)
|
| 132 |
+
combined_evidence = " ".join(evidence_texts)
|
| 133 |
+
|
| 134 |
return SingleTurnSample(
|
| 135 |
user_input=query,
|
| 136 |
+
response=cleaned_explanation,
|
| 137 |
+
retrieved_contexts=[combined_evidence],
|
| 138 |
)
|
| 139 |
|
| 140 |
|
|
|
|
| 288 |
evidence_count=len(er.evidence_texts),
|
| 289 |
meets_target=float(score) >= self.target,
|
| 290 |
)
|
| 291 |
+
for er, score in zip(explanation_results, scores, strict=True)
|
| 292 |
]
|
| 293 |
|
| 294 |
scores_arr = np.array(scores)
|
|
|
|
| 465 |
# - Regular recommendations evaluated by HHEM
|
| 466 |
regular_passes = sum(
|
| 467 |
1
|
| 468 |
+
for r, is_non_rec in zip(results, valid_non_recs, strict=True)
|
| 469 |
if not is_non_rec and not r.is_hallucinated
|
| 470 |
)
|
| 471 |
adjusted_passes = regular_passes + n_valid_non_recs
|
scripts/build_eval_dataset.py
CHANGED
|
@@ -536,7 +536,7 @@ def save_eval_cases(
|
|
| 536 |
for c in cases
|
| 537 |
]
|
| 538 |
|
| 539 |
-
with open(filepath, "w") as f:
|
| 540 |
json.dump(data, f, indent=2)
|
| 541 |
|
| 542 |
if verbose:
|
|
@@ -557,7 +557,7 @@ def load_eval_cases(filename: str) -> list[EvalCase]:
|
|
| 557 |
"""
|
| 558 |
filepath = EVAL_DIR / filename
|
| 559 |
|
| 560 |
-
with open(filepath) as f:
|
| 561 |
data = json.load(f)
|
| 562 |
|
| 563 |
return [
|
|
|
|
| 536 |
for c in cases
|
| 537 |
]
|
| 538 |
|
| 539 |
+
with open(filepath, "w", encoding="utf-8") as f:
|
| 540 |
json.dump(data, f, indent=2)
|
| 541 |
|
| 542 |
if verbose:
|
|
|
|
| 557 |
"""
|
| 558 |
filepath = EVAL_DIR / filename
|
| 559 |
|
| 560 |
+
with open(filepath, encoding="utf-8") as f:
|
| 561 |
data = json.load(f)
|
| 562 |
|
| 563 |
return [
|
scripts/build_natural_eval_dataset.py
CHANGED
|
@@ -479,7 +479,7 @@ def save_natural_eval_cases(
|
|
| 479 |
}
|
| 480 |
)
|
| 481 |
|
| 482 |
-
with open(filepath, "w") as f:
|
| 483 |
json.dump(data, f, indent=2)
|
| 484 |
|
| 485 |
logger.info("Saved %d natural language eval cases to: %s", len(data), filepath)
|
|
|
|
| 479 |
}
|
| 480 |
)
|
| 481 |
|
| 482 |
+
with open(filepath, "w", encoding="utf-8") as f:
|
| 483 |
json.dump(data, f, indent=2)
|
| 484 |
|
| 485 |
logger.info("Saved %d natural language eval cases to: %s", len(data), filepath)
|
scripts/demo.py
CHANGED
|
@@ -86,7 +86,12 @@ def demo_recommendation(query: str, top_k: int = 3, max_evidence: int = 3):
|
|
| 86 |
# Display evidence traceability
|
| 87 |
log_section(logger, "EVIDENCE SOURCES")
|
| 88 |
for j, (ev_id, ev_text) in enumerate(
|
| 89 |
-
zip(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
):
|
| 91 |
# Truncate long evidence for display
|
| 92 |
display_text = ev_text[:200] + "..." if len(ev_text) > 200 else ev_text
|
|
@@ -108,7 +113,9 @@ def demo_recommendation(query: str, top_k: int = 3, max_evidence: int = 3):
|
|
| 108 |
"evidence_sources": [
|
| 109 |
{"id": ev_id, "text": ev_text}
|
| 110 |
for ev_id, ev_text in zip(
|
| 111 |
-
explanation_result.evidence_ids,
|
|
|
|
|
|
|
| 112 |
)
|
| 113 |
],
|
| 114 |
}
|
|
|
|
| 86 |
# Display evidence traceability
|
| 87 |
log_section(logger, "EVIDENCE SOURCES")
|
| 88 |
for j, (ev_id, ev_text) in enumerate(
|
| 89 |
+
zip(
|
| 90 |
+
explanation_result.evidence_ids,
|
| 91 |
+
explanation_result.evidence_texts,
|
| 92 |
+
strict=True,
|
| 93 |
+
),
|
| 94 |
+
1,
|
| 95 |
):
|
| 96 |
# Truncate long evidence for display
|
| 97 |
display_text = ev_text[:200] + "..." if len(ev_text) > 200 else ev_text
|
|
|
|
| 113 |
"evidence_sources": [
|
| 114 |
{"id": ev_id, "text": ev_text}
|
| 115 |
for ev_id, ev_text in zip(
|
| 116 |
+
explanation_result.evidence_ids,
|
| 117 |
+
explanation_result.evidence_texts,
|
| 118 |
+
strict=True,
|
| 119 |
)
|
| 120 |
],
|
| 121 |
}
|
scripts/e2e_success_rate.py
CHANGED
|
@@ -19,7 +19,8 @@ from dataclasses import dataclass, asdict
|
|
| 19 |
from datetime import datetime
|
| 20 |
|
| 21 |
from sage.config import (
|
| 22 |
-
|
|
|
|
| 23 |
get_logger,
|
| 24 |
log_banner,
|
| 25 |
log_section,
|
|
@@ -31,33 +32,6 @@ from sage.services.retrieval import get_candidates
|
|
| 31 |
|
| 32 |
logger = get_logger(__name__)
|
| 33 |
|
| 34 |
-
RESULTS_DIR = DATA_DIR / "eval_results"
|
| 35 |
-
RESULTS_DIR.mkdir(exist_ok=True)
|
| 36 |
-
|
| 37 |
-
# Evaluation queries - mix of natural language intents
|
| 38 |
-
EVAL_QUERIES = [
|
| 39 |
-
"wireless headphones with noise cancellation",
|
| 40 |
-
"laptop charger for MacBook",
|
| 41 |
-
"USB hub with multiple ports",
|
| 42 |
-
"portable battery pack for travel",
|
| 43 |
-
"bluetooth speaker with good bass",
|
| 44 |
-
"cheap but good quality earbuds",
|
| 45 |
-
"durable phone case that looks nice",
|
| 46 |
-
"fast charging cable that won't break",
|
| 47 |
-
"comfortable headphones for long sessions",
|
| 48 |
-
"quiet keyboard for office",
|
| 49 |
-
"headphones that don't hurt ears",
|
| 50 |
-
"charger that actually works",
|
| 51 |
-
"waterproof speaker for shower",
|
| 52 |
-
"gift for someone who likes music",
|
| 53 |
-
"tablet stand for kitchen",
|
| 54 |
-
"wireless mouse for laptop",
|
| 55 |
-
"HDMI cable for monitor",
|
| 56 |
-
"phone mount for car",
|
| 57 |
-
"screen protector for phone",
|
| 58 |
-
"backup battery for camping",
|
| 59 |
-
]
|
| 60 |
-
|
| 61 |
|
| 62 |
@dataclass
|
| 63 |
class CaseResult:
|
|
@@ -137,7 +111,7 @@ def run_e2e_evaluation(n_samples: int = 20) -> E2EReport:
|
|
| 137 |
is_valid_non_recommendation,
|
| 138 |
)
|
| 139 |
|
| 140 |
-
queries =
|
| 141 |
|
| 142 |
log_banner(logger, "END-TO-END SUCCESS RATE EVALUATION")
|
| 143 |
logger.info("Samples: %d", len(queries))
|
|
@@ -408,7 +382,7 @@ def run_e2e_evaluation(n_samples: int = 20) -> E2EReport:
|
|
| 408 |
output_file = (
|
| 409 |
RESULTS_DIR / f"e2e_success_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
| 410 |
)
|
| 411 |
-
with open(output_file, "w") as f:
|
| 412 |
json.dump(output, f, indent=2)
|
| 413 |
logger.info("Saved: %s", output_file)
|
| 414 |
|
|
|
|
| 19 |
from datetime import datetime
|
| 20 |
|
| 21 |
from sage.config import (
|
| 22 |
+
E2E_EVAL_QUERIES,
|
| 23 |
+
RESULTS_DIR,
|
| 24 |
get_logger,
|
| 25 |
log_banner,
|
| 26 |
log_section,
|
|
|
|
| 32 |
|
| 33 |
logger = get_logger(__name__)
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
@dataclass
|
| 37 |
class CaseResult:
|
|
|
|
| 111 |
is_valid_non_recommendation,
|
| 112 |
)
|
| 113 |
|
| 114 |
+
queries = E2E_EVAL_QUERIES[:n_samples]
|
| 115 |
|
| 116 |
log_banner(logger, "END-TO-END SUCCESS RATE EVALUATION")
|
| 117 |
logger.info("Samples: %d", len(queries))
|
|
|
|
| 382 |
output_file = (
|
| 383 |
RESULTS_DIR / f"e2e_success_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
| 384 |
)
|
| 385 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
| 386 |
json.dump(output, f, indent=2)
|
| 387 |
logger.info("Saved: %s", output_file)
|
| 388 |
|
scripts/eda.py
CHANGED
|
@@ -8,7 +8,7 @@
|
|
| 8 |
import pandas as pd
|
| 9 |
import matplotlib.pyplot as plt
|
| 10 |
|
| 11 |
-
from sage.config import DEV_SUBSET_SIZE, DATA_DIR
|
| 12 |
from sage.data import load_reviews, get_review_stats, prepare_data
|
| 13 |
|
| 14 |
# Output directory for figures
|
|
@@ -21,7 +21,9 @@ plt.rcParams.update(
|
|
| 21 |
{
|
| 22 |
"figure.figsize": (10, 5),
|
| 23 |
"figure.dpi": 100,
|
| 24 |
-
"savefig.dpi":
|
|
|
|
|
|
|
| 25 |
"font.size": 11,
|
| 26 |
"axes.titlesize": 12,
|
| 27 |
"axes.labelsize": 11,
|
|
@@ -67,7 +69,7 @@ ax.set_ylabel("Count")
|
|
| 67 |
ax.set_title("Rating Distribution")
|
| 68 |
ax.set_xticks(rating_counts.index)
|
| 69 |
|
| 70 |
-
for bar, count in zip(bars, rating_counts.values):
|
| 71 |
ax.text(
|
| 72 |
bar.get_x() + bar.get_width() / 2,
|
| 73 |
bar.get_height() + 50,
|
|
@@ -77,9 +79,7 @@ for bar, count in zip(bars, rating_counts.values):
|
|
| 77 |
fontsize=10,
|
| 78 |
)
|
| 79 |
|
| 80 |
-
plt.
|
| 81 |
-
plt.savefig(FIGURES_DIR / "rating_distribution.png", dpi=150)
|
| 82 |
-
plt.show()
|
| 83 |
|
| 84 |
print("\nRating breakdown:")
|
| 85 |
for rating, count in rating_counts.items():
|
|
@@ -89,7 +89,7 @@ for rating, count in rating_counts.items():
|
|
| 89 |
# %% Review length analysis
|
| 90 |
df["text_length"] = df["text"].str.len()
|
| 91 |
df["word_count"] = df["text"].str.split().str.len()
|
| 92 |
-
df["estimated_tokens"] = df["text_length"] //
|
| 93 |
|
| 94 |
fig, axes = plt.subplots(1, 2, figsize=FIGURE_SIZE_WIDE)
|
| 95 |
|
|
@@ -120,9 +120,7 @@ ax2.set_title("Estimated Token Distribution")
|
|
| 120 |
ax2.axvline(200, color="red", linestyle="--", label="Chunking threshold (200)")
|
| 121 |
ax2.legend()
|
| 122 |
|
| 123 |
-
plt.
|
| 124 |
-
plt.savefig(FIGURES_DIR / "review_lengths.png", dpi=150)
|
| 125 |
-
plt.show()
|
| 126 |
|
| 127 |
needs_chunking = (df["estimated_tokens"] > 200).sum()
|
| 128 |
print("\nReview length stats:")
|
|
@@ -146,9 +144,7 @@ ax.set_ylabel("Median Review Length (chars)")
|
|
| 146 |
ax.set_title("Review Length by Rating")
|
| 147 |
ax.set_xticks([1, 2, 3, 4, 5])
|
| 148 |
|
| 149 |
-
plt.
|
| 150 |
-
plt.savefig(FIGURES_DIR / "length_by_rating.png", dpi=150)
|
| 151 |
-
plt.show()
|
| 152 |
|
| 153 |
print("\nMedian review length by rating:")
|
| 154 |
for rating, length in length_by_rating.items():
|
|
@@ -169,9 +165,7 @@ ax.set_ylabel("Number of Reviews")
|
|
| 169 |
ax.set_title("Reviews Over Time")
|
| 170 |
plt.xticks(rotation=45)
|
| 171 |
|
| 172 |
-
plt.
|
| 173 |
-
plt.savefig(FIGURES_DIR / "reviews_over_time.png", dpi=150)
|
| 174 |
-
plt.show()
|
| 175 |
|
| 176 |
print("\nTemporal range:")
|
| 177 |
print(f" Earliest: {df['datetime'].min()}")
|
|
@@ -230,9 +224,7 @@ ax2.set_xlabel("Reviews per Item")
|
|
| 230 |
ax2.set_ylabel("Number of Items")
|
| 231 |
ax2.set_title("Item Popularity Distribution")
|
| 232 |
|
| 233 |
-
plt.
|
| 234 |
-
plt.savefig(FIGURES_DIR / "user_item_distribution.png", dpi=150)
|
| 235 |
-
plt.show()
|
| 236 |
|
| 237 |
print("\nUser activity:")
|
| 238 |
print(
|
|
@@ -321,5 +313,3 @@ print(
|
|
| 321 |
)
|
| 322 |
print(f"Data quality issues: {empty_reviews + very_short + duplicate_texts}")
|
| 323 |
print(f"\nPlots saved to: {FIGURES_DIR}")
|
| 324 |
-
|
| 325 |
-
# %%
|
|
|
|
| 8 |
import pandas as pd
|
| 9 |
import matplotlib.pyplot as plt
|
| 10 |
|
| 11 |
+
from sage.config import CHARS_PER_TOKEN, DEV_SUBSET_SIZE, DATA_DIR
|
| 12 |
from sage.data import load_reviews, get_review_stats, prepare_data
|
| 13 |
|
| 14 |
# Output directory for figures
|
|
|
|
| 21 |
{
|
| 22 |
"figure.figsize": (10, 5),
|
| 23 |
"figure.dpi": 100,
|
| 24 |
+
"savefig.dpi": 300, # High-res for markdown reports
|
| 25 |
+
"savefig.bbox": "tight",
|
| 26 |
+
"savefig.pad_inches": 0.1,
|
| 27 |
"font.size": 11,
|
| 28 |
"axes.titlesize": 12,
|
| 29 |
"axes.labelsize": 11,
|
|
|
|
| 69 |
ax.set_title("Rating Distribution")
|
| 70 |
ax.set_xticks(rating_counts.index)
|
| 71 |
|
| 72 |
+
for bar, count in zip(bars, rating_counts.values, strict=True):
|
| 73 |
ax.text(
|
| 74 |
bar.get_x() + bar.get_width() / 2,
|
| 75 |
bar.get_height() + 50,
|
|
|
|
| 79 |
fontsize=10,
|
| 80 |
)
|
| 81 |
|
| 82 |
+
plt.savefig(FIGURES_DIR / "rating_distribution.png")
|
|
|
|
|
|
|
| 83 |
|
| 84 |
print("\nRating breakdown:")
|
| 85 |
for rating, count in rating_counts.items():
|
|
|
|
| 89 |
# %% Review length analysis
|
| 90 |
df["text_length"] = df["text"].str.len()
|
| 91 |
df["word_count"] = df["text"].str.split().str.len()
|
| 92 |
+
df["estimated_tokens"] = df["text_length"] // CHARS_PER_TOKEN
|
| 93 |
|
| 94 |
fig, axes = plt.subplots(1, 2, figsize=FIGURE_SIZE_WIDE)
|
| 95 |
|
|
|
|
| 120 |
ax2.axvline(200, color="red", linestyle="--", label="Chunking threshold (200)")
|
| 121 |
ax2.legend()
|
| 122 |
|
| 123 |
+
plt.savefig(FIGURES_DIR / "review_lengths.png")
|
|
|
|
|
|
|
| 124 |
|
| 125 |
needs_chunking = (df["estimated_tokens"] > 200).sum()
|
| 126 |
print("\nReview length stats:")
|
|
|
|
| 144 |
ax.set_title("Review Length by Rating")
|
| 145 |
ax.set_xticks([1, 2, 3, 4, 5])
|
| 146 |
|
| 147 |
+
plt.savefig(FIGURES_DIR / "length_by_rating.png")
|
|
|
|
|
|
|
| 148 |
|
| 149 |
print("\nMedian review length by rating:")
|
| 150 |
for rating, length in length_by_rating.items():
|
|
|
|
| 165 |
ax.set_title("Reviews Over Time")
|
| 166 |
plt.xticks(rotation=45)
|
| 167 |
|
| 168 |
+
plt.savefig(FIGURES_DIR / "reviews_over_time.png")
|
|
|
|
|
|
|
| 169 |
|
| 170 |
print("\nTemporal range:")
|
| 171 |
print(f" Earliest: {df['datetime'].min()}")
|
|
|
|
| 224 |
ax2.set_ylabel("Number of Items")
|
| 225 |
ax2.set_title("Item Popularity Distribution")
|
| 226 |
|
| 227 |
+
plt.savefig(FIGURES_DIR / "user_item_distribution.png")
|
|
|
|
|
|
|
| 228 |
|
| 229 |
print("\nUser activity:")
|
| 230 |
print(
|
|
|
|
| 313 |
)
|
| 314 |
print(f"Data quality issues: {empty_reviews + very_short + duplicate_texts}")
|
| 315 |
print(f"\nPlots saved to: {FIGURES_DIR}")
|
|
|
|
|
|
scripts/evaluation.py
CHANGED
|
@@ -19,6 +19,7 @@ Run from project root.
|
|
| 19 |
|
| 20 |
import argparse
|
| 21 |
import json
|
|
|
|
| 22 |
from datetime import datetime
|
| 23 |
from pathlib import Path
|
| 24 |
|
|
@@ -29,16 +30,13 @@ from sage.services.baselines import (
|
|
| 29 |
RandomBaseline,
|
| 30 |
load_product_embeddings_from_qdrant,
|
| 31 |
)
|
| 32 |
-
from sage.config import
|
| 33 |
from sage.data import load_eval_cases, load_splits
|
| 34 |
from sage.services.evaluation import compute_item_popularity, evaluate_recommendations
|
| 35 |
from sage.services.retrieval import recommend
|
| 36 |
|
| 37 |
logger = get_logger(__name__)
|
| 38 |
|
| 39 |
-
RESULTS_DIR = DATA_DIR / "eval_results"
|
| 40 |
-
RESULTS_DIR.mkdir(exist_ok=True)
|
| 41 |
-
|
| 42 |
|
| 43 |
def create_recommend_fn(
|
| 44 |
top_k: int = 10,
|
|
@@ -46,7 +44,7 @@ def create_recommend_fn(
|
|
| 46 |
min_rating: float | None = None,
|
| 47 |
similarity_weight: float = 1.0,
|
| 48 |
rating_weight: float = 0.0,
|
| 49 |
-
):
|
| 50 |
"""Create a recommend function for evaluation."""
|
| 51 |
|
| 52 |
def _recommend(query: str) -> list[str]:
|
|
@@ -76,14 +74,14 @@ def save_results(
|
|
| 76 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 77 |
filename = f"eval_results_{timestamp}.json"
|
| 78 |
filepath = RESULTS_DIR / filename
|
| 79 |
-
with open(filepath, "w") as f:
|
| 80 |
json.dump(results, f, indent=2)
|
| 81 |
|
| 82 |
# Write latest symlink for the summary script
|
| 83 |
if dataset:
|
| 84 |
stem = Path(dataset).stem # e.g. "eval_loo_history"
|
| 85 |
latest_path = RESULTS_DIR / f"{stem}_latest.json"
|
| 86 |
-
with open(latest_path, "w") as f:
|
| 87 |
json.dump(results, f, indent=2)
|
| 88 |
|
| 89 |
return filepath
|
|
|
|
| 19 |
|
| 20 |
import argparse
|
| 21 |
import json
|
| 22 |
+
from collections.abc import Callable
|
| 23 |
from datetime import datetime
|
| 24 |
from pathlib import Path
|
| 25 |
|
|
|
|
| 30 |
RandomBaseline,
|
| 31 |
load_product_embeddings_from_qdrant,
|
| 32 |
)
|
| 33 |
+
from sage.config import RESULTS_DIR, get_logger, log_banner, log_section, log_kv
|
| 34 |
from sage.data import load_eval_cases, load_splits
|
| 35 |
from sage.services.evaluation import compute_item_popularity, evaluate_recommendations
|
| 36 |
from sage.services.retrieval import recommend
|
| 37 |
|
| 38 |
logger = get_logger(__name__)
|
| 39 |
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
def create_recommend_fn(
|
| 42 |
top_k: int = 10,
|
|
|
|
| 44 |
min_rating: float | None = None,
|
| 45 |
similarity_weight: float = 1.0,
|
| 46 |
rating_weight: float = 0.0,
|
| 47 |
+
) -> Callable[[str], list[str]]:
|
| 48 |
"""Create a recommend function for evaluation."""
|
| 49 |
|
| 50 |
def _recommend(query: str) -> list[str]:
|
|
|
|
| 74 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 75 |
filename = f"eval_results_{timestamp}.json"
|
| 76 |
filepath = RESULTS_DIR / filename
|
| 77 |
+
with open(filepath, "w", encoding="utf-8") as f:
|
| 78 |
json.dump(results, f, indent=2)
|
| 79 |
|
| 80 |
# Write latest symlink for the summary script
|
| 81 |
if dataset:
|
| 82 |
stem = Path(dataset).stem # e.g. "eval_loo_history"
|
| 83 |
latest_path = RESULTS_DIR / f"{stem}_latest.json"
|
| 84 |
+
with open(latest_path, "w", encoding="utf-8") as f:
|
| 85 |
json.dump(results, f, indent=2)
|
| 86 |
|
| 87 |
return filepath
|
scripts/explanation.py
CHANGED
|
@@ -90,7 +90,7 @@ def run_basic_tests():
|
|
| 90 |
for expl in all_explanations
|
| 91 |
]
|
| 92 |
|
| 93 |
-
for expl, result in zip(all_explanations, hhem_results):
|
| 94 |
status = "GROUNDED" if not result.is_hallucinated else "HALLUCINATED"
|
| 95 |
logger.info("[%s] Score: %.3f - %s", status, result.score, expl.product_id)
|
| 96 |
|
|
|
|
| 90 |
for expl in all_explanations
|
| 91 |
]
|
| 92 |
|
| 93 |
+
for expl, result in zip(all_explanations, hhem_results, strict=True):
|
| 94 |
status = "GROUNDED" if not result.is_hallucinated else "HALLUCINATED"
|
| 95 |
logger.info("[%s] Score: %.3f - %s", status, result.score, expl.product_id)
|
| 96 |
|
scripts/faithfulness.py
CHANGED
|
@@ -25,6 +25,7 @@ import numpy as np
|
|
| 25 |
|
| 26 |
from sage.core import AggregationMethod
|
| 27 |
from sage.config import (
|
|
|
|
| 28 |
EVALUATION_QUERIES,
|
| 29 |
FAITHFULNESS_TARGET,
|
| 30 |
MAX_EVIDENCE,
|
|
@@ -100,7 +101,7 @@ def run_evaluation(n_samples: int, run_ragas: bool = False):
|
|
| 100 |
for expl in all_explanations
|
| 101 |
]
|
| 102 |
|
| 103 |
-
for expl, result in zip(all_explanations, hhem_results):
|
| 104 |
status = "GROUNDED" if not result.is_hallucinated else "HALLUCINATED"
|
| 105 |
logger.info(" [%s] %.3f - %s", status, result.score, expl.product_id)
|
| 106 |
|
|
@@ -200,23 +201,6 @@ def run_evaluation(n_samples: int, run_ragas: bool = False):
|
|
| 200 |
# SECTION: Failure Analysis
|
| 201 |
# ============================================================================
|
| 202 |
|
| 203 |
-
ANALYSIS_QUERIES = [
|
| 204 |
-
"wireless headphones with noise cancellation",
|
| 205 |
-
"laptop charger for MacBook",
|
| 206 |
-
"USB hub with multiple ports",
|
| 207 |
-
"portable battery pack for travel",
|
| 208 |
-
"bluetooth speaker with good bass",
|
| 209 |
-
"cheap but good quality earbuds",
|
| 210 |
-
"durable phone case that looks nice",
|
| 211 |
-
"fast charging cable that won't break",
|
| 212 |
-
"comfortable headphones for long sessions",
|
| 213 |
-
"quiet keyboard for office",
|
| 214 |
-
"headphones that don't hurt ears",
|
| 215 |
-
"charger that actually works",
|
| 216 |
-
"waterproof speaker for shower",
|
| 217 |
-
"gift for someone who likes music",
|
| 218 |
-
]
|
| 219 |
-
|
| 220 |
|
| 221 |
def run_failure_analysis():
|
| 222 |
"""Analyze failure cases to identify root causes."""
|
|
|
|
| 25 |
|
| 26 |
from sage.core import AggregationMethod
|
| 27 |
from sage.config import (
|
| 28 |
+
ANALYSIS_QUERIES,
|
| 29 |
EVALUATION_QUERIES,
|
| 30 |
FAITHFULNESS_TARGET,
|
| 31 |
MAX_EVIDENCE,
|
|
|
|
| 101 |
for expl in all_explanations
|
| 102 |
]
|
| 103 |
|
| 104 |
+
for expl, result in zip(all_explanations, hhem_results, strict=True):
|
| 105 |
status = "GROUNDED" if not result.is_hallucinated else "HALLUCINATED"
|
| 106 |
logger.info(" [%s] %.3f - %s", status, result.score, expl.product_id)
|
| 107 |
|
|
|
|
| 201 |
# SECTION: Failure Analysis
|
| 202 |
# ============================================================================
|
| 203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
def run_failure_analysis():
|
| 206 |
"""Analyze failure cases to identify root causes."""
|
scripts/pipeline.py
CHANGED
|
@@ -23,6 +23,7 @@ import argparse
|
|
| 23 |
import numpy as np
|
| 24 |
|
| 25 |
from sage.config import (
|
|
|
|
| 26 |
DEV_SUBSET_SIZE,
|
| 27 |
DATA_DIR,
|
| 28 |
get_logger,
|
|
@@ -80,7 +81,7 @@ def run_tokenizer_validation():
|
|
| 80 |
|
| 81 |
log_section(logger, "Results")
|
| 82 |
log_kv(logger, "Mean chars/token", np.mean(ratios))
|
| 83 |
-
log_kv(logger, "Std", np.std(ratios))
|
| 84 |
log_kv(logger, "Current assumption", 4.0)
|
| 85 |
|
| 86 |
status = "VALID" if abs(np.mean(ratios) - 4.0) <= 0.5 else "UPDATE NEEDED"
|
|
@@ -115,8 +116,8 @@ def run_chunking_test():
|
|
| 115 |
sample = long_reviews.sample(min(50, len(long_reviews)), random_state=42)
|
| 116 |
results = []
|
| 117 |
|
| 118 |
-
for idx,
|
| 119 |
-
text, tokens, rating = row
|
| 120 |
chunks = chunk_text(text, embedder=embedder)
|
| 121 |
sentences = split_sentences(text)
|
| 122 |
|
|
@@ -185,7 +186,7 @@ def run_pipeline(subset_size: int, force: bool):
|
|
| 185 |
|
| 186 |
# Review length analysis
|
| 187 |
df["text_length"] = df["text"].str.len()
|
| 188 |
-
df["estimated_tokens"] = df["text_length"] //
|
| 189 |
|
| 190 |
needs_chunking = (df["estimated_tokens"] > 200).sum()
|
| 191 |
logger.info(
|
|
|
|
| 23 |
import numpy as np
|
| 24 |
|
| 25 |
from sage.config import (
|
| 26 |
+
CHARS_PER_TOKEN,
|
| 27 |
DEV_SUBSET_SIZE,
|
| 28 |
DATA_DIR,
|
| 29 |
get_logger,
|
|
|
|
| 81 |
|
| 82 |
log_section(logger, "Results")
|
| 83 |
log_kv(logger, "Mean chars/token", np.mean(ratios))
|
| 84 |
+
log_kv(logger, "Std", np.std(ratios, ddof=1))
|
| 85 |
log_kv(logger, "Current assumption", 4.0)
|
| 86 |
|
| 87 |
status = "VALID" if abs(np.mean(ratios) - 4.0) <= 0.5 else "UPDATE NEEDED"
|
|
|
|
| 116 |
sample = long_reviews.sample(min(50, len(long_reviews)), random_state=42)
|
| 117 |
results = []
|
| 118 |
|
| 119 |
+
for idx, row in enumerate(sample.itertuples(index=False)):
|
| 120 |
+
text, tokens, rating = row.text, row.tokens, int(row.rating)
|
| 121 |
chunks = chunk_text(text, embedder=embedder)
|
| 122 |
sentences = split_sentences(text)
|
| 123 |
|
|
|
|
| 186 |
|
| 187 |
# Review length analysis
|
| 188 |
df["text_length"] = df["text"].str.len()
|
| 189 |
+
df["estimated_tokens"] = df["text_length"] // CHARS_PER_TOKEN
|
| 190 |
|
| 191 |
needs_chunking = (df["estimated_tokens"] > 200).sum()
|
| 192 |
logger.info(
|
tests/test_chunking.py
CHANGED
|
@@ -85,12 +85,14 @@ class TestSlidingWindowChunk:
|
|
| 85 |
sentences = [f"Unique sentence {i} here." for i in range(20)]
|
| 86 |
text = " ".join(sentences)
|
| 87 |
chunks = sliding_window_chunk(text, chunk_size=30, overlap=10)
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
| 94 |
|
| 95 |
|
| 96 |
class TestFindSplitPoints:
|
|
|
|
| 85 |
sentences = [f"Unique sentence {i} here." for i in range(20)]
|
| 86 |
text = " ".join(sentences)
|
| 87 |
chunks = sliding_window_chunk(text, chunk_size=30, overlap=10)
|
| 88 |
+
assert len(chunks) >= 2, "Expected multiple chunks for overlap test"
|
| 89 |
+
# With overlap, adjacent chunks should share some words
|
| 90 |
+
words_0 = set(chunks[0].split())
|
| 91 |
+
words_1 = set(chunks[1].split())
|
| 92 |
+
shared_words = words_0 & words_1
|
| 93 |
+
assert len(shared_words) > 0, (
|
| 94 |
+
"Adjacent chunks should share words due to overlap"
|
| 95 |
+
)
|
| 96 |
|
| 97 |
|
| 98 |
class TestFindSplitPoints:
|