Spaces:
Running
Running
Commit ·
4b0e0a6
1
Parent(s): 2cfed75
added evaluation
Browse files
backend/app/evaluation/ablation.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Ablation study for AtlasRAG retrieval."""
|
| 2 |
+
|
| 3 |
+
from app.evaluation.metrics import coverage, diversity, recall_at_k
|
| 4 |
+
from app.evaluation.test_queries import TEST_QUERIES
|
| 5 |
+
from app.evaluation.utils import extract_pages
|
| 6 |
+
from app.retrieval.retrieve import hybrid_graph_search
|
| 7 |
+
from app.retrieval.vector_store import vector_search
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def run_ablation() -> None:
|
| 11 |
+
"""Run ablation study."""
|
| 12 |
+
print("\n=== AtlasRAG Ablation Study ===\n")
|
| 13 |
+
|
| 14 |
+
for item in TEST_QUERIES:
|
| 15 |
+
query = item["query"]
|
| 16 |
+
expected = item["expected_pages"]
|
| 17 |
+
|
| 18 |
+
print("-" * 70)
|
| 19 |
+
print(f"Query: {query}\n")
|
| 20 |
+
|
| 21 |
+
vector_pages = extract_pages(vector_search(query, top_k=5))
|
| 22 |
+
hybrid_pages = extract_pages(hybrid_graph_search(query, top_k=5))
|
| 23 |
+
|
| 24 |
+
print("VECTOR ONLY")
|
| 25 |
+
print(f"Recall@5: {recall_at_k(vector_pages, expected):.2f}")
|
| 26 |
+
print(f"Coverage: {coverage(vector_pages)}")
|
| 27 |
+
print(f"Diversity: {diversity(vector_pages):.2f}\n")
|
| 28 |
+
|
| 29 |
+
print("VECTOR + GRAPH")
|
| 30 |
+
print(f"Recall@5: {recall_at_k(hybrid_pages, expected):.2f}")
|
| 31 |
+
print(f"Coverage: {coverage(hybrid_pages)}")
|
| 32 |
+
print(f"Diversity: {diversity(hybrid_pages):.2f}\n")
|
| 33 |
+
|
| 34 |
+
print("Ablation complete.\n")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
if __name__ == "__main__":
|
| 38 |
+
run_ablation()
|
backend/app/evaluation/compare_baseline.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compare Vector Search vs Hybrid Graph-RAG."""
|
| 2 |
+
|
| 3 |
+
from app.evaluation.metrics import coverage, diversity, recall_at_k
|
| 4 |
+
from app.evaluation.test_queries import TEST_QUERIES
|
| 5 |
+
from app.evaluation.utils import extract_pages
|
| 6 |
+
from app.retrieval.retrieve import hybrid_graph_search
|
| 7 |
+
from app.retrieval.vector_store import vector_search
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def _print_block(
|
| 11 |
+
*,
|
| 12 |
+
name: str,
|
| 13 |
+
pages: list[int],
|
| 14 |
+
expected: set[int],
|
| 15 |
+
) -> None:
|
| 16 |
+
print(name)
|
| 17 |
+
print(f"Pages: {pages}")
|
| 18 |
+
print(f"Recall@5: {recall_at_k(pages, expected):.2f}")
|
| 19 |
+
print(f"Coverage: {coverage(pages)}")
|
| 20 |
+
print(f"Diversity: {diversity(pages):.2f}")
|
| 21 |
+
print()
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def run_comparison() -> None:
|
| 25 |
+
"""Run retrieval comparison."""
|
| 26 |
+
print("\n=== AtlasRAG Retrieval Comparison ===\n")
|
| 27 |
+
|
| 28 |
+
for item in TEST_QUERIES:
|
| 29 |
+
query = item["query"]
|
| 30 |
+
expected = item["expected_pages"]
|
| 31 |
+
qtype = item["type"]
|
| 32 |
+
|
| 33 |
+
print("-" * 70)
|
| 34 |
+
print(f"Query ({qtype}): {query}")
|
| 35 |
+
print(f"Expected pages: {sorted(expected)}\n")
|
| 36 |
+
|
| 37 |
+
vector_results = vector_search(query, top_k=5)
|
| 38 |
+
vector_pages = extract_pages(vector_results)
|
| 39 |
+
|
| 40 |
+
graph_results = hybrid_graph_search(query, top_k=5)
|
| 41 |
+
graph_pages = extract_pages(graph_results)
|
| 42 |
+
|
| 43 |
+
_print_block(
|
| 44 |
+
name="VECTOR SEARCH",
|
| 45 |
+
pages=vector_pages,
|
| 46 |
+
expected=expected,
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
_print_block(
|
| 50 |
+
name="HYBRID GRAPH-RAG",
|
| 51 |
+
pages=graph_pages,
|
| 52 |
+
expected=expected,
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
print("Comparison complete.\n")
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
if __name__ == "__main__":
|
| 59 |
+
run_comparison()
|
backend/app/evaluation/metrics.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Evaluation metrics for retrieval quality."""
|
| 2 |
+
|
| 3 |
+
from typing import Iterable, Set
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def recall_at_k(retrieved_pages: Iterable[int], expected_pages: Set[int]) -> float:
|
| 7 |
+
"""Compute Recall@K."""
|
| 8 |
+
return float(bool(set(retrieved_pages) & expected_pages))
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def coverage(retrieved_pages: Iterable[int]) -> int:
|
| 12 |
+
"""Number of unique pages retrieved."""
|
| 13 |
+
return len(set(retrieved_pages))
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def diversity(retrieved_pages: Iterable[int]) -> float:
|
| 17 |
+
"""Ratio of unique pages to total retrieved pages."""
|
| 18 |
+
pages = list(retrieved_pages)
|
| 19 |
+
if not pages:
|
| 20 |
+
return 0.0
|
| 21 |
+
return len(set(pages)) / len(pages)
|
backend/app/evaluation/retrievers.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Evaluation-only retrievers."""
|
| 2 |
+
|
| 3 |
+
from typing import List
|
| 4 |
+
|
| 5 |
+
from app.models.retrieval import ScoredChunk
|
| 6 |
+
from app.retrieval.vector_store import vector_search
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def vector_only_search(query: str, top_k: int) -> List[ScoredChunk]:
|
| 10 |
+
"""Pure vector search baseline."""
|
| 11 |
+
return vector_search(query, top_k=top_k)
|
backend/app/evaluation/test_queries.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Evaluation queries for AtlasRAG."""
|
| 2 |
+
|
| 3 |
+
TEST_QUERIES = [
|
| 4 |
+
{
|
| 5 |
+
"query": "What is scaled dot-product attention?",
|
| 6 |
+
"expected_pages": {3, 4},
|
| 7 |
+
"type": "localized",
|
| 8 |
+
},
|
| 9 |
+
{
|
| 10 |
+
"query": "How does self-attention replace recurrence and convolution?",
|
| 11 |
+
"expected_pages": {1, 2, 5},
|
| 12 |
+
"type": "distributed",
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"query": "Compare encoder, decoder, and encoder-decoder architectures",
|
| 16 |
+
"expected_pages": {2, 3},
|
| 17 |
+
"type": "comparative",
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"query": "What role does positional encoding play in the Transformer model?",
|
| 21 |
+
"expected_pages": {2, 6},
|
| 22 |
+
"type": "distributed",
|
| 23 |
+
},
|
| 24 |
+
]
|
backend/app/evaluation/utils.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Utility helpers for evaluation."""
|
| 2 |
+
|
| 3 |
+
from typing import Iterable
|
| 4 |
+
|
| 5 |
+
from app.models.retrieval import ScoredChunk
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def extract_pages(results: Iterable[ScoredChunk]) -> list[int]:
|
| 9 |
+
"""Extract page numbers from retrieved chunks."""
|
| 10 |
+
return [sc.chunk.page_start for sc in results]
|
backend/app/utils/__init__.py
DELETED
|
File without changes
|