Spaces:

Rom89823974978
/

RAG_Eval

Build error

App Files Files Community

Rom89823974978 commited on Jun 6, 2025

Commit

4dc151e

1 Parent(s): 79bdbbe

Resolved issues

Browse files

Files changed (9) hide show

evaluation/metrics/composite.py +34 -7
evaluation/pipeline.py +1 -1
evaluation/retrievers/bm25.py +1 -1
evaluation/retrievers/hybrid.py +1 -1
evaluation/stats/robustness.py +1 -1
evaluation/stats/significance.py +1 -9
tests/test_pipeline.py +1 -1
tests/test_sparse_retriever.py +2 -2
tests/test_stats.py +4 -3

evaluation/metrics/composite.py CHANGED Viewed

@@ -9,14 +9,41 @@ def harmonic_mean(scores: Mapping[str, float]) -> float:
     """Compute the harmonic mean of positive scores."""
     if not scores:
         return 0.0
-    inv_sum = sum(1.0 / (v) for v in scores.values() if v > 0)
-    return len(scores) / inv_sum if inv_sum and inv_sum != 0 else 0.0
-def rag_score(scores: Mapping[str, float]) -> float:
     """
-    Compute a composite RAG score as the harmonic mean of all sub‐scores.
-    Tests expect to call rag_score(...) with a single mapping of sub‐scores.
     """
-    return harmonic_mean(scores)

     """Compute the harmonic mean of positive scores."""
     if not scores:
         return 0.0
+    if any(v <= 0 for v in scores.values()):
+        return 0.0
+    else:
+        inv_sum = sum(1.0 / (v) for v in scores.values())
+        return len(scores) / inv_sum if inv_sum and inv_sum != 0 else 0.0
+def rag_score(
+    scores: Mapping[str, float],
+    *,
+    alpha: float = 0.5,
+) -> float:
     """
+    Combine retrieval & generation sub-scores (0-1) via weighted HM.
     """
+    # Split the incoming flat mapping into two maps: retrieval vs generation
+    retr_map: dict[str, float] = {}
+    gen_map: dict[str, float] = {}
+    for k, v in scores.items():
+        if k.startswith("retrieval_"):
+            retr_map[k[len("retrieval_"):]] = v
+        elif k.startswith("generation_"):
+            gen_map[k[len("generation_"):]] = v
+        else:
+            # ignore any key that doesn't start with 'retrieval_' or 'generation_'
+            pass
+    # If either side is empty, we cannot score
+    if not retr_map or not gen_map:
+        return 0.0
+    retr_score = harmonic_mean(retr_map)
+    gen_score = harmonic_mean(gen_map)
+    if retr_score == 0 or gen_score == 0:
+        return 0.0
+    return 1.0 / (alpha / retr_score + (1 - alpha) / gen_score)

evaluation/pipeline.py CHANGED Viewed

@@ -63,7 +63,7 @@ class RAGPipeline:
         r=cfg.retriever
         name = r.name
         if name == "bm25":
-            return bm25.BM25Retriever(bm25_idx=str(r.bm25_index), doc_store_path=str(r.doc_store))
         if name == "dense":
             return dense.DenseRetriever(
                 faiss_index=str(r.faiss_index),

         r=cfg.retriever
         name = r.name
         if name == "bm25":
+            return bm25.BM25Retriever(bm25_idx=str(r.bm25_idx), doc_store=str(r.doc_store))
         if name == "dense":
             return dense.DenseRetriever(
                 faiss_index=str(r.faiss_index),

evaluation/retrievers/bm25.py CHANGED Viewed

@@ -19,7 +19,7 @@ class BM25Retriever(Retriever):
     def __init__(
         self,
         bm25_idx: str | None,
-        doc_store_path: str | None = None,
         threads: int = 1,
     ):
         if bm25_idx is None:

     def __init__(
         self,
         bm25_idx: str | None,
+        doc_store: str | None = None,
         threads: int = 1,
     ):
         if bm25_idx is None:

evaluation/retrievers/hybrid.py CHANGED Viewed

@@ -24,7 +24,7 @@ class HybridRetriever(Retriever):
         device: str = "cpu",
     ):
         # 1) BM25 retriever
-        self.bm25 = BM25Retriever(bm25_idx, doc_store_path=doc_store)
         # 2) Dense retriever
         self.dense = DenseRetriever(

         device: str = "cpu",
     ):
         # 1) BM25 retriever
+        self.bm25 = BM25Retriever(bm25_idx, doc_store=doc_store)
         # 2) Dense retriever
         self.dense = DenseRetriever(

evaluation/stats/robustness.py CHANGED Viewed

@@ -77,7 +77,7 @@ def chi2_error_propagation(
     ]
     try:
         chi2, p, dof, expected = chi2_contingency(table)
-        return dict(chi2=chi2, p=p, dof=dof, expected=expected, table=table)
     except ValueError:
         default_expected = [[0, 0], [0, 0]]
         return dict(chi2=0.0, p=1.0, dof=0, expected=default_expected, table=table)

     ]
     try:
         chi2, p, dof, expected = chi2_contingency(table)
+        return dict(chi2=float(chi2), p=float(p), dof=int(dof), expected= expected.tolist(), table=table)
     except ValueError:
         default_expected = [[0, 0], [0, 0]]
         return dict(chi2=0.0, p=1.0, dof=0, expected=default_expected, table=table)

evaluation/stats/significance.py CHANGED Viewed

@@ -36,12 +36,4 @@ def holm_bonferroni(pvalues: Mapping[str, float]) -> Mapping[str, float]:
     adjusted: dict[str, float] = {}
     for i, (name, p) in enumerate(sorted_items, start=1):
         adjusted[name] = min((m - i + 1) * p, 1.0)
-    return adjusted
-def delta_metric(base: Sequence[float], new: Sequence[float]) -> list[float]:
-    """Compute per‐element differences `new[i] - base[i]` as a list of floats."""
-    diffs: list[float] = []
-    for b, n in zip(base, new):
-        diffs.append(float(n - b))
-    return diffs

     adjusted: dict[str, float] = {}
     for i, (name, p) in enumerate(sorted_items, start=1):
         adjusted[name] = min((m - i + 1) * p, 1.0)
+    return adjusted

tests/test_pipeline.py CHANGED Viewed

@@ -7,7 +7,7 @@ from evaluation.pipeline import RAGPipeline
 def test_pipeline_init():
     # Using bm25 + dummy index path
     cfg = PipelineConfig(
-        retriever=RetrieverConfig(name="bm25", index_path="dummy"),
         generator=GeneratorConfig(model_name="google/flan-t5-base"),
     )
     pipeline = RAGPipeline(cfg)

 def test_pipeline_init():
     # Using bm25 + dummy index path
     cfg = PipelineConfig(
+        retriever=RetrieverConfig(name="bm25", bm25_idx="dummy"),
         generator=GeneratorConfig(model_name="google/flan-t5-base"),
     )
     pipeline = RAGPipeline(cfg)

tests/test_sparse_retriever.py CHANGED Viewed

@@ -59,7 +59,7 @@ def test_bm25_index_build_and_query(tmp_path):
     assert not index_dir.exists()
     # Instantiate BM25Retriever; __init__ should “build” the index (subprocess.run no‐ops)
-    retriever = BM25Retriever(index_path=str(index_dir), doc_store_path=str(doc_store_path))
     # After init, index_dir “exists” (because build_index created it)
     assert index_dir.exists()
@@ -89,6 +89,6 @@ def test_bm25_retrieve_when_pyserini_missing(monkeypatch, tmp_path):
     doc_store_path.write_text('{"id":0,"text":"hello"}\n')
     index_dir = tmp_path / "bm25_index2"
-    retriever = BM25Retriever(index_path=str(index_dir), doc_store_path=str(doc_store_path))
     # If SimpleSearcher failed to import, retrieve() returns []
     assert retriever.retrieve("whatever", top_k=5) == []

     assert not index_dir.exists()
     # Instantiate BM25Retriever; __init__ should “build” the index (subprocess.run no‐ops)
+    retriever = BM25Retriever(bm25_idx=str(index_dir), doc_store=str(doc_store_path))
     # After init, index_dir “exists” (because build_index created it)
     assert index_dir.exists()
     doc_store_path.write_text('{"id":0,"text":"hello"}\n')
     index_dir = tmp_path / "bm25_index2"
+    retriever = BM25Retriever(bm25_idx=str(index_dir), doc_store_path=str(doc_store_path))
     # If SimpleSearcher failed to import, retrieve() returns []
     assert retriever.retrieve("whatever", top_k=5) == []

tests/test_stats.py CHANGED Viewed

@@ -41,7 +41,7 @@ def test_delta_and_failure_rate():
     base = [0.9, 0.8, 0.7]
     new = [0.85, 0.75, 0.65]
     deltas = delta_metric(base, new)
-    assert isinstance(deltas, list) and len(deltas) == 3
     rate = conditional_failure_rate([0, 1, 0, 1], threshold=0.5)
     assert 0 <= rate <= 1
@@ -50,5 +50,6 @@ def test_chi2_error_propagation():
     arr1 = [10, 20, 30]
     arr2 = [15, 25, 35]
     err = chi2_error_propagation(arr1, arr2)
-    assert isinstance(err, float)
-    assert err >= 0

     base = [0.9, 0.8, 0.7]
     new = [0.85, 0.75, 0.65]
     deltas = delta_metric(base, new)
+    assert isinstance(deltas, tuple) and len(deltas) == 2
     rate = conditional_failure_rate([0, 1, 0, 1], threshold=0.5)
     assert 0 <= rate <= 1
     arr1 = [10, 20, 30]
     arr2 = [15, 25, 35]
     err = chi2_error_propagation(arr1, arr2)
+    assert isinstance(err, dict)
+    assert isinstance(err.get("chi2"), float)
+    assert isinstance(err.get("p"), float)