Rom89823974978 commited on
Commit
4dc151e
·
1 Parent(s): 79bdbbe

Resolved issues

Browse files
evaluation/metrics/composite.py CHANGED
@@ -9,14 +9,41 @@ def harmonic_mean(scores: Mapping[str, float]) -> float:
9
  """Compute the harmonic mean of positive scores."""
10
  if not scores:
11
  return 0.0
12
- inv_sum = sum(1.0 / (v) for v in scores.values() if v > 0)
13
- return len(scores) / inv_sum if inv_sum and inv_sum != 0 else 0.0
 
 
 
14
 
15
 
16
- def rag_score(scores: Mapping[str, float]) -> float:
 
 
 
 
17
  """
18
- Compute a composite RAG score as the harmonic mean of all sub‐scores.
19
-
20
- Tests expect to call rag_score(...) with a single mapping of sub‐scores.
21
  """
22
- return harmonic_mean(scores)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  """Compute the harmonic mean of positive scores."""
10
  if not scores:
11
  return 0.0
12
+ if any(v <= 0 for v in scores.values()):
13
+ return 0.0
14
+ else:
15
+ inv_sum = sum(1.0 / (v) for v in scores.values())
16
+ return len(scores) / inv_sum if inv_sum and inv_sum != 0 else 0.0
17
 
18
 
19
+ def rag_score(
20
+ scores: Mapping[str, float],
21
+ *,
22
+ alpha: float = 0.5,
23
+ ) -> float:
24
  """
25
+ Combine retrieval & generation sub-scores (0-1) via weighted HM.
 
 
26
  """
27
+ # Split the incoming flat mapping into two maps: retrieval vs generation
28
+ retr_map: dict[str, float] = {}
29
+ gen_map: dict[str, float] = {}
30
+ for k, v in scores.items():
31
+ if k.startswith("retrieval_"):
32
+ retr_map[k[len("retrieval_"):]] = v
33
+ elif k.startswith("generation_"):
34
+ gen_map[k[len("generation_"):]] = v
35
+ else:
36
+ # ignore any key that doesn't start with 'retrieval_' or 'generation_'
37
+ pass
38
+
39
+ # If either side is empty, we cannot score
40
+ if not retr_map or not gen_map:
41
+ return 0.0
42
+
43
+ retr_score = harmonic_mean(retr_map)
44
+ gen_score = harmonic_mean(gen_map)
45
+
46
+ if retr_score == 0 or gen_score == 0:
47
+ return 0.0
48
+
49
+ return 1.0 / (alpha / retr_score + (1 - alpha) / gen_score)
evaluation/pipeline.py CHANGED
@@ -63,7 +63,7 @@ class RAGPipeline:
63
  r=cfg.retriever
64
  name = r.name
65
  if name == "bm25":
66
- return bm25.BM25Retriever(bm25_idx=str(r.bm25_index), doc_store_path=str(r.doc_store))
67
  if name == "dense":
68
  return dense.DenseRetriever(
69
  faiss_index=str(r.faiss_index),
 
63
  r=cfg.retriever
64
  name = r.name
65
  if name == "bm25":
66
+ return bm25.BM25Retriever(bm25_idx=str(r.bm25_idx), doc_store=str(r.doc_store))
67
  if name == "dense":
68
  return dense.DenseRetriever(
69
  faiss_index=str(r.faiss_index),
evaluation/retrievers/bm25.py CHANGED
@@ -19,7 +19,7 @@ class BM25Retriever(Retriever):
19
  def __init__(
20
  self,
21
  bm25_idx: str | None,
22
- doc_store_path: str | None = None,
23
  threads: int = 1,
24
  ):
25
  if bm25_idx is None:
 
19
  def __init__(
20
  self,
21
  bm25_idx: str | None,
22
+ doc_store: str | None = None,
23
  threads: int = 1,
24
  ):
25
  if bm25_idx is None:
evaluation/retrievers/hybrid.py CHANGED
@@ -24,7 +24,7 @@ class HybridRetriever(Retriever):
24
  device: str = "cpu",
25
  ):
26
  # 1) BM25 retriever
27
- self.bm25 = BM25Retriever(bm25_idx, doc_store_path=doc_store)
28
 
29
  # 2) Dense retriever
30
  self.dense = DenseRetriever(
 
24
  device: str = "cpu",
25
  ):
26
  # 1) BM25 retriever
27
+ self.bm25 = BM25Retriever(bm25_idx, doc_store=doc_store)
28
 
29
  # 2) Dense retriever
30
  self.dense = DenseRetriever(
evaluation/stats/robustness.py CHANGED
@@ -77,7 +77,7 @@ def chi2_error_propagation(
77
  ]
78
  try:
79
  chi2, p, dof, expected = chi2_contingency(table)
80
- return dict(chi2=chi2, p=p, dof=dof, expected=expected, table=table)
81
  except ValueError:
82
  default_expected = [[0, 0], [0, 0]]
83
  return dict(chi2=0.0, p=1.0, dof=0, expected=default_expected, table=table)
 
77
  ]
78
  try:
79
  chi2, p, dof, expected = chi2_contingency(table)
80
+ return dict(chi2=float(chi2), p=float(p), dof=int(dof), expected= expected.tolist(), table=table)
81
  except ValueError:
82
  default_expected = [[0, 0], [0, 0]]
83
  return dict(chi2=0.0, p=1.0, dof=0, expected=default_expected, table=table)
evaluation/stats/significance.py CHANGED
@@ -36,12 +36,4 @@ def holm_bonferroni(pvalues: Mapping[str, float]) -> Mapping[str, float]:
36
  adjusted: dict[str, float] = {}
37
  for i, (name, p) in enumerate(sorted_items, start=1):
38
  adjusted[name] = min((m - i + 1) * p, 1.0)
39
- return adjusted
40
-
41
-
42
- def delta_metric(base: Sequence[float], new: Sequence[float]) -> list[float]:
43
- """Compute per‐element differences `new[i] - base[i]` as a list of floats."""
44
- diffs: list[float] = []
45
- for b, n in zip(base, new):
46
- diffs.append(float(n - b))
47
- return diffs
 
36
  adjusted: dict[str, float] = {}
37
  for i, (name, p) in enumerate(sorted_items, start=1):
38
  adjusted[name] = min((m - i + 1) * p, 1.0)
39
+ return adjusted
 
 
 
 
 
 
 
 
tests/test_pipeline.py CHANGED
@@ -7,7 +7,7 @@ from evaluation.pipeline import RAGPipeline
7
  def test_pipeline_init():
8
  # Using bm25 + dummy index path
9
  cfg = PipelineConfig(
10
- retriever=RetrieverConfig(name="bm25", index_path="dummy"),
11
  generator=GeneratorConfig(model_name="google/flan-t5-base"),
12
  )
13
  pipeline = RAGPipeline(cfg)
 
7
  def test_pipeline_init():
8
  # Using bm25 + dummy index path
9
  cfg = PipelineConfig(
10
+ retriever=RetrieverConfig(name="bm25", bm25_idx="dummy"),
11
  generator=GeneratorConfig(model_name="google/flan-t5-base"),
12
  )
13
  pipeline = RAGPipeline(cfg)
tests/test_sparse_retriever.py CHANGED
@@ -59,7 +59,7 @@ def test_bm25_index_build_and_query(tmp_path):
59
  assert not index_dir.exists()
60
 
61
  # Instantiate BM25Retriever; __init__ should “build” the index (subprocess.run no‐ops)
62
- retriever = BM25Retriever(index_path=str(index_dir), doc_store_path=str(doc_store_path))
63
 
64
  # After init, index_dir “exists” (because build_index created it)
65
  assert index_dir.exists()
@@ -89,6 +89,6 @@ def test_bm25_retrieve_when_pyserini_missing(monkeypatch, tmp_path):
89
  doc_store_path.write_text('{"id":0,"text":"hello"}\n')
90
 
91
  index_dir = tmp_path / "bm25_index2"
92
- retriever = BM25Retriever(index_path=str(index_dir), doc_store_path=str(doc_store_path))
93
  # If SimpleSearcher failed to import, retrieve() returns []
94
  assert retriever.retrieve("whatever", top_k=5) == []
 
59
  assert not index_dir.exists()
60
 
61
  # Instantiate BM25Retriever; __init__ should “build” the index (subprocess.run no‐ops)
62
+ retriever = BM25Retriever(bm25_idx=str(index_dir), doc_store=str(doc_store_path))
63
 
64
  # After init, index_dir “exists” (because build_index created it)
65
  assert index_dir.exists()
 
89
  doc_store_path.write_text('{"id":0,"text":"hello"}\n')
90
 
91
  index_dir = tmp_path / "bm25_index2"
92
+ retriever = BM25Retriever(bm25_idx=str(index_dir), doc_store_path=str(doc_store_path))
93
  # If SimpleSearcher failed to import, retrieve() returns []
94
  assert retriever.retrieve("whatever", top_k=5) == []
tests/test_stats.py CHANGED
@@ -41,7 +41,7 @@ def test_delta_and_failure_rate():
41
  base = [0.9, 0.8, 0.7]
42
  new = [0.85, 0.75, 0.65]
43
  deltas = delta_metric(base, new)
44
- assert isinstance(deltas, list) and len(deltas) == 3
45
  rate = conditional_failure_rate([0, 1, 0, 1], threshold=0.5)
46
  assert 0 <= rate <= 1
47
 
@@ -50,5 +50,6 @@ def test_chi2_error_propagation():
50
  arr1 = [10, 20, 30]
51
  arr2 = [15, 25, 35]
52
  err = chi2_error_propagation(arr1, arr2)
53
- assert isinstance(err, float)
54
- assert err >= 0
 
 
41
  base = [0.9, 0.8, 0.7]
42
  new = [0.85, 0.75, 0.65]
43
  deltas = delta_metric(base, new)
44
+ assert isinstance(deltas, tuple) and len(deltas) == 2
45
  rate = conditional_failure_rate([0, 1, 0, 1], threshold=0.5)
46
  assert 0 <= rate <= 1
47
 
 
50
  arr1 = [10, 20, 30]
51
  arr2 = [15, 25, 35]
52
  err = chi2_error_propagation(arr1, arr2)
53
+ assert isinstance(err, dict)
54
+ assert isinstance(err.get("chi2"), float)
55
+ assert isinstance(err.get("p"), float)