GitHub Actions commited on
Commit
919c2ec
·
1 Parent(s): d380148

Sync from GitHub Actions

Browse files
Lawverse/evaluation/metrics.py CHANGED
@@ -1,21 +1,31 @@
1
  import numpy as np
2
  from datasets import Dataset
3
- from ragas.metrics import context_recall, context_precision, faithfulness, answer_relevancy
4
  from ragas import evaluate, RunConfig
5
  from ragas.llms.base import BaseRagasLLM
6
  from ragas.embeddings.base import BaseRagasEmbeddings
7
  from langchain_huggingface import HuggingFaceEmbeddings
8
 
 
 
 
 
 
 
 
 
 
 
 
9
  class RagasMetrics:
10
  def __init__(self):
11
  self.metrics = {
12
- "context_precision": context_precision,
13
- "context_recall": context_recall,
14
  "faithfulness": faithfulness,
15
- "answer_relevancy": answer_relevancy,
16
  }
17
 
18
- def evaluate_dataset(self, dataset : Dataset, llm : BaseRagasLLM, embedding : BaseRagasEmbeddings, run_config : RunConfig):
19
  result = evaluate(
20
  dataset=dataset,
21
  metrics=list(self.metrics.values()),
@@ -33,18 +43,7 @@ class RagasMetrics:
33
 
34
  return {k: round(v, 4) for k, v in scores_dict.items()}
35
 
36
- @staticmethod
37
- def f_recall(pred_answer, true_answer):
38
- pred_tokens = set(" ".join(pred_answer).lower().split())
39
- true_tokens = set(" ".join(true_answer).lower().split())
40
-
41
- tp = len(pred_tokens & true_tokens)
42
- fn = len(true_tokens - pred_tokens)
43
-
44
- return round(tp / (tp + fn + 1e-8), 4)
45
-
46
-
47
- def compute_all_metrics(dataset : Dataset, preds, trues, llm : BaseRagasLLM, run_config : RunConfig):
48
  ragas = RagasMetrics()
49
 
50
  hf_embeddings = HuggingFaceEmbeddings(
@@ -52,5 +51,5 @@ def compute_all_metrics(dataset : Dataset, preds, trues, llm : BaseRagasLLM, run
52
  )
53
 
54
  ragas_score = ragas.evaluate_dataset(dataset, llm, hf_embeddings, run_config)
55
- ragas_score["f_recall"] = RagasMetrics.f_recall(preds, trues)
56
  return ragas_score
 
1
  import numpy as np
2
  from datasets import Dataset
3
+ from ragas.metrics import context_recall, answer_relevancy, faithfulness
4
  from ragas import evaluate, RunConfig
5
  from ragas.llms.base import BaseRagasLLM
6
  from ragas.embeddings.base import BaseRagasEmbeddings
7
  from langchain_huggingface import HuggingFaceEmbeddings
8
 
9
+ def mrr_score(preds, trues):
10
+ ranks = []
11
+ for pred, true in zip(preds, trues):
12
+ rank = 0
13
+ for i, p in enumerate(pred, start=1):
14
+ if p == true:
15
+ rank = i
16
+ break
17
+ ranks.append(1 / rank if rank > 0 else 0)
18
+ return round(float(np.mean(ranks)), 4)
19
+
20
  class RagasMetrics:
21
  def __init__(self):
22
  self.metrics = {
23
+ "recall@10": context_recall,
24
+ "ndcg@10": answer_relevancy,
25
  "faithfulness": faithfulness,
 
26
  }
27
 
28
+ def evaluate_dataset(self, dataset: Dataset, llm: BaseRagasLLM, embedding: BaseRagasEmbeddings, run_config: RunConfig):
29
  result = evaluate(
30
  dataset=dataset,
31
  metrics=list(self.metrics.values()),
 
43
 
44
  return {k: round(v, 4) for k, v in scores_dict.items()}
45
 
46
+ def compute_all_metrics(dataset: Dataset, preds, trues, llm: BaseRagasLLM, run_config: RunConfig):
 
 
 
 
 
 
 
 
 
 
 
47
  ragas = RagasMetrics()
48
 
49
  hf_embeddings = HuggingFaceEmbeddings(
 
51
  )
52
 
53
  ragas_score = ragas.evaluate_dataset(dataset, llm, hf_embeddings, run_config)
54
+ ragas_score["mrr"] = mrr_score(preds, trues)
55
  return ragas_score
Lawverse/evaluation/ragas_eval.py CHANGED
@@ -20,13 +20,20 @@ def eval_dataset(eval_data):
20
  MAX_RETRIES = 3
21
 
22
  for sample in eval_data:
23
- time.sleep(2)
24
  retries = 0
25
  while retries < MAX_RETRIES:
26
  try:
27
- result = chain.invoke({"question": sample["question"]})
28
- answer = result["answer"]
29
- context_docs = [d.page_content for d in result["source_documents"]]
 
 
 
 
 
 
 
30
 
31
  eval_results.append({
32
  "question": sample["question"],
@@ -49,7 +56,7 @@ def eval_dataset(eval_data):
49
  time.sleep(sleep_time)
50
  return Dataset.from_list(eval_results)
51
 
52
- def run_ragas_evaluation(eval_data, llm : BaseLanguageModel):
53
  dataset = eval_dataset(eval_data)
54
  preds = [item["answer"] for item in dataset]
55
  trues = [item["ground_truth"] for item in dataset]
@@ -59,7 +66,7 @@ def run_ragas_evaluation(eval_data, llm : BaseLanguageModel):
59
 
60
  results = compute_all_metrics(dataset, preds, trues, eval_llm, run_config)
61
 
62
- logging.info(f"RAGAS evaluation completed. Scores: {results}")
63
 
64
  entry = {
65
  "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
 
20
  MAX_RETRIES = 3
21
 
22
  for sample in eval_data:
23
+ time.sleep(3)
24
  retries = 0
25
  while retries < MAX_RETRIES:
26
  try:
27
+ result = chain.invoke({"input": sample["question"]})
28
+
29
+ if isinstance(result, str):
30
+ answer = result
31
+ context_docs = []
32
+ elif isinstance(result, dict):
33
+ answer = result.get("answer", "")
34
+ context_docs = [d.page_content for d in result.get("source_documents", [])]
35
+ else:
36
+ raise TypeError(f"Unexpected type from chain.invoke(): {type(result)}")
37
 
38
  eval_results.append({
39
  "question": sample["question"],
 
56
  time.sleep(sleep_time)
57
  return Dataset.from_list(eval_results)
58
 
59
+ def run_ragas_evaluation(eval_data, llm: BaseLanguageModel):
60
  dataset = eval_dataset(eval_data)
61
  preds = [item["answer"] for item in dataset]
62
  trues = [item["ground_truth"] for item in dataset]
 
66
 
67
  results = compute_all_metrics(dataset, preds, trues, eval_llm, run_config)
68
 
69
+ logging.info(f"RAG evaluation completed. Scores: {results}")
70
 
71
  entry = {
72
  "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),