Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,10 +3,12 @@ import os
|
|
| 3 |
from typing import List, Dict
|
| 4 |
from ragas import evaluate
|
| 5 |
from ragas.metrics import (
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
| 10 |
)
|
| 11 |
from datasets import load_dataset
|
| 12 |
from langchain.text_splitter import (
|
|
@@ -87,12 +89,11 @@ def load_evaluation_dataset():
|
|
| 87 |
dataset = load_dataset("explodinggradients/fiqa", split="test")
|
| 88 |
return dataset
|
| 89 |
|
| 90 |
-
def
|
| 91 |
# Sample a few examples for evaluation
|
| 92 |
eval_samples = dataset.select(range(5))
|
| 93 |
|
| 94 |
-
|
| 95 |
-
eval_data = []
|
| 96 |
for sample in eval_samples:
|
| 97 |
question = sample["question"]
|
| 98 |
|
|
@@ -102,33 +103,39 @@ def evaluate_rag_pipeline(qa_chain, dataset):
|
|
| 102 |
"chat_history": []
|
| 103 |
})
|
| 104 |
|
| 105 |
-
|
| 106 |
"question": question,
|
| 107 |
"answer": response["answer"],
|
| 108 |
-
"
|
| 109 |
-
"
|
| 110 |
})
|
| 111 |
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
Faithfulness(),
|
| 117 |
-
AnswerRelevancy()
|
| 118 |
-
]
|
| 119 |
|
| 120 |
-
# Run evaluation
|
| 121 |
results = evaluate(
|
| 122 |
-
|
| 123 |
-
metrics=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
)
|
| 125 |
|
| 126 |
-
# Convert results to dictionary
|
| 127 |
return {
|
| 128 |
-
"
|
| 129 |
-
"context_relevancy": float(results["context_relevancy"]),
|
| 130 |
"faithfulness": float(results["faithfulness"]),
|
| 131 |
-
"answer_relevancy": float(results["answer_relevancy"])
|
|
|
|
|
|
|
|
|
|
| 132 |
}
|
| 133 |
|
| 134 |
# Initialize langchain LLM chain
|
|
|
|
| 3 |
from typing import List, Dict
|
| 4 |
from ragas import evaluate
|
| 5 |
from ragas.metrics import (
|
| 6 |
+
answer_relevancy,
|
| 7 |
+
faithfulness,
|
| 8 |
+
context_recall,
|
| 9 |
+
context_precision,
|
| 10 |
+
answer_correctness,
|
| 11 |
+
answer_similarity
|
| 12 |
)
|
| 13 |
from datasets import load_dataset
|
| 14 |
from langchain.text_splitter import (
|
|
|
|
| 89 |
dataset = load_dataset("explodinggradients/fiqa", split="test")
|
| 90 |
return dataset
|
| 91 |
|
| 92 |
+
def prepare_ragas_dataset(qa_chain, dataset):
|
| 93 |
# Sample a few examples for evaluation
|
| 94 |
eval_samples = dataset.select(range(5))
|
| 95 |
|
| 96 |
+
ragas_dataset = []
|
|
|
|
| 97 |
for sample in eval_samples:
|
| 98 |
question = sample["question"]
|
| 99 |
|
|
|
|
| 103 |
"chat_history": []
|
| 104 |
})
|
| 105 |
|
| 106 |
+
ragas_dataset.append({
|
| 107 |
"question": question,
|
| 108 |
"answer": response["answer"],
|
| 109 |
+
"contexts": [doc.page_content for doc in response["source_documents"]],
|
| 110 |
+
"ground_truth": sample["answer"]
|
| 111 |
})
|
| 112 |
|
| 113 |
+
return ragas_dataset
|
| 114 |
+
|
| 115 |
+
def evaluate_rag_pipeline(qa_chain, dataset):
|
| 116 |
+
ragas_dataset = prepare_ragas_dataset(qa_chain, dataset)
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
+
# Run RAGAS evaluation
|
| 119 |
results = evaluate(
|
| 120 |
+
ragas_dataset,
|
| 121 |
+
metrics=[
|
| 122 |
+
context_precision,
|
| 123 |
+
faithfulness,
|
| 124 |
+
answer_relevancy,
|
| 125 |
+
context_recall,
|
| 126 |
+
answer_correctness,
|
| 127 |
+
answer_similarity
|
| 128 |
+
]
|
| 129 |
)
|
| 130 |
|
| 131 |
+
# Convert results to a dictionary
|
| 132 |
return {
|
| 133 |
+
"context_precision": float(results["context_precision"]),
|
|
|
|
| 134 |
"faithfulness": float(results["faithfulness"]),
|
| 135 |
+
"answer_relevancy": float(results["answer_relevancy"]),
|
| 136 |
+
"context_recall": float(results["context_recall"]),
|
| 137 |
+
"answer_correctness": float(results["answer_correctness"]),
|
| 138 |
+
"answer_similarity": float(results["answer_similarity"])
|
| 139 |
}
|
| 140 |
|
| 141 |
# Initialize langchain LLM chain
|