Spaces:
No application file
No application file
Update metrics_v2.py
#1
by
rodrigomasini
- opened
- metrics_v2.py +63 -1
metrics_v2.py
CHANGED
|
@@ -239,4 +239,66 @@ if __name__ == "__main__":
|
|
| 239 |
|
| 240 |
print("\n----- Evaluation Result -----")
|
| 241 |
print(f"Review Flag: {evaluation_result['review_flag']}")
|
| 242 |
-
print(f"Explanation: {evaluation_result['explanation']}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
|
| 240 |
print("\n----- Evaluation Result -----")
|
| 241 |
print(f"Review Flag: {evaluation_result['review_flag']}")
|
| 242 |
+
print(f"Explanation: {evaluation_result['explanation']}")
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
#######
|
| 246 |
+
from typing import List, Tuple, Callable
|
| 247 |
+
|
| 248 |
+
def evaluate_retrieval_precision(
|
| 249 |
+
questions: List[str],
|
| 250 |
+
system: Callable[[str], List[str]],
|
| 251 |
+
evaluator: Callable[[str, str], int],
|
| 252 |
+
num_chunks_expected: int = 3,
|
| 253 |
+
verbose: bool = True
|
| 254 |
+
) -> dict:
|
| 255 |
+
"""
|
| 256 |
+
Evaluates the retrieval precision of a system using an LLM evaluator.
|
| 257 |
+
|
| 258 |
+
Args:
|
| 259 |
+
questions: A list of evaluation questions.
|
| 260 |
+
system: A function that takes a question as input and returns a list of retrieved chunks.
|
| 261 |
+
evaluator: A function that takes a question and a chunk as input and returns a relevance score (0 or 1).
|
| 262 |
+
num_chunks_expected: The number of chunks the system is expected to return. Defaults to 3.
|
| 263 |
+
verbose: Whether to print warnings for questions with fewer returned chunks than expected.
|
| 264 |
+
|
| 265 |
+
Returns:
|
| 266 |
+
A dictionary containing:
|
| 267 |
+
- 'mean_precision': The mean retrieval precision score across all questions.
|
| 268 |
+
- 'precision_scores': A list of precision scores for each individual question.
|
| 269 |
+
- 'question_relevance': A list of tuples, where each tuple contains a question and the number of relevant chunks retrieved for that question.
|
| 270 |
+
|
| 271 |
+
"""
|
| 272 |
+
|
| 273 |
+
results = {
|
| 274 |
+
'mean_precision': 0.0,
|
| 275 |
+
'precision_scores': [],
|
| 276 |
+
'question_relevance': []
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
for i, question in enumerate(questions):
|
| 280 |
+
retrieved_chunks = system(question)
|
| 281 |
+
|
| 282 |
+
# Warning if fewer chunks are returned than expected
|
| 283 |
+
if len(retrieved_chunks) < num_chunks_expected and verbose:
|
| 284 |
+
print(f"Warning: System returned {len(retrieved_chunks)} chunks (expected {num_chunks_expected}) for question {i+1}: {question}")
|
| 285 |
+
|
| 286 |
+
# Calculate precision for the current question
|
| 287 |
+
relevant_chunks = sum(evaluator(question, chunk) for chunk in retrieved_chunks)
|
| 288 |
+
precision = relevant_chunks / len(retrieved_chunks) if retrieved_chunks else 0
|
| 289 |
+
results['precision_scores'].append(precision)
|
| 290 |
+
|
| 291 |
+
# Store the question and its relevant chunk count
|
| 292 |
+
results['question_relevance'].append((question, relevant_chunks))
|
| 293 |
+
|
| 294 |
+
# Calculate mean precision
|
| 295 |
+
results['mean_precision'] = sum(results['precision_scores']) / len(questions) if questions else 0
|
| 296 |
+
return results
|
| 297 |
+
|
| 298 |
+
# Example usage (assuming you've defined 'questions', 'system', and 'evaluator'):
|
| 299 |
+
evaluation_results = evaluate_retrieval_precision(
|
| 300 |
+
questions, system, evaluator, num_chunks_expected=3, verbose=True
|
| 301 |
+
)
|
| 302 |
+
print(f"Mean Retrieval Precision: {evaluation_results['mean_precision']:.2f}")
|
| 303 |
+
print(f"Precision Scores for Each Question: {evaluation_results['precision_scores']}")
|
| 304 |
+
print(f"Question Relevance: {evaluation_results['question_relevance']}")
|