Ilia Tambovtsev commited on
Commit
d06b625
·
1 Parent(s): 683b3ae

feat: implement BestChunkMatch metric

Browse files
Files changed (1) hide show
  1. src/eval/eval_mlflow.py +34 -1
src/eval/eval_mlflow.py CHANGED
@@ -168,6 +168,34 @@ class PresentationCount(BaseMetric):
168
  )
169
 
170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  class LLMRelevance(BaseMetric):
172
  """LLM-based relevance scoring"""
173
 
@@ -267,8 +295,9 @@ class MetricsRegistry:
267
  "presentationfound": PresentationFound,
268
  "pagematch": PageMatch,
269
  "pagefound": PageFound,
270
- "llmrelevance": LLMRelevance,
271
  "presentationcount": PresentationCount,
 
 
272
  }
273
 
274
  @classmethod
@@ -290,6 +319,7 @@ class MetricPresets:
290
  "pagematch",
291
  "pagefound",
292
  "presentationcount",
 
293
  ]
294
 
295
  LLM = ["llmrelevance"]
@@ -483,6 +513,7 @@ class RAGEvaluatorMlflow:
483
  "question": row["question"],
484
  "pres_name": row["pres_name"],
485
  "pages": [int(x) for x in row["page"].split(",") if x],
 
486
  }
487
 
488
  try:
@@ -499,6 +530,7 @@ class RAGEvaluatorMlflow:
499
  # Update aggregated results
500
  result_row = {
501
  "question": row["question"],
 
502
  "expected_presentation": row["pres_name"],
503
  "expected_pages": row["page"],
504
  "retrieved_presentations": [
@@ -507,6 +539,7 @@ class RAGEvaluatorMlflow:
507
  "retrieved_pages": [
508
  ",".join(map(str, p["pages"])) for p in output["contexts"]
509
  ],
 
510
  }
511
 
512
  for metric_name, metric_result in results.items():
 
168
  )
169
 
170
 
171
+ class BestChunkMatch(BaseMetric):
172
+ """Count number of retrieved presentations"""
173
+
174
+ async def acalculate(self, run_output: Dict, ground_truth: Dict) -> MetricResult:
175
+ """Count presentations in retrieved results"""
176
+ best_pres = run_output["contexts"][0]
177
+ best_chunk = best_pres["best_chunk"]
178
+
179
+ true_content_type = ground_truth["content_type"]
180
+ found_content_type = best_chunk["chunk_type"]
181
+
182
+ score = 0
183
+ if true_content_type in found_content_type: # text_content and visual_content
184
+ score = 1
185
+ if true_content_type == "general" and found_content_type in [
186
+ "general_description",
187
+ "conclusions_and_insights",
188
+ "layout_and_composition",
189
+ ]:
190
+ score = 1
191
+
192
+ return MetricResult(
193
+ name=self.name,
194
+ score=float(score),
195
+ explanation=f"Found content type '{found_content_type}' matches ground truth '{true_content_type}'",
196
+ )
197
+
198
+
199
  class LLMRelevance(BaseMetric):
200
  """LLM-based relevance scoring"""
201
 
 
295
  "presentationfound": PresentationFound,
296
  "pagematch": PageMatch,
297
  "pagefound": PageFound,
 
298
  "presentationcount": PresentationCount,
299
+ "bestchunkmatch": BestChunkMatch,
300
+ "llmrelevance": LLMRelevance,
301
  }
302
 
303
  @classmethod
 
319
  "pagematch",
320
  "pagefound",
321
  "presentationcount",
322
+ "bestchunkmatch",
323
  ]
324
 
325
  LLM = ["llmrelevance"]
 
513
  "question": row["question"],
514
  "pres_name": row["pres_name"],
515
  "pages": [int(x) for x in row["page"].split(",") if x],
516
+ "content_type": row["content"],
517
  }
518
 
519
  try:
 
530
  # Update aggregated results
531
  result_row = {
532
  "question": row["question"],
533
+ "expected_content_type": row["content"],
534
  "expected_presentation": row["pres_name"],
535
  "expected_pages": row["page"],
536
  "retrieved_presentations": [
 
539
  "retrieved_pages": [
540
  ",".join(map(str, p["pages"])) for p in output["contexts"]
541
  ],
542
+ "best_chunk_type": output["contexts"][0]["best_chunk"]["chunk_type"], # fmt: skip
543
  }
544
 
545
  for metric_name, metric_result in results.items():