Doanh Van Vu commited on
Commit
dbdb72a
·
1 Parent(s): ee8ceae

Enhance evaluation metrics and reporting in recommendation system

Browse files

- Updated `evaluate_recommendations.py` to include Mean Reciprocal Rank (MRR) as a new evaluation metric alongside existing metrics like Precision@K, Recall@K, Hit Rate@K, and NDCG@K.
- Modified the evaluation report generation to incorporate MRR results, ensuring comprehensive performance insights.
- Revised `evaluation_report.md` and `sample_mentee_evaluation.json` to reflect updated ground truth data and evaluation results for improved accuracy and relevance.

evaluation/evaluate_recommendations.py CHANGED
@@ -4,7 +4,7 @@ Script đánh giá hệ thống recommendation cho MentorMe.
4
  Script này thực hiện đánh giá hiệu suất của hệ thống recommendation bằng cách:
5
  1. Gửi requests recommendation cho các mentees trong dataset
6
  2. So sánh kết quả với ground truth
7
- 3. Tính toán các metrics: Precision@K, Recall@K, Hit Rate@K, NDCG@K
8
  4. Tạo báo cáo đánh giá theo format nghiên cứu khoa học
9
  """
10
 
@@ -114,6 +114,25 @@ def ndcg_at_k(recommended: List[str], relevant_list: List[str], k: int) -> float
114
  return dcg / idcg if idcg > 0 else 0.0
115
 
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  def evaluate_recommendation(
118
  recommended: List[Dict[str, Any]],
119
  ground_truth: List[int],
@@ -138,7 +157,8 @@ def evaluate_recommendation(
138
  "precision": {},
139
  "recall": {},
140
  "hit": {},
141
- "ndcg": {}
 
142
  }
143
 
144
  for k in k_values:
@@ -146,6 +166,7 @@ def evaluate_recommendation(
146
  results["recall"][k] = recall_at_k(recommended_ids, ground_truth_set, k)
147
  results["hit"][k] = hit_at_k(recommended_ids, ground_truth_set, k)
148
  results["ndcg"][k] = ndcg_at_k(recommended_ids, ground_truth_list, k)
 
149
 
150
  return results
151
 
@@ -200,7 +221,7 @@ def generate_research_report(
200
 
201
  # Tính toán thống kê cơ bản
202
  stats_by_metric = {}
203
- for metric_name in ['precision', 'recall', 'hit', 'ndcg']:
204
  stats_by_metric[metric_name] = {}
205
  for k in k_values:
206
  metrics_list = [r['metrics'][metric_name][k] for r in all_results]
@@ -231,6 +252,7 @@ def generate_research_report(
231
  | **Recall** | {aggregate_metrics['recall'][1]:.4f} | {aggregate_metrics['recall'][3]:.4f} | {aggregate_metrics['recall'][6]:.4f} |
232
  | **Hit Rate** | {aggregate_metrics['hit'][1]:.4f} | {aggregate_metrics['hit'][3]:.4f} | {aggregate_metrics['hit'][6]:.4f} |
233
  | **NDCG** | {aggregate_metrics['ndcg'][1]:.4f} | {aggregate_metrics['ndcg'][3]:.4f} | {aggregate_metrics['ndcg'][6]:.4f} |
 
234
 
235
  ## Thống Kê Chi Tiết
236
 
@@ -248,6 +270,12 @@ def generate_research_report(
248
  stats = stats_by_metric['recall'][k]
249
  report += f"- **@{k}:** Mean={stats['mean']:.4f}, Std={stats['std']:.4f}, Min={stats['min']:.4f}, Max={stats['max']:.4f}\n"
250
 
 
 
 
 
 
 
251
  report += f"\n### Hit Rate Distribution (@6)\n\n"
252
  report += f"- 0 hits: {hit_rate_distribution['0 hits']} ({hit_rate_distribution['0 hits']/total_mentees*100:.1f}%)\n"
253
  report += f"- Partial hits: {hit_rate_distribution['Partial hits']} ({hit_rate_distribution['Partial hits']/total_mentees*100:.1f}%)\n"
@@ -265,7 +293,8 @@ def generate_research_report(
265
  report += f"| Precision | {result['metrics']['precision'][1]:.4f} | {result['metrics']['precision'][3]:.4f} | {result['metrics']['precision'][6]:.4f} |\n"
266
  report += f"| Recall | {result['metrics']['recall'][1]:.4f} | {result['metrics']['recall'][3]:.4f} | {result['metrics']['recall'][6]:.4f} |\n"
267
  report += f"| Hit Rate | {result['metrics']['hit'][1]:.4f} | {result['metrics']['hit'][3]:.4f} | {result['metrics']['hit'][6]:.4f} |\n"
268
- report += f"| NDCG | {result['metrics']['ndcg'][1]:.4f} | {result['metrics']['ndcg'][3]:.4f} | {result['metrics']['ndcg'][6]:.4f} |\n\n"
 
269
 
270
  return report
271
 
@@ -341,7 +370,8 @@ def main():
341
  'precision': {k: 0.0 for k in args.k_values},
342
  'recall': {k: 0.0 for k in args.k_values},
343
  'hit': {k: 0.0 for k in args.k_values},
344
- 'ndcg': {k: 0.0 for k in args.k_values}
 
345
  }
346
  })
347
  continue
@@ -359,7 +389,7 @@ def main():
359
  'metrics': results
360
  })
361
 
362
- print(f" Precision@6: {results['precision'][6]:.4f}, Recall@6: {results['recall'][6]:.4f}, NDCG@6: {results['ndcg'][6]:.4f}\n")
363
 
364
  time.sleep(args.delay)
365
 
@@ -368,11 +398,12 @@ def main():
368
  'precision': {k: 0.0 for k in args.k_values},
369
  'recall': {k: 0.0 for k in args.k_values},
370
  'hit': {k: 0.0 for k in args.k_values},
371
- 'ndcg': {k: 0.0 for k in args.k_values}
 
372
  }
373
 
374
  for result in all_results:
375
- for metric_name in ['precision', 'recall', 'hit', 'ndcg']:
376
  for k in args.k_values:
377
  aggregate_metrics[metric_name][k] += result['metrics'][metric_name][k]
378
 
 
4
  Script này thực hiện đánh giá hiệu suất của hệ thống recommendation bằng cách:
5
  1. Gửi requests recommendation cho các mentees trong dataset
6
  2. So sánh kết quả với ground truth
7
+ 3. Tính toán các metrics: Precision@K, Recall@K, Hit Rate@K, NDCG@K, MRR@K
8
  4. Tạo báo cáo đánh giá theo format nghiên cứu khoa học
9
  """
10
 
 
114
  return dcg / idcg if idcg > 0 else 0.0
115
 
116
 
117
+ def mrr_at_k(recommended: List[str], relevant: Set[str], k: int) -> float:
118
+ """
119
+ Tính Mean Reciprocal Rank@K.
120
+
121
+ Reciprocal Rank = 1 / position của item relevant đầu tiên trong top-k
122
+ Nếu không có item relevant nào trong top-k, RR = 0
123
+ """
124
+ if len(relevant) == 0:
125
+ return 0.0
126
+
127
+ top_k = recommended[:k]
128
+
129
+ for i, mentor_id in enumerate(top_k, 1):
130
+ if mentor_id in relevant:
131
+ return 1.0 / i
132
+
133
+ return 0.0
134
+
135
+
136
  def evaluate_recommendation(
137
  recommended: List[Dict[str, Any]],
138
  ground_truth: List[int],
 
157
  "precision": {},
158
  "recall": {},
159
  "hit": {},
160
+ "ndcg": {},
161
+ "mrr": {}
162
  }
163
 
164
  for k in k_values:
 
166
  results["recall"][k] = recall_at_k(recommended_ids, ground_truth_set, k)
167
  results["hit"][k] = hit_at_k(recommended_ids, ground_truth_set, k)
168
  results["ndcg"][k] = ndcg_at_k(recommended_ids, ground_truth_list, k)
169
+ results["mrr"][k] = mrr_at_k(recommended_ids, ground_truth_set, k)
170
 
171
  return results
172
 
 
221
 
222
  # Tính toán thống kê cơ bản
223
  stats_by_metric = {}
224
+ for metric_name in ['precision', 'recall', 'hit', 'ndcg', 'mrr']:
225
  stats_by_metric[metric_name] = {}
226
  for k in k_values:
227
  metrics_list = [r['metrics'][metric_name][k] for r in all_results]
 
252
  | **Recall** | {aggregate_metrics['recall'][1]:.4f} | {aggregate_metrics['recall'][3]:.4f} | {aggregate_metrics['recall'][6]:.4f} |
253
  | **Hit Rate** | {aggregate_metrics['hit'][1]:.4f} | {aggregate_metrics['hit'][3]:.4f} | {aggregate_metrics['hit'][6]:.4f} |
254
  | **NDCG** | {aggregate_metrics['ndcg'][1]:.4f} | {aggregate_metrics['ndcg'][3]:.4f} | {aggregate_metrics['ndcg'][6]:.4f} |
255
+ | **MRR** | {aggregate_metrics['mrr'][1]:.4f} | {aggregate_metrics['mrr'][3]:.4f} | {aggregate_metrics['mrr'][6]:.4f} |
256
 
257
  ## Thống Kê Chi Tiết
258
 
 
270
  stats = stats_by_metric['recall'][k]
271
  report += f"- **@{k}:** Mean={stats['mean']:.4f}, Std={stats['std']:.4f}, Min={stats['min']:.4f}, Max={stats['max']:.4f}\n"
272
 
273
+ report += "\n### MRR@K\n\n"
274
+
275
+ for k in k_values:
276
+ stats = stats_by_metric['mrr'][k]
277
+ report += f"- **@{k}:** Mean={stats['mean']:.4f}, Std={stats['std']:.4f}, Min={stats['min']:.4f}, Max={stats['max']:.4f}\n"
278
+
279
  report += f"\n### Hit Rate Distribution (@6)\n\n"
280
  report += f"- 0 hits: {hit_rate_distribution['0 hits']} ({hit_rate_distribution['0 hits']/total_mentees*100:.1f}%)\n"
281
  report += f"- Partial hits: {hit_rate_distribution['Partial hits']} ({hit_rate_distribution['Partial hits']/total_mentees*100:.1f}%)\n"
 
293
  report += f"| Precision | {result['metrics']['precision'][1]:.4f} | {result['metrics']['precision'][3]:.4f} | {result['metrics']['precision'][6]:.4f} |\n"
294
  report += f"| Recall | {result['metrics']['recall'][1]:.4f} | {result['metrics']['recall'][3]:.4f} | {result['metrics']['recall'][6]:.4f} |\n"
295
  report += f"| Hit Rate | {result['metrics']['hit'][1]:.4f} | {result['metrics']['hit'][3]:.4f} | {result['metrics']['hit'][6]:.4f} |\n"
296
+ report += f"| NDCG | {result['metrics']['ndcg'][1]:.4f} | {result['metrics']['ndcg'][3]:.4f} | {result['metrics']['ndcg'][6]:.4f} |\n"
297
+ report += f"| MRR | {result['metrics']['mrr'][1]:.4f} | {result['metrics']['mrr'][3]:.4f} | {result['metrics']['mrr'][6]:.4f} |\n\n"
298
 
299
  return report
300
 
 
370
  'precision': {k: 0.0 for k in args.k_values},
371
  'recall': {k: 0.0 for k in args.k_values},
372
  'hit': {k: 0.0 for k in args.k_values},
373
+ 'ndcg': {k: 0.0 for k in args.k_values},
374
+ 'mrr': {k: 0.0 for k in args.k_values}
375
  }
376
  })
377
  continue
 
389
  'metrics': results
390
  })
391
 
392
+ print(f" Precision@6: {results['precision'][6]:.4f}, Recall@6: {results['recall'][6]:.4f}, NDCG@6: {results['ndcg'][6]:.4f}, MRR@6: {results['mrr'][6]:.4f}\n")
393
 
394
  time.sleep(args.delay)
395
 
 
398
  'precision': {k: 0.0 for k in args.k_values},
399
  'recall': {k: 0.0 for k in args.k_values},
400
  'hit': {k: 0.0 for k in args.k_values},
401
+ 'ndcg': {k: 0.0 for k in args.k_values},
402
+ 'mrr': {k: 0.0 for k in args.k_values}
403
  }
404
 
405
  for result in all_results:
406
+ for metric_name in ['precision', 'recall', 'hit', 'ndcg', 'mrr']:
407
  for k in args.k_values:
408
  aggregate_metrics[metric_name][k] += result['metrics'][metric_name][k]
409
 
evaluation/evaluation_report.md CHANGED
The diff for this file is too large to render. See raw diff
 
evaluation/sample_mentee_evaluation.json CHANGED
The diff for this file is too large to render. See raw diff