Rodrigo Ferreira Rodrigues
commited on
Commit
·
440ceb6
1
Parent(s):
6cd9340
adding bert score
Browse files- mcq_eval.py +6 -8
- requirements.txt +6 -1
mcq_eval.py
CHANGED
|
@@ -15,9 +15,7 @@
|
|
| 15 |
|
| 16 |
import evaluate
|
| 17 |
import datasets
|
| 18 |
-
|
| 19 |
-
bleu = evaluate.load('bleu')
|
| 20 |
-
#bert_score = evaluate.load('bertscore')
|
| 21 |
|
| 22 |
|
| 23 |
# TODO: Add BibTeX citation
|
|
@@ -50,7 +48,7 @@ Examples:
|
|
| 50 |
Here is an exemple on how to use the metric:
|
| 51 |
|
| 52 |
>>> metric = evaluate.load("rfr2003/MQC_eval")
|
| 53 |
-
>>> results = metric.compute(
|
| 54 |
>>> print(results)
|
| 55 |
{'accuracy': 0.5, 'bleu-1': 0.5}
|
| 56 |
"""
|
|
@@ -82,8 +80,8 @@ class MCQ_eval(evaluate.Metric):
|
|
| 82 |
|
| 83 |
def _download_and_prepare(self, dl_manager):
|
| 84 |
"""Optional: download external resources useful to compute the scores"""
|
| 85 |
-
|
| 86 |
-
|
| 87 |
|
| 88 |
def _compute(self, generations, golds):
|
| 89 |
"""Returns the scores"""
|
|
@@ -107,10 +105,10 @@ class MCQ_eval(evaluate.Metric):
|
|
| 107 |
references.append(gold)
|
| 108 |
|
| 109 |
metrics = {}
|
| 110 |
-
|
| 111 |
metrics.update({
|
| 112 |
'accuracy': correct/total,
|
| 113 |
-
'bleu-1': bleu.compute(predictions=predictions, references=references, max_order=1)['bleu']
|
| 114 |
})
|
| 115 |
|
| 116 |
return metrics
|
|
|
|
| 15 |
|
| 16 |
import evaluate
|
| 17 |
import datasets
|
| 18 |
+
import numpy as np
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
# TODO: Add BibTeX citation
|
|
|
|
| 48 |
Here is an exemple on how to use the metric:
|
| 49 |
|
| 50 |
>>> metric = evaluate.load("rfr2003/MQC_eval")
|
| 51 |
+
>>> results = metric.compute(generations=["A", "B"], golds=["A", "D"])
|
| 52 |
>>> print(results)
|
| 53 |
{'accuracy': 0.5, 'bleu-1': 0.5}
|
| 54 |
"""
|
|
|
|
| 80 |
|
| 81 |
def _download_and_prepare(self, dl_manager):
|
| 82 |
"""Optional: download external resources useful to compute the scores"""
|
| 83 |
+
self.bleu = evaluate.load('bleu')
|
| 84 |
+
self.bert_score = evaluate.load('bertscore')
|
| 85 |
|
| 86 |
def _compute(self, generations, golds):
|
| 87 |
"""Returns the scores"""
|
|
|
|
| 105 |
references.append(gold)
|
| 106 |
|
| 107 |
metrics = {}
|
| 108 |
+
metrics = {f"bert_score_{k}": np.mean(v).item() for k,v in self.bert_score.compute(predictions=predictions, references=references, lang="en").items() if k in ['recall', 'precision', 'f1']}
|
| 109 |
metrics.update({
|
| 110 |
'accuracy': correct/total,
|
| 111 |
+
'bleu-1': self.bleu.compute(predictions=predictions, references=references, max_order=1)['bleu']
|
| 112 |
})
|
| 113 |
|
| 114 |
return metrics
|
requirements.txt
CHANGED
|
@@ -1 +1,6 @@
|
|
| 1 |
-
git+https://github.com/huggingface/evaluate@main
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
git+https://github.com/huggingface/evaluate@main
|
| 2 |
+
transformers
|
| 3 |
+
torch
|
| 4 |
+
datasets
|
| 5 |
+
numpy
|
| 6 |
+
bert_score
|