Rodrigo Ferreira Rodrigues commited on
Commit
6cd9340
·
1 Parent(s): 627f929

Adding _compute

Browse files
Files changed (2) hide show
  1. mcq_eval.py +48 -27
  2. tests.py +6 -11
mcq_eval.py CHANGED
@@ -16,6 +16,9 @@
16
  import evaluate
17
  import datasets
18
 
 
 
 
19
 
20
  # TODO: Add BibTeX citation
21
  _CITATION = """\
@@ -28,34 +31,30 @@ year={2020}
28
 
29
  # TODO: Add description of the module here
30
  _DESCRIPTION = """\
31
- This new module is designed to solve this great ML task and is crafted with a lot of care.
32
  """
33
 
34
 
35
  # TODO: Add description of the arguments of the module here
36
  _KWARGS_DESCRIPTION = """
37
- Calculates how good are predictions given some references, using certain scores
38
  Args:
39
- predictions: list of predictions to score. Each predictions
40
- should be a string with tokens separated by spaces.
41
- references: list of reference for each prediction. Each
42
- reference should be a string with tokens separated by spaces.
43
  Returns:
44
- accuracy: description of the first score,
45
- another_score: description of the second score,
46
  Examples:
47
- Examples should be written in doctest format, and should illustrate how
48
- to use the function.
49
 
50
- >>> my_new_module = evaluate.load("my_new_module")
51
- >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
52
  >>> print(results)
53
- {'accuracy': 1.0}
54
  """
55
 
56
- # TODO: Define external resources urls if needed
57
- BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
58
-
59
 
60
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
61
  class MCQ_eval(evaluate.Metric):
@@ -71,14 +70,14 @@ class MCQ_eval(evaluate.Metric):
71
  inputs_description=_KWARGS_DESCRIPTION,
72
  # This defines the format of each prediction and reference
73
  features=datasets.Features({
74
- 'predictions': datasets.Value('int64'),
75
- 'references': datasets.Value('int64'),
76
  }),
77
  # Homepage of the module for documentation
78
- homepage="http://module.homepage",
79
  # Additional links to the codebase or references
80
- codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
81
- reference_urls=["http://path.to.reference.url/new_module"]
82
  )
83
 
84
  def _download_and_prepare(self, dl_manager):
@@ -86,10 +85,32 @@ class MCQ_eval(evaluate.Metric):
86
  # TODO: Download external resources if needed
87
  pass
88
 
89
- def _compute(self, predictions, references):
90
  """Returns the scores"""
91
- # TODO: Compute the different scores of the module
92
- accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
93
- return {
94
- "accuracy": accuracy,
95
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  import evaluate
17
  import datasets
18
 
19
+ bleu = evaluate.load('bleu')
20
+ #bert_score = evaluate.load('bertscore')
21
+
22
 
23
  # TODO: Add BibTeX citation
24
  _CITATION = """\
 
31
 
32
  # TODO: Add description of the module here
33
  _DESCRIPTION = """\
34
+ This metric is designed to evaluate MCQ generations tasks.
35
  """
36
 
37
 
38
  # TODO: Add description of the arguments of the module here
39
  _KWARGS_DESCRIPTION = """
40
+ Calculates Accuracy and Blue-1 between generations and gold answers in a MCQ context.
41
  Args:
42
+ generations: list of predictions to score. Each predictions
43
+ should be a string generated by a LM model.
44
+ golds: list of reference for each prediction. Each
45
+ reference should be a string only containing one letter (eg. A, B, C...).
46
  Returns:
47
+ accuracy: ratio of good answers,
48
+ bleu-1: calculated by the module evaluate
49
  Examples:
50
+ Here is an exemple on how to use the metric:
 
51
 
52
+ >>> metric = evaluate.load("rfr2003/MQC_eval")
53
+ >>> results = metric.compute(references=["A", "B"], predictions=["A", "D"])
54
  >>> print(results)
55
+ {'accuracy': 0.5, 'bleu-1': 0.5}
56
  """
57
 
 
 
 
58
 
59
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
60
  class MCQ_eval(evaluate.Metric):
 
70
  inputs_description=_KWARGS_DESCRIPTION,
71
  # This defines the format of each prediction and reference
72
  features=datasets.Features({
73
+ 'generations': datasets.Value('string'),
74
+ 'golds': datasets.Value('string'),
75
  }),
76
  # Homepage of the module for documentation
77
+ #homepage="http://module.homepage",
78
  # Additional links to the codebase or references
79
+ #codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
80
+ #reference_urls=["http://path.to.reference.url/new_module"]
81
  )
82
 
83
  def _download_and_prepare(self, dl_manager):
 
85
  # TODO: Download external resources if needed
86
  pass
87
 
88
+ def _compute(self, generations, golds):
89
  """Returns the scores"""
90
+ assert len(generations) == len(golds)
91
+
92
+ correct, total = 0, 0
93
+
94
+ predictions, references = [], []
95
+
96
+ for gen, gold in zip(generations, golds):
97
+ gen = gen.strip().upper()
98
+ gold = gold.upper()
99
+ if len(gen) > 1:
100
+ gen = gen[0]
101
+ if gen == gold:
102
+ correct += 1
103
+
104
+ total += 1
105
+
106
+ predictions.append(gen)
107
+ references.append(gold)
108
+
109
+ metrics = {}
110
+ #metrics = {f"bert_score_{k}":np.mean(v).item() for k,v in bert_score.compute(predictions=predictions, references=references, lang="en").items() if k in ['recall', 'precision', 'f1']}
111
+ metrics.update({
112
+ 'accuracy': correct/total,
113
+ 'bleu-1': bleu.compute(predictions=predictions, references=references, max_order=1)['bleu']
114
+ })
115
+
116
+ return metrics
tests.py CHANGED
@@ -1,17 +1,12 @@
1
  test_cases = [
2
  {
3
- "predictions": [0, 0],
4
- "references": [1, 1],
5
- "result": {"metric_score": 0}
6
  },
7
  {
8
- "predictions": [1, 1],
9
- "references": [1, 1],
10
- "result": {"metric_score": 1}
11
- },
12
- {
13
- "predictions": [1, 0],
14
- "references": [1, 1],
15
- "result": {"metric_score": 0.5}
16
  }
17
  ]
 
1
  test_cases = [
2
  {
3
+ "predictions": ["A", "B"],
4
+ "references": ["A", "B"],
5
+ "result": {"accuracy": 1, 'bleu-1': 1}
6
  },
7
  {
8
+ "predictions": ["A", "B"],
9
+ "references": ["A", "C"],
10
+ "result": {"accuracy": 0.5, 'bleu-1': 0.5}
 
 
 
 
 
11
  }
12
  ]