File size: 3,250 Bytes
8ee93b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34a14c3
8ee93b2
 
 
 
 
 
 
 
 
34a14c3
8ee93b2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""
Code BLEU metric implementation
"""

import datasets
import evaluate
from codebleu import calc_codebleu

CODEBLEU_WEIGHTS = (0.25, 0.25, 0.25, 0.25)

_CITATION = """\
@misc{ren2020codebleu,
      title={CodeBLEU: a Method for Automatic Evaluation of Code Synthesis}, 
      author={Shuo Ren and Daya Guo and Shuai Lu and Long Zhou and Shujie Liu and Duyu Tang and Neel Sundaresan and Ming Zhou and Ambrosio Blanco and Shuai Ma},
      year={2020},
      eprint={2009.10297},
      archivePrefix={arXiv},
      primaryClass={cs.SE}
}
"""

_DESCRIPTION = """
An ideal evaluation metric should consider the grammatical correctness and the logic correctness. 
We propose weighted n-gram match and syntactic AST match to measure grammatical correctness, and introduce semantic data-flow match to calculate logic correctness.
Source: https://pypi.org/project/codebleu/
"""

_KWARGS_DESCRIPTION = """
Computes CodeBLEU score of code segments against a reference.
Args:
    predictions: list of code generations to score.
    references: list of lists of or just a list of references for each code generation task.
Returns:
    'codebleu_score': code bleu score
Examples:

    >>> predictions = ["def add ( a , b ) :\n return a + b"]
    >>> references = ["def sum ( first , second ) :\n return second + first"]
    >>> codebleu = evaluate.load("codebleu_score")
    >>> results = codebleu.compute(predictions=predictions, references=references)
    >>> print(results["codebleu_score"])
    0.5537
"""

@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class CodeBleu(evaluate.Metric):
    def _info(self):
        return evaluate.MetricInfo(
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            features=[
                datasets.Features(
                    {
                        "predictions": datasets.Value("string", id="sequence"),
                        "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
                    }
                ),
                datasets.Features(
                    {
                        "predictions": datasets.Value("string", id="sequence"),
                        "references": datasets.Value("string", id="sequence"),
                    }
                ),
            ],
            codebase_urls=["https://github.com/microsoft/CodeXGLUE/tree/main"],
            reference_urls=[
                "https://pypi.org/project/codebleu/",
            ],
        )

    def compute_codebleu_score(self, ground_truth, generated_answer, lang="python"):
        """
        Function to compute CodeBLEU score between ground truth code and generated code
        Has keywords for C, C#, C++, Go, Java, JavaScript, PHP, Python, Ruby, and Rust.
        """
        result = calc_codebleu([ground_truth], [generated_answer], lang=lang, weights=CODEBLEU_WEIGHTS, tokenizer=None)

        return result["codebleu"]

    def _compute(self, references, predictions):
        average_codebleu_score = sum([self.compute_codebleu_score(r, p) for r, p in zip(references, predictions)])/len(references)
        return {"codebleu_score": average_codebleu_score}