| | """ |
| | Code BLEU metric implementation |
| | """ |
| |
|
| | import datasets |
| | import evaluate |
| | from codebleu import calc_codebleu |
| |
|
| | CODEBLEU_WEIGHTS = (0.25, 0.25, 0.25, 0.25) |
| |
|
| | _CITATION = """\ |
| | @misc{ren2020codebleu, |
| | title={CodeBLEU: a Method for Automatic Evaluation of Code Synthesis}, |
| | author={Shuo Ren and Daya Guo and Shuai Lu and Long Zhou and Shujie Liu and Duyu Tang and Neel Sundaresan and Ming Zhou and Ambrosio Blanco and Shuai Ma}, |
| | year={2020}, |
| | eprint={2009.10297}, |
| | archivePrefix={arXiv}, |
| | primaryClass={cs.SE} |
| | } |
| | """ |
| |
|
| | _DESCRIPTION = """ |
| | An ideal evaluation metric should consider the grammatical correctness and the logic correctness. |
| | We propose weighted n-gram match and syntactic AST match to measure grammatical correctness, and introduce semantic data-flow match to calculate logic correctness. |
| | Source: https://pypi.org/project/codebleu/ |
| | """ |
| |
|
| | _KWARGS_DESCRIPTION = """ |
| | Computes CodeBLEU score of code segments against a reference. |
| | Args: |
| | predictions: list of code generations to score. |
| | references: list of lists of or just a list of references for each code generation task. |
| | Returns: |
| | 'codebleu_score': code bleu score |
| | Examples: |
| | |
| | >>> predictions = ["def add ( a , b ) :\n return a + b"] |
| | >>> references = ["def sum ( first , second ) :\n return second + first"] |
| | >>> codebleu = evaluate.load("codebleu_score") |
| | >>> results = codebleu.compute(predictions=predictions, references=references) |
| | >>> print(results["codebleu_score"]) |
| | 0.5537 |
| | """ |
| |
|
| | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
| | class CodeBleu(evaluate.Metric): |
| | def _info(self): |
| | return evaluate.MetricInfo( |
| | description=_DESCRIPTION, |
| | citation=_CITATION, |
| | inputs_description=_KWARGS_DESCRIPTION, |
| | features=[ |
| | datasets.Features( |
| | { |
| | "predictions": datasets.Value("string", id="sequence"), |
| | "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"), |
| | } |
| | ), |
| | datasets.Features( |
| | { |
| | "predictions": datasets.Value("string", id="sequence"), |
| | "references": datasets.Value("string", id="sequence"), |
| | } |
| | ), |
| | ], |
| | codebase_urls=["https://github.com/microsoft/CodeXGLUE/tree/main"], |
| | reference_urls=[ |
| | "https://pypi.org/project/codebleu/", |
| | ], |
| | ) |
| |
|
| | def compute_codebleu_score(ground_truth, generated_answer, lang="python"): |
| | """ |
| | Function to compute CodeBLEU score between ground truth code and generated code |
| | Has keywords for C, C#, C++, Go, Java, JavaScript, PHP, Python, Ruby, and Rust. |
| | """ |
| | result = calc_codebleu([ground_truth], [generated_answer], lang=lang, weights=CODEBLEU_WEIGHTS, tokenizer=None) |
| |
|
| | return result["codebleu"] |
| |
|
| | def _compute(self, references, predictions): |
| | average_codebleu_score = sum([compute_codebleu_score(r, p) for r, p in zip(references, predictions)])/len(references) |
| | return {"codebleu_score": average_codebleu_score} |
| |
|