mutash commited on
Commit
8ee93b2
·
1 Parent(s): f36d3d7

Add code_bleu module

Browse files
Files changed (3) hide show
  1. app.py +5 -0
  2. code_bleu.py +83 -0
  3. requirements.txt +1 -0
app.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import evaluate
2
+ from evaluate.utils import launch_gradio_widget
3
+
4
+ module = evaluate.load("muditash/code_bleu")
5
+ launch_gradio_widget(module)
code_bleu.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Code BLEU metric implementation
3
+ """
4
+
5
+ import datasets
6
+ import evaluate
7
+ from codebleu import calc_codebleu
8
+
9
+ CODEBLEU_WEIGHTS = (0.25, 0.25, 0.25, 0.25)
10
+
11
+ _CITATION = """\
12
+ @misc{ren2020codebleu,
13
+ title={CodeBLEU: a Method for Automatic Evaluation of Code Synthesis},
14
+ author={Shuo Ren and Daya Guo and Shuai Lu and Long Zhou and Shujie Liu and Duyu Tang and Neel Sundaresan and Ming Zhou and Ambrosio Blanco and Shuai Ma},
15
+ year={2020},
16
+ eprint={2009.10297},
17
+ archivePrefix={arXiv},
18
+ primaryClass={cs.SE}
19
+ }
20
+ """
21
+
22
+ _DESCRIPTION = """
23
+ An ideal evaluation metric should consider the grammatical correctness and the logic correctness.
24
+ We propose weighted n-gram match and syntactic AST match to measure grammatical correctness, and introduce semantic data-flow match to calculate logic correctness.
25
+ Source: https://pypi.org/project/codebleu/
26
+ """
27
+
28
+ _KWARGS_DESCRIPTION = """
29
+ Computes CodeBLEU score of code segments against a reference.
30
+ Args:
31
+ predictions: list of code generations to score.
32
+ references: list of lists of or just a list of references for each code generation task.
33
+ Returns:
34
+ 'codebleu_score': code bleu score
35
+ Examples:
36
+
37
+ >>> predictions = ["def add ( a , b ) :\n return a + b"]
38
+ >>> references = ["def sum ( first , second ) :\n return second + first"]
39
+ >>> codebleu = evaluate.load("codebleu_score")
40
+ >>> results = codebleu.compute(predictions=predictions, references=references)
41
+ >>> print(results["codebleu_score"])
42
+ 0.5537
43
+ """
44
+
45
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
46
+ class CodeBleu(evaluate.Metric):
47
+ def _info(self):
48
+ return evaluate.MetricInfo(
49
+ description=_DESCRIPTION,
50
+ citation=_CITATION,
51
+ inputs_description=_KWARGS_DESCRIPTION,
52
+ features=[
53
+ datasets.Features(
54
+ {
55
+ "predictions": datasets.Value("string", id="sequence"),
56
+ "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
57
+ }
58
+ ),
59
+ datasets.Features(
60
+ {
61
+ "predictions": datasets.Value("string", id="sequence"),
62
+ "references": datasets.Value("string", id="sequence"),
63
+ }
64
+ ),
65
+ ],
66
+ codebase_urls=["https://github.com/microsoft/CodeXGLUE/tree/main"],
67
+ reference_urls=[
68
+ "https://pypi.org/project/codebleu/",
69
+ ],
70
+ )
71
+
72
+ def compute_codebleu_score(ground_truth, generated_answer, lang="python"):
73
+ """
74
+ Function to compute CodeBLEU score between ground truth code and generated code
75
+ Has keywords for C, C#, C++, Go, Java, JavaScript, PHP, Python, Ruby, and Rust.
76
+ """
77
+ result = calc_codebleu([ground_truth], [generated_answer], lang=lang, weights=CODEBLEU_WEIGHTS, tokenizer=None)
78
+
79
+ return result["codebleu"]
80
+
81
+ def _compute(self, references, predictions):
82
+ average_codebleu_score = sum([compute_codebleu_score(r, p) for r, p in zip(references, predictions)])/len(references)
83
+ return {"codebleu_score": average_codebleu_score}
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ codebleu