Spaces:

muditash
/

code_bleu

Runtime error

App Files Files Community

mutash commited on Apr 11, 2024

Commit

8ee93b2

1 Parent(s): f36d3d7

Add code_bleu module

Browse files

Files changed (3) hide show

app.py +5 -0
code_bleu.py +83 -0
requirements.txt +1 -0

app.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import evaluate
+from evaluate.utils import launch_gradio_widget
+module = evaluate.load("muditash/code_bleu")
+launch_gradio_widget(module)

code_bleu.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""
+Code BLEU metric implementation
+"""
+import datasets
+import evaluate
+from codebleu import calc_codebleu
+CODEBLEU_WEIGHTS = (0.25, 0.25, 0.25, 0.25)
+_CITATION = """\
+@misc{ren2020codebleu,
+      title={CodeBLEU: a Method for Automatic Evaluation of Code Synthesis},
+      author={Shuo Ren and Daya Guo and Shuai Lu and Long Zhou and Shujie Liu and Duyu Tang and Neel Sundaresan and Ming Zhou and Ambrosio Blanco and Shuai Ma},
+      year={2020},
+      eprint={2009.10297},
+      archivePrefix={arXiv},
+      primaryClass={cs.SE}
+}
+"""
+_DESCRIPTION = """
+An ideal evaluation metric should consider the grammatical correctness and the logic correctness.
+We propose weighted n-gram match and syntactic AST match to measure grammatical correctness, and introduce semantic data-flow match to calculate logic correctness.
+Source: https://pypi.org/project/codebleu/
+"""
+_KWARGS_DESCRIPTION = """
+Computes CodeBLEU score of code segments against a reference.
+Args:
+    predictions: list of code generations to score.
+    references: list of lists of or just a list of references for each code generation task.
+Returns:
+    'codebleu_score': code bleu score
+Examples:
+    >>> predictions = ["def add ( a , b ) :\n return a + b"]
+    >>> references = ["def sum ( first , second ) :\n return second + first"]
+    >>> codebleu = evaluate.load("codebleu_score")
+    >>> results = codebleu.compute(predictions=predictions, references=references)
+    >>> print(results["codebleu_score"])
+    0.5537
+"""
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class CodeBleu(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=[
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string", id="sequence"),
+                        "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
+                    }
+                ),
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string", id="sequence"),
+                        "references": datasets.Value("string", id="sequence"),
+                    }
+                ),
+            ],
+            codebase_urls=["https://github.com/microsoft/CodeXGLUE/tree/main"],
+            reference_urls=[
+                "https://pypi.org/project/codebleu/",
+            ],
+        )
+    def compute_codebleu_score(ground_truth, generated_answer, lang="python"):
+        """
+        Function to compute CodeBLEU score between ground truth code and generated code
+        Has keywords for C, C#, C++, Go, Java, JavaScript, PHP, Python, Ruby, and Rust.
+        """
+        result = calc_codebleu([ground_truth], [generated_answer], lang=lang, weights=CODEBLEU_WEIGHTS, tokenizer=None)
+        return result["codebleu"]
+    def _compute(self, references, predictions):
+        average_codebleu_score = sum([compute_codebleu_score(r, p) for r, p in zip(references, predictions)])/len(references)
+        return {"codebleu_score": average_codebleu_score}

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ codebleu