Spaces:
Sleeping
Sleeping
Kewen Zhao
commited on
Commit
·
7ddd4d3
1
Parent(s):
5906581
merge inputs into references
Browse files- README.md +9 -15
- code_eval_stdio.py +12 -9
README.md
CHANGED
|
@@ -43,9 +43,7 @@ The Code Eval metric calculates how good are predictions given a set of referenc
|
|
| 43 |
|
| 44 |
`predictions`: a list of candidates to evaluate. Each candidate should be a list of strings with several code candidates to solve the problem.
|
| 45 |
|
| 46 |
-
`references`: a list of expected output for each prediction.
|
| 47 |
-
|
| 48 |
-
`inputs`: a list of inputs for each problem
|
| 49 |
|
| 50 |
`k`: number of code candidates to consider in the evaluation. The default value is `[1, 10, 100]`.
|
| 51 |
|
|
@@ -56,10 +54,9 @@ The Code Eval metric calculates how good are predictions given a set of referenc
|
|
| 56 |
```python
|
| 57 |
from evaluate import load
|
| 58 |
code_eval_stdio = load("hage2000/code_eval_stdio")
|
| 59 |
-
|
| 60 |
-
references = ["5"]
|
| 61 |
candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
|
| 62 |
-
pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates,
|
| 63 |
```
|
| 64 |
|
| 65 |
N.B.
|
|
@@ -89,10 +86,9 @@ Full match at `k=1`:
|
|
| 89 |
```python
|
| 90 |
from evaluate import load
|
| 91 |
code_eval_stdio = load("hage2000/code_eval_stdio")
|
| 92 |
-
|
| 93 |
-
references = ["5"]
|
| 94 |
candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
|
| 95 |
-
pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates,
|
| 96 |
print(pass_at_k)
|
| 97 |
{'pass@1': 1.0}
|
| 98 |
```
|
|
@@ -102,10 +98,9 @@ No match for k = 1:
|
|
| 102 |
```python
|
| 103 |
from evaluate import load
|
| 104 |
code_eval_stdio = load("hage2000/code_eval_stdio")
|
| 105 |
-
|
| 106 |
-
references = ["5"]
|
| 107 |
candidates = [[ "nums = list(map(int, input().split()))\nprint(nums[0]*nums[1])"]]
|
| 108 |
-
pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates,
|
| 109 |
print(pass_at_k)
|
| 110 |
{'pass@1': 0.0}
|
| 111 |
```
|
|
@@ -115,10 +110,9 @@ Partial match at k=1, full match at k=2:
|
|
| 115 |
```python
|
| 116 |
from evaluate import load
|
| 117 |
code_eval_stdio = load("hage2000/code_eval_stdio")
|
| 118 |
-
|
| 119 |
-
references = ["5"]
|
| 120 |
candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))", "nums = list(map(int, input().split()))\nprint(nums[0]*nums[1])"]]
|
| 121 |
-
pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates,
|
| 122 |
print(pass_at_k)
|
| 123 |
{'pass@1': 0.5, 'pass@2': 1.0}
|
| 124 |
```
|
|
|
|
| 43 |
|
| 44 |
`predictions`: a list of candidates to evaluate. Each candidate should be a list of strings with several code candidates to solve the problem.
|
| 45 |
|
| 46 |
+
`references`: a list of tuple of [str, str], corresponding of input and expected output for each prediction.
|
|
|
|
|
|
|
| 47 |
|
| 48 |
`k`: number of code candidates to consider in the evaluation. The default value is `[1, 10, 100]`.
|
| 49 |
|
|
|
|
| 54 |
```python
|
| 55 |
from evaluate import load
|
| 56 |
code_eval_stdio = load("hage2000/code_eval_stdio")
|
| 57 |
+
references = [("2 3", "5")]
|
|
|
|
| 58 |
candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
|
| 59 |
+
pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, k=[1, 2])
|
| 60 |
```
|
| 61 |
|
| 62 |
N.B.
|
|
|
|
| 86 |
```python
|
| 87 |
from evaluate import load
|
| 88 |
code_eval_stdio = load("hage2000/code_eval_stdio")
|
| 89 |
+
references = [("2 3", "5")]
|
|
|
|
| 90 |
candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
|
| 91 |
+
pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, k=[1, 2])
|
| 92 |
print(pass_at_k)
|
| 93 |
{'pass@1': 1.0}
|
| 94 |
```
|
|
|
|
| 98 |
```python
|
| 99 |
from evaluate import load
|
| 100 |
code_eval_stdio = load("hage2000/code_eval_stdio")
|
| 101 |
+
references = [("2 3", "5")]
|
|
|
|
| 102 |
candidates = [[ "nums = list(map(int, input().split()))\nprint(nums[0]*nums[1])"]]
|
| 103 |
+
pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, k=[1, 2])
|
| 104 |
print(pass_at_k)
|
| 105 |
{'pass@1': 0.0}
|
| 106 |
```
|
|
|
|
| 110 |
```python
|
| 111 |
from evaluate import load
|
| 112 |
code_eval_stdio = load("hage2000/code_eval_stdio")
|
| 113 |
+
references = [("2 3", "5")]
|
|
|
|
| 114 |
candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))", "nums = list(map(int, input().split()))\nprint(nums[0]*nums[1])"]]
|
| 115 |
+
pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, k=[1, 2])
|
| 116 |
print(pass_at_k)
|
| 117 |
{'pass@1': 0.5, 'pass@2': 1.0}
|
| 118 |
```
|
code_eval_stdio.py
CHANGED
|
@@ -77,10 +77,9 @@ Returns:
|
|
| 77 |
results: dict with granular results of each unittest
|
| 78 |
Examples:
|
| 79 |
>>> code_eval_stdio = evaluate.load("hage2000/code_eval_stdio")
|
| 80 |
-
>>>
|
| 81 |
-
>>> references = ["5"]
|
| 82 |
>>> candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
|
| 83 |
-
>>> pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates,
|
| 84 |
>>> print(pass_at_k)
|
| 85 |
{'pass@1': 0.5, 'pass@2': 1.0}
|
| 86 |
"""
|
|
@@ -144,7 +143,12 @@ class CodeEval(evaluate.Metric):
|
|
| 144 |
features=datasets.Features(
|
| 145 |
{
|
| 146 |
"predictions": datasets.Sequence(datasets.Value("string")),
|
| 147 |
-
"references": datasets.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
}
|
| 149 |
),
|
| 150 |
homepage="https://github.com/openai/human-eval",
|
|
@@ -153,12 +157,11 @@ class CodeEval(evaluate.Metric):
|
|
| 153 |
license=_LICENSE,
|
| 154 |
)
|
| 155 |
|
| 156 |
-
def _compute(self, predictions, references,
|
| 157 |
"""
|
| 158 |
Returns the scores
|
| 159 |
predictions: List[List[str]] the python program
|
| 160 |
-
references: List[str] test
|
| 161 |
-
inputs: List[str] test input
|
| 162 |
"""
|
| 163 |
|
| 164 |
if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
|
|
@@ -173,9 +176,9 @@ class CodeEval(evaluate.Metric):
|
|
| 173 |
n_samples = 0
|
| 174 |
results = defaultdict(list)
|
| 175 |
|
| 176 |
-
for task_id, (candidates,
|
| 177 |
for candidate in candidates:
|
| 178 |
-
args = (candidate,
|
| 179 |
future = executor.submit(check_correctness, *args)
|
| 180 |
futures.append(future)
|
| 181 |
completion_id[task_id] += 1
|
|
|
|
| 77 |
results: dict with granular results of each unittest
|
| 78 |
Examples:
|
| 79 |
>>> code_eval_stdio = evaluate.load("hage2000/code_eval_stdio")
|
| 80 |
+
>>> references = [("5", "2 3")]
|
|
|
|
| 81 |
>>> candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
|
| 82 |
+
>>> pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, k=[1, 2])
|
| 83 |
>>> print(pass_at_k)
|
| 84 |
{'pass@1': 0.5, 'pass@2': 1.0}
|
| 85 |
"""
|
|
|
|
| 143 |
features=datasets.Features(
|
| 144 |
{
|
| 145 |
"predictions": datasets.Sequence(datasets.Value("string")),
|
| 146 |
+
"references": datasets.Sequence(
|
| 147 |
+
{
|
| 148 |
+
"inputs": datasets.Value("string"),
|
| 149 |
+
"expected_output": datasets.Value("string"),
|
| 150 |
+
}
|
| 151 |
+
),
|
| 152 |
}
|
| 153 |
),
|
| 154 |
homepage="https://github.com/openai/human-eval",
|
|
|
|
| 157 |
license=_LICENSE,
|
| 158 |
)
|
| 159 |
|
| 160 |
+
def _compute(self, predictions, references, k=[1, 10, 100], num_workers=4, timeout=3.0):
|
| 161 |
"""
|
| 162 |
Returns the scores
|
| 163 |
predictions: List[List[str]] the python program
|
| 164 |
+
references: List[Tuple[str, str]] test inputs and expected outputs
|
|
|
|
| 165 |
"""
|
| 166 |
|
| 167 |
if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
|
|
|
|
| 176 |
n_samples = 0
|
| 177 |
results = defaultdict(list)
|
| 178 |
|
| 179 |
+
for task_id, (candidates, (input_data, expected_output)) in enumerate(zip(predictions, references)):
|
| 180 |
for candidate in candidates:
|
| 181 |
+
args = (candidate, input_data, expected_output, timeout, task_id, completion_id[task_id])
|
| 182 |
future = executor.submit(check_correctness, *args)
|
| 183 |
futures.append(future)
|
| 184 |
completion_id[task_id] += 1
|