Spaces:
Runtime error
Runtime error
Commit
·
0fb6838
1
Parent(s):
a7f3790
Update bc_eval.py
Browse files- bc_eval.py +10 -8
bc_eval.py
CHANGED
|
@@ -3,7 +3,7 @@ import itertools
|
|
| 3 |
import os
|
| 4 |
import re
|
| 5 |
import tempfile
|
| 6 |
-
from collections import defaultdict
|
| 7 |
from pathlib import Path
|
| 8 |
|
| 9 |
import datasets
|
|
@@ -204,13 +204,13 @@ class BabelCodeEval(evaluate.Metric):
|
|
| 204 |
garbage_collection_freq=gc_freq,
|
| 205 |
)
|
| 206 |
|
| 207 |
-
all_results, q_passes, q_pct = _eval_predictions(results, question_map)
|
| 208 |
|
| 209 |
assert len(q_passes) == len(q_pct)
|
| 210 |
metrics = {}
|
| 211 |
for lang in q_passes:
|
| 212 |
metrics.update(
|
| 213 |
-
_calculate_metrics(lang, q_passes[lang], q_pct[lang], k_vals=k)
|
| 214 |
)
|
| 215 |
return metrics, all_results
|
| 216 |
|
|
@@ -258,7 +258,7 @@ def _eval_predictions(pred_results, question_map):
|
|
| 258 |
out = []
|
| 259 |
question_results = defaultdict(lambda: defaultdict(list))
|
| 260 |
question_pct_pass = defaultdict(lambda: defaultdict(list))
|
| 261 |
-
|
| 262 |
for p in pred_results:
|
| 263 |
question = question_map[p["qid"]]
|
| 264 |
test_cases = question["test_case_ids"]
|
|
@@ -275,13 +275,13 @@ def _eval_predictions(pred_results, question_map):
|
|
| 275 |
lang = question["language"]
|
| 276 |
question_results[lang][p["qid"]].append(num_passed == len(test_case_results))
|
| 277 |
question_pct_pass[lang][p["qid"]].append(num_passed / len(test_case_results))
|
| 278 |
-
|
| 279 |
out.append(p)
|
| 280 |
|
| 281 |
-
return out, question_results, question_pct_pass
|
| 282 |
|
| 283 |
|
| 284 |
-
def _calculate_metrics(lang, q_passed, q_pcts, k_vals):
|
| 285 |
assert len(q_passed) == len(q_pcts)
|
| 286 |
|
| 287 |
num_samples = np.zeros(len(q_passed))
|
|
@@ -298,7 +298,9 @@ def _calculate_metrics(lang, q_passed, q_pcts, k_vals):
|
|
| 298 |
}
|
| 299 |
out[f"{lang}/mean_pct_pass"] = np.mean(pcts_passed)
|
| 300 |
out[f"{lang}/median_pct_pass"] = np.median(pcts_passed)
|
| 301 |
-
|
|
|
|
|
|
|
| 302 |
|
| 303 |
return out
|
| 304 |
|
|
|
|
| 3 |
import os
|
| 4 |
import re
|
| 5 |
import tempfile
|
| 6 |
+
from collections import defaultdict, Counter
|
| 7 |
from pathlib import Path
|
| 8 |
|
| 9 |
import datasets
|
|
|
|
| 204 |
garbage_collection_freq=gc_freq,
|
| 205 |
)
|
| 206 |
|
| 207 |
+
all_results, q_passes, q_pct, o_count = _eval_predictions(results, question_map)
|
| 208 |
|
| 209 |
assert len(q_passes) == len(q_pct)
|
| 210 |
metrics = {}
|
| 211 |
for lang in q_passes:
|
| 212 |
metrics.update(
|
| 213 |
+
_calculate_metrics(lang, q_passes[lang], q_pct[lang], o_count[lang], k_vals=k)
|
| 214 |
)
|
| 215 |
return metrics, all_results
|
| 216 |
|
|
|
|
| 258 |
out = []
|
| 259 |
question_results = defaultdict(lambda: defaultdict(list))
|
| 260 |
question_pct_pass = defaultdict(lambda: defaultdict(list))
|
| 261 |
+
outcome_counts = defaultdict(Counter)
|
| 262 |
for p in pred_results:
|
| 263 |
question = question_map[p["qid"]]
|
| 264 |
test_cases = question["test_case_ids"]
|
|
|
|
| 275 |
lang = question["language"]
|
| 276 |
question_results[lang][p["qid"]].append(num_passed == len(test_case_results))
|
| 277 |
question_pct_pass[lang][p["qid"]].append(num_passed / len(test_case_results))
|
| 278 |
+
outcome_counts[lang][outcome] += 1
|
| 279 |
out.append(p)
|
| 280 |
|
| 281 |
+
return out, question_results, question_pct_pass, outcome_counts
|
| 282 |
|
| 283 |
|
| 284 |
+
def _calculate_metrics(lang, q_passed, q_pcts, o_count, k_vals):
|
| 285 |
assert len(q_passed) == len(q_pcts)
|
| 286 |
|
| 287 |
num_samples = np.zeros(len(q_passed))
|
|
|
|
| 298 |
}
|
| 299 |
out[f"{lang}/mean_pct_pass"] = np.mean(pcts_passed)
|
| 300 |
out[f"{lang}/median_pct_pass"] = np.median(pcts_passed)
|
| 301 |
+
|
| 302 |
+
for outcome, val in o_count.items():
|
| 303 |
+
out[f"{lang}/pct_{outcome}"] = val/len(q_passed)
|
| 304 |
|
| 305 |
return out
|
| 306 |
|