| | |
| | """TODO: Add a description here.""" |
| | import collections |
| | import os |
| |
|
| | from typing import Literal |
| |
|
| | import concurrent.futures |
| | import datasets |
| | import evaluate |
| | import itertools |
| | import numpy as np |
| |
|
| | from .execute import check_correctness |
| |
|
| | |
| | _CITATION = """\ |
| | @InProceedings{huggingface:module, |
| | title = {A great new module}, |
| | authors={huggingface, Inc.}, |
| | year={2020} |
| | } |
| | """ |
| |
|
| | |
| | _DESCRIPTION = """\ |
| | This new module is designed to solve this great ML task and is crafted with a lot of care. |
| | """ |
| |
|
| |
|
| | |
| | _KWARGS_DESCRIPTION = """ |
| | Calculates how good are predictions given some references, using certain scores |
| | Args: |
| | predictions: list of predictions to score. Each predictions |
| | should be a string with tokens separated by spaces. |
| | references: list of reference for each prediction. Each |
| | reference should be a string with tokens separated by spaces. |
| | Returns: |
| | accuracy: description of the first score, |
| | another_score: description of the second score, |
| | Examples: |
| | Examples should be written in doctest format, and should illustrate how |
| | to use the function. |
| | |
| | >>> my_new_module = evaluate.load("my_new_module") |
| | >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1]) |
| | >>> print(results) |
| | {'accuracy': 1.0} |
| | """ |
| |
|
| | _WARNING = """ |
| | ################################################################################ |
| | !!!WARNING!!! |
| | ################################################################################ |
| | The "code_eval" metric executes untrusted model-generated code in Python. |
| | Although it is highly unlikely that model-generated code will do something |
| | overtly malicious in response to this test suite, model-generated code may act |
| | destructively due to a lack of model capability or alignment. |
| | Users are strongly encouraged to sandbox this evaluation suite so that it |
| | does not perform destructive actions on their host or network. For more |
| | information on how OpenAI sandboxes its code, see the paper "Evaluating Large |
| | Language Models Trained on Code" (https://arxiv.org/abs/2107.03374). |
| | Once you have read this disclaimer and taken appropriate precautions, |
| | set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this |
| | with: |
| | >>> import os |
| | >>> os.environ["HF_ALLOW_CODE_EVAL"] = "1" |
| | ################################################################################\ |
| | """ |
| |
|
| | _CLANG_WARNING = """ |
| | Please provide the environment variable 'GENERICIFY_CLANG' with the path of the |
| | clang++ compiler. Version 15+ is required. Within Python you can to this |
| | with: |
| | >>> import os |
| | >>> os.environ["GENERICIFY_CLANG"] = "/path/to/clang++" |
| | """ |
| |
|
| | |
| | BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt" |
| |
|
| |
|
| | @evaluate.utils.file_utils.add_start_docstrings( |
| | _DESCRIPTION, _KWARGS_DESCRIPTION |
| | ) |
| | class EvaluateGenericifyCpp(evaluate.Metric): |
| | """TODO: Short description of my evaluation module.""" |
| |
|
| | def _info(self): |
| | |
| | return evaluate.MetricInfo( |
| | |
| | description=_DESCRIPTION, |
| | citation=_CITATION, |
| | inputs_description=_KWARGS_DESCRIPTION, |
| | |
| | features=datasets.Features( |
| | { |
| | "predictions": datasets.Sequence(datasets.Value("string")), |
| | "references": datasets.Features( |
| | { |
| | "tests": datasets.Value("string"), |
| | "invalids": datasets.Value("string"), |
| | } |
| | ), |
| | } |
| | ), |
| | |
| | homepage="http://module.homepage", |
| | |
| | codebase_urls=["http://github.com/path/to/codebase/of/new_module"], |
| | reference_urls=["http://path.to.reference.url/new_module"], |
| | ) |
| |
|
| | def _compute( |
| | self, |
| | *, |
| | predictions, |
| | references, |
| | cpp_type: Literal["base", "sfinae", "concepts"], |
| | k=[1, 10, 100], |
| | ): |
| | """Returns the scores""" |
| | num_workers = 4 |
| |
|
| | if os.getenv("HF_ALLOW_CODE_EVAL", default=0) != "1": |
| | raise ValueError(_WARNING) |
| |
|
| | if os.getenv("GENERICIFY_CLANG", default=0) == 0: |
| | raise ValueError(_CLANG_WARNING) |
| |
|
| | if os.name == "nt": |
| | raise NotImplementedError( |
| | "This metric is currently not supported on Windows." |
| | ) |
| |
|
| | with concurrent.futures.ThreadPoolExecutor( |
| | max_workers=num_workers |
| | ) as executor: |
| | futures = [] |
| | completion_id = collections.Counter() |
| | results = collections.defaultdict(list) |
| |
|
| | for task_id, (candidates, reference) in enumerate( |
| | zip(predictions, references) |
| | ): |
| | for candidate in candidates: |
| | args = ( |
| | candidate, |
| | reference, |
| | cpp_type, |
| | task_id, |
| | completion_id[task_id], |
| | ) |
| | future = executor.submit(check_correctness, *args) |
| | futures.append(future) |
| | completion_id[task_id] += 1 |
| |
|
| | for future in concurrent.futures.as_completed(futures): |
| | result = future.result() |
| | results[result["task_id"]].append( |
| | (result["completion_id"], result) |
| | ) |
| |
|
| | totals = collections.defaultdict(list) |
| | corrects = collections.defaultdict(list) |
| |
|
| | keys = { |
| | "base": [ |
| | "base_run_passed", |
| | "base_run_compiled", |
| | ], |
| | "sfinae": [ |
| | "sfinae_run_passed", |
| | "sfinae_run_compiled", |
| | "sfinae_constrain_passed", |
| | ], |
| | "concepts": [ |
| | "concepts_run_passed", |
| | "concepts_run_compiled", |
| | "concepts_constrain_passed", |
| | ], |
| | }[cpp_type] |
| | for result in results.values(): |
| | result.sort() |
| | for pt in keys: |
| | passed = [r[1][pt] for r in result] |
| | totals[pt].append(len(passed)) |
| | corrects[pt].append(sum(passed)) |
| |
|
| | totals = {k: np.array(v) for k, v in totals.items()} |
| | corrects = {k: np.array(v) for k, v in corrects.items()} |
| |
|
| | ks = k |
| | pass_at_k = { |
| | f"{key}@{k}": estimate_pass_at_k( |
| | totals[key], |
| | corrects[key], |
| | k, |
| | ).mean() |
| | for key in totals.keys() |
| | for k in ks |
| | if (totals[key] >= k).all() |
| | } |
| |
|
| | return pass_at_k, results |
| |
|
| |
|
| | def estimate_pass_at_k(num_samples, num_correct, k) -> np.array: |
| | """Estimates pass@k of each problem and returns them in an array.""" |
| |
|
| | def estimator(n: int, c: int) -> float: |
| | """Calculates 1 - comb(n - c, k) / comb(n, k).""" |
| | if n - c < k: |
| | return 1.0 |
| | return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) |
| |
|
| | if isinstance(num_samples, int): |
| | num_samples_it = itertools.repeat(num_samples, len(num_correct)) |
| | else: |
| | assert len(num_samples) == len(num_correct) |
| | num_samples_it = iter(num_samples) |
| |
|
| | return np.array( |
| | [estimator(int(n), int(c)) for n, c in zip(num_samples_it, num_correct)] |
| | ) |
| |
|