Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +3 -0
- __pycache__/cloudgpt_aoai.cpython-310.pyc +0 -0
- __pycache__/rouge.cpython-310.pyc +0 -0
- ans.sh +26 -0
- arenaans.sh +6 -0
- battle.py +55 -0
- battle.sh +21 -0
- battlescore.py +105 -0
- bertencode.py +40 -0
- cleanans.py +10 -0
- cloudgpt-apim-token-cache.bin +3 -0
- cloudgpt_aoai.py +619 -0
- combine.py +11 -0
- config_sft_fhw.yaml +41 -0
- crux.sh +87 -0
- evalplus/.dockerignore +182 -0
- evalplus/.github/ISSUE_TEMPLATE/buggy_contract.yml +48 -0
- evalplus/.github/ISSUE_TEMPLATE/buggy_test.yml +49 -0
- evalplus/.github/ISSUE_TEMPLATE/config.yml +1 -0
- evalplus/.github/ISSUE_TEMPLATE/model_eval_request.yml +73 -0
- evalplus/.gitignore +182 -0
- evalplus/.pre-commit-config.yaml +20 -0
- evalplus/CITATION.cff +25 -0
- evalplus/Dockerfile +19 -0
- evalplus/LICENSE +205 -0
- evalplus/MANIFEST.in +1 -0
- evalplus/README.md +325 -0
- evalplus/build/lib/evalplus/__init__.py +4 -0
- evalplus/build/lib/evalplus/_version.py +16 -0
- evalplus/build/lib/evalplus/codegen.py +272 -0
- evalplus/build/lib/evalplus/config.py +16 -0
- evalplus/build/lib/evalplus/data/__init__.py +14 -0
- evalplus/build/lib/evalplus/data/humaneval.py +96 -0
- evalplus/build/lib/evalplus/data/mbpp.py +203 -0
- evalplus/build/lib/evalplus/data/utils.py +166 -0
- evalplus/build/lib/evalplus/eval/__init__.py +316 -0
- evalplus/build/lib/evalplus/eval/_special_oracle.py +55 -0
- evalplus/build/lib/evalplus/eval/utils.py +187 -0
- evalplus/build/lib/evalplus/evalperf.py +558 -0
- evalplus/build/lib/evalplus/evaluate.py +375 -0
- evalplus/build/lib/evalplus/gen/__init__.py +21 -0
- evalplus/build/lib/evalplus/gen/chatgpt_gen.py +78 -0
- evalplus/build/lib/evalplus/gen/mut_gen.py +30 -0
- evalplus/build/lib/evalplus/gen/type_mut.py +340 -0
- evalplus/build/lib/evalplus/gen/util/__init__.py +40 -0
- evalplus/build/lib/evalplus/gen/util/anthropic_request.py +47 -0
- evalplus/build/lib/evalplus/gen/util/openai_request.py +51 -0
- evalplus/build/lib/evalplus/inputgen.py +108 -0
- evalplus/build/lib/evalplus/lecacy_sanitize.py +201 -0
- evalplus/build/lib/evalplus/perf/__init__.py +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
evalplus/gallary/overview.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
evalplus/gallary/render.gif filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
nohup.out filter=lfs diff=lfs merge=lfs -text
|
__pycache__/cloudgpt_aoai.cpython-310.pyc
ADDED
|
Binary file (17.7 kB). View file
|
|
|
__pycache__/rouge.cpython-310.pyc
ADDED
|
Binary file (630 Bytes). View file
|
|
|
ans.sh
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/llama --datapath /home/aiscuser/fhw/data/athene_python_7w.json
|
| 2 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/llama --datapath /home/aiscuser/fhw/data/deepseekcoder_python_7w.json
|
| 3 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/llama --datapath /home/aiscuser/fhw/data/llama_python_7w.json
|
| 4 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/llama --datapath /home/aiscuser/fhw/data/qwen_python_7w.json
|
| 5 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/llama --datapath /home/aiscuser/fhw/data/qwq_python_7w.json
|
| 6 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/deepseekcoder --datapath /home/aiscuser/fhw/data/athene_python_7w.json
|
| 7 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/deepseekcoder --datapath /home/aiscuser/fhw/data/deepseekcoder_python_7w.json
|
| 8 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/deepseekcoder --datapath /home/aiscuser/fhw/data/llama_python_7w.json
|
| 9 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/deepseekcoder --datapath /home/aiscuser/fhw/data/qwen_python_7w.json
|
| 10 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/deepseekcoder --datapath /home/aiscuser/fhw/data/qwq_python_7w.json
|
| 11 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/athene --datapath /home/aiscuser/fhw/data/athene_python_7w.json
|
| 12 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/athene --datapath /home/aiscuser/fhw/data/deepseekcoder_python_7w.json
|
| 13 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/athene --datapath /home/aiscuser/fhw/data/llama_python_7w.json
|
| 14 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/athene --datapath /home/aiscuser/fhw/data/qwen_python_7w.json
|
| 15 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/athene --datapath /home/aiscuser/fhw/data/qwq_python_7w.json
|
| 16 |
+
python vllmans.py --path /home/aiscuser/fhw/model_weights/qwen --datapath /home/aiscuser/fhw/data/athene_python_7w.json
|
| 17 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/qwen --datapath /home/aiscuser/fhw/data/deepseekcoder_python_7w.json
|
| 18 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/qwen --datapath /home/aiscuser/fhw/data/llama_python_7w.json
|
| 19 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/qwen --datapath /home/aiscuser/fhw/data/qwen_python_7w.json
|
| 20 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/qwen --datapath /home/aiscuser/fhw/data/qwq_python_7w.json
|
| 21 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/qwq --datapath /home/aiscuser/fhw/data/athene_python_7w.json
|
| 22 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/qwq --datapath /home/aiscuser/fhw/data/deepseekcoder_python_7w.json
|
| 23 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/qwq --datapath /home/aiscuser/fhw/data/llama_python_7w.json
|
| 24 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/qwq --datapath /home/aiscuser/fhw/data/qwen_python_7w.json
|
| 25 |
+
#python vllmans.py --path /home/aiscuser/fhw/model_weights/qwq --datapath /home/aiscuser/fhw/data/qwq_python_7w.json
|
| 26 |
+
python /data/local/zhangdi/DPO/DPO_train.py
|
arenaans.sh
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
python vllmarenaans.py --model target --judge athene --split 0
|
| 2 |
+
python vllmarenaans.py --model target --judge deepseekcoder --split 0
|
| 3 |
+
python vllmarenaans.py --model target --judge llama --split 0
|
| 4 |
+
python vllmarenaans.py --model target --judge qwen --split 0
|
| 5 |
+
python vllmarenaans.py --model target --judge qwq --split 0
|
| 6 |
+
python /data/local/zhangdi/DPO/DPO_train.py
|
battle.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer
|
| 2 |
+
from vllm import LLM, SamplingParams
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
from tqdm import tqdm
|
| 6 |
+
|
| 7 |
+
parser = argparse.ArgumentParser()
|
| 8 |
+
parser.add_argument('--path', type=str,help='模型路径')
|
| 9 |
+
parser.add_argument('--start', type=int,help='开始')
|
| 10 |
+
parser.add_argument('--end', type=int,help='终止')
|
| 11 |
+
args = parser.parse_args()
|
| 12 |
+
|
| 13 |
+
name = args.path[args.path.rfind('/')+1:]
|
| 14 |
+
|
| 15 |
+
fw = open(f"alignment-handbook/data/llama_battle_mistral_qwen_{args.start}_{args.end}.json", 'w+')
|
| 16 |
+
|
| 17 |
+
prompts = []
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# Initialize the tokenizer
|
| 21 |
+
tokenizer = AutoTokenizer.from_pretrained(args.path, trust_remote_code=True)
|
| 22 |
+
f1 = open("alignment-handbook/data/llama_python_mistral_answer_0_70000_sft.json", "r+")
|
| 23 |
+
f2 = open("alignment-handbook/data/llama_python_qwen_answer_0_70000_sft.json", "r+")
|
| 24 |
+
lines1 = f1.readlines()[args.start:args.end]
|
| 25 |
+
lines2 = f2.readlines()[args.start:args.end]
|
| 26 |
+
t = 0
|
| 27 |
+
for line1, line2 in zip(lines1, lines2):
|
| 28 |
+
d1 = json.loads(line1)
|
| 29 |
+
d2 = json.loads(line2)
|
| 30 |
+
instruction = d1["messages"][0]["content"]
|
| 31 |
+
answer1 = d1["messages"][1]["content"]
|
| 32 |
+
answer2 = d2["messages"][1]["content"]
|
| 33 |
+
#print(answer1)
|
| 34 |
+
#print(answer2)
|
| 35 |
+
if t%2 == 0:
|
| 36 |
+
prompt = f"This is a chatbot arena. You will be given assistant A’s answer, and assistant B’s answer. Please act as an impartial judge and evaluate the capability of two AI assistants. You should choose the assistant that follows instructions and answers questions better. Your evaluation should consider factors such as helpfulness, relevance, and accuracy. Begin your evaluation by comparing the responses of the two assistants and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. DO NOT allow the LENGTH of the responses to influence your evaluation, choose the one that is straight-to-the-point instead of unnecessarily verbose. When the two candidates perform equally well, choose the SHORTER answer. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation concisely within 200 words, output your final verdict by strictly following this format: “[[A]]” if assistant A is better, “[[B]]” if assistant B is better, and “[[Tie]]” for a tie. Finish your judgement within 300 words.\n\n[User Question]\n{instruction}\n\n[The Start of Assistant A’s Answer]\n{answer1}\n[The End of Assistant A’s Answer]\n\n[The Start of Assistant B’s Answer]\n{answer2}\n[The End of Assistant B’s Answer]"
|
| 37 |
+
else:
|
| 38 |
+
prompt = f"This is a chatbot arena. You will be given assistant A’s answer, and assistant B’s answer. Please act as an impartial judge and evaluate the capability of two AI assistants. You should choose the assistant that follows instructions and answers questions better. Your evaluation should consider factors such as helpfulness, relevance, and accuracy. Begin your evaluation by comparing the responses of the two assistants and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. DO NOT allow the LENGTH of the responses to influence your evaluation, choose the one that is straight-to-the-point instead of unnecessarily verbose. When the two candidates perform equally well, choose the SHORTER answer. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation concisely within 200 words, output your final verdict by strictly following this format: “[[A]]” if assistant A is better, “[[B]]” if assistant B is better, and “[[Tie]]” for a tie. Finish your judgement within 300 words.\n\n[User Question]\n{instruction}\n\n[The Start of Assistant A’s Answer]\n{answer2}\n[The End of Assistant A’s Answer]\n\n[The Start of Assistant B’s Answer]\n{answer1}\n[The End of Assistant B’s Answer]"
|
| 39 |
+
messages = [{"role": "user", "content": prompt}]
|
| 40 |
+
text = tokenizer.apply_chat_template(
|
| 41 |
+
messages,
|
| 42 |
+
tokenize=False
|
| 43 |
+
)
|
| 44 |
+
prompts.append(text)
|
| 45 |
+
t = t + 1
|
| 46 |
+
|
| 47 |
+
# Input the model name or path. Can be GPTQ or AWQ models.
|
| 48 |
+
llm = LLM(args.path, dtype="float16", tensor_parallel_size=8, trust_remote_code=True, max_model_len=8192, enforce_eager=True)
|
| 49 |
+
sampling_params = SamplingParams(temperature=1.0, top_p=0.995, max_tokens=8192)
|
| 50 |
+
outputs = llm.generate(prompts=prompts, sampling_params=sampling_params)
|
| 51 |
+
t = 0
|
| 52 |
+
for output in outputs:
|
| 53 |
+
d = {"arena": output.outputs[0].text, "t": t}
|
| 54 |
+
t = t + 1
|
| 55 |
+
fw.write(json.dumps(d)+"\n")
|
battle.sh
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/athene --model /home/aiscuser/fhw/model_weights/deepseekcoder
|
| 2 |
+
#python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/athene --model /home/aiscuser/fhw/model_weights/llama
|
| 3 |
+
#python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/athene --model /home/aiscuser/fhw/model_weights/qwen
|
| 4 |
+
#python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/athene --model /home/aiscuser/fhw/model_weights/qwq
|
| 5 |
+
#python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/deepseekcoder --model /home/aiscuser/fhw/model_weights/athene
|
| 6 |
+
python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/deepseekcoder --model /home/aiscuser/fhw/model_weights/llama
|
| 7 |
+
python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/deepseekcoder --model /home/aiscuser/fhw/model_weights/qwen
|
| 8 |
+
python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/deepseekcoder --model /home/aiscuser/fhw/model_weights/qwq
|
| 9 |
+
python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/llama --model /home/aiscuser/fhw/model_weights/athene
|
| 10 |
+
python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/llama --model /home/aiscuser/fhw/model_weights/deepseekcoder
|
| 11 |
+
#python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/llama --model /home/aiscuser/fhw/model_weights/qwen
|
| 12 |
+
#python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/llama --model /home/aiscuser/fhw/model_weights/qwq
|
| 13 |
+
#python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/qwen --model /home/aiscuser/fhw/model_weights/athene
|
| 14 |
+
#python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/qwen --model /home/aiscuser/fhw/model_weights/deepseekcoder
|
| 15 |
+
#python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/qwen --model /home/aiscuser/fhw/model_weights/llama
|
| 16 |
+
#python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/qwen --model /home/aiscuser/fhw/model_weights/qwq
|
| 17 |
+
#python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/qwq --model /home/aiscuser/fhw/model_weights/athene
|
| 18 |
+
#python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/qwq --model /home/aiscuser/fhw/model_weights/deepseekcoder
|
| 19 |
+
#python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/qwq --model /home/aiscuser/fhw/model_weights/llama
|
| 20 |
+
#python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/qwq --model /home/aiscuser/fhw/model_weights/qwen
|
| 21 |
+
python /data/local/zhangdi/DPO/DPO_train.py
|
battlescore.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer
|
| 2 |
+
from vllm import LLM, SamplingParams
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
from tqdm import tqdm
|
| 6 |
+
import re
|
| 7 |
+
def extract_score(judgement):
|
| 8 |
+
d = {}
|
| 9 |
+
extracted = re.findall(r"\[\[(\d*\.\d+|\d+)/10\]\]", judgement, re.S)
|
| 10 |
+
if len(extracted) > 0:
|
| 11 |
+
d["score"] = float(extracted[-1])
|
| 12 |
+
return int(d["score"])
|
| 13 |
+
extracted = re.findall(r"\[\[(\d*\.\d+|\d+)\]\]", judgement, re.S)
|
| 14 |
+
if len(extracted) > 0:
|
| 15 |
+
d["score"] = float(extracted[-1])
|
| 16 |
+
return int(d["score"])
|
| 17 |
+
extracted = re.findall(r"\*\*Score: \[(\d*\.\d+|\d+)/10\]\*\*", judgement, re.S)
|
| 18 |
+
if len(extracted) > 0:
|
| 19 |
+
d["score"] = float(extracted[-1])
|
| 20 |
+
return int(d["score"])
|
| 21 |
+
extracted = re.findall(r"\*\*Score: \[(\d*\.\d+|\d+)\]\*\*", judgement, re.S)
|
| 22 |
+
if len(extracted) > 0:
|
| 23 |
+
d["score"] = float(extracted[-1])
|
| 24 |
+
return int(d["score"])
|
| 25 |
+
extracted = re.findall(r"\*\*Score: (\d*\.\d+|\d+)/10\*\*", judgement, re.S)
|
| 26 |
+
if len(extracted) > 0:
|
| 27 |
+
d["score"] = float(extracted[-1])
|
| 28 |
+
return int(d["score"])
|
| 29 |
+
extracted = re.findall(r"\*\*Score: (\d*\.\d+|\d+)\*\*", judgement, re.S)
|
| 30 |
+
if len(extracted) > 0:
|
| 31 |
+
d["score"] = float(extracted[-1])
|
| 32 |
+
return int(d["score"])
|
| 33 |
+
extracted = re.findall(r"\*\*Score:\*\* (\d*\.\d+|\d+)/10", judgement, re.S)
|
| 34 |
+
if len(extracted) > 0:
|
| 35 |
+
d["score"] = float(extracted[-1])
|
| 36 |
+
return int(d["score"])
|
| 37 |
+
extracted = re.findall(r"\*\*Score:\*\* (\d*\.\d+|\d+)", judgement, re.S)
|
| 38 |
+
if len(extracted) > 0:
|
| 39 |
+
d["score"] = float(extracted[-1])
|
| 40 |
+
return int(d["score"])
|
| 41 |
+
extracted = re.findall(r"Score(.*?)", judgement, re.S)
|
| 42 |
+
if len(extracted) > 0:
|
| 43 |
+
judgement = extracted[-1]
|
| 44 |
+
extracted = re.findall(r"\d*\.\d+|\d+", judgement, re.S)
|
| 45 |
+
if len(extracted) > 0:
|
| 46 |
+
d["score"] = float(extracted[-1])
|
| 47 |
+
return int(d["score"])
|
| 48 |
+
return -1
|
| 49 |
+
parser = argparse.ArgumentParser()
|
| 50 |
+
parser.add_argument('--judgename', type=str,help='模型路径')
|
| 51 |
+
parser.add_argument('--modelnames', nargs='+')
|
| 52 |
+
args = parser.parse_args()
|
| 53 |
+
|
| 54 |
+
f = open(f"/home/aiscuser/fhw/data/{args.judgename}_filtered_by_answer.json", "r+")
|
| 55 |
+
ddd = json.loads(f.readlines()[0])
|
| 56 |
+
|
| 57 |
+
fr = open(f"/home/aiscuser/fhw/data/{args.judgename}_answerby_{args.judgename}.json", 'r+')
|
| 58 |
+
linesr = fr.readlines()
|
| 59 |
+
|
| 60 |
+
all_lines = []
|
| 61 |
+
for modelname in args.modelnames:
|
| 62 |
+
f = open(f"/home/aiscuser/fhw/data/{args.judgename}_judge_{modelname}.json", 'r+')
|
| 63 |
+
all_lines.append(f.readlines())
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
a, b, c, d = 0, 0, 0, 0
|
| 67 |
+
|
| 68 |
+
fw = open(f"/home/aiscuser/fhw/data/{args.judgename}_with_best_answer.json", "w+")
|
| 69 |
+
|
| 70 |
+
for i in tqdm(ddd[args.judgename]):
|
| 71 |
+
reference = json.loads(linesr[i])
|
| 72 |
+
da = json.loads(all_lines[0][a]) if a<len(all_lines[0]) else json.loads(all_lines[0][0])
|
| 73 |
+
db = json.loads(all_lines[1][b]) if b<len(all_lines[1]) else json.loads(all_lines[1][0])
|
| 74 |
+
dc = json.loads(all_lines[2][c]) if c<len(all_lines[2]) else json.loads(all_lines[2][0])
|
| 75 |
+
dd = json.loads(all_lines[3][d]) if d<len(all_lines[3]) else json.loads(all_lines[3][0])
|
| 76 |
+
|
| 77 |
+
da["battlescore"], db["battlescore"], dc["battlescore"], dd["battlescore"] = -1, -1, -1, -1
|
| 78 |
+
|
| 79 |
+
if da["index"] == i:
|
| 80 |
+
da["battlescore"] = extract_score(da["battle"])
|
| 81 |
+
a = a + 1
|
| 82 |
+
if db["index"] == i:
|
| 83 |
+
db["battlescore"] = extract_score(db["battle"])
|
| 84 |
+
b = b + 1
|
| 85 |
+
if dc["index"] == i:
|
| 86 |
+
dc["battlescore"] = extract_score(dc["battle"])
|
| 87 |
+
c = c + 1
|
| 88 |
+
if dd["index"] == i:
|
| 89 |
+
dd["battlescore"] = extract_score(dd["battle"])
|
| 90 |
+
d = d + 1
|
| 91 |
+
|
| 92 |
+
instruction = reference["instruction"]
|
| 93 |
+
scorelist = [da["battlescore"], db["battlescore"], dc["battlescore"], dd["battlescore"]]
|
| 94 |
+
maxscore = max(scorelist)
|
| 95 |
+
maxindex = scorelist.index(maxscore)
|
| 96 |
+
|
| 97 |
+
if maxscore>6:
|
| 98 |
+
bestname = args.modelnames[maxindex]
|
| 99 |
+
bestanswer = [da, db, dc, dd][maxindex]["response"]
|
| 100 |
+
else:
|
| 101 |
+
bestname = args.judgename
|
| 102 |
+
bestanswer = reference["response"]
|
| 103 |
+
fw.write(json.dumps({"instruction": instruction, "scorelist": scorelist, "bestname": bestname, "bestanswer": bestanswer, "modelnames": args.modelnames, "judgename": args.judgename})+"\n")
|
| 104 |
+
|
| 105 |
+
|
bertencode.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
This example starts multiple processes (1 per GPU), which encode
|
| 3 |
+
sentences in parallel. This gives a near linear speed-up
|
| 4 |
+
when encoding large text collections.
|
| 5 |
+
"""
|
| 6 |
+
from tqdm import tqdm
|
| 7 |
+
import logging
|
| 8 |
+
import json
|
| 9 |
+
import torch
|
| 10 |
+
from sentence_transformers import LoggingHandler, SentenceTransformer
|
| 11 |
+
|
| 12 |
+
logging.basicConfig(
|
| 13 |
+
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
# Important, you need to shield your code with if __name__. Otherwise, CUDA runs into issues when spawning new processes.
|
| 17 |
+
if __name__ == "__main__":
|
| 18 |
+
# Create a large list of 100k sentences
|
| 19 |
+
f = open("/home/aiscuser/fhw/data/qwq_python_selected.json","r+")
|
| 20 |
+
lines = f.readlines()
|
| 21 |
+
sentences = []
|
| 22 |
+
for line in tqdm(lines):
|
| 23 |
+
d= json.loads(line)
|
| 24 |
+
sentences.append(d["instruction"])
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# Define the model
|
| 28 |
+
model = SentenceTransformer("/home/aiscuser/fhw/model_weights/all-roberta-large-v1")
|
| 29 |
+
|
| 30 |
+
# Start the multi-process pool on all available CUDA devices
|
| 31 |
+
pool = model.start_multi_process_pool(["cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:7", "cuda:7", "cuda:7", "cuda:7", "cuda:7", "cuda:7", "cuda:7", "cuda:7", "cuda:7", "cuda:7", ])
|
| 32 |
+
|
| 33 |
+
# Compute the embeddings using the multi-process pool
|
| 34 |
+
emb = model.encode_multi_process(sentences, pool)
|
| 35 |
+
|
| 36 |
+
print("Embeddings computed. Shape:", emb.shape)
|
| 37 |
+
|
| 38 |
+
# Optional: Stop the processes in the pool
|
| 39 |
+
model.stop_multi_process_pool(pool)
|
| 40 |
+
torch.save(emb, "/home/aiscuser/fhw/embeddings/qwq_ins_embeddings.pt", pickle_protocol=4)
|
cleanans.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tqdm import tqdm
|
| 2 |
+
import json
|
| 3 |
+
f = open("/home/aiscuser/fhw/data/all_instruct_with_answers.json", "r+")
|
| 4 |
+
fw = open("/home/aiscuser/fhw/data/all_instruct_with_answers_cleaned.json", "w+")
|
| 5 |
+
|
| 6 |
+
lines = f.readlines()
|
| 7 |
+
for line in lines:
|
| 8 |
+
d = json.loads(line)
|
| 9 |
+
d["bestanswer"] = d["bestanswer"].strip("<|start_header_id|>assistant<|end_header_id|>").strip("\n")
|
| 10 |
+
fw.write(json.dumps(d)+"\n")
|
cloudgpt-apim-token-cache.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc1dedc3209b111ca9ac7676ffad159ce9bff625b0980c4897653414b796f3aa
|
| 3 |
+
size 300
|
cloudgpt_aoai.py
ADDED
|
@@ -0,0 +1,619 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
from typing import (
|
| 3 |
+
Any,
|
| 4 |
+
AsyncGenerator,
|
| 5 |
+
Callable,
|
| 6 |
+
Coroutine,
|
| 7 |
+
Literal,
|
| 8 |
+
Optional,
|
| 9 |
+
ParamSpec,
|
| 10 |
+
TypeVar,
|
| 11 |
+
cast,
|
| 12 |
+
Dict,
|
| 13 |
+
TYPE_CHECKING,
|
| 14 |
+
)
|
| 15 |
+
import sys, os
|
| 16 |
+
import contextlib
|
| 17 |
+
import functools
|
| 18 |
+
|
| 19 |
+
__all__ = [
|
| 20 |
+
"get_openai_token_provider",
|
| 21 |
+
"get_openai_token",
|
| 22 |
+
"get_openai_client",
|
| 23 |
+
"get_chat_completion",
|
| 24 |
+
"encode_image",
|
| 25 |
+
"cloudgpt_available_models",
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
TokenProvider = Callable[[], str]
|
| 29 |
+
AsyncTokenProvider = Callable[[], Coroutine[Any, Any, str]]
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def check_module():
|
| 33 |
+
try:
|
| 34 |
+
import openai, azure.identity.broker # type: ignore
|
| 35 |
+
|
| 36 |
+
del openai, azure.identity.broker
|
| 37 |
+
except ImportError:
|
| 38 |
+
print("Please install the required packages by running the following command:")
|
| 39 |
+
print("pip install openai azure-identity-broker --upgrade")
|
| 40 |
+
exit(1)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
check_module()
|
| 44 |
+
|
| 45 |
+
import openai
|
| 46 |
+
from openai import OpenAI
|
| 47 |
+
|
| 48 |
+
_depRt = TypeVar("_depRt")
|
| 49 |
+
_depParam = ParamSpec("_depParam")
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _deprecated(message: str):
|
| 53 |
+
def deprecated_decorator(
|
| 54 |
+
func: Callable[_depParam, _depRt]
|
| 55 |
+
) -> Callable[_depParam, _depRt]:
|
| 56 |
+
def deprecated_func(
|
| 57 |
+
*args: _depParam.args, **kwargs: _depParam.kwargs
|
| 58 |
+
) -> _depRt:
|
| 59 |
+
import traceback
|
| 60 |
+
|
| 61 |
+
print(
|
| 62 |
+
"\n ⚠️ \x1b[31m{} is a deprecated function. {}".format(
|
| 63 |
+
func.__name__, message
|
| 64 |
+
)
|
| 65 |
+
)
|
| 66 |
+
traceback.print_stack()
|
| 67 |
+
print("\x1b[0m")
|
| 68 |
+
return func(*args, **kwargs)
|
| 69 |
+
|
| 70 |
+
return deprecated_func
|
| 71 |
+
|
| 72 |
+
return deprecated_decorator
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _validate_token(token: str) -> bool:
|
| 76 |
+
import requests
|
| 77 |
+
|
| 78 |
+
url = "https://cloudgpt-openai.azure-api.net/openai/ping"
|
| 79 |
+
|
| 80 |
+
headers = {
|
| 81 |
+
"Authorization": f"Bearer {token}",
|
| 82 |
+
}
|
| 83 |
+
try:
|
| 84 |
+
response = requests.get(url, headers=headers)
|
| 85 |
+
assert response.status_code == 200 and response.text == "OK", response.text
|
| 86 |
+
return True
|
| 87 |
+
except Exception as e:
|
| 88 |
+
print("Failed to validate token", e)
|
| 89 |
+
return False
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
@functools.lru_cache(maxsize=3)
|
| 93 |
+
def get_openai_token_provider(
|
| 94 |
+
token_cache_file: str = "cloudgpt-apim-token-cache.bin",
|
| 95 |
+
client_id: Optional[str] = None,
|
| 96 |
+
client_secret: Optional[str] = None,
|
| 97 |
+
use_azure_cli: Optional[bool] = None,
|
| 98 |
+
use_broker_login: Optional[bool] = None,
|
| 99 |
+
use_managed_identity: Optional[bool] = None,
|
| 100 |
+
use_device_code: Optional[bool] = None,
|
| 101 |
+
skip_access_validation: Optional[bool] = False,
|
| 102 |
+
**kwargs: Any,
|
| 103 |
+
) -> TokenProvider:
|
| 104 |
+
"""
|
| 105 |
+
Get a token provider function that could return a valid access token for CloudGPT OpenAI.
|
| 106 |
+
|
| 107 |
+
The return value is a function that should be used with AzureOpenAIClient constructor as azure_ad_token_provider parameter.
|
| 108 |
+
The following code snippet shows how to use it with AzureOpenAIClient:
|
| 109 |
+
|
| 110 |
+
```python
|
| 111 |
+
token_provider = get_openai_token_provider()
|
| 112 |
+
client = openai.AzureOpenAI(
|
| 113 |
+
api_version="2024-06-01",
|
| 114 |
+
azure_endpoint="https://cloudgpt-openai.azure-api.net/",
|
| 115 |
+
azure_ad_token_provider=token_provider,
|
| 116 |
+
)
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
Parameters
|
| 120 |
+
----------
|
| 121 |
+
token_cache_file : str, optional
|
| 122 |
+
path to the token cache file, by default 'cloudgpt-apim-token-cache.bin' in the current directory
|
| 123 |
+
client_id : Optional[str], optional
|
| 124 |
+
client id for AAD app, by default None
|
| 125 |
+
client_secret : Optional[str], optional
|
| 126 |
+
client secret for AAD app, by default None
|
| 127 |
+
use_azure_cli : Optional[bool], optional
|
| 128 |
+
use Azure CLI for authentication, by default None. If AzCli has been installed and logged in,
|
| 129 |
+
it will be used for authentication. This is recommended for headless environments and AzCLI takes
|
| 130 |
+
care of token cache and token refresh.
|
| 131 |
+
use_broker_login : Optional[bool], optional
|
| 132 |
+
use broker login for authentication, by default None.
|
| 133 |
+
If not specified, it will be enabled for known supported environments (e.g. Windows, macOS, WSL, VSCode),
|
| 134 |
+
but sometimes it may not always could cache the token for long-term usage.
|
| 135 |
+
In such cases, you can disable it by setting it to False.
|
| 136 |
+
use_managed_identity : Optional[bool], optional
|
| 137 |
+
use managed identity for authentication, by default None.
|
| 138 |
+
If not specified, it will use user assigned managed identity if client_id is specified,
|
| 139 |
+
For use system assigned managed identity, client_id could be None but need to set use_managed_identity to True.
|
| 140 |
+
use_device_code : Optional[bool], optional
|
| 141 |
+
use device code for authentication, by default None. If not specified, it will use interactive login on supported platform.
|
| 142 |
+
skip_access_validation : Optional[bool], optional
|
| 143 |
+
skip access token validation, by default False.
|
| 144 |
+
|
| 145 |
+
Returns
|
| 146 |
+
-------
|
| 147 |
+
TokenProvider
|
| 148 |
+
the token provider function that could return a valid access token for CloudGPT OpenAI
|
| 149 |
+
"""
|
| 150 |
+
import shutil
|
| 151 |
+
from azure.identity.broker import InteractiveBrowserBrokerCredential
|
| 152 |
+
from azure.identity import (
|
| 153 |
+
ManagedIdentityCredential,
|
| 154 |
+
ClientSecretCredential,
|
| 155 |
+
DeviceCodeCredential,
|
| 156 |
+
AuthenticationRecord,
|
| 157 |
+
AzureCliCredential,
|
| 158 |
+
)
|
| 159 |
+
from azure.identity import TokenCachePersistenceOptions
|
| 160 |
+
import msal # type: ignore
|
| 161 |
+
|
| 162 |
+
api_scope_base = "api://feb7b661-cac7-44a8-8dc1-163b63c23df2"
|
| 163 |
+
tenant_id = "72f988bf-86f1-41af-91ab-2d7cd011db47"
|
| 164 |
+
scope = api_scope_base + "/.default"
|
| 165 |
+
|
| 166 |
+
token_cache_option = TokenCachePersistenceOptions(
|
| 167 |
+
name=token_cache_file,
|
| 168 |
+
enable_persistence=True,
|
| 169 |
+
allow_unencrypted_storage=True,
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
def save_auth_record(auth_record: AuthenticationRecord):
|
| 173 |
+
try:
|
| 174 |
+
with open(token_cache_file, "w") as cache_file:
|
| 175 |
+
cache_file.write(auth_record.serialize())
|
| 176 |
+
except Exception as e:
|
| 177 |
+
print("failed to save auth record", e)
|
| 178 |
+
|
| 179 |
+
def load_auth_record() -> Optional[AuthenticationRecord]:
|
| 180 |
+
try:
|
| 181 |
+
if not os.path.exists(token_cache_file):
|
| 182 |
+
return None
|
| 183 |
+
with open(token_cache_file, "r") as cache_file:
|
| 184 |
+
return AuthenticationRecord.deserialize(cache_file.read())
|
| 185 |
+
except Exception as e:
|
| 186 |
+
print("failed to load auth record", e)
|
| 187 |
+
return None
|
| 188 |
+
|
| 189 |
+
auth_record: Optional[AuthenticationRecord] = load_auth_record()
|
| 190 |
+
|
| 191 |
+
current_auth_mode: Literal[
|
| 192 |
+
"client_secret",
|
| 193 |
+
"managed_identity",
|
| 194 |
+
"az_cli",
|
| 195 |
+
"interactive",
|
| 196 |
+
"device_code",
|
| 197 |
+
"none",
|
| 198 |
+
] = "none"
|
| 199 |
+
|
| 200 |
+
implicit_mode = not (
|
| 201 |
+
use_managed_identity or use_azure_cli or use_broker_login or use_device_code
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
if use_managed_identity or (implicit_mode and client_id is not None):
|
| 205 |
+
if not use_managed_identity and client_secret is not None:
|
| 206 |
+
assert (
|
| 207 |
+
client_id is not None
|
| 208 |
+
), "client_id must be specified with client_secret"
|
| 209 |
+
current_auth_mode = "client_secret"
|
| 210 |
+
identity = ClientSecretCredential(
|
| 211 |
+
client_id=client_id,
|
| 212 |
+
client_secret=client_secret,
|
| 213 |
+
tenant_id=tenant_id,
|
| 214 |
+
cache_persistence_options=token_cache_option,
|
| 215 |
+
authentication_record=auth_record,
|
| 216 |
+
)
|
| 217 |
+
else:
|
| 218 |
+
current_auth_mode = "managed_identity"
|
| 219 |
+
if client_id is None:
|
| 220 |
+
# using default managed identity
|
| 221 |
+
identity = ManagedIdentityCredential(
|
| 222 |
+
cache_persistence_options=token_cache_option,
|
| 223 |
+
)
|
| 224 |
+
else:
|
| 225 |
+
identity = ManagedIdentityCredential(
|
| 226 |
+
client_id=client_id,
|
| 227 |
+
cache_persistence_options=token_cache_option,
|
| 228 |
+
)
|
| 229 |
+
elif use_azure_cli or (implicit_mode and shutil.which("az") is not None):
|
| 230 |
+
current_auth_mode = "az_cli"
|
| 231 |
+
identity = AzureCliCredential(tenant_id=tenant_id)
|
| 232 |
+
else:
|
| 233 |
+
if implicit_mode:
|
| 234 |
+
# enable broker login for known supported envs if not specified using use_device_code
|
| 235 |
+
if sys.platform.startswith("darwin") or sys.platform.startswith("win32"):
|
| 236 |
+
use_broker_login = True
|
| 237 |
+
elif os.environ.get("WSL_DISTRO_NAME", "") != "":
|
| 238 |
+
use_broker_login = True
|
| 239 |
+
elif os.environ.get("TERM_PROGRAM", "") == "vscode":
|
| 240 |
+
use_broker_login = True
|
| 241 |
+
else:
|
| 242 |
+
use_broker_login = False
|
| 243 |
+
if use_broker_login:
|
| 244 |
+
current_auth_mode = "interactive"
|
| 245 |
+
identity = InteractiveBrowserBrokerCredential(
|
| 246 |
+
tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47",
|
| 247 |
+
cache_persistence_options=token_cache_option,
|
| 248 |
+
use_default_broker_account=True,
|
| 249 |
+
parent_window_handle=msal.PublicClientApplication.CONSOLE_WINDOW_HANDLE,
|
| 250 |
+
authentication_record=auth_record,
|
| 251 |
+
)
|
| 252 |
+
else:
|
| 253 |
+
current_auth_mode = "device_code"
|
| 254 |
+
identity = DeviceCodeCredential(
|
| 255 |
+
tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47",
|
| 256 |
+
cache_persistence_options=token_cache_option,
|
| 257 |
+
authentication_record=auth_record,
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
try:
|
| 261 |
+
auth_record = identity.authenticate(scopes=[scope])
|
| 262 |
+
if auth_record:
|
| 263 |
+
save_auth_record(auth_record)
|
| 264 |
+
|
| 265 |
+
except Exception as e:
|
| 266 |
+
print(
|
| 267 |
+
f"failed to acquire token from AAD for CloudGPT OpenAI using {current_auth_mode}",
|
| 268 |
+
e,
|
| 269 |
+
)
|
| 270 |
+
raise e
|
| 271 |
+
|
| 272 |
+
try:
|
| 273 |
+
from azure.identity import get_bearer_token_provider
|
| 274 |
+
|
| 275 |
+
token_provider = get_bearer_token_provider(identity, scope)
|
| 276 |
+
token_verified_cache: str = ""
|
| 277 |
+
|
| 278 |
+
def token_provider_wrapper():
|
| 279 |
+
nonlocal token_verified_cache
|
| 280 |
+
token = token_provider()
|
| 281 |
+
if token != token_verified_cache:
|
| 282 |
+
if not skip_access_validation:
|
| 283 |
+
assert _validate_token(token), "failed to validate token"
|
| 284 |
+
token_verified_cache = token
|
| 285 |
+
return token
|
| 286 |
+
|
| 287 |
+
return token_provider_wrapper
|
| 288 |
+
except Exception as e:
|
| 289 |
+
print("failed to acquire token from AAD for CloudGPT OpenAI", e)
|
| 290 |
+
raise e
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
@functools.lru_cache(maxsize=3)
|
| 294 |
+
async def async_get_openai_token_provider(
|
| 295 |
+
**kwargs: Any,
|
| 296 |
+
) -> AsyncTokenProvider:
|
| 297 |
+
# TODO: implement async version of get_openai_token_provider
|
| 298 |
+
token_provider = get_openai_token_provider(
|
| 299 |
+
**kwargs,
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
async def async_token_provider() -> str:
|
| 303 |
+
return token_provider()
|
| 304 |
+
|
| 305 |
+
return async_token_provider
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
@_deprecated(
|
| 309 |
+
"use get_openai_token_provider instead whenever possible "
|
| 310 |
+
"and use it as the azure_ad_token_provider parameter in AzureOpenAIClient constructor. "
|
| 311 |
+
"Please do not acquire token directly or use it elsewhere."
|
| 312 |
+
)
|
| 313 |
+
def get_openai_token(
|
| 314 |
+
token_cache_file: str = "cloudgpt-apim-token-cache.bin",
|
| 315 |
+
client_id: Optional[str] = None,
|
| 316 |
+
client_secret: Optional[str] = None,
|
| 317 |
+
use_azure_cli: Optional[bool] = None,
|
| 318 |
+
use_broker_login: Optional[bool] = None,
|
| 319 |
+
use_managed_identity: Optional[bool] = None,
|
| 320 |
+
use_device_code: Optional[bool] = None,
|
| 321 |
+
skip_access_validation: Optional[bool] = False,
|
| 322 |
+
**kwargs: Any,
|
| 323 |
+
) -> str:
|
| 324 |
+
"""
|
| 325 |
+
get access token for CloudGPT OpenAI
|
| 326 |
+
"""
|
| 327 |
+
return get_openai_token_provider(
|
| 328 |
+
token_cache_file=token_cache_file,
|
| 329 |
+
client_id=client_id,
|
| 330 |
+
client_secret=client_secret,
|
| 331 |
+
use_azure_cli=use_azure_cli,
|
| 332 |
+
use_broker_login=use_broker_login,
|
| 333 |
+
use_managed_identity=use_managed_identity,
|
| 334 |
+
use_device_code=use_device_code,
|
| 335 |
+
skip_access_validation=skip_access_validation,
|
| 336 |
+
**kwargs,
|
| 337 |
+
)()
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
"""
|
| 341 |
+
Available models for CloudGPT OpenAI
|
| 342 |
+
"""
|
| 343 |
+
cloudgpt_available_models = Literal[
|
| 344 |
+
"gpt-35-turbo-20220309",
|
| 345 |
+
"gpt-35-turbo-16k-20230613",
|
| 346 |
+
"gpt-35-turbo-20230613",
|
| 347 |
+
"gpt-35-turbo-1106",
|
| 348 |
+
"gpt-4-20230321",
|
| 349 |
+
"gpt-4-20230613",
|
| 350 |
+
"gpt-4-32k-20230321",
|
| 351 |
+
"gpt-4-32k-20230613",
|
| 352 |
+
"gpt-4-1106-preview",
|
| 353 |
+
"gpt-4-0125-preview",
|
| 354 |
+
"gpt-4-visual-preview",
|
| 355 |
+
"gpt-4-turbo-20240409",
|
| 356 |
+
"gpt-4o-20240513",
|
| 357 |
+
"gpt-4o-20240806",
|
| 358 |
+
"gpt-4o-mini-20240718",
|
| 359 |
+
]
|
| 360 |
+
|
| 361 |
+
cloudgpt_available_realtime_models = Literal["gpt-4o-realtime-preview-20241001"]
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
def encode_image(image_path: str, mime_type: Optional[str] = None) -> str:
|
| 365 |
+
"""
|
| 366 |
+
Utility function to encode image to base64 for using in OpenAI API
|
| 367 |
+
|
| 368 |
+
Parameters
|
| 369 |
+
----------
|
| 370 |
+
image_path : str
|
| 371 |
+
path to the image file
|
| 372 |
+
|
| 373 |
+
mime_type : Optional[str], optional
|
| 374 |
+
mime type of the image, by default None and will infer from the file extension if possible
|
| 375 |
+
|
| 376 |
+
Returns
|
| 377 |
+
-------
|
| 378 |
+
str
|
| 379 |
+
base64 encoded image url
|
| 380 |
+
"""
|
| 381 |
+
import base64
|
| 382 |
+
import mimetypes
|
| 383 |
+
|
| 384 |
+
file_name = os.path.basename(image_path)
|
| 385 |
+
mime_type = cast(
|
| 386 |
+
Optional[str],
|
| 387 |
+
mime_type if mime_type is not None else mimetypes.guess_type(file_name)[0], # type: ignore
|
| 388 |
+
)
|
| 389 |
+
with open(image_path, "rb") as image_file:
|
| 390 |
+
encoded_image = base64.b64encode(image_file.read()).decode("ascii")
|
| 391 |
+
|
| 392 |
+
if mime_type is None or not mime_type.startswith("image/"):
|
| 393 |
+
print(
|
| 394 |
+
"Warning: mime_type is not specified or not an image mime type. Defaulting to png."
|
| 395 |
+
)
|
| 396 |
+
mime_type = "image/png"
|
| 397 |
+
|
| 398 |
+
image_url = f"data:{mime_type};base64," + encoded_image
|
| 399 |
+
return image_url
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
@functools.lru_cache(maxsize=3)
|
| 403 |
+
def get_openai_client(
|
| 404 |
+
client_id: Optional[str] = None,
|
| 405 |
+
client_secret: Optional[str] = None,
|
| 406 |
+
use_azure_cli: Optional[bool] = None,
|
| 407 |
+
use_broker_login: Optional[bool] = None,
|
| 408 |
+
use_managed_identity: Optional[bool] = None,
|
| 409 |
+
use_device_code: Optional[bool] = None,
|
| 410 |
+
) -> OpenAI:
|
| 411 |
+
"""
|
| 412 |
+
Initialize OpenAI client for CloudGPT OpenAI.
|
| 413 |
+
|
| 414 |
+
All parameters are optional and will use the default authentication method if not specified.
|
| 415 |
+
|
| 416 |
+
Parameters
|
| 417 |
+
----------
|
| 418 |
+
client_id : Optional[str], optional
|
| 419 |
+
client id for AAD app, by default None
|
| 420 |
+
client_secret : Optional[str], optional
|
| 421 |
+
client secret for AAD app, by default None
|
| 422 |
+
use_azure_cli : Optional[bool], optional
|
| 423 |
+
use Azure CLI for authentication, by default None. If AzCli has been installed and logged in,
|
| 424 |
+
it will be used for authentication. This is recommended for headless environments and AzCLI takes
|
| 425 |
+
care of token cache and token refresh.
|
| 426 |
+
use_broker_login : Optional[bool], optional
|
| 427 |
+
use broker login for authentication, by default None.
|
| 428 |
+
If not specified, it will be enabled for known supported environments (e.g. Windows, macOS, WSL, VSCode),
|
| 429 |
+
but sometimes it may not always could cache the token for long-term usage.
|
| 430 |
+
In such cases, you can disable it by setting it to False.
|
| 431 |
+
use_managed_identity : Optional[bool], optional
|
| 432 |
+
use managed identity for authentication, by default None.
|
| 433 |
+
If not specified, it will use user assigned managed identity if client_id is specified,
|
| 434 |
+
For use system assigned managed identity, client_id could be None but need to set use_managed_identity to True.
|
| 435 |
+
use_device_code : Optional[bool], optional
|
| 436 |
+
use device code for authentication, by default None. If not specified, it will use interactive login on supported platform.
|
| 437 |
+
|
| 438 |
+
Returns
|
| 439 |
+
-------
|
| 440 |
+
OpenAI
|
| 441 |
+
OpenAI client for CloudGPT OpenAI. Check https://github.com/openai/openai-python for more details.
|
| 442 |
+
"""
|
| 443 |
+
token_provider = get_openai_token_provider(
|
| 444 |
+
client_id=client_id,
|
| 445 |
+
client_secret=client_secret,
|
| 446 |
+
use_azure_cli=use_azure_cli,
|
| 447 |
+
use_broker_login=use_broker_login,
|
| 448 |
+
use_managed_identity=use_managed_identity,
|
| 449 |
+
use_device_code=use_device_code,
|
| 450 |
+
)
|
| 451 |
+
print(token_provider())
|
| 452 |
+
client = openai.AzureOpenAI(
|
| 453 |
+
api_version="2024-06-01",
|
| 454 |
+
azure_endpoint="https://cloudgpt-openai.azure-api.net/",
|
| 455 |
+
azure_ad_token_provider=token_provider,
|
| 456 |
+
)
|
| 457 |
+
return client
|
| 458 |
+
|
| 459 |
+
|
| 460 |
+
def get_chat_completion(
|
| 461 |
+
model: Optional[cloudgpt_available_models] = None,
|
| 462 |
+
client_id: Optional[str] = None,
|
| 463 |
+
client_secret: Optional[str] = None,
|
| 464 |
+
use_azure_cli: Optional[bool] = None,
|
| 465 |
+
use_broker_login: Optional[bool] = None,
|
| 466 |
+
use_managed_identity: Optional[bool] = None,
|
| 467 |
+
use_device_code: Optional[bool] = None,
|
| 468 |
+
**kwargs: Any,
|
| 469 |
+
):
|
| 470 |
+
"""
|
| 471 |
+
Helper function to get chat completion from OpenAI API
|
| 472 |
+
"""
|
| 473 |
+
|
| 474 |
+
engine: Optional[str] = kwargs.get("engine")
|
| 475 |
+
|
| 476 |
+
model_name: Any = model
|
| 477 |
+
if model_name is None:
|
| 478 |
+
if engine is None:
|
| 479 |
+
raise ValueError("model name must be specified by 'model' parameter")
|
| 480 |
+
model_name = engine
|
| 481 |
+
|
| 482 |
+
if "engine" in kwargs:
|
| 483 |
+
del kwargs["engine"]
|
| 484 |
+
|
| 485 |
+
client = get_openai_client(
|
| 486 |
+
client_id=client_id,
|
| 487 |
+
client_secret=client_secret,
|
| 488 |
+
use_azure_cli=use_azure_cli,
|
| 489 |
+
use_broker_login=use_broker_login,
|
| 490 |
+
use_managed_identity=use_managed_identity,
|
| 491 |
+
use_device_code=use_device_code,
|
| 492 |
+
)
|
| 493 |
+
|
| 494 |
+
response: Any = client.completions.create(model=model_name, **kwargs)
|
| 495 |
+
|
| 496 |
+
return response
|
| 497 |
+
|
| 498 |
+
|
| 499 |
+
def _check_rtclient():
|
| 500 |
+
try:
|
| 501 |
+
import rtclient # type: ignore
|
| 502 |
+
|
| 503 |
+
del rtclient
|
| 504 |
+
except ImportError:
|
| 505 |
+
raise ImportError(
|
| 506 |
+
f"rtclient package is required when using realtime API`. Please install it by running \n"
|
| 507 |
+
"pip install https://github.com/Azure-Samples/aoai-realtime-audio-sdk/releases/download/py%2Fv0.5.1/rtclient-0.5.1-py3-none-any.whl"
|
| 508 |
+
)
|
| 509 |
+
return True
|
| 510 |
+
|
| 511 |
+
|
| 512 |
+
if TYPE_CHECKING:
|
| 513 |
+
from rtclient import RTClient, RTLowLevelClient
|
| 514 |
+
|
| 515 |
+
|
| 516 |
+
async def get_realtime_low_level_client(
|
| 517 |
+
model: cloudgpt_available_realtime_models = "gpt-4o-realtime-preview-20241001",
|
| 518 |
+
**kwargs: Any,
|
| 519 |
+
) -> RTLowLevelClient:
|
| 520 |
+
"""
|
| 521 |
+
Get realtime client with low level API for fined grained control
|
| 522 |
+
|
| 523 |
+
Usage:
|
| 524 |
+
```python
|
| 525 |
+
async with await get_realtime_low_level_client() as client:
|
| 526 |
+
# use client
|
| 527 |
+
pass
|
| 528 |
+
```
|
| 529 |
+
"""
|
| 530 |
+
assert _check_rtclient()
|
| 531 |
+
from rtclient import RTLowLevelClient
|
| 532 |
+
|
| 533 |
+
class CloudGPT_AOAI_RTLowLevelClient(RTLowLevelClient):
|
| 534 |
+
def __init__(
|
| 535 |
+
self,
|
| 536 |
+
token_provider: AsyncTokenProvider,
|
| 537 |
+
url: str = "https://cloudgpt-openai.azure-api.net/",
|
| 538 |
+
azure_deployment: cloudgpt_available_realtime_models | None = None,
|
| 539 |
+
):
|
| 540 |
+
self._async_token_provider = token_provider
|
| 541 |
+
|
| 542 |
+
from azure.core.credentials import AzureKeyCredential
|
| 543 |
+
|
| 544 |
+
key_credential = AzureKeyCredential("placeholder")
|
| 545 |
+
|
| 546 |
+
super().__init__(
|
| 547 |
+
url=url,
|
| 548 |
+
key_credential=key_credential,
|
| 549 |
+
azure_deployment=azure_deployment,
|
| 550 |
+
)
|
| 551 |
+
|
| 552 |
+
async def _get_auth(self) -> Dict[str, str]:
|
| 553 |
+
token = await self._async_token_provider()
|
| 554 |
+
return {"Authorization": f"Bearer {token}"}
|
| 555 |
+
|
| 556 |
+
token_provider = await async_get_openai_token_provider(**kwargs)
|
| 557 |
+
return CloudGPT_AOAI_RTLowLevelClient(
|
| 558 |
+
token_provider=token_provider,
|
| 559 |
+
azure_deployment=model,
|
| 560 |
+
)
|
| 561 |
+
|
| 562 |
+
|
| 563 |
+
async def get_realtime_client(
|
| 564 |
+
model: cloudgpt_available_realtime_models = "gpt-4o-realtime-preview-20241001",
|
| 565 |
+
**kwargs: Any,
|
| 566 |
+
) -> RTClient:
|
| 567 |
+
"""
|
| 568 |
+
Get realtime client with high level API for simplified usage
|
| 569 |
+
|
| 570 |
+
Usage:
|
| 571 |
+
```python
|
| 572 |
+
async with await get_realtime_client() as client:
|
| 573 |
+
# use client
|
| 574 |
+
pass
|
| 575 |
+
```
|
| 576 |
+
"""
|
| 577 |
+
assert _check_rtclient()
|
| 578 |
+
from rtclient import RTClient, MessageQueueWithError, Session
|
| 579 |
+
|
| 580 |
+
class CloudGPT_AOAI_RTClient(RTClient):
|
| 581 |
+
def __init__(
|
| 582 |
+
self,
|
| 583 |
+
low_level_client: Optional[RTLowLevelClient] = None,
|
| 584 |
+
):
|
| 585 |
+
self._client = low_level_client
|
| 586 |
+
|
| 587 |
+
self._message_queue = MessageQueueWithError(
|
| 588 |
+
receive_delegate=self._receive_message,
|
| 589 |
+
error_predicate=lambda m: m is not None and (m.type == "error"),
|
| 590 |
+
)
|
| 591 |
+
|
| 592 |
+
self.session: Optional[Session] = None
|
| 593 |
+
|
| 594 |
+
self._response_map: dict[str, str] = {}
|
| 595 |
+
|
| 596 |
+
low_level_client = await get_realtime_low_level_client(model=model, **kwargs)
|
| 597 |
+
return CloudGPT_AOAI_RTClient(low_level_client=low_level_client)
|
| 598 |
+
|
| 599 |
+
|
| 600 |
+
def _test_call(**kwargs: Any):
|
| 601 |
+
test_message = "What is the content?"
|
| 602 |
+
|
| 603 |
+
client = get_openai_client(**kwargs)
|
| 604 |
+
|
| 605 |
+
response = client.chat.completions.create(
|
| 606 |
+
model="gpt-4o-mini-20240718",
|
| 607 |
+
messages=[{"role": "user", "content": test_message}],
|
| 608 |
+
temperature=0.7,
|
| 609 |
+
max_tokens=100,
|
| 610 |
+
top_p=0.95,
|
| 611 |
+
frequency_penalty=0,
|
| 612 |
+
presence_penalty=0,
|
| 613 |
+
)
|
| 614 |
+
|
| 615 |
+
print(response.choices[0].message)
|
| 616 |
+
|
| 617 |
+
|
| 618 |
+
if __name__ == "__main__":
|
| 619 |
+
_test_call(use_broker_login=True)
|
combine.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
names = os.listdir("/home/aiscuser/fhw/data")
|
| 4 |
+
lines = []
|
| 5 |
+
for name in names:
|
| 6 |
+
if "_with_best_answer" in name:
|
| 7 |
+
f = open(f"/home/aiscuser/fhw/data/{name}", "r+")
|
| 8 |
+
lines.extend(f.readlines())
|
| 9 |
+
fw = open("/home/aiscuser/fhw/data/all_instruct_with_answers.json", "w+")
|
| 10 |
+
for line in lines:
|
| 11 |
+
fw.write(line)
|
config_sft_fhw.yaml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Model arguments
|
| 2 |
+
model_name_or_path: mistralai/Mistral-7B-v0.1
|
| 3 |
+
model_revision: main
|
| 4 |
+
torch_dtype: bfloat16
|
| 5 |
+
attn_implementation: flash_attention_2
|
| 6 |
+
|
| 7 |
+
# Data training arguments
|
| 8 |
+
chat_template: "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
|
| 9 |
+
dataset_mixer:
|
| 10 |
+
data/my: 1.0
|
| 11 |
+
dataset_splits:
|
| 12 |
+
- train
|
| 13 |
+
preprocessing_num_workers: 128
|
| 14 |
+
|
| 15 |
+
# SFT trainer config
|
| 16 |
+
bf16: true
|
| 17 |
+
do_eval: False
|
| 18 |
+
eval_strategy: epoch
|
| 19 |
+
gradient_accumulation_steps: 1
|
| 20 |
+
gradient_checkpointing: true
|
| 21 |
+
gradient_checkpointing_kwargs:
|
| 22 |
+
use_reentrant: False
|
| 23 |
+
learning_rate: 1.0e-05
|
| 24 |
+
log_level: info
|
| 25 |
+
logging_steps: 5
|
| 26 |
+
logging_strategy: steps
|
| 27 |
+
lr_scheduler_type: cosine
|
| 28 |
+
max_seq_length: 4096
|
| 29 |
+
num_train_epochs: 3
|
| 30 |
+
output_dir: trained_models/deepseekcoder
|
| 31 |
+
overwrite_output_dir: true
|
| 32 |
+
per_device_eval_batch_size: 8
|
| 33 |
+
per_device_train_batch_size: 8
|
| 34 |
+
push_to_hub: true
|
| 35 |
+
remove_unused_columns: true
|
| 36 |
+
report_to:
|
| 37 |
+
- tensorboard
|
| 38 |
+
save_strategy: "epoch"
|
| 39 |
+
save_total_limit: 5
|
| 40 |
+
seed: 42
|
| 41 |
+
warmup_ratio: 0.1
|
crux.sh
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
python main.py \
|
| 2 |
+
--model /home/aiscuser/fhw/model_weights/warriordeep448/ \
|
| 3 |
+
--trust_remote_code \
|
| 4 |
+
--tasks output_prediction \
|
| 5 |
+
--batch_size 20 \
|
| 6 |
+
--n_samples 10 \
|
| 7 |
+
--max_length_generation 4096 \
|
| 8 |
+
--precision fp16 \
|
| 9 |
+
--temperature 0.8 \
|
| 10 |
+
--save_generations \
|
| 11 |
+
--save_generations_path model_generations_raw/warriordeep448_temp0.8_output/shard_0.json \
|
| 12 |
+
--shuffle \
|
| 13 |
+
--tensor_parallel_size 8
|
| 14 |
+
|
| 15 |
+
python main.py \
|
| 16 |
+
--model /home/aiscuser/fhw/model_weights/warriordeep448/ \
|
| 17 |
+
--trust_remote_code \
|
| 18 |
+
--tasks output_prediction \
|
| 19 |
+
--batch_size 20 \
|
| 20 |
+
--n_samples 10 \
|
| 21 |
+
--max_length_generation 4096 \
|
| 22 |
+
--precision fp16 \
|
| 23 |
+
--temperature 0.2 \
|
| 24 |
+
--save_generations \
|
| 25 |
+
--save_generations_path model_generations_raw/warriordeep448_temp0.2_output/shard_0.json \
|
| 26 |
+
--shuffle \
|
| 27 |
+
--tensor_parallel_size 8
|
| 28 |
+
|
| 29 |
+
python main.py \
|
| 30 |
+
--model /home/aiscuser/fhw/model_weights/warriordeep448/ \
|
| 31 |
+
--trust_remote_code \
|
| 32 |
+
--tasks output_prediction \
|
| 33 |
+
--batch_size 20 \
|
| 34 |
+
--n_samples 10 \
|
| 35 |
+
--max_length_generation 4096 \
|
| 36 |
+
--precision fp16 \
|
| 37 |
+
--temperature 0.8 \
|
| 38 |
+
--save_generations \
|
| 39 |
+
--save_generations_path model_generations_raw/warriordeep448+cot_temp0.8_output/shard_0.json \
|
| 40 |
+
--cot \
|
| 41 |
+
--shuffle \
|
| 42 |
+
--tensor_parallel_size 8
|
| 43 |
+
|
| 44 |
+
python main.py \
|
| 45 |
+
--model /home/aiscuser/fhw/model_weights/warriordeep448/ \
|
| 46 |
+
--trust_remote_code \
|
| 47 |
+
--tasks output_prediction \
|
| 48 |
+
--batch_size 20 \
|
| 49 |
+
--n_samples 10 \
|
| 50 |
+
--max_length_generation 4096 \
|
| 51 |
+
--precision fp16 \
|
| 52 |
+
--temperature 0.2 \
|
| 53 |
+
--save_generations \
|
| 54 |
+
--save_generations_path model_generations_raw/warriordeep448+cot_temp0.2_output/shard_0.json \
|
| 55 |
+
--cot \
|
| 56 |
+
--shuffle \
|
| 57 |
+
--tensor_parallel_size 8
|
| 58 |
+
|
| 59 |
+
python main.py \
|
| 60 |
+
--model /home/aiscuser/fhw/model_weights/warriordeep448/ \
|
| 61 |
+
--trust_remote_code \
|
| 62 |
+
--tasks input_prediction \
|
| 63 |
+
--batch_size 20 \
|
| 64 |
+
--n_samples 10 \
|
| 65 |
+
--max_length_generation 4096 \
|
| 66 |
+
--precision fp16 \
|
| 67 |
+
--temperature 0.8 \
|
| 68 |
+
--save_generations \
|
| 69 |
+
--save_generations_path model_generations_raw/warriordeep448+cot_temp0.8_input/shard_0.json \
|
| 70 |
+
--cot \
|
| 71 |
+
--shuffle \
|
| 72 |
+
--tensor_parallel_size 8
|
| 73 |
+
|
| 74 |
+
python main.py \
|
| 75 |
+
--model /home/aiscuser/fhw/model_weights/warriordeep448/ \
|
| 76 |
+
--trust_remote_code \
|
| 77 |
+
--tasks input_prediction \
|
| 78 |
+
--batch_size 20 \
|
| 79 |
+
--n_samples 10 \
|
| 80 |
+
--max_length_generation 4096 \
|
| 81 |
+
--precision fp16 \
|
| 82 |
+
--temperature 0.2 \
|
| 83 |
+
--save_generations \
|
| 84 |
+
--save_generations_path model_generations_raw/warriordeep448+cot_temp0.2_input/shard_0.json \
|
| 85 |
+
--cot \
|
| 86 |
+
--shuffle \
|
| 87 |
+
--tensor_parallel_size 8
|
evalplus/.dockerignore
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py,cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# poetry
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 102 |
+
#poetry.lock
|
| 103 |
+
|
| 104 |
+
# pdm
|
| 105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 106 |
+
#pdm.lock
|
| 107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 108 |
+
# in version control.
|
| 109 |
+
# https://pdm.fming.dev/#use-with-ide
|
| 110 |
+
.pdm.toml
|
| 111 |
+
|
| 112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 113 |
+
__pypackages__/
|
| 114 |
+
|
| 115 |
+
# Celery stuff
|
| 116 |
+
celerybeat-schedule
|
| 117 |
+
celerybeat.pid
|
| 118 |
+
|
| 119 |
+
# SageMath parsed files
|
| 120 |
+
*.sage.py
|
| 121 |
+
|
| 122 |
+
# Environments
|
| 123 |
+
.env
|
| 124 |
+
.venv
|
| 125 |
+
env/
|
| 126 |
+
venv/
|
| 127 |
+
ENV/
|
| 128 |
+
env.bak/
|
| 129 |
+
venv.bak/
|
| 130 |
+
|
| 131 |
+
# Spyder project settings
|
| 132 |
+
.spyderproject
|
| 133 |
+
.spyproject
|
| 134 |
+
|
| 135 |
+
# Rope project settings
|
| 136 |
+
.ropeproject
|
| 137 |
+
|
| 138 |
+
# mkdocs documentation
|
| 139 |
+
/site
|
| 140 |
+
|
| 141 |
+
# mypy
|
| 142 |
+
.mypy_cache/
|
| 143 |
+
.dmypy.json
|
| 144 |
+
dmypy.json
|
| 145 |
+
|
| 146 |
+
# Pyre type checker
|
| 147 |
+
.pyre/
|
| 148 |
+
|
| 149 |
+
# pytype static type analyzer
|
| 150 |
+
.pytype/
|
| 151 |
+
|
| 152 |
+
# Cython debug symbols
|
| 153 |
+
cython_debug/
|
| 154 |
+
|
| 155 |
+
# PyCharm
|
| 156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 160 |
+
# nuclear option because steven uses PyCharm.
|
| 161 |
+
.idea/
|
| 162 |
+
|
| 163 |
+
# VSCode
|
| 164 |
+
.vscode/
|
| 165 |
+
|
| 166 |
+
# EvalPlus specific
|
| 167 |
+
EvalPlus/
|
| 168 |
+
backup/
|
| 169 |
+
passrate.p*
|
| 170 |
+
min_cov_dir/
|
| 171 |
+
HumanEvalPlus*.gz
|
| 172 |
+
MbppPlus*.gz
|
| 173 |
+
evalplus/_version.py
|
| 174 |
+
*mbpp.json
|
| 175 |
+
*.jsonl
|
| 176 |
+
*.json
|
| 177 |
+
*.png
|
| 178 |
+
*.pdf
|
| 179 |
+
trash-bin
|
| 180 |
+
.bak
|
| 181 |
+
evalplus.github.io
|
| 182 |
+
evalplus_results/
|
evalplus/.github/ISSUE_TEMPLATE/buggy_contract.yml
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: "🐛 Report Bad Contract"
|
| 2 |
+
description: Report to us that certain program contract should be repaired.
|
| 3 |
+
title: "🐛 [TestRemoval] - <TASK_ID> <WHY>"
|
| 4 |
+
labels: ["program contract"]
|
| 5 |
+
body:
|
| 6 |
+
- type: input
|
| 7 |
+
id: version
|
| 8 |
+
attributes:
|
| 9 |
+
label: "EvalPlus version"
|
| 10 |
+
description: What is the version of EvalPlus? You can find it by running `pip show evalplus`.
|
| 11 |
+
placeholder: For example, 0.1.0
|
| 12 |
+
validations:
|
| 13 |
+
required: true
|
| 14 |
+
- type: input
|
| 15 |
+
id: cache
|
| 16 |
+
attributes:
|
| 17 |
+
label: "Output of running `ls ~/.cache/evalplus`"
|
| 18 |
+
validations:
|
| 19 |
+
required: true
|
| 20 |
+
- type: input
|
| 21 |
+
id: task_id
|
| 22 |
+
attributes:
|
| 23 |
+
label: "Task ID of the programming task"
|
| 24 |
+
placeholder: HumanEval/[??]
|
| 25 |
+
validations:
|
| 26 |
+
required: true
|
| 27 |
+
- type: textarea
|
| 28 |
+
id: original
|
| 29 |
+
attributes:
|
| 30 |
+
label: "The original wrong contract"
|
| 31 |
+
description: You can run `python -c "from evalplus.data import get_human_eval_plus; print(get_human_eval_plus()['HumanEval/❓']['contract'])"`
|
| 32 |
+
render: python
|
| 33 |
+
validations:
|
| 34 |
+
required: true
|
| 35 |
+
- type: textarea
|
| 36 |
+
id: new
|
| 37 |
+
attributes:
|
| 38 |
+
label: "Your proposed new contract"
|
| 39 |
+
render: python
|
| 40 |
+
validations:
|
| 41 |
+
required: true
|
| 42 |
+
- type: textarea
|
| 43 |
+
id: other
|
| 44 |
+
attributes:
|
| 45 |
+
label: "Other context"
|
| 46 |
+
description: (Optional) Anything else the maintainer should notice?
|
| 47 |
+
validations:
|
| 48 |
+
required: false
|
evalplus/.github/ISSUE_TEMPLATE/buggy_test.yml
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: "🐛 Report Bad Test Inputs"
|
| 2 |
+
description: Report to us that certain test inputs should be removed.
|
| 3 |
+
title: "🐛 [TestRemoval] - <TASK_ID> <WHY>"
|
| 4 |
+
labels: ["bug"]
|
| 5 |
+
body:
|
| 6 |
+
- type: input
|
| 7 |
+
id: version
|
| 8 |
+
attributes:
|
| 9 |
+
label: "EvalPlus version"
|
| 10 |
+
description: What is the version of EvalPlus? You can find it by running `pip show evalplus`.
|
| 11 |
+
placeholder: For example, 0.1.0
|
| 12 |
+
validations:
|
| 13 |
+
required: true
|
| 14 |
+
- type: input
|
| 15 |
+
id: cache
|
| 16 |
+
attributes:
|
| 17 |
+
label: "Output of running `ls ~/.cache/evalplus`"
|
| 18 |
+
validations:
|
| 19 |
+
required: true
|
| 20 |
+
- type: input
|
| 21 |
+
id: task_id
|
| 22 |
+
attributes:
|
| 23 |
+
label: "Task ID of the programming task"
|
| 24 |
+
placeholder: HumanEval/[??]
|
| 25 |
+
validations:
|
| 26 |
+
required: true
|
| 27 |
+
- type: textarea
|
| 28 |
+
id: test_input
|
| 29 |
+
attributes:
|
| 30 |
+
label: "Test input"
|
| 31 |
+
description: The text form of the test input that you think should be removed
|
| 32 |
+
render: python
|
| 33 |
+
validations:
|
| 34 |
+
required: true
|
| 35 |
+
- type: textarea
|
| 36 |
+
id: description
|
| 37 |
+
attributes:
|
| 38 |
+
label: "Description"
|
| 39 |
+
description: An explicit description of why you think this test should be removed
|
| 40 |
+
placeholder: Here is a correct solution but it is incorrectly falsified by the test because ...
|
| 41 |
+
validations:
|
| 42 |
+
required: true
|
| 43 |
+
- type: textarea
|
| 44 |
+
id: other
|
| 45 |
+
attributes:
|
| 46 |
+
label: "Other context"
|
| 47 |
+
description: (Optional) Anything else the maintainer should notice?
|
| 48 |
+
validations:
|
| 49 |
+
required: false
|
evalplus/.github/ISSUE_TEMPLATE/config.yml
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
blank_issues_enabled: true
|
evalplus/.github/ISSUE_TEMPLATE/model_eval_request.yml
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: "🤗 Model Evaluation Request"
|
| 2 |
+
description: Request EvalPlus maintainers to evaluate your model independently and update it on our leaderboard.
|
| 3 |
+
title: "🤗 [REQUEST] - FILL_THE_MODEL_NAME_HERE"
|
| 4 |
+
labels: ["model eval"]
|
| 5 |
+
body:
|
| 6 |
+
- type: textarea
|
| 7 |
+
id: about
|
| 8 |
+
attributes:
|
| 9 |
+
label: "Model introduction"
|
| 10 |
+
description: Provide a brief introduction to the model.
|
| 11 |
+
placeholder: The models is created by ... and is used for ...
|
| 12 |
+
validations:
|
| 13 |
+
required: true
|
| 14 |
+
- type: input
|
| 15 |
+
id: url
|
| 16 |
+
attributes:
|
| 17 |
+
label: "Model URL"
|
| 18 |
+
description: Indicate the URL (e.g., huggingface or other release pages) of the model
|
| 19 |
+
placeholder: https://huggingface.co/[???]/[???]
|
| 20 |
+
validations:
|
| 21 |
+
required: true
|
| 22 |
+
- type: textarea
|
| 23 |
+
id: other
|
| 24 |
+
attributes:
|
| 25 |
+
label: "Additional information (Optional)"
|
| 26 |
+
description: Special steps indicating how to run the model with preferably scripts/codes.
|
| 27 |
+
placeholder: What data type precision should be used? What is the minimal hardware requirement? Can it be accelerated by tools such as vLLM?
|
| 28 |
+
validations:
|
| 29 |
+
required: false
|
| 30 |
+
- type: textarea
|
| 31 |
+
id: decomtamination
|
| 32 |
+
attributes:
|
| 33 |
+
label: "Decontamination"
|
| 34 |
+
description: How does the authors avoid contamination for their training data?
|
| 35 |
+
placeholder: Please clarify the decontamination steps and quantify it, e.g., N-gram match of ground-truth code in the training dataset.
|
| 36 |
+
validations:
|
| 37 |
+
required: true
|
| 38 |
+
- type: dropdown
|
| 39 |
+
id: author
|
| 40 |
+
attributes:
|
| 41 |
+
label: "Author"
|
| 42 |
+
description: "Are you (one of) the author(s) of the model?"
|
| 43 |
+
multiple: false
|
| 44 |
+
options:
|
| 45 |
+
- "Yes"
|
| 46 |
+
- "No"
|
| 47 |
+
validations:
|
| 48 |
+
required: true
|
| 49 |
+
- type: dropdown
|
| 50 |
+
id: data
|
| 51 |
+
attributes:
|
| 52 |
+
label: "Data"
|
| 53 |
+
description: "Is the training/fine-tuning data available in public?"
|
| 54 |
+
multiple: false
|
| 55 |
+
options:
|
| 56 |
+
- "Yes (If so please specify in 'Additional information')"
|
| 57 |
+
- "No"
|
| 58 |
+
validations:
|
| 59 |
+
required: true
|
| 60 |
+
- type: checkboxes
|
| 61 |
+
id: security
|
| 62 |
+
attributes:
|
| 63 |
+
label: "Security"
|
| 64 |
+
options:
|
| 65 |
+
- label: "I confirm that the model is safe to run which is not designed to produce malicious code or content."
|
| 66 |
+
required: true
|
| 67 |
+
- type: checkboxes
|
| 68 |
+
id: integrity
|
| 69 |
+
attributes:
|
| 70 |
+
label: "Integrity"
|
| 71 |
+
options:
|
| 72 |
+
- label: "I confirm that the model comes from unique and original work and does not contain any plagiarism."
|
| 73 |
+
required: true
|
evalplus/.gitignore
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py,cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# poetry
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 102 |
+
#poetry.lock
|
| 103 |
+
|
| 104 |
+
# pdm
|
| 105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 106 |
+
#pdm.lock
|
| 107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 108 |
+
# in version control.
|
| 109 |
+
# https://pdm.fming.dev/#use-with-ide
|
| 110 |
+
.pdm.toml
|
| 111 |
+
|
| 112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 113 |
+
__pypackages__/
|
| 114 |
+
|
| 115 |
+
# Celery stuff
|
| 116 |
+
celerybeat-schedule
|
| 117 |
+
celerybeat.pid
|
| 118 |
+
|
| 119 |
+
# SageMath parsed files
|
| 120 |
+
*.sage.py
|
| 121 |
+
|
| 122 |
+
# Environments
|
| 123 |
+
.env
|
| 124 |
+
.venv
|
| 125 |
+
env/
|
| 126 |
+
venv/
|
| 127 |
+
ENV/
|
| 128 |
+
env.bak/
|
| 129 |
+
venv.bak/
|
| 130 |
+
|
| 131 |
+
# Spyder project settings
|
| 132 |
+
.spyderproject
|
| 133 |
+
.spyproject
|
| 134 |
+
|
| 135 |
+
# Rope project settings
|
| 136 |
+
.ropeproject
|
| 137 |
+
|
| 138 |
+
# mkdocs documentation
|
| 139 |
+
/site
|
| 140 |
+
|
| 141 |
+
# mypy
|
| 142 |
+
.mypy_cache/
|
| 143 |
+
.dmypy.json
|
| 144 |
+
dmypy.json
|
| 145 |
+
|
| 146 |
+
# Pyre type checker
|
| 147 |
+
.pyre/
|
| 148 |
+
|
| 149 |
+
# pytype static type analyzer
|
| 150 |
+
.pytype/
|
| 151 |
+
|
| 152 |
+
# Cython debug symbols
|
| 153 |
+
cython_debug/
|
| 154 |
+
|
| 155 |
+
# PyCharm
|
| 156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 160 |
+
# nuclear option because steven uses PyCharm.
|
| 161 |
+
.idea/
|
| 162 |
+
|
| 163 |
+
# VSCode
|
| 164 |
+
.vscode/
|
| 165 |
+
|
| 166 |
+
# EvalPlus specific
|
| 167 |
+
EvalPlus/
|
| 168 |
+
backup/
|
| 169 |
+
passrate.p*
|
| 170 |
+
min_cov_dir/
|
| 171 |
+
HumanEvalPlus*.gz
|
| 172 |
+
MbppPlus*.gz
|
| 173 |
+
evalplus/_version.py
|
| 174 |
+
*mbpp.json
|
| 175 |
+
*.jsonl
|
| 176 |
+
*.json
|
| 177 |
+
*.png
|
| 178 |
+
*.pdf
|
| 179 |
+
trash-bin
|
| 180 |
+
.bak
|
| 181 |
+
evalplus.github.io
|
| 182 |
+
evalplus_results/
|
evalplus/.pre-commit-config.yaml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
repos:
|
| 2 |
+
- repo: https://github.com/pycqa/isort
|
| 3 |
+
rev: 5.12.0
|
| 4 |
+
hooks:
|
| 5 |
+
- id: isort
|
| 6 |
+
name: isort (python)
|
| 7 |
+
args: ["--profile", "black"]
|
| 8 |
+
- repo: https://github.com/psf/black
|
| 9 |
+
rev: 22.6.0
|
| 10 |
+
hooks:
|
| 11 |
+
- id: black
|
| 12 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
| 13 |
+
rev: v4.3.0
|
| 14 |
+
hooks:
|
| 15 |
+
- id: check-yaml
|
| 16 |
+
- id: end-of-file-fixer
|
| 17 |
+
- id: trailing-whitespace
|
| 18 |
+
exclude: (?x)^(
|
| 19 |
+
groundtruth/.*
|
| 20 |
+
)$
|
evalplus/CITATION.cff
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
cff-version: 1.2.0
|
| 2 |
+
message: "If you use this work and love it, consider citing it as below \U0001F917"
|
| 3 |
+
title: EvalPlus
|
| 4 |
+
authors:
|
| 5 |
+
- family-names: EvalPlus Team
|
| 6 |
+
url: https://github.com/evalplus/evalplus
|
| 7 |
+
doi: https://doi.org/10.48550/arXiv.2305.01210
|
| 8 |
+
date-released: 2023-05-01
|
| 9 |
+
license: Apache-2.0
|
| 10 |
+
preferred-citation:
|
| 11 |
+
type: article
|
| 12 |
+
title: "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation"
|
| 13 |
+
authors:
|
| 14 |
+
- family-names: Liu
|
| 15 |
+
given-names: Jiawei
|
| 16 |
+
- family-names: Xia
|
| 17 |
+
given-names: Chunqiu Steven
|
| 18 |
+
- family-names: Wang
|
| 19 |
+
given-names: Yuyao
|
| 20 |
+
- family-names: Zhang
|
| 21 |
+
given-names: Lingming
|
| 22 |
+
year: 2023
|
| 23 |
+
journal: "arXiv preprint arXiv:2305.01210"
|
| 24 |
+
doi: https://doi.org/10.48550/arXiv.2305.01210
|
| 25 |
+
url: https://arxiv.org/abs/2305.01210
|
evalplus/Dockerfile
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Better use newer Python as generated code can use new features
|
| 2 |
+
FROM python:3.11-slim
|
| 3 |
+
|
| 4 |
+
# install git and c++ (required by cirronlib.cpp)
|
| 5 |
+
RUN apt-get update && apt-get install -y git g++
|
| 6 |
+
|
| 7 |
+
# upgrade to latest pip
|
| 8 |
+
RUN pip install --upgrade pip
|
| 9 |
+
|
| 10 |
+
COPY . /evalplus
|
| 11 |
+
|
| 12 |
+
RUN cd /evalplus && pip install ".[perf]"
|
| 13 |
+
|
| 14 |
+
# Pre-install the dataset
|
| 15 |
+
RUN python3 -c "from evalplus.data import *; get_human_eval_plus(); get_mbpp_plus(); get_evalperf_data()"
|
| 16 |
+
|
| 17 |
+
WORKDIR /app
|
| 18 |
+
|
| 19 |
+
CMD ["bash"]
|
evalplus/LICENSE
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License
|
| 2 |
+
Version 2.0, January 2004
|
| 3 |
+
http://www.apache.org/licenses/
|
| 4 |
+
|
| 5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 6 |
+
|
| 7 |
+
1. Definitions.
|
| 8 |
+
|
| 9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 11 |
+
|
| 12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 13 |
+
the copyright owner that is granting the License.
|
| 14 |
+
|
| 15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 16 |
+
other entities that control, are controlled by, or are under common
|
| 17 |
+
control with that entity. For the purposes of this definition,
|
| 18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 19 |
+
direction or management of such entity, whether by contract or
|
| 20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 22 |
+
|
| 23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 24 |
+
exercising permissions granted by this License.
|
| 25 |
+
|
| 26 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 27 |
+
including but not limited to software source code, documentation
|
| 28 |
+
source, and configuration files.
|
| 29 |
+
|
| 30 |
+
"Object" form shall mean any form resulting from mechanical
|
| 31 |
+
transformation or translation of a Source form, including but
|
| 32 |
+
not limited to compiled object code, generated documentation,
|
| 33 |
+
and conversions to other media types.
|
| 34 |
+
|
| 35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 36 |
+
Object form, made available under the License, as indicated by a
|
| 37 |
+
copyright notice that is included in or attached to the work
|
| 38 |
+
(an example is provided in the Appendix below).
|
| 39 |
+
|
| 40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 41 |
+
form, that is based on (or derived from) the Work and for which the
|
| 42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 44 |
+
of this License, Derivative Works shall not include works that remain
|
| 45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 46 |
+
the Work and Derivative Works thereof.
|
| 47 |
+
|
| 48 |
+
"Contribution" shall mean any work of authorship, including
|
| 49 |
+
the original version of the Work and any modifications or additions
|
| 50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 54 |
+
means any form of electronic, verbal, or written communication sent
|
| 55 |
+
to the Licensor or its representatives, including but not limited to
|
| 56 |
+
communication on electronic mailing lists, source code control systems,
|
| 57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 59 |
+
excluding communication that is conspicuously marked or otherwise
|
| 60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 61 |
+
|
| 62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 64 |
+
subsequently incorporated within the Work.
|
| 65 |
+
|
| 66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 71 |
+
Work and such Derivative Works in Source or Object form.
|
| 72 |
+
|
| 73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 76 |
+
(except as stated in this section) patent license to make, have made,
|
| 77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 78 |
+
where such license applies only to those patent claims licensable
|
| 79 |
+
by such Contributor that are necessarily infringed by their
|
| 80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 82 |
+
institute patent litigation against any entity (including a
|
| 83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 84 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 85 |
+
or contributory patent infringement, then any patent licenses
|
| 86 |
+
granted to You under this License for that Work shall terminate
|
| 87 |
+
as of the date such litigation is filed.
|
| 88 |
+
|
| 89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 90 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 91 |
+
modifications, and in Source or Object form, provided that You
|
| 92 |
+
meet the following conditions:
|
| 93 |
+
|
| 94 |
+
(a) You must give any other recipients of the Work or
|
| 95 |
+
Derivative Works a copy of this License; and
|
| 96 |
+
|
| 97 |
+
(b) You must cause any modified files to carry prominent notices
|
| 98 |
+
stating that You changed the files; and
|
| 99 |
+
|
| 100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 101 |
+
that You distribute, all copyright, patent, trademark, and
|
| 102 |
+
attribution notices from the Source form of the Work,
|
| 103 |
+
excluding those notices that do not pertain to any part of
|
| 104 |
+
the Derivative Works; and
|
| 105 |
+
|
| 106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 107 |
+
distribution, then any Derivative Works that You distribute must
|
| 108 |
+
include a readable copy of the attribution notices contained
|
| 109 |
+
within such NOTICE file, excluding those notices that do not
|
| 110 |
+
pertain to any part of the Derivative Works, in at least one
|
| 111 |
+
of the following places: within a NOTICE text file distributed
|
| 112 |
+
as part of the Derivative Works; within the Source form or
|
| 113 |
+
documentation, if provided along with the Derivative Works; or,
|
| 114 |
+
within a display generated by the Derivative Works, if and
|
| 115 |
+
wherever such third-party notices normally appear. The contents
|
| 116 |
+
of the NOTICE file are for informational purposes only and
|
| 117 |
+
do not modify the License. You may add Your own attribution
|
| 118 |
+
notices within Derivative Works that You distribute, alongside
|
| 119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 120 |
+
that such additional attribution notices cannot be construed
|
| 121 |
+
as modifying the License.
|
| 122 |
+
|
| 123 |
+
You may add Your own copyright statement to Your modifications and
|
| 124 |
+
may provide additional or different license terms and conditions
|
| 125 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 126 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 127 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 128 |
+
the conditions stated in this License.
|
| 129 |
+
|
| 130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 132 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 133 |
+
this License, without any additional terms or conditions.
|
| 134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 135 |
+
the terms of any separate license agreement you may have executed
|
| 136 |
+
with Licensor regarding such Contributions.
|
| 137 |
+
|
| 138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 140 |
+
except as required for reasonable and customary use in describing the
|
| 141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 142 |
+
|
| 143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 144 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 147 |
+
implied, including, without limitation, any warranties or conditions
|
| 148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 150 |
+
appropriateness of using or redistributing the Work and assume any
|
| 151 |
+
risks associated with Your exercise of permissions under this License.
|
| 152 |
+
|
| 153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 154 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 155 |
+
unless required by applicable law (such as deliberate and grossly
|
| 156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 157 |
+
liable to You for damages, including any direct, indirect, special,
|
| 158 |
+
incidental, or consequential damages of any character arising as a
|
| 159 |
+
result of this License or out of the use or inability to use the
|
| 160 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 161 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 162 |
+
other commercial damages or losses), even if such Contributor
|
| 163 |
+
has been advised of the possibility of such damages.
|
| 164 |
+
|
| 165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 168 |
+
or other liability obligations and/or rights consistent with this
|
| 169 |
+
License. However, in accepting such obligations, You may act only
|
| 170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 171 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 172 |
+
defend, and hold each Contributor harmless for any liability
|
| 173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 174 |
+
of your accepting any such warranty or additional liability.
|
| 175 |
+
|
| 176 |
+
END OF TERMS AND CONDITIONS
|
| 177 |
+
|
| 178 |
+
APPENDIX: How to apply the Apache License to your work.
|
| 179 |
+
|
| 180 |
+
To apply the Apache License to your work, attach the following
|
| 181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 182 |
+
replaced with your own identifying information. (Don't include
|
| 183 |
+
the brackets!) The text should be enclosed in the appropriate
|
| 184 |
+
comment syntax for the file format. We also recommend that a
|
| 185 |
+
file or class name and description of purpose be included on the
|
| 186 |
+
same "printed page" as the copyright notice for easier
|
| 187 |
+
identification within third-party archives.
|
| 188 |
+
|
| 189 |
+
Copyright [yyyy] [name of copyright owner]
|
| 190 |
+
|
| 191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 192 |
+
you may not use this file except in compliance with the License.
|
| 193 |
+
You may obtain a copy of the License at
|
| 194 |
+
|
| 195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 196 |
+
|
| 197 |
+
Unless required by applicable law or agreed to in writing, software
|
| 198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 200 |
+
See the License for the specific language governing permissions and
|
| 201 |
+
limitations under the License.
|
| 202 |
+
|
| 203 |
+
-------------------------------------------------------------------------------
|
| 204 |
+
The files under "evalplus/eval/" additionally complies with the MIT License for
|
| 205 |
+
being built on OpenAI's HumanEval work.
|
evalplus/MANIFEST.in
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
exclude evalplus/_experimental/**/*.py
|
evalplus/README.md
ADDED
|
@@ -0,0 +1,325 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# `EvalPlus(📖) => 📚`
|
| 2 |
+
|
| 3 |
+
<p align="center">
|
| 4 |
+
<a href="https://evalplus.github.io"><img src="https://img.shields.io/badge/%F0%9F%8F%86-leaderboard-8A2BE2"></a>
|
| 5 |
+
<a href="https://openreview.net/forum?id=1qvx610Cu7"><img src="https://img.shields.io/badge/EvalPlus-NeurIPS'23-a55fed.svg"></a>
|
| 6 |
+
<a href="https://openreview.net/forum?id=IBCBMeAhmC"><img src="https://img.shields.io/badge/EvalPerf-COLM'24-a55fed.svg"></a>
|
| 7 |
+
<a href="https://huggingface.co/evalplus/"><img src="https://img.shields.io/badge/🤗%20Hugging%20Face-evalplus-%23ff8811.svg"></a>
|
| 8 |
+
<a href="https://pypi.org/project/evalplus/"><img src="https://img.shields.io/pypi/v/evalplus?color=g"></a>
|
| 9 |
+
<a href="https://hub.docker.com/r/ganler/evalplus" title="Docker"><img src="https://img.shields.io/docker/image-size/ganler/evalplus"></a>
|
| 10 |
+
</p>
|
| 11 |
+
|
| 12 |
+
<p align="center">
|
| 13 |
+
<a href="#-about">📙About</a> •
|
| 14 |
+
<a href="#-quick-start">🔥Quick Start</a> •
|
| 15 |
+
<a href="#-llm-backends">🚀LLM Backends</a> •
|
| 16 |
+
<a href="#-documents">📚Documents</a> •
|
| 17 |
+
<a href="#-citation">📜Citation</a> •
|
| 18 |
+
<a href="#-acknowledgement">🙏Acknowledgement</a>
|
| 19 |
+
</p>
|
| 20 |
+
|
| 21 |
+
## 📢 News
|
| 22 |
+
|
| 23 |
+
Who's using EvalPlus datasets? EvalPlus has been used by various LLM teams, including:
|
| 24 |
+
|
| 25 |
+
* [Meta Llama 3.1 and 3.3](https://ai.meta.com/blog/meta-llama-3-1/)
|
| 26 |
+
* [Allen AI TÜLU 1/2/3](https://github.com/allenai/open-instruct/blob/main/docs/tulu1_tulu2.md#benchmark-based-eval)
|
| 27 |
+
* [Qwen2.5-Coder](https://qwenlm.github.io/blog/qwen2.5-coder-family/)
|
| 28 |
+
* [CodeQwen 1.5](https://qwenlm.github.io/blog/codeqwen1.5/)
|
| 29 |
+
* [DeepSeek-Coder V2](https://arxiv.org/pdf/2406.11931)
|
| 30 |
+
* [Qwen2](https://arxiv.org/pdf/2407.10671)
|
| 31 |
+
* [Snowflake Arctic](https://www.snowflake.com/en/data-cloud/arctic/)
|
| 32 |
+
* [StarCoder2](https://arxiv.org/pdf/2402.19173)
|
| 33 |
+
* [Magicoder](https://arxiv.org/pdf/2312.02120)
|
| 34 |
+
* [WizardCoder](https://arxiv.org/pdf/2306.08568)
|
| 35 |
+
|
| 36 |
+
Below tracks the notable updates of EvalPlus:
|
| 37 |
+
|
| 38 |
+
- **[2024-10-20 `v0.3.1`]**: EvalPlus `v0.3.1` is officially released! Highlights: *(i)* Code efficiency evaluation via EvalPerf, *(ii)* one command to run all: generation + post-processing + evaluation, *(iii)* support for more inference backends such as Google Gemini & Anthropic, etc.
|
| 39 |
+
- **[2024-06-09 pre `v0.3.0`]**: Improved ground-truth solutions for MBPP+ tasks (IDs: 459, 102, 559). Thanks to [EvalArena](https://github.com/crux-eval/eval-arena).
|
| 40 |
+
- **[2024-04-17 pre `v0.3.0`]**: MBPP+ is upgraded to `v0.2.0` by removing some broken tasks (399 -> 378 tasks). ~4pp pass@1 improvement could be expected.
|
| 41 |
+
|
| 42 |
+
<details><summary>Earlier news <i>:: click to expand ::</i></summary>
|
| 43 |
+
<div>
|
| 44 |
+
|
| 45 |
+
- ([`v0.2.1`](https://github.com/evalplus/evalplus/releases/tag/v0.2.1)) You can use EvalPlus datasets via [bigcode-evaluation-harness](https://github.com/bigcode-project/bigcode-evaluation-harness)! HumanEval+ oracle fixes (32).
|
| 46 |
+
- ([`v0.2.0`](https://github.com/evalplus/evalplus/releases/tag/v0.2.0)) MBPP+ is released! HumanEval contract & input fixes (0/3/9/148/114/1/2/99/28/32/35/160).
|
| 47 |
+
- ([`v0.1.7`](https://github.com/evalplus/evalplus/releases/tag/v0.1.7)) [Leaderboard](https://evalplus.github.io/leaderboard.html) release; HumanEval+ contract and input fixes (32/166/126/6)
|
| 48 |
+
- ([`v0.1.6`](https://github.com/evalplus/evalplus/releases/tag/v0.1.6)) Configurable and by-default-conservative timeout settings; HumanEval+ contract & ground-truth fixes (129/148/75/53/0/3/9/140)
|
| 49 |
+
- ([`v0.1.5`](https://github.com/evalplus/evalplus/releases/tag/v0.1.5)) HumanEval+ mini is released for ultra-fast evaluation when you have too many samples!
|
| 50 |
+
- ([`v0.1.1`](https://github.com/evalplus/evalplus/releases/tag/v0.1.1)) Optimizing user experiences: evaluation speed, PyPI package, Docker, etc.
|
| 51 |
+
- ([`v0.1.0`](https://github.com/evalplus/evalplus/releases/tag/v0.1.0)) HumanEval+ is released!
|
| 52 |
+
|
| 53 |
+
</div>
|
| 54 |
+
</details>
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
## 📙 About
|
| 58 |
+
|
| 59 |
+
EvalPlus is a rigorous evaluation framework for LLM4Code, with:
|
| 60 |
+
|
| 61 |
+
- ✨ **HumanEval+**: 80x more tests than the original HumanEval!
|
| 62 |
+
- ✨ **MBPP+**: 35x more tests than the original MBPP!
|
| 63 |
+
- ✨ **EvalPerf**: evaluating the efficiency of LLM-generated code!
|
| 64 |
+
- ✨ **Framework**: our packages/images/tools can easily and safely evaluate LLMs on above benchmarks.
|
| 65 |
+
|
| 66 |
+
Why EvalPlus?
|
| 67 |
+
|
| 68 |
+
- ✨ **Precise evaluation**: See [our leaderboard](https://evalplus.github.io/leaderboard.html) for latest LLM rankings before & after rigorous evaluation.
|
| 69 |
+
- ✨ **Coding rigorousness**: Look at the score differences! esp. before & after using EvalPlus tests! Less drop means more rigorousness in code generation; while a bigger drop means the generated code tends to be fragile.
|
| 70 |
+
- ✨ **Code efficiency**: Beyond correctness, our EvalPerf dataset evaluates the efficiency of LLM-generated code via performance-exercising coding tasks and test inputs.
|
| 71 |
+
|
| 72 |
+
Want to know more details? Read our papers & materials!
|
| 73 |
+
|
| 74 |
+
- **EvalPlus**: [NeurIPS'23 paper](https://openreview.net/forum?id=1qvx610Cu7), [Slides](https://docs.google.com/presentation/d/1eTxzUQG9uHaU13BGhrqm4wH5NmMZiM3nI0ezKlODxKs), [Poster](https://jw-liu.xyz/assets/pdf/EvalPlus_Poster.pdf), [Leaderboard](https://evalplus.github.io/leaderboard.html)
|
| 75 |
+
- **EvalPerf**: [COLM'24 paper](https://openreview.net/forum?id=IBCBMeAhmC), [Poster](https://jw-liu.xyz/assets/pdf/jiawei-colm-evalperf-poster.pdf), [Documentation](./docs/evalperf.md), [Leaderboard](https://evalplus.github.io/evalperf.html)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
## 🔥 Quick Start
|
| 79 |
+
|
| 80 |
+
### Code Correctness Evaluation: HumanEval(+) or MBPP(+)
|
| 81 |
+
|
| 82 |
+
```bash
|
| 83 |
+
pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"
|
| 84 |
+
# Or `pip install "evalplus[vllm]" --upgrade` for the latest stable release
|
| 85 |
+
|
| 86 |
+
evalplus.evaluate --model "ise-uiuc/Magicoder-S-DS-6.7B" \
|
| 87 |
+
--dataset [humaneval|mbpp] \
|
| 88 |
+
--backend vllm \
|
| 89 |
+
--greedy
|
| 90 |
+
```
|
| 91 |
+
|
| 92 |
+
<details><summary>🛡️ Safe code execution within Docker <i>:: click to expand ::</i></summary>
|
| 93 |
+
<div>
|
| 94 |
+
|
| 95 |
+
```bash
|
| 96 |
+
# Local generation
|
| 97 |
+
evalplus.codegen --model "ise-uiuc/Magicoder-S-DS-6.7B" \
|
| 98 |
+
--dataset humaneval \
|
| 99 |
+
--backend vllm \
|
| 100 |
+
--greedy
|
| 101 |
+
|
| 102 |
+
# Code execution within Docker
|
| 103 |
+
docker run --rm --pull=always -v $(pwd)/evalplus_results:/app ganler/evalplus:latest \
|
| 104 |
+
evalplus.evaluate --dataset humaneval \
|
| 105 |
+
--samples /app/humaneval/ise-uiuc--Magicoder-S-DS-6.7B_vllm_temp_0.0.jsonl
|
| 106 |
+
```
|
| 107 |
+
|
| 108 |
+
</div>
|
| 109 |
+
</details>
|
| 110 |
+
|
| 111 |
+
### Code Efficiency Evaluation: EvalPerf (*nix only)
|
| 112 |
+
|
| 113 |
+
```bash
|
| 114 |
+
pip install --upgrade "evalplus[perf,vllm] @ git+https://github.com/evalplus/evalplus"
|
| 115 |
+
# Or `pip install "evalplus[perf,vllm]" --upgrade` for the latest stable release
|
| 116 |
+
|
| 117 |
+
sudo sh -c 'echo 0 > /proc/sys/kernel/perf_event_paranoid' # Enable perf
|
| 118 |
+
evalplus.evalperf --model "ise-uiuc/Magicoder-S-DS-6.7B" --backend vllm
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
<details><summary>🛡️ Safe code execution within Docker <i>:: click to expand ::</i></summary>
|
| 122 |
+
<div>
|
| 123 |
+
|
| 124 |
+
```bash
|
| 125 |
+
# Local generation
|
| 126 |
+
evalplus.codegen --model "ise-uiuc/Magicoder-S-DS-6.7B" \
|
| 127 |
+
--dataset evalperf \
|
| 128 |
+
--backend vllm \
|
| 129 |
+
--temperature 1.0 \
|
| 130 |
+
--n-samples 100
|
| 131 |
+
|
| 132 |
+
# Code execution within Docker
|
| 133 |
+
sudo sh -c 'echo 0 > /proc/sys/kernel/perf_event_paranoid' # Enable perf
|
| 134 |
+
docker run --cap-add PERFMON --rm --pull=always -v $(pwd)/evalplus_results:/app ganler/evalplus:latest \
|
| 135 |
+
evalplus.evalperf --samples /app/evalperf/ise-uiuc--Magicoder-S-DS-6.7B_vllm_temp_1.0.jsonl
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
</div>
|
| 139 |
+
</details>
|
| 140 |
+
|
| 141 |
+
## 🚀 LLM Backends
|
| 142 |
+
|
| 143 |
+
### HuggingFace models
|
| 144 |
+
|
| 145 |
+
- `transformers` backend:
|
| 146 |
+
|
| 147 |
+
```bash
|
| 148 |
+
evalplus.evaluate --model "ise-uiuc/Magicoder-S-DS-6.7B" \
|
| 149 |
+
--dataset [humaneval|mbpp] \
|
| 150 |
+
--backend hf \
|
| 151 |
+
--greedy
|
| 152 |
+
```
|
| 153 |
+
|
| 154 |
+
> [!Note]
|
| 155 |
+
>
|
| 156 |
+
> EvalPlus uses different prompts for base and chat models.
|
| 157 |
+
> By default it is detected by `tokenizer.chat_template` when using `hf`/`vllm` as backend.
|
| 158 |
+
> For other backends, only chat mode is allowed.
|
| 159 |
+
>
|
| 160 |
+
> Therefore, if your base models come with a `tokenizer.chat_template`,
|
| 161 |
+
> please add `--force-base-prompt` to avoid being evaluated
|
| 162 |
+
> in a chat mode.
|
| 163 |
+
|
| 164 |
+
<details><summary>Enable Flash Attention 2 <i>:: click to expand ::</i></summary>
|
| 165 |
+
<div>
|
| 166 |
+
|
| 167 |
+
```bash
|
| 168 |
+
# Install Flash Attention 2
|
| 169 |
+
pip install packaging ninja
|
| 170 |
+
pip install flash-attn --no-build-isolation
|
| 171 |
+
# Note: if you have installation problem, consider using pre-built
|
| 172 |
+
# wheels from https://github.com/Dao-AILab/flash-attention/releases
|
| 173 |
+
|
| 174 |
+
# Run evaluation with FA2
|
| 175 |
+
evalplus.evaluate --model "ise-uiuc/Magicoder-S-DS-6.7B" \
|
| 176 |
+
--dataset [humaneval|mbpp] \
|
| 177 |
+
--backend hf \
|
| 178 |
+
--attn-implementation [flash_attention_2|sdpa] \
|
| 179 |
+
--greedy
|
| 180 |
+
```
|
| 181 |
+
|
| 182 |
+
</div>
|
| 183 |
+
</details>
|
| 184 |
+
|
| 185 |
+
- `vllm` backend:
|
| 186 |
+
|
| 187 |
+
```bash
|
| 188 |
+
evalplus.evaluate --model "ise-uiuc/Magicoder-S-DS-6.7B" \
|
| 189 |
+
--dataset [humaneval|mbpp] \
|
| 190 |
+
--backend vllm \
|
| 191 |
+
--tp [TENSOR_PARALLEL_SIZE] \
|
| 192 |
+
--greedy
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
- `openai` compatible servers (e.g., [vLLM](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html)):
|
| 196 |
+
|
| 197 |
+
```bash
|
| 198 |
+
# OpenAI models
|
| 199 |
+
export OPENAI_API_KEY="{KEY}" # https://platform.openai.com/settings/organization/api-keys
|
| 200 |
+
evalplus.evaluate --model "gpt-4o-2024-08-06" \
|
| 201 |
+
--dataset [humaneval|mbpp] \
|
| 202 |
+
--backend openai --greedy
|
| 203 |
+
|
| 204 |
+
# DeepSeek
|
| 205 |
+
export OPENAI_API_KEY="{KEY}" # https://platform.deepseek.com/api_keys
|
| 206 |
+
evalplus.evaluate --model "deepseek-chat" \
|
| 207 |
+
--dataset [humaneval|mbpp] \
|
| 208 |
+
--base-url https://api.deepseek.com \
|
| 209 |
+
--backend openai --greedy
|
| 210 |
+
|
| 211 |
+
# Grok
|
| 212 |
+
export OPENAI_API_KEY="{KEY}" # https://console.x.ai/
|
| 213 |
+
evalplus.evaluate --model "grok-beta" \
|
| 214 |
+
--dataset [humaneval|mbpp] \
|
| 215 |
+
--base-url https://api.x.ai/v1 \
|
| 216 |
+
--backend openai --greedy
|
| 217 |
+
|
| 218 |
+
# vLLM server
|
| 219 |
+
# First, launch a vLLM server: https://docs.vllm.ai/en/latest/serving/deploying_with_docker.html
|
| 220 |
+
evalplus.evaluate --model "ise-uiuc/Magicoder-S-DS-6.7B" \
|
| 221 |
+
--dataset [humaneval|mbpp] \
|
| 222 |
+
--base-url http://localhost:8000/v1 \
|
| 223 |
+
--backend openai --greedy
|
| 224 |
+
|
| 225 |
+
# GPTQModel
|
| 226 |
+
evalplus.evaluate --model "ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1" \
|
| 227 |
+
--dataset [humaneval|mbpp] \
|
| 228 |
+
--backend gptqmodel --greedy
|
| 229 |
+
```
|
| 230 |
+
|
| 231 |
+
### OpenAI models
|
| 232 |
+
|
| 233 |
+
- Access OpenAI APIs from [OpenAI Console](https://platform.openai.com/)
|
| 234 |
+
|
| 235 |
+
```bash
|
| 236 |
+
export OPENAI_API_KEY="[YOUR_API_KEY]"
|
| 237 |
+
evalplus.evaluate --model "gpt-4o" \
|
| 238 |
+
--dataset [humaneval|mbpp] \
|
| 239 |
+
--backend openai \
|
| 240 |
+
--greedy
|
| 241 |
+
```
|
| 242 |
+
|
| 243 |
+
### Anthropic models
|
| 244 |
+
|
| 245 |
+
- Access Anthropic APIs from [Anthropic Console](https://console.anthropic.com/)
|
| 246 |
+
|
| 247 |
+
```bash
|
| 248 |
+
export ANTHROPIC_API_KEY="[YOUR_API_KEY]"
|
| 249 |
+
evalplus.evaluate --model "claude-3-haiku-20240307" \
|
| 250 |
+
--dataset [humaneval|mbpp] \
|
| 251 |
+
--backend anthropic \
|
| 252 |
+
--greedy
|
| 253 |
+
```
|
| 254 |
+
|
| 255 |
+
### Google Gemini models
|
| 256 |
+
|
| 257 |
+
- Access Gemini APIs from [Google AI Studio](https://aistudio.google.com/)
|
| 258 |
+
|
| 259 |
+
```bash
|
| 260 |
+
export GOOGLE_API_KEY="[YOUR_API_KEY]"
|
| 261 |
+
evalplus.evaluate --model "gemini-1.5-pro" \
|
| 262 |
+
--dataset [humaneval|mbpp] \
|
| 263 |
+
--backend google \
|
| 264 |
+
--greedy
|
| 265 |
+
```
|
| 266 |
+
|
| 267 |
+
### Amazon Bedrock models
|
| 268 |
+
|
| 269 |
+
- [Amazon Bedrock](https://aws.amazon.com/bedrock/)
|
| 270 |
+
|
| 271 |
+
```bash
|
| 272 |
+
export BEDROCK_ROLE_ARN="[BEDROCK_ROLE_ARN]"
|
| 273 |
+
evalplus.evaluate --model "anthropic.claude-3-5-sonnet-20241022-v2:0" \
|
| 274 |
+
--dataset [humaneval|mbpp] \
|
| 275 |
+
--backend bedrock \
|
| 276 |
+
--greedy
|
| 277 |
+
```
|
| 278 |
+
|
| 279 |
+
You can checkout the generation and results at `evalplus_results/[humaneval|mbpp]/`
|
| 280 |
+
|
| 281 |
+
<details><summary>⏬ Using EvalPlus as a local repo? <i>:: click to expand ::</i></summary>
|
| 282 |
+
<div>
|
| 283 |
+
|
| 284 |
+
```bash
|
| 285 |
+
git clone https://github.com/evalplus/evalplus.git
|
| 286 |
+
cd evalplus
|
| 287 |
+
export PYTHONPATH=$PYTHONPATH:$(pwd)
|
| 288 |
+
pip install -r requirements.txt
|
| 289 |
+
```
|
| 290 |
+
|
| 291 |
+
</div>
|
| 292 |
+
</details>
|
| 293 |
+
|
| 294 |
+
## 📚 Documents
|
| 295 |
+
|
| 296 |
+
To learn more about how to use EvalPlus, please refer to:
|
| 297 |
+
|
| 298 |
+
- [EvalPlus Commands](./docs/cli.md)
|
| 299 |
+
- [EvalPerf](./docs/evalperf.md)
|
| 300 |
+
- [Program Execution](./docs/execution.md)
|
| 301 |
+
|
| 302 |
+
## 📜 Citation
|
| 303 |
+
|
| 304 |
+
```bibtex
|
| 305 |
+
@inproceedings{evalplus,
|
| 306 |
+
title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation},
|
| 307 |
+
author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming},
|
| 308 |
+
booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
|
| 309 |
+
year = {2023},
|
| 310 |
+
url = {https://openreview.net/forum?id=1qvx610Cu7},
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
@inproceedings{evalperf,
|
| 314 |
+
title = {Evaluating Language Models for Efficient Code Generation},
|
| 315 |
+
author = {Liu, Jiawei and Xie, Songrun and Wang, Junhao and Wei, Yuxiang and Ding, Yifeng and Zhang, Lingming},
|
| 316 |
+
booktitle = {First Conference on Language Modeling},
|
| 317 |
+
year = {2024},
|
| 318 |
+
url = {https://openreview.net/forum?id=IBCBMeAhmC},
|
| 319 |
+
}
|
| 320 |
+
```
|
| 321 |
+
|
| 322 |
+
## 🙏 Acknowledgement
|
| 323 |
+
|
| 324 |
+
- [HumanEval](https://github.com/openai/human-eval)
|
| 325 |
+
- [MBPP](https://github.com/google-research/google-research/tree/master/mbpp)
|
evalplus/build/lib/evalplus/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
try:
|
| 2 |
+
from evalplus._version import __version__, __version_tuple__
|
| 3 |
+
except ImportError:
|
| 4 |
+
__version__ = "local-dev"
|
evalplus/build/lib/evalplus/_version.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# file generated by setuptools_scm
|
| 2 |
+
# don't change, don't track in version control
|
| 3 |
+
TYPE_CHECKING = False
|
| 4 |
+
if TYPE_CHECKING:
|
| 5 |
+
from typing import Tuple, Union
|
| 6 |
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
| 7 |
+
else:
|
| 8 |
+
VERSION_TUPLE = object
|
| 9 |
+
|
| 10 |
+
version: str
|
| 11 |
+
__version__: str
|
| 12 |
+
__version_tuple__: VERSION_TUPLE
|
| 13 |
+
version_tuple: VERSION_TUPLE
|
| 14 |
+
|
| 15 |
+
__version__ = version = '0.4.0.dev33'
|
| 16 |
+
__version_tuple__ = version_tuple = (0, 4, 0, 'dev33')
|
evalplus/build/lib/evalplus/codegen.py
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from typing import Dict, List, Optional
|
| 4 |
+
|
| 5 |
+
from evalplus.data import get_evalperf_data, get_human_eval_plus, get_mbpp_plus
|
| 6 |
+
from evalplus.provider import DecoderBase, make_model
|
| 7 |
+
from evalplus.sanitize import sanitize
|
| 8 |
+
from evalplus.utils import progress
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def codegen(
|
| 12 |
+
target_path: str,
|
| 13 |
+
model: DecoderBase,
|
| 14 |
+
dataset: Dict,
|
| 15 |
+
greedy=False,
|
| 16 |
+
n_samples=1,
|
| 17 |
+
id_range=None,
|
| 18 |
+
resume=True,
|
| 19 |
+
):
|
| 20 |
+
task2nexist = {}
|
| 21 |
+
if resume and target_path.endswith(".jsonl") and os.path.isfile(target_path):
|
| 22 |
+
with open(target_path, "r") as f:
|
| 23 |
+
for line in f:
|
| 24 |
+
if not line.strip():
|
| 25 |
+
continue
|
| 26 |
+
task_id = json.loads(line)["task_id"]
|
| 27 |
+
task2nexist[task_id] = task2nexist.get(task_id, 0) + 1
|
| 28 |
+
|
| 29 |
+
if target_path.endswith(".jsonl"):
|
| 30 |
+
raw_target_path = target_path.replace(".jsonl", ".raw.jsonl")
|
| 31 |
+
else:
|
| 32 |
+
raw_target_path = target_path + ".raw"
|
| 33 |
+
os.makedirs(target_path, exist_ok=True)
|
| 34 |
+
|
| 35 |
+
print(f"Sanitized code outputs will be saved to {target_path}")
|
| 36 |
+
print(f"Raw outputs will be saved to {raw_target_path}")
|
| 37 |
+
|
| 38 |
+
backend_type: str = type(model).__name__
|
| 39 |
+
with progress(backend_type) as p:
|
| 40 |
+
for task_id, task in p.track(dataset.items()):
|
| 41 |
+
if id_range is not None:
|
| 42 |
+
id_num = int(task_id.split("/")[1])
|
| 43 |
+
low, high = id_range
|
| 44 |
+
if id_num < low or id_num >= high:
|
| 45 |
+
p.console.print(f"Skipping {task_id} as it is not in {id_range}")
|
| 46 |
+
continue
|
| 47 |
+
|
| 48 |
+
if not target_path.endswith(".jsonl"):
|
| 49 |
+
p_name = task_id.replace("/", "_")
|
| 50 |
+
os.makedirs(os.path.join(target_path, p_name), exist_ok=True)
|
| 51 |
+
task2nexist[task_id] = len(
|
| 52 |
+
[
|
| 53 |
+
f
|
| 54 |
+
for f in os.listdir(os.path.join(target_path, p_name))
|
| 55 |
+
if f.endswith(".py")
|
| 56 |
+
]
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
n_more_samples = n_samples
|
| 60 |
+
log = f"Codegen: {task_id} @ {model}"
|
| 61 |
+
if resume and task2nexist.get(task_id, 0) > 0:
|
| 62 |
+
log += f" (resuming from {task2nexist[task_id]})"
|
| 63 |
+
n_more_samples -= task2nexist[task_id]
|
| 64 |
+
|
| 65 |
+
p.console.print(log)
|
| 66 |
+
|
| 67 |
+
sidx = n_samples - n_more_samples
|
| 68 |
+
while sidx < n_samples:
|
| 69 |
+
prompt = task["prompt"].strip() + "\n"
|
| 70 |
+
outputs = model.codegen(
|
| 71 |
+
prompt,
|
| 72 |
+
do_sample=not greedy,
|
| 73 |
+
num_samples=n_samples - sidx,
|
| 74 |
+
)
|
| 75 |
+
assert outputs, "No outputs from model!"
|
| 76 |
+
for impl in outputs:
|
| 77 |
+
solution = prompt + impl if model.is_direct_completion() else impl
|
| 78 |
+
sanitized_solution = sanitize(
|
| 79 |
+
solution, entrypoint=task["entry_point"]
|
| 80 |
+
)
|
| 81 |
+
if target_path.endswith(".jsonl"):
|
| 82 |
+
# Writing the sanitized version
|
| 83 |
+
with open(target_path, "a") as f:
|
| 84 |
+
f.write(
|
| 85 |
+
json.dumps(
|
| 86 |
+
{"task_id": task_id, "solution": sanitized_solution}
|
| 87 |
+
)
|
| 88 |
+
+ "\n"
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
# Writing the raw version
|
| 92 |
+
with open(raw_target_path, "a") as f:
|
| 93 |
+
f.write(
|
| 94 |
+
json.dumps({"task_id": task_id, "solution": solution})
|
| 95 |
+
+ "\n"
|
| 96 |
+
)
|
| 97 |
+
else:
|
| 98 |
+
# Writing the sanitized version
|
| 99 |
+
with open(
|
| 100 |
+
os.path.join(target_path, p_name, f"{sidx}.py"),
|
| 101 |
+
"w",
|
| 102 |
+
encoding="utf-8",
|
| 103 |
+
) as f:
|
| 104 |
+
f.write(sanitized_solution)
|
| 105 |
+
|
| 106 |
+
# Writing the raw version
|
| 107 |
+
with open(
|
| 108 |
+
os.path.join(raw_target_path, p_name, f"{sidx}.py"),
|
| 109 |
+
"w",
|
| 110 |
+
encoding="utf-8",
|
| 111 |
+
) as f:
|
| 112 |
+
f.write(solution)
|
| 113 |
+
sidx += 1
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def run_codegen(
|
| 117 |
+
model: str,
|
| 118 |
+
dataset: str,
|
| 119 |
+
root: str = "evalplus_results",
|
| 120 |
+
bs: Optional[int] = None,
|
| 121 |
+
n_samples: int = 1,
|
| 122 |
+
temperature: float = 0.0,
|
| 123 |
+
resume: bool = True,
|
| 124 |
+
greedy: bool = False,
|
| 125 |
+
id_range: List = None,
|
| 126 |
+
version: str = "default",
|
| 127 |
+
backend: str = "vllm",
|
| 128 |
+
force_base_prompt: bool = False,
|
| 129 |
+
base_url: str = None,
|
| 130 |
+
tp: int = 1,
|
| 131 |
+
evalperf_type: str = None, # For EvalPerf
|
| 132 |
+
jsonl_fmt: bool = True,
|
| 133 |
+
attn_implementation: str = "eager",
|
| 134 |
+
device_map: Optional[str] = None,
|
| 135 |
+
trust_remote_code: bool = False,
|
| 136 |
+
enable_prefix_caching: bool = False,
|
| 137 |
+
enable_chunked_prefill: bool = False,
|
| 138 |
+
dtype: str = "bfloat16",
|
| 139 |
+
gptqmodel_backend: str = "auto", # For GPTQModel
|
| 140 |
+
gguf_file: Optional[str] = None
|
| 141 |
+
):
|
| 142 |
+
assert dataset in ["humaneval", "mbpp", "evalperf"], f"Invalid dataset {dataset}"
|
| 143 |
+
assert evalperf_type is None or evalperf_type in [
|
| 144 |
+
"instruct",
|
| 145 |
+
"perf-instruct",
|
| 146 |
+
"perf-CoT",
|
| 147 |
+
]
|
| 148 |
+
|
| 149 |
+
# Make dir for codes generated by each model
|
| 150 |
+
identifier = model.strip("./").replace("/", "--") + f"_{backend}_temp_{temperature}"
|
| 151 |
+
if evalperf_type:
|
| 152 |
+
identifier += f"-{evalperf_type}"
|
| 153 |
+
|
| 154 |
+
target_path = os.path.join(root, dataset, identifier)
|
| 155 |
+
if jsonl_fmt:
|
| 156 |
+
target_path += ".jsonl"
|
| 157 |
+
else:
|
| 158 |
+
os.makedirs(target_path, exist_ok=True)
|
| 159 |
+
|
| 160 |
+
if dataset == "humaneval":
|
| 161 |
+
dataset_dict = get_human_eval_plus(version=version)
|
| 162 |
+
elif dataset == "mbpp":
|
| 163 |
+
dataset_dict = get_mbpp_plus(version=version)
|
| 164 |
+
elif dataset == "evalperf":
|
| 165 |
+
original_dataset = {**get_human_eval_plus(), **get_mbpp_plus()}
|
| 166 |
+
dataset_dict = {k: original_dataset[k] for k in get_evalperf_data()}
|
| 167 |
+
assert id_range is None, "id_range not supported for evalperf"
|
| 168 |
+
else:
|
| 169 |
+
raise ValueError(f"Invalid dataset {dataset}")
|
| 170 |
+
|
| 171 |
+
all_tasks_complete = False
|
| 172 |
+
if jsonl_fmt and os.path.isfile(target_path):
|
| 173 |
+
task_counts = {}
|
| 174 |
+
with open(target_path, "r") as f:
|
| 175 |
+
for line in f:
|
| 176 |
+
if not line.strip():
|
| 177 |
+
continue
|
| 178 |
+
data = json.loads(line)
|
| 179 |
+
task_id = data["task_id"]
|
| 180 |
+
task_counts[task_id] = task_counts.get(task_id, 0) + 1
|
| 181 |
+
|
| 182 |
+
all_tasks_complete = all(
|
| 183 |
+
task_counts.get(task_id, 0) >= n_samples
|
| 184 |
+
for task_id in dataset_dict.keys()
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
if all_tasks_complete:
|
| 188 |
+
print("All samples are already cached. Skipping codegen.")
|
| 189 |
+
return target_path
|
| 190 |
+
|
| 191 |
+
if greedy and (temperature != 0 or bs != 1 or n_samples != 1):
|
| 192 |
+
temperature = 0.0
|
| 193 |
+
bs = 1
|
| 194 |
+
n_samples = 1
|
| 195 |
+
print("Greedy decoding ON (--greedy): setting bs=1, n_samples=1, temperature=0")
|
| 196 |
+
|
| 197 |
+
if id_range is not None:
|
| 198 |
+
assert len(id_range) == 2, "id_range must be a list of length 2"
|
| 199 |
+
assert id_range[0] < id_range[1], "id_range must be increasing"
|
| 200 |
+
id_range = tuple(id_range)
|
| 201 |
+
|
| 202 |
+
if bs is None:
|
| 203 |
+
bs = min(n_samples, 32)
|
| 204 |
+
print(f"Setting batch size to {bs}")
|
| 205 |
+
|
| 206 |
+
# Make project dir
|
| 207 |
+
os.makedirs(root, exist_ok=True)
|
| 208 |
+
# Make dataset dir
|
| 209 |
+
os.makedirs(os.path.join(root, dataset), exist_ok=True)
|
| 210 |
+
|
| 211 |
+
# Model instructions
|
| 212 |
+
instruction_prefix = "Please provide a self-contained Python script that solves the following problem in a markdown code block:"
|
| 213 |
+
response_prefix = "Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:"
|
| 214 |
+
|
| 215 |
+
if evalperf_type == "perf-instruct":
|
| 216 |
+
instruction_prefix = "Please provide an efficient and self-contained Python script that solves the following problem in a markdown code block:"
|
| 217 |
+
response_prefix = "Below is a Python script with a self-contained function that efficiently solves the problem and passes corresponding tests:"
|
| 218 |
+
elif evalperf_type == "perf-CoT":
|
| 219 |
+
instruction_prefix = "Think step by step: please provide an efficient and self-contained Python script that solves the following problem in a markdown code block:"
|
| 220 |
+
response_prefix = "Below is a Python script with a self-contained function that efficiently solves the problem and passes corresponding tests:"
|
| 221 |
+
elif evalperf_type is not None and evalperf_type != "instruct":
|
| 222 |
+
raise ValueError(f"Invalid evalperf_type: {evalperf_type}")
|
| 223 |
+
|
| 224 |
+
# Model creation
|
| 225 |
+
model_runner = make_model(
|
| 226 |
+
model=model,
|
| 227 |
+
backend=backend,
|
| 228 |
+
batch_size=bs,
|
| 229 |
+
temperature=temperature,
|
| 230 |
+
force_base_prompt=force_base_prompt,
|
| 231 |
+
dataset=dataset,
|
| 232 |
+
base_url=base_url,
|
| 233 |
+
tp=tp,
|
| 234 |
+
instruction_prefix=instruction_prefix,
|
| 235 |
+
response_prefix=response_prefix,
|
| 236 |
+
device_map=device_map,
|
| 237 |
+
attn_implementation=attn_implementation,
|
| 238 |
+
trust_remote_code=trust_remote_code,
|
| 239 |
+
enable_prefix_caching=enable_prefix_caching,
|
| 240 |
+
enable_chunked_prefill=enable_chunked_prefill,
|
| 241 |
+
dtype=dtype,
|
| 242 |
+
gptqmodel_backend=gptqmodel_backend,
|
| 243 |
+
gguf_file=gguf_file,
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
codegen(
|
| 247 |
+
target_path=target_path,
|
| 248 |
+
dataset=dataset_dict,
|
| 249 |
+
greedy=greedy,
|
| 250 |
+
model=model_runner,
|
| 251 |
+
n_samples=n_samples,
|
| 252 |
+
resume=resume,
|
| 253 |
+
id_range=id_range,
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
# force shutdown the model runner
|
| 257 |
+
del model_runner
|
| 258 |
+
import gc
|
| 259 |
+
|
| 260 |
+
gc.collect()
|
| 261 |
+
|
| 262 |
+
return target_path
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def main():
|
| 266 |
+
from fire import Fire
|
| 267 |
+
|
| 268 |
+
Fire(run_codegen)
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
if __name__ == "__main__":
|
| 272 |
+
main()
|
evalplus/build/lib/evalplus/config.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## EvalPlus
|
| 2 |
+
DEFAULT_GT_TIME_LIMIT_FACTOR = 4.0
|
| 3 |
+
DEFAULT_MIN_TIME_LIMIT = 1.0
|
| 4 |
+
|
| 5 |
+
## EvalPerf
|
| 6 |
+
|
| 7 |
+
### General
|
| 8 |
+
PERF_PROFILE_ROUNDS = 1
|
| 9 |
+
PERF_RAM_GB_PER_PROC = 12
|
| 10 |
+
|
| 11 |
+
### Evaluation Phase
|
| 12 |
+
PERF_EVAL_TIMEOUT_SECOND = 45
|
| 13 |
+
|
| 14 |
+
### Curation Phase
|
| 15 |
+
PERF_CURATE_TIMEOUT_SECOND = 20
|
| 16 |
+
PREF_CURATE_MIN_INSTRUCTION = 10000
|
evalplus/build/lib/evalplus/data/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
from datasets import load_dataset
|
| 4 |
+
|
| 5 |
+
from evalplus.data.humaneval import get_human_eval_plus, get_human_eval_plus_hash
|
| 6 |
+
from evalplus.data.mbpp import get_mbpp_plus, get_mbpp_plus_hash
|
| 7 |
+
from evalplus.data.utils import load_solutions, write_directory, write_jsonl
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def get_evalperf_data():
|
| 11 |
+
dataset = load_dataset("evalplus/evalperf", split="test").to_list()
|
| 12 |
+
for d in dataset:
|
| 13 |
+
d["pe_input"] = json.loads(d["pe_input"])
|
| 14 |
+
return {task["task_id"]: task for task in dataset}
|
evalplus/build/lib/evalplus/data/humaneval.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import hashlib
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
from typing import Dict
|
| 5 |
+
|
| 6 |
+
from evalplus.data.utils import (
|
| 7 |
+
CACHE_DIR,
|
| 8 |
+
completeness_check,
|
| 9 |
+
get_dataset_metadata,
|
| 10 |
+
make_cache,
|
| 11 |
+
stream_jsonl,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
HUMANEVAL_PLUS_VERSION = "v0.1.10"
|
| 15 |
+
HUMANEVAL_OVERRIDE_PATH = os.environ.get("HUMANEVAL_OVERRIDE_PATH", None)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _ready_human_eval_plus_path(mini=False, noextreme=False, version="default") -> str:
|
| 19 |
+
if HUMANEVAL_OVERRIDE_PATH:
|
| 20 |
+
return HUMANEVAL_OVERRIDE_PATH
|
| 21 |
+
|
| 22 |
+
version = HUMANEVAL_PLUS_VERSION if version == "default" else version
|
| 23 |
+
url, plus_path = get_dataset_metadata(
|
| 24 |
+
"HumanEvalPlus", HUMANEVAL_PLUS_VERSION, mini, noextreme
|
| 25 |
+
)
|
| 26 |
+
make_cache(url, plus_path)
|
| 27 |
+
|
| 28 |
+
return plus_path
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def get_human_eval_plus_hash(mini=False, noextreme=False, version="default") -> str:
|
| 32 |
+
"""Get the hash of HumanEvalPlus.
|
| 33 |
+
Returns:
|
| 34 |
+
str: The hash of HumanEvalPlus
|
| 35 |
+
"""
|
| 36 |
+
plus_path = _ready_human_eval_plus_path(mini, noextreme, version="default")
|
| 37 |
+
with open(plus_path, "rb") as f:
|
| 38 |
+
plus = f.read()
|
| 39 |
+
return hashlib.md5(plus).hexdigest()
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def get_human_eval_plus(
|
| 43 |
+
err_incomplete=True, mini=False, noextreme=False, version="default"
|
| 44 |
+
) -> Dict[str, Dict]:
|
| 45 |
+
"""Get HumanEvalPlus locally.
|
| 46 |
+
Args:
|
| 47 |
+
err_incomplete (bool, optional): Whether to raise error if HumanEvalPlus is not complete. Defaults to True.
|
| 48 |
+
mini (bool, optional): Whether to use the mini version of HumanEvalPlus. Defaults to False.
|
| 49 |
+
Returns:
|
| 50 |
+
List[Dict[str, str]]: List of dicts with keys "task_id", "prompt", "contract", "canonical_solution", "base_input"
|
| 51 |
+
Notes:
|
| 52 |
+
"task_id" is the identifier string for the task
|
| 53 |
+
"prompt" is the function signature with docstring
|
| 54 |
+
"contract" is the assertions for the function's input (validity)
|
| 55 |
+
"canonical_solution" is the ground-truth implementation for diff-testing
|
| 56 |
+
"base_input" is the test inputs from original HumanEval
|
| 57 |
+
"plus_input" is the test inputs brought by EvalPlus
|
| 58 |
+
"atol" is the absolute tolerance for diff-testing
|
| 59 |
+
"""
|
| 60 |
+
plus_path = _ready_human_eval_plus_path(
|
| 61 |
+
mini=mini, noextreme=noextreme, version=version
|
| 62 |
+
)
|
| 63 |
+
plus = {task["task_id"]: task for task in stream_jsonl(plus_path)}
|
| 64 |
+
if err_incomplete:
|
| 65 |
+
completeness_check("HumanEval+", plus)
|
| 66 |
+
return plus
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def get_human_eval() -> Dict[str, Dict]:
|
| 70 |
+
"""Get HumanEval from OpenAI's github repo and return as a list of parsed dicts.
|
| 71 |
+
|
| 72 |
+
Returns:
|
| 73 |
+
List[Dict[str, str]]: List of dicts with keys "prompt", "test", "entry_point"
|
| 74 |
+
|
| 75 |
+
Notes:
|
| 76 |
+
"task_id" is the identifier string for the task.
|
| 77 |
+
"prompt" is the prompt to be used for the task (function signature with docstrings).
|
| 78 |
+
"test" is test-cases wrapped in a `check` function.
|
| 79 |
+
"entry_point" is the name of the function.
|
| 80 |
+
"""
|
| 81 |
+
# Check if human eval file exists in CACHE_DIR
|
| 82 |
+
human_eval_path = os.path.join(CACHE_DIR, "HumanEval.jsonl")
|
| 83 |
+
make_cache(
|
| 84 |
+
"https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz",
|
| 85 |
+
human_eval_path,
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
human_eval = open(human_eval_path, "r").read().split("\n")
|
| 89 |
+
human_eval = [json.loads(line) for line in human_eval if line]
|
| 90 |
+
|
| 91 |
+
# Handle 115_max_fill.py to make its docstring well-formed
|
| 92 |
+
human_eval[115]["prompt"] = "import math\n" + human_eval[115]["prompt"].replace(
|
| 93 |
+
"import math\n", ""
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
return {task["task_id"]: task for task in human_eval}
|
evalplus/build/lib/evalplus/data/mbpp.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import hashlib
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
from typing import Dict
|
| 5 |
+
|
| 6 |
+
import wget
|
| 7 |
+
|
| 8 |
+
from evalplus.data.utils import (
|
| 9 |
+
CACHE_DIR,
|
| 10 |
+
completeness_check,
|
| 11 |
+
get_dataset_metadata,
|
| 12 |
+
make_cache,
|
| 13 |
+
stream_jsonl,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
MBPP_PLUS_VERSION = "v0.2.0"
|
| 17 |
+
MBPP_OVERRIDE_PATH = os.environ.get("MBPP_OVERRIDE_PATH", None)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _ready_mbpp_plus_path(mini=False, noextreme=False, version="default") -> str:
|
| 21 |
+
assert mini is False, "Mini version of MBPP+ is not available yet."
|
| 22 |
+
|
| 23 |
+
if MBPP_OVERRIDE_PATH:
|
| 24 |
+
return MBPP_OVERRIDE_PATH
|
| 25 |
+
|
| 26 |
+
version = MBPP_PLUS_VERSION if version == "default" else version
|
| 27 |
+
|
| 28 |
+
url, plus_path = get_dataset_metadata("MbppPlus", version, mini, noextreme)
|
| 29 |
+
make_cache(url, plus_path)
|
| 30 |
+
|
| 31 |
+
return plus_path
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def mbpp_serialize_inputs(task_id: str, inputs: list) -> list:
|
| 35 |
+
task_id = int(task_id.split("/")[-1])
|
| 36 |
+
|
| 37 |
+
if task_id == 115:
|
| 38 |
+
return [[[list(item) for item in inp[0]]] for inp in inputs]
|
| 39 |
+
elif task_id == 124:
|
| 40 |
+
return [(str(inp[0]), str(inp[1])) for inp in inputs]
|
| 41 |
+
elif task_id == 252:
|
| 42 |
+
return [[str(inp[0])] for inp in inputs]
|
| 43 |
+
|
| 44 |
+
return inputs
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def mbpp_deserialize_inputs(task_id: str, inputs: list) -> list:
|
| 48 |
+
task_id = int(task_id.split("/")[-1])
|
| 49 |
+
if task_id in [
|
| 50 |
+
2,
|
| 51 |
+
116,
|
| 52 |
+
132,
|
| 53 |
+
143,
|
| 54 |
+
222,
|
| 55 |
+
261,
|
| 56 |
+
273,
|
| 57 |
+
394,
|
| 58 |
+
399,
|
| 59 |
+
421,
|
| 60 |
+
424,
|
| 61 |
+
429,
|
| 62 |
+
470,
|
| 63 |
+
560,
|
| 64 |
+
579,
|
| 65 |
+
596,
|
| 66 |
+
616,
|
| 67 |
+
630,
|
| 68 |
+
726,
|
| 69 |
+
740,
|
| 70 |
+
744,
|
| 71 |
+
809,
|
| 72 |
+
]:
|
| 73 |
+
modified_inputs = [[tuple(lst) for lst in inp] for inp in inputs]
|
| 74 |
+
|
| 75 |
+
elif task_id in [
|
| 76 |
+
63,
|
| 77 |
+
64,
|
| 78 |
+
70,
|
| 79 |
+
94,
|
| 80 |
+
120,
|
| 81 |
+
237,
|
| 82 |
+
272,
|
| 83 |
+
299,
|
| 84 |
+
400,
|
| 85 |
+
409,
|
| 86 |
+
417,
|
| 87 |
+
438,
|
| 88 |
+
473,
|
| 89 |
+
614,
|
| 90 |
+
780,
|
| 91 |
+
]:
|
| 92 |
+
modified_inputs = [
|
| 93 |
+
[[tuple(lst) for lst in lst_lst] for lst_lst in inp] for inp in inputs
|
| 94 |
+
]
|
| 95 |
+
|
| 96 |
+
elif task_id in [75, 413, 444, 753]:
|
| 97 |
+
modified_inputs = [
|
| 98 |
+
[[tuple(lst) for lst in inp[0]]] + [inp[1]] for inp in inputs
|
| 99 |
+
]
|
| 100 |
+
|
| 101 |
+
elif task_id == 106 or task_id == 750:
|
| 102 |
+
modified_inputs = [[inp[0]] + [tuple(inp[1])] for inp in inputs]
|
| 103 |
+
|
| 104 |
+
elif task_id == 115:
|
| 105 |
+
modified_inputs = [
|
| 106 |
+
[
|
| 107 |
+
[
|
| 108 |
+
set(item) if isinstance(item, list) and len(item) else {}
|
| 109 |
+
for item in inp[0]
|
| 110 |
+
]
|
| 111 |
+
]
|
| 112 |
+
for inp in inputs
|
| 113 |
+
]
|
| 114 |
+
|
| 115 |
+
elif task_id == 124:
|
| 116 |
+
modified_inputs = [(float(inp[0]), complex(inp[1])) for inp in inputs]
|
| 117 |
+
|
| 118 |
+
elif task_id in [250, 405, 446, 617, 720, 763, 808]:
|
| 119 |
+
modified_inputs = [[tuple(inp[0])] + [inp[1]] for inp in inputs]
|
| 120 |
+
|
| 121 |
+
elif task_id in [259, 401, 445]:
|
| 122 |
+
modified_inputs = [
|
| 123 |
+
[[tuple(lst) for lst in lst_lst] for lst_lst in inp] for inp in inputs
|
| 124 |
+
]
|
| 125 |
+
modified_inputs = [[tuple(lst) for lst in inp] for inp in modified_inputs]
|
| 126 |
+
|
| 127 |
+
elif task_id == 278:
|
| 128 |
+
modified_inputs = [
|
| 129 |
+
[[tuple(item) if isinstance(item, list) else item for item in inp[0]]]
|
| 130 |
+
for inp in inputs
|
| 131 |
+
]
|
| 132 |
+
modified_inputs = [[tuple(lst) for lst in inp] for inp in modified_inputs]
|
| 133 |
+
|
| 134 |
+
elif task_id == 307:
|
| 135 |
+
modified_inputs = [[tuple(inp[0])] + [inp[1], inp[2]] for inp in inputs]
|
| 136 |
+
|
| 137 |
+
elif task_id == 722:
|
| 138 |
+
modified_inputs = [
|
| 139 |
+
[{key: tuple(value) for key, value in inp[0].items()}] + inp[1:]
|
| 140 |
+
for inp in inputs
|
| 141 |
+
]
|
| 142 |
+
|
| 143 |
+
elif task_id == 252:
|
| 144 |
+
modified_inputs = [[complex(inp[0])] for inp in inputs]
|
| 145 |
+
|
| 146 |
+
elif task_id in [580, 615, 791]:
|
| 147 |
+
|
| 148 |
+
def turn_all_list_into_tuple(inp):
|
| 149 |
+
if isinstance(inp, list):
|
| 150 |
+
return tuple([turn_all_list_into_tuple(item) for item in inp])
|
| 151 |
+
return inp
|
| 152 |
+
|
| 153 |
+
modified_inputs = [turn_all_list_into_tuple(inp) for inp in inputs]
|
| 154 |
+
|
| 155 |
+
else:
|
| 156 |
+
modified_inputs = inputs
|
| 157 |
+
|
| 158 |
+
return modified_inputs
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def get_mbpp() -> Dict[str, Dict]:
|
| 162 |
+
"""Get sanitized MBPP from Google's Github repo."""
|
| 163 |
+
mbpp_path = os.path.join(CACHE_DIR, "sanitized-mbpp.json")
|
| 164 |
+
|
| 165 |
+
if not os.path.exists(mbpp_path):
|
| 166 |
+
os.makedirs(CACHE_DIR, exist_ok=True)
|
| 167 |
+
|
| 168 |
+
# Install MBPP-sanitized from scratch
|
| 169 |
+
print("Downloading original MBPP dataset...")
|
| 170 |
+
wget.download(
|
| 171 |
+
"https://github.com/google-research/google-research/raw/master/mbpp/sanitized-mbpp.json",
|
| 172 |
+
mbpp_path,
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
with open(mbpp_path, "r") as f:
|
| 176 |
+
mbpp = json.load(f)
|
| 177 |
+
|
| 178 |
+
return {str(task["task_id"]): task for task in mbpp}
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def get_mbpp_plus(
|
| 182 |
+
err_incomplete=True, mini=False, noextreme=False, version="default"
|
| 183 |
+
) -> Dict[str, Dict]:
|
| 184 |
+
plus_path = _ready_mbpp_plus_path(mini=mini, noextreme=noextreme, version=version)
|
| 185 |
+
plus = {task["task_id"]: task for task in stream_jsonl(plus_path)}
|
| 186 |
+
for task_id, task in plus.items():
|
| 187 |
+
task["base_input"] = mbpp_deserialize_inputs(task_id, task["base_input"])
|
| 188 |
+
task["plus_input"] = mbpp_deserialize_inputs(task_id, task["plus_input"])
|
| 189 |
+
|
| 190 |
+
if err_incomplete:
|
| 191 |
+
completeness_check("MBPP+", plus)
|
| 192 |
+
return plus
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def get_mbpp_plus_hash(mini=False, noextreme=False, version="default") -> str:
|
| 196 |
+
"""Get the hash of MbppPlus.
|
| 197 |
+
Returns:
|
| 198 |
+
str: The hash of MbppPlus
|
| 199 |
+
"""
|
| 200 |
+
plus_path = _ready_mbpp_plus_path(mini=mini, noextreme=noextreme, version=version)
|
| 201 |
+
with open(plus_path, "rb") as f:
|
| 202 |
+
plus = f.read()
|
| 203 |
+
return hashlib.md5(plus).hexdigest()
|
evalplus/build/lib/evalplus/data/utils.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gzip
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
from os import PathLike
|
| 5 |
+
from typing import Dict, Iterable
|
| 6 |
+
|
| 7 |
+
import tempdir
|
| 8 |
+
import wget
|
| 9 |
+
from appdirs import user_cache_dir
|
| 10 |
+
|
| 11 |
+
CACHE_DIR = user_cache_dir("evalplus")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def get_dataset_metadata(name: str, version: str, mini: bool, noextreme: bool = False):
|
| 15 |
+
assert name in ["HumanEvalPlus", "MbppPlus"], f"Unknown/unsupported dataset: {name}"
|
| 16 |
+
extra = ""
|
| 17 |
+
assert not (mini and noextreme), "Cannot have both mini and noextreme"
|
| 18 |
+
if mini:
|
| 19 |
+
extra = "-Mini"
|
| 20 |
+
if noextreme:
|
| 21 |
+
extra = "-NoExtreme"
|
| 22 |
+
url = f"https://github.com/evalplus/{name.lower()}_release/releases/download/{version}/{name}{extra}.jsonl.gz"
|
| 23 |
+
cache_path = os.path.join(CACHE_DIR, f"{name}{extra}-{version}.jsonl")
|
| 24 |
+
return url, cache_path
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def make_cache(gzip_url, cache_path):
|
| 28 |
+
# Check if human eval file exists in CACHE_DIR
|
| 29 |
+
if not os.path.exists(cache_path):
|
| 30 |
+
# Install HumanEval dataset and parse as jsonl
|
| 31 |
+
print(f"Downloading dataset from {gzip_url}")
|
| 32 |
+
with tempdir.TempDir() as tmpdir:
|
| 33 |
+
plus_gz_path = os.path.join(tmpdir, f"data.jsonl.gz")
|
| 34 |
+
wget.download(gzip_url, plus_gz_path)
|
| 35 |
+
|
| 36 |
+
with gzip.open(plus_gz_path, "rb") as f:
|
| 37 |
+
plus = f.read().decode("utf-8")
|
| 38 |
+
|
| 39 |
+
# create CACHE_DIR if not exists
|
| 40 |
+
if not os.path.exists(CACHE_DIR):
|
| 41 |
+
os.makedirs(CACHE_DIR)
|
| 42 |
+
|
| 43 |
+
# Write the original human eval file to CACHE_DIR
|
| 44 |
+
with open(cache_path, "w") as f:
|
| 45 |
+
f.write(plus)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def write_jsonl(
|
| 49 |
+
filename: str, data: Iterable[Dict], append: bool = False, drop_builtin: bool = True
|
| 50 |
+
):
|
| 51 |
+
"""
|
| 52 |
+
Writes an iterable of dictionaries to jsonl
|
| 53 |
+
"""
|
| 54 |
+
if append:
|
| 55 |
+
mode = "ab"
|
| 56 |
+
else:
|
| 57 |
+
mode = "wb"
|
| 58 |
+
filename = os.path.expanduser(filename)
|
| 59 |
+
if filename.endswith(".gz"):
|
| 60 |
+
with open(filename, mode) as fp:
|
| 61 |
+
with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp:
|
| 62 |
+
for x in data:
|
| 63 |
+
if drop_builtin:
|
| 64 |
+
x = {k: v for k, v in x.items() if not k.startswith("_")}
|
| 65 |
+
gzfp.write((json.dumps(x) + "\n").encode("utf-8"))
|
| 66 |
+
else:
|
| 67 |
+
with open(filename, mode) as fp:
|
| 68 |
+
for x in data:
|
| 69 |
+
if drop_builtin:
|
| 70 |
+
x = {k: v for k, v in x.items() if not k.startswith("_")}
|
| 71 |
+
fp.write((json.dumps(x) + "\n").encode("utf-8"))
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def stream_jsonl(filename: str) -> Iterable[Dict]:
|
| 75 |
+
"""
|
| 76 |
+
Parses each jsonl line and yields it as a dictionary
|
| 77 |
+
"""
|
| 78 |
+
if filename.endswith(".gz"):
|
| 79 |
+
with open(filename, "rb") as gzfp:
|
| 80 |
+
with gzip.open(gzfp, "rt") as fp:
|
| 81 |
+
for line in fp:
|
| 82 |
+
if any(not x.isspace() for x in line):
|
| 83 |
+
yield json.loads(line)
|
| 84 |
+
else:
|
| 85 |
+
with open(filename, "r") as fp:
|
| 86 |
+
for line in fp:
|
| 87 |
+
if any(not x.isspace() for x in line):
|
| 88 |
+
yield json.loads(line)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def load_solutions(sample_path: PathLike) -> Iterable[Dict]:
|
| 92 |
+
"""We accept two formats of inputs.
|
| 93 |
+
+ `sample.jsonl` which is the format from HumanEval, i.e., {task_id, completion or solution}.
|
| 94 |
+
+ A folder which contains sub-folders named after the task_id. Each sub-folder
|
| 95 |
+
contains samples named in `[?].py` where `?` is the solution id starting with 0.
|
| 96 |
+
Different from `sample.jsonl`, the solutions must be complete (with prompt prefix).
|
| 97 |
+
"""
|
| 98 |
+
|
| 99 |
+
# if it is a file
|
| 100 |
+
if os.path.isfile(sample_path):
|
| 101 |
+
for i, sample in enumerate(stream_jsonl(sample_path)):
|
| 102 |
+
assert (
|
| 103 |
+
"completion" in sample or "solution" in sample
|
| 104 |
+
), "No completion or solution found in sample!"
|
| 105 |
+
assert "solution" not in sample or isinstance(
|
| 106 |
+
sample["solution"], str
|
| 107 |
+
), "Solution must be a string! If you have multiple solutions, please repeat the task_id."
|
| 108 |
+
assert "completion" not in sample or isinstance(
|
| 109 |
+
sample["completion"], str
|
| 110 |
+
), "Completion must be a string! If you have multiple solutions, please repeat the task_id."
|
| 111 |
+
|
| 112 |
+
sample["_identifier"] = (
|
| 113 |
+
sample["task_id"] + f" (line {i+1} in {sample_path})"
|
| 114 |
+
)
|
| 115 |
+
yield sample
|
| 116 |
+
else:
|
| 117 |
+
# if it is a folder
|
| 118 |
+
for task_id in os.listdir(sample_path):
|
| 119 |
+
task_path = os.path.join(sample_path, task_id)
|
| 120 |
+
if not os.path.isdir(task_path):
|
| 121 |
+
continue
|
| 122 |
+
|
| 123 |
+
for solution_id in os.listdir(task_path):
|
| 124 |
+
solution_path = os.path.join(task_path, solution_id)
|
| 125 |
+
if os.path.isfile(solution_path) and solution_path.endswith(".py"):
|
| 126 |
+
with open(solution_path, "r") as f:
|
| 127 |
+
completion = f.read()
|
| 128 |
+
yield {
|
| 129 |
+
"_identifier": solution_path,
|
| 130 |
+
"_path": solution_path,
|
| 131 |
+
"task_id": task_id.replace("_", "/"),
|
| 132 |
+
"solution": completion,
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def write_directory(directory: PathLike, data: Iterable[Dict]):
|
| 137 |
+
os.makedirs(directory, exist_ok=True)
|
| 138 |
+
counters = {}
|
| 139 |
+
for sample in data:
|
| 140 |
+
assert "solution" in sample, "Samples must come with `solution` field!"
|
| 141 |
+
task_id = sample["task_id"].replace("/", "_")
|
| 142 |
+
task_dir = os.path.join(directory, task_id)
|
| 143 |
+
os.makedirs(task_dir, exist_ok=True)
|
| 144 |
+
if task_id not in counters:
|
| 145 |
+
counters[task_id] = 0
|
| 146 |
+
sample_id = counters[task_id]
|
| 147 |
+
with open(os.path.join(task_dir, f"{sample_id}.py"), "w") as f:
|
| 148 |
+
f.write(sample["solution"])
|
| 149 |
+
counters[task_id] += 1
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def completeness_check(name, plus):
|
| 153 |
+
for task_id, task in plus.items():
|
| 154 |
+
for key in [
|
| 155 |
+
"prompt",
|
| 156 |
+
"contract",
|
| 157 |
+
"canonical_solution",
|
| 158 |
+
"base_input",
|
| 159 |
+
"plus_input",
|
| 160 |
+
"atol",
|
| 161 |
+
]:
|
| 162 |
+
assert key in task, f"{key} not found in {name} #{task_id}!"
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def to_raw(string):
|
| 166 |
+
return string.encode("unicode-escape").decode().replace("\\\\", "\\")
|
evalplus/build/lib/evalplus/eval/__init__.py
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# The MIT License
|
| 2 |
+
#
|
| 3 |
+
# Copyright (c) OpenAI (https://openai.com)
|
| 4 |
+
#
|
| 5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
# in the Software without restriction, including without limitation the rights
|
| 8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
# furnished to do so, subject to the following conditions:
|
| 11 |
+
#
|
| 12 |
+
# The above copyright notice and this permission notice shall be included in
|
| 13 |
+
# all copies or substantial portions of the Software.
|
| 14 |
+
#
|
| 15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
| 21 |
+
# THE SOFTWARE.
|
| 22 |
+
|
| 23 |
+
import itertools
|
| 24 |
+
import multiprocessing
|
| 25 |
+
import os
|
| 26 |
+
import time
|
| 27 |
+
from multiprocessing import Array, Value
|
| 28 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
| 29 |
+
|
| 30 |
+
import numpy as np
|
| 31 |
+
import psutil
|
| 32 |
+
|
| 33 |
+
from evalplus.config import *
|
| 34 |
+
from evalplus.eval._special_oracle import (
|
| 35 |
+
MBPP_OUTPUT_NOT_NONE_TASKS,
|
| 36 |
+
MBPP_OUTPUT_SET_EQ_TASKS,
|
| 37 |
+
_digit_distance_nums,
|
| 38 |
+
_poly,
|
| 39 |
+
_surface_Area,
|
| 40 |
+
)
|
| 41 |
+
from evalplus.eval.utils import (
|
| 42 |
+
create_tempdir,
|
| 43 |
+
reliability_guard,
|
| 44 |
+
swallow_io,
|
| 45 |
+
time_limit,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def compatible_eval_result(results: Dict) -> Dict:
|
| 50 |
+
# compatibility
|
| 51 |
+
for task_results in results["eval"].values():
|
| 52 |
+
# update the "files" field to "nfiles"
|
| 53 |
+
if "files" in task_results and "nfiles" not in task_results:
|
| 54 |
+
task_results["nfiles"] = len(task_results.pop("files"))
|
| 55 |
+
return results
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
# unbiased estimator from https://github.com/openai/human-eval
|
| 59 |
+
def estimate_pass_at_k(
|
| 60 |
+
num_samples: Union[int, List[int], np.ndarray],
|
| 61 |
+
num_correct: Union[List[int], np.ndarray],
|
| 62 |
+
k: int,
|
| 63 |
+
) -> np.ndarray:
|
| 64 |
+
"""
|
| 65 |
+
Estimates pass@k of each problem and returns them in an array.
|
| 66 |
+
"""
|
| 67 |
+
|
| 68 |
+
def estimator(n: int, c: int, k: int) -> float:
|
| 69 |
+
"""
|
| 70 |
+
Calculates 1 - comb(n - c, k) / comb(n, k).
|
| 71 |
+
"""
|
| 72 |
+
if n - c < k:
|
| 73 |
+
return 1.0
|
| 74 |
+
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
|
| 75 |
+
|
| 76 |
+
if isinstance(num_samples, int):
|
| 77 |
+
num_samples_it = itertools.repeat(num_samples, len(num_correct))
|
| 78 |
+
else:
|
| 79 |
+
assert len(num_samples) == len(num_correct)
|
| 80 |
+
num_samples_it = iter(num_samples)
|
| 81 |
+
|
| 82 |
+
return np.array(
|
| 83 |
+
[estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
PASS = "pass"
|
| 88 |
+
FAIL = "fail"
|
| 89 |
+
TIMEOUT = "timeout"
|
| 90 |
+
|
| 91 |
+
_SUCCESS = 0
|
| 92 |
+
_FAILED = 1
|
| 93 |
+
_TIMEOUT = 2
|
| 94 |
+
_UNKNOWN = 3
|
| 95 |
+
|
| 96 |
+
_mapping = {_SUCCESS: PASS, _FAILED: FAIL, _TIMEOUT: TIMEOUT, _UNKNOWN: None}
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def query_maximum_memory_bytes() -> Optional[int]:
|
| 100 |
+
# Disable functionalities that can make destructive changes to the test.
|
| 101 |
+
# allow only 4GB memory usage
|
| 102 |
+
maximum_memory_bytes = os.getenv(
|
| 103 |
+
"EVALPLUS_MAX_MEMORY_BYTES", 4 * 1024 * 1024 * 1024
|
| 104 |
+
)
|
| 105 |
+
maximum_memory_bytes = min(int(maximum_memory_bytes), psutil.virtual_memory().total)
|
| 106 |
+
if maximum_memory_bytes == -1:
|
| 107 |
+
return None
|
| 108 |
+
return maximum_memory_bytes
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def is_floats(x) -> bool:
|
| 112 |
+
# check if it is float; List[float]; Tuple[float]
|
| 113 |
+
if isinstance(x, float):
|
| 114 |
+
return True
|
| 115 |
+
if isinstance(x, (list, tuple)) and x:
|
| 116 |
+
return all(isinstance(i, float) for i in x)
|
| 117 |
+
if isinstance(x, np.ndarray):
|
| 118 |
+
return x.dtype == np.float64 or x.dtype == np.float32
|
| 119 |
+
return False
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def unsafe_execute(
|
| 123 |
+
dataset: str,
|
| 124 |
+
entry_point: str,
|
| 125 |
+
code: str,
|
| 126 |
+
inputs,
|
| 127 |
+
expected: List,
|
| 128 |
+
time_limits,
|
| 129 |
+
atol,
|
| 130 |
+
fast_check,
|
| 131 |
+
stat, # Value
|
| 132 |
+
details, # Array
|
| 133 |
+
progress, # Value
|
| 134 |
+
):
|
| 135 |
+
with create_tempdir():
|
| 136 |
+
# These system calls are needed when cleaning up tempdir.
|
| 137 |
+
import os
|
| 138 |
+
import shutil
|
| 139 |
+
|
| 140 |
+
rmtree = shutil.rmtree
|
| 141 |
+
rmdir = os.rmdir
|
| 142 |
+
chdir = os.chdir
|
| 143 |
+
#reliability_guard(maximum_memory_bytes=query_maximum_memory_bytes())
|
| 144 |
+
exec_globals = {}
|
| 145 |
+
try:
|
| 146 |
+
with swallow_io():
|
| 147 |
+
exec(code, exec_globals)
|
| 148 |
+
fn = exec_globals[entry_point]
|
| 149 |
+
|
| 150 |
+
for i, inp in enumerate(inputs):
|
| 151 |
+
try:
|
| 152 |
+
with time_limit(time_limits[i]):
|
| 153 |
+
with swallow_io():
|
| 154 |
+
out = fn(*inp)
|
| 155 |
+
|
| 156 |
+
exp = expected[i]
|
| 157 |
+
exact_match = out == exp
|
| 158 |
+
|
| 159 |
+
# ================================================ #
|
| 160 |
+
# ============== special oracles ================= #
|
| 161 |
+
if dataset == "mbpp":
|
| 162 |
+
if "are_equivalent" == entry_point: # Mbpp/164 special oracle
|
| 163 |
+
exact_match = exact_match or True
|
| 164 |
+
elif "sum_div" == entry_point: # Mbpp/295 special oracle
|
| 165 |
+
exact_match = exact_match or out == 0
|
| 166 |
+
elif "surface_Area" == entry_point: # Mbpp/581 special oracle
|
| 167 |
+
exact_match = (
|
| 168 |
+
exact_match or abs(out - _surface_Area(*inp)) <= atol
|
| 169 |
+
)
|
| 170 |
+
elif (
|
| 171 |
+
"digit_distance_nums" == entry_point
|
| 172 |
+
): # Mbpp/558 special oracle
|
| 173 |
+
exact_match = exact_match or out == _digit_distance_nums(
|
| 174 |
+
*inp
|
| 175 |
+
)
|
| 176 |
+
elif entry_point in MBPP_OUTPUT_SET_EQ_TASKS:
|
| 177 |
+
exact_match = set(out) == set(exp)
|
| 178 |
+
elif entry_point in MBPP_OUTPUT_NOT_NONE_TASKS:
|
| 179 |
+
# exp is True if not None
|
| 180 |
+
# False if None
|
| 181 |
+
if isinstance(out, bool):
|
| 182 |
+
exact_match = out == exp
|
| 183 |
+
else:
|
| 184 |
+
exact_match = exp == (out is not None)
|
| 185 |
+
|
| 186 |
+
if dataset == "humaneval":
|
| 187 |
+
if "find_zero" == entry_point:
|
| 188 |
+
assert abs(_poly(*inp, out)) <= atol
|
| 189 |
+
details[i] = True
|
| 190 |
+
progress.value += 1
|
| 191 |
+
continue
|
| 192 |
+
# ============== special oracles ================= #
|
| 193 |
+
# ================================================ #
|
| 194 |
+
|
| 195 |
+
if atol == 0 and is_floats(exp):
|
| 196 |
+
atol = 1e-6 # enforce atol for float comparison
|
| 197 |
+
if not exact_match and atol != 0:
|
| 198 |
+
# explicitly set rtol=1e-07
|
| 199 |
+
# to match `np.testing.assert_allclose`'s default values
|
| 200 |
+
assert type(out) == type(exp)
|
| 201 |
+
if isinstance(exp, (list, tuple)):
|
| 202 |
+
assert len(out) == len(exp)
|
| 203 |
+
assert np.allclose(out, exp, rtol=1e-07, atol=atol)
|
| 204 |
+
else:
|
| 205 |
+
assert exact_match
|
| 206 |
+
except BaseException:
|
| 207 |
+
details[i] = False
|
| 208 |
+
progress.value += 1
|
| 209 |
+
if fast_check:
|
| 210 |
+
raise
|
| 211 |
+
continue
|
| 212 |
+
|
| 213 |
+
details[i] = True
|
| 214 |
+
progress.value += 1
|
| 215 |
+
|
| 216 |
+
stat.value = _SUCCESS
|
| 217 |
+
except BaseException:
|
| 218 |
+
stat.value = _FAILED
|
| 219 |
+
# Needed for cleaning up.
|
| 220 |
+
shutil.rmtree = rmtree
|
| 221 |
+
os.rmdir = rmdir
|
| 222 |
+
os.chdir = chdir
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def untrusted_check(
|
| 226 |
+
dataset: str,
|
| 227 |
+
code: str,
|
| 228 |
+
inputs: List[Any],
|
| 229 |
+
entry_point: str,
|
| 230 |
+
expected,
|
| 231 |
+
atol,
|
| 232 |
+
ref_time: List[float],
|
| 233 |
+
fast_check: bool = False,
|
| 234 |
+
min_time_limit: float = DEFAULT_MIN_TIME_LIMIT,
|
| 235 |
+
gt_time_limit_factor: float = DEFAULT_GT_TIME_LIMIT_FACTOR,
|
| 236 |
+
) -> Tuple[str, np.ndarray]:
|
| 237 |
+
time_limits = [max(min_time_limit, gt_time_limit_factor * t) for t in ref_time]
|
| 238 |
+
timeout = min(os.getenv("EVALPLUS_TIMEOUT_PER_TASK", 60), sum(time_limits)) + 1
|
| 239 |
+
if not fast_check:
|
| 240 |
+
timeout += 1 # extra time for data collection
|
| 241 |
+
|
| 242 |
+
# shared memory objects
|
| 243 |
+
progress = Value("i", 0)
|
| 244 |
+
stat = Value("i", _UNKNOWN)
|
| 245 |
+
details = Array("b", [False for _ in range(len(inputs))])
|
| 246 |
+
|
| 247 |
+
p = multiprocessing.Process(
|
| 248 |
+
target=unsafe_execute,
|
| 249 |
+
args=(
|
| 250 |
+
dataset,
|
| 251 |
+
entry_point,
|
| 252 |
+
code,
|
| 253 |
+
inputs,
|
| 254 |
+
expected,
|
| 255 |
+
time_limits,
|
| 256 |
+
atol,
|
| 257 |
+
fast_check,
|
| 258 |
+
# return values
|
| 259 |
+
stat,
|
| 260 |
+
details,
|
| 261 |
+
progress,
|
| 262 |
+
),
|
| 263 |
+
)
|
| 264 |
+
p.start()
|
| 265 |
+
p.join(timeout=timeout + 1)
|
| 266 |
+
if p.is_alive():
|
| 267 |
+
p.terminate()
|
| 268 |
+
time.sleep(0.1)
|
| 269 |
+
if p.is_alive():
|
| 270 |
+
p.kill()
|
| 271 |
+
time.sleep(0.1)
|
| 272 |
+
|
| 273 |
+
stat = _mapping[stat.value]
|
| 274 |
+
details = details[: progress.value]
|
| 275 |
+
|
| 276 |
+
if not stat:
|
| 277 |
+
stat = TIMEOUT
|
| 278 |
+
|
| 279 |
+
if stat == PASS:
|
| 280 |
+
if len(details) != len(inputs) or not all(details):
|
| 281 |
+
stat = FAIL
|
| 282 |
+
|
| 283 |
+
return stat, details
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
def evaluate_files(
|
| 287 |
+
dataset: str,
|
| 288 |
+
files: List[str],
|
| 289 |
+
inputs: List,
|
| 290 |
+
expected: List,
|
| 291 |
+
entry_point: str,
|
| 292 |
+
atol: float,
|
| 293 |
+
ref_time: List[float],
|
| 294 |
+
fast_check: bool = False,
|
| 295 |
+
min_time_limit: float = DEFAULT_MIN_TIME_LIMIT,
|
| 296 |
+
gt_time_limit_factor: float = DEFAULT_GT_TIME_LIMIT_FACTOR,
|
| 297 |
+
) -> List[Tuple[str, List[bool]]]:
|
| 298 |
+
ret = []
|
| 299 |
+
# sort files by the id in name (i.e., "../n.py")
|
| 300 |
+
files = sorted(files, key=lambda x: int(x.split("/")[-1].split(".")[0]))
|
| 301 |
+
for file in files:
|
| 302 |
+
code = open(file, "r").read()
|
| 303 |
+
stat, det = untrusted_check(
|
| 304 |
+
dataset,
|
| 305 |
+
code,
|
| 306 |
+
inputs,
|
| 307 |
+
entry_point,
|
| 308 |
+
expected=expected,
|
| 309 |
+
atol=atol,
|
| 310 |
+
ref_time=ref_time,
|
| 311 |
+
fast_check=fast_check,
|
| 312 |
+
min_time_limit=min_time_limit,
|
| 313 |
+
gt_time_limit_factor=gt_time_limit_factor,
|
| 314 |
+
)
|
| 315 |
+
ret.append((stat, det.tolist()))
|
| 316 |
+
return ret
|
evalplus/build/lib/evalplus/eval/_special_oracle.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Special oracle handlings for problems where direct differential testing is not applicable."""
|
| 2 |
+
|
| 3 |
+
import math
|
| 4 |
+
|
| 5 |
+
# For tasks whose output are not serializable, we only check the output is not None, which
|
| 6 |
+
# is also consistent with the original dataset.
|
| 7 |
+
MBPP_OUTPUT_NOT_NONE_TASKS = ["check_str", "text_match_three", "text_starta_endb"]
|
| 8 |
+
|
| 9 |
+
# Tasks that needs to perform set comparison over two lists
|
| 10 |
+
MBPP_OUTPUT_SET_EQ_TASKS = [
|
| 11 |
+
"similar_elements", # Mbpp/2
|
| 12 |
+
"find_char_long", # Mbpp/7
|
| 13 |
+
"common_in_nested_lists", # Mbpp/111
|
| 14 |
+
"extract_singly", # Mbpp/140
|
| 15 |
+
"larg_nnum", # Mbpp/232
|
| 16 |
+
"intersection_array", # Mbpp/249
|
| 17 |
+
"find_dissimilar", # Mbpp/579
|
| 18 |
+
"Diff", # Mbpp/769
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# oracle for Mbpp/581
|
| 23 |
+
def _surface_Area(base_edge, height):
|
| 24 |
+
"""
|
| 25 |
+
Recognizes the "height" as the perpendicular distance from the base to the apex of the pyramid
|
| 26 |
+
"""
|
| 27 |
+
slant_height = math.sqrt((base_edge / 2) ** 2 + height**2)
|
| 28 |
+
base_area = base_edge**2
|
| 29 |
+
lateral_area = 4 * (base_edge * slant_height) / 2
|
| 30 |
+
total_surface_area = base_area + lateral_area
|
| 31 |
+
return round(total_surface_area)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# oracle for Mbpp/558
|
| 35 |
+
def _digit_distance_nums(num1, num2):
|
| 36 |
+
"""
|
| 37 |
+
Preprocesses the two numbers to have the same length by padding with zeros
|
| 38 |
+
"""
|
| 39 |
+
str_num1, str_num2 = str(num1), str(num2)
|
| 40 |
+
max_length = max(len(str_num1), len(str_num2))
|
| 41 |
+
str_num1, str_num2 = str_num1.zfill(max_length), str_num2.zfill(max_length)
|
| 42 |
+
total_difference = 0
|
| 43 |
+
for digit1, digit2 in zip(str_num1, str_num2):
|
| 44 |
+
difference = abs(int(digit1) - int(digit2))
|
| 45 |
+
total_difference += difference
|
| 46 |
+
return total_difference
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# oracle for HumaneEval/032
|
| 50 |
+
def _poly(xs: list, x: float):
|
| 51 |
+
"""
|
| 52 |
+
Evaluates polynomial with coefficients xs at point x.
|
| 53 |
+
return xs[0] + xs[1] * x + xs[1] * x^2 + .... xs[n] * x^n
|
| 54 |
+
"""
|
| 55 |
+
return sum([coeff * math.pow(x, i) for i, coeff in enumerate(xs)])
|
evalplus/build/lib/evalplus/eval/utils.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# The MIT License
|
| 2 |
+
#
|
| 3 |
+
# Copyright (c) OpenAI (https://openai.com)
|
| 4 |
+
#
|
| 5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
# in the Software without restriction, including without limitation the rights
|
| 8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
# furnished to do so, subject to the following conditions:
|
| 11 |
+
#
|
| 12 |
+
# The above copyright notice and this permission notice shall be included in
|
| 13 |
+
# all copies or substantial portions of the Software.
|
| 14 |
+
#
|
| 15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
| 21 |
+
# THE SOFTWARE.
|
| 22 |
+
|
| 23 |
+
import contextlib
|
| 24 |
+
import faulthandler
|
| 25 |
+
import io
|
| 26 |
+
import os
|
| 27 |
+
import platform
|
| 28 |
+
import signal
|
| 29 |
+
import tempfile
|
| 30 |
+
from typing import Optional
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@contextlib.contextmanager
|
| 34 |
+
def swallow_io():
|
| 35 |
+
stream = WriteOnlyStringIO()
|
| 36 |
+
with contextlib.redirect_stdout(stream):
|
| 37 |
+
with contextlib.redirect_stderr(stream):
|
| 38 |
+
with redirect_stdin(stream):
|
| 39 |
+
yield
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@contextlib.contextmanager
|
| 43 |
+
def time_limit(seconds: float):
|
| 44 |
+
def signal_handler(signum, frame):
|
| 45 |
+
raise TimeoutException("Timed out!")
|
| 46 |
+
|
| 47 |
+
signal.setitimer(signal.ITIMER_REAL, seconds)
|
| 48 |
+
signal.signal(signal.SIGALRM, signal_handler)
|
| 49 |
+
try:
|
| 50 |
+
yield
|
| 51 |
+
finally:
|
| 52 |
+
signal.setitimer(signal.ITIMER_REAL, 0)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
@contextlib.contextmanager
|
| 56 |
+
def create_tempdir():
|
| 57 |
+
with tempfile.TemporaryDirectory() as dirname:
|
| 58 |
+
with chdir(dirname):
|
| 59 |
+
yield dirname
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
@contextlib.contextmanager
|
| 63 |
+
def chdir(root):
|
| 64 |
+
if root == ".":
|
| 65 |
+
yield
|
| 66 |
+
return
|
| 67 |
+
cwd = os.getcwd()
|
| 68 |
+
os.chdir(root)
|
| 69 |
+
try:
|
| 70 |
+
yield
|
| 71 |
+
except BaseException as exc:
|
| 72 |
+
raise exc
|
| 73 |
+
finally:
|
| 74 |
+
os.chdir(cwd)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
class TimeoutException(Exception):
|
| 78 |
+
pass
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class WriteOnlyStringIO(io.StringIO):
|
| 82 |
+
"""StringIO that throws an exception when it's read from"""
|
| 83 |
+
|
| 84 |
+
def read(self, *args, **kwargs):
|
| 85 |
+
raise IOError
|
| 86 |
+
|
| 87 |
+
def readline(self, *args, **kwargs):
|
| 88 |
+
raise IOError
|
| 89 |
+
|
| 90 |
+
def readlines(self, *args, **kwargs):
|
| 91 |
+
raise IOError
|
| 92 |
+
|
| 93 |
+
def readable(self, *args, **kwargs):
|
| 94 |
+
"""Returns True if the IO object can be read."""
|
| 95 |
+
return False
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
class redirect_stdin(contextlib._RedirectStream): # type: ignore
|
| 99 |
+
_stream = "stdin"
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def reliability_guard(maximum_memory_bytes: Optional[int] = None):
|
| 103 |
+
"""
|
| 104 |
+
This disables various destructive functions and prevents the generated code
|
| 105 |
+
from interfering with the test (e.g. fork bomb, killing other processes,
|
| 106 |
+
removing filesystem files, etc.)
|
| 107 |
+
|
| 108 |
+
WARNING
|
| 109 |
+
This function is NOT a security sandbox. Untrusted code, including, model-
|
| 110 |
+
generated code, should not be blindly executed outside of one. See the
|
| 111 |
+
Codex paper for more information about OpenAI's code sandbox, and proceed
|
| 112 |
+
with caution.
|
| 113 |
+
"""
|
| 114 |
+
|
| 115 |
+
if maximum_memory_bytes is not None:
|
| 116 |
+
import resource
|
| 117 |
+
|
| 118 |
+
resource.setrlimit(
|
| 119 |
+
resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)
|
| 120 |
+
)
|
| 121 |
+
resource.setrlimit(
|
| 122 |
+
resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)
|
| 123 |
+
)
|
| 124 |
+
if not platform.uname().system == "Darwin":
|
| 125 |
+
resource.setrlimit(
|
| 126 |
+
resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
faulthandler.disable()
|
| 130 |
+
|
| 131 |
+
import builtins
|
| 132 |
+
|
| 133 |
+
builtins.exit = None
|
| 134 |
+
builtins.quit = None
|
| 135 |
+
|
| 136 |
+
import os
|
| 137 |
+
|
| 138 |
+
os.environ["OMP_NUM_THREADS"] = "1"
|
| 139 |
+
|
| 140 |
+
os.kill = None
|
| 141 |
+
os.system = None
|
| 142 |
+
os.putenv = None
|
| 143 |
+
os.remove = None
|
| 144 |
+
os.removedirs = None
|
| 145 |
+
os.rmdir = None
|
| 146 |
+
os.fchdir = None
|
| 147 |
+
os.setuid = None
|
| 148 |
+
os.fork = None
|
| 149 |
+
os.forkpty = None
|
| 150 |
+
os.killpg = None
|
| 151 |
+
os.rename = None
|
| 152 |
+
os.renames = None
|
| 153 |
+
os.truncate = None
|
| 154 |
+
os.replace = None
|
| 155 |
+
os.unlink = None
|
| 156 |
+
os.fchmod = None
|
| 157 |
+
os.fchown = None
|
| 158 |
+
os.chmod = None
|
| 159 |
+
os.chown = None
|
| 160 |
+
os.chroot = None
|
| 161 |
+
os.fchdir = None
|
| 162 |
+
os.lchflags = None
|
| 163 |
+
os.lchmod = None
|
| 164 |
+
os.lchown = None
|
| 165 |
+
os.getcwd = None
|
| 166 |
+
os.chdir = None
|
| 167 |
+
builtins.open = None
|
| 168 |
+
|
| 169 |
+
import shutil
|
| 170 |
+
|
| 171 |
+
shutil.rmtree = None
|
| 172 |
+
shutil.move = None
|
| 173 |
+
shutil.chown = None
|
| 174 |
+
|
| 175 |
+
import subprocess
|
| 176 |
+
|
| 177 |
+
subprocess.Popen = None # type: ignore
|
| 178 |
+
|
| 179 |
+
__builtins__["help"] = None
|
| 180 |
+
|
| 181 |
+
import sys
|
| 182 |
+
|
| 183 |
+
sys.modules["ipdb"] = None
|
| 184 |
+
sys.modules["joblib"] = None
|
| 185 |
+
sys.modules["resource"] = None
|
| 186 |
+
sys.modules["psutil"] = None
|
| 187 |
+
sys.modules["tkinter"] = None
|
evalplus/build/lib/evalplus/evalperf.py
ADDED
|
@@ -0,0 +1,558 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compute the Differential Performance Scores (DPS) and DPS_{norm} of given samples from a model.
|
| 2 |
+
|
| 3 |
+
Check our COLM paper for more details: https://www.arxiv.org/abs/2408.06450
|
| 4 |
+
|
| 5 |
+
^Updates from the COLM paper:
|
| 6 |
+
* Condition to activate efficiency evaluation for a task:
|
| 7 |
+
* Paper: as long as you have at least one correct solution, and we select up to 10 correct solutions for efficiency sampling
|
| 8 |
+
* Here: you need to have at least `min_correct` correct solutions, and we evaluate the efficiency of all correct solutions
|
| 9 |
+
* Updating rationale: to make the evaluation more statistically robust
|
| 10 |
+
|
| 11 |
+
@inproceedings{liu2024evaluating,
|
| 12 |
+
title = {Evaluating Language Models for Efficient Code Generation},
|
| 13 |
+
author = {Liu, Jiawei and Xie, Songrun and Wang, Junhao and Wei, Yuxiang and Ding, Yifeng and Zhang, Lingming},
|
| 14 |
+
booktitle = {First Conference on Language Modeling},
|
| 15 |
+
year = {2024},
|
| 16 |
+
url = {https://openreview.net/forum?id=IBCBMeAhmC},
|
| 17 |
+
}
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
import json
|
| 21 |
+
import multiprocessing
|
| 22 |
+
import os
|
| 23 |
+
import socket
|
| 24 |
+
import time
|
| 25 |
+
from collections import defaultdict
|
| 26 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 27 |
+
from contextlib import closing
|
| 28 |
+
from datetime import datetime
|
| 29 |
+
from statistics import mean
|
| 30 |
+
from typing import Dict, List, Optional, Tuple
|
| 31 |
+
|
| 32 |
+
import rich
|
| 33 |
+
from rich.rule import Rule
|
| 34 |
+
from rich.syntax import Syntax
|
| 35 |
+
from rich.table import Table
|
| 36 |
+
|
| 37 |
+
from evalplus.codegen import run_codegen
|
| 38 |
+
from evalplus.config import *
|
| 39 |
+
from evalplus.config import PERF_EVAL_TIMEOUT_SECOND
|
| 40 |
+
from evalplus.data import (
|
| 41 |
+
get_evalperf_data,
|
| 42 |
+
get_human_eval_plus,
|
| 43 |
+
get_human_eval_plus_hash,
|
| 44 |
+
get_mbpp_plus,
|
| 45 |
+
get_mbpp_plus_hash,
|
| 46 |
+
)
|
| 47 |
+
from evalplus.data.mbpp import mbpp_deserialize_inputs
|
| 48 |
+
from evalplus.data.utils import stream_jsonl
|
| 49 |
+
from evalplus.eval import PASS, untrusted_check
|
| 50 |
+
from evalplus.eval._special_oracle import MBPP_OUTPUT_NOT_NONE_TASKS
|
| 51 |
+
from evalplus.evaluate import get_groundtruth
|
| 52 |
+
from evalplus.perf.profile import (
|
| 53 |
+
are_profiles_broken,
|
| 54 |
+
default_parallelism,
|
| 55 |
+
profile,
|
| 56 |
+
simple_test_profiler,
|
| 57 |
+
)
|
| 58 |
+
from evalplus.utils import progress
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def rule(msg: str):
|
| 62 |
+
rich.print(Rule(msg))
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def not_none(l: list) -> list:
|
| 66 |
+
return [x for x in l if x is not None]
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def get_free_port():
|
| 70 |
+
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
|
| 71 |
+
s.bind(("", 0))
|
| 72 |
+
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
| 73 |
+
return s.getsockname()[1]
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def correctness_check(
|
| 77 |
+
solution: str, dataset: str, task: Dict, expected_output: List
|
| 78 |
+
) -> Tuple:
|
| 79 |
+
assert isinstance(solution, str)
|
| 80 |
+
result = untrusted_check(
|
| 81 |
+
dataset,
|
| 82 |
+
solution,
|
| 83 |
+
task["base_input"] + list(task["plus_input"]),
|
| 84 |
+
task["entry_point"],
|
| 85 |
+
expected_output["base"] + expected_output["plus"],
|
| 86 |
+
task["atol"],
|
| 87 |
+
expected_output["base_time"] + expected_output["plus_time"],
|
| 88 |
+
fast_check=True,
|
| 89 |
+
min_time_limit=DEFAULT_MIN_TIME_LIMIT,
|
| 90 |
+
gt_time_limit_factor=DEFAULT_GT_TIME_LIMIT_FACTOR,
|
| 91 |
+
)
|
| 92 |
+
return result, solution
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def get_evalplus_data():
|
| 96 |
+
problems_he = get_human_eval_plus(noextreme=True)
|
| 97 |
+
dataset_hash = get_human_eval_plus_hash(noextreme=True)
|
| 98 |
+
expected_output_human = get_groundtruth(problems_he, dataset_hash, [])
|
| 99 |
+
problems_mbpp = get_mbpp_plus(noextreme=True)
|
| 100 |
+
dataset_hash = get_mbpp_plus_hash(noextreme=True)
|
| 101 |
+
expected_output_mbpp = get_groundtruth(
|
| 102 |
+
problems_mbpp,
|
| 103 |
+
dataset_hash,
|
| 104 |
+
MBPP_OUTPUT_NOT_NONE_TASKS,
|
| 105 |
+
)
|
| 106 |
+
problems = {**problems_he, **problems_mbpp}
|
| 107 |
+
expected_output = {**expected_output_human, **expected_output_mbpp}
|
| 108 |
+
return problems, expected_output
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def table_print(table_name: str, kv: Dict):
|
| 112 |
+
table = Table(
|
| 113 |
+
title=table_name,
|
| 114 |
+
show_header=True,
|
| 115 |
+
header_style="bold",
|
| 116 |
+
)
|
| 117 |
+
for col_name in kv:
|
| 118 |
+
table.add_column(col_name)
|
| 119 |
+
|
| 120 |
+
table.add_row(*[str(v) for v in kv.values()])
|
| 121 |
+
rich.print(table)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def correctness_worker(task_id: str, samples: list, ctask: Dict, expected_output: Dict):
|
| 125 |
+
assert isinstance(
|
| 126 |
+
samples, list
|
| 127 |
+
), f"{task_id}: samples is not a list but {type(samples)}"
|
| 128 |
+
|
| 129 |
+
results = []
|
| 130 |
+
|
| 131 |
+
for solution in samples:
|
| 132 |
+
result, solution = correctness_check(
|
| 133 |
+
solution, task_id.split("/")[0].lower(), ctask, expected_output
|
| 134 |
+
)
|
| 135 |
+
results.append(
|
| 136 |
+
{
|
| 137 |
+
"solution": solution,
|
| 138 |
+
"pass": result[0] == PASS,
|
| 139 |
+
"profiled": False,
|
| 140 |
+
"matching_cluster_idx": None,
|
| 141 |
+
"dps": None,
|
| 142 |
+
"dps_norm": None,
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
return task_id, results
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def perf_worker(
|
| 150 |
+
task_id: str,
|
| 151 |
+
ptask: Dict, # EvalPerf data
|
| 152 |
+
ret_dict: Dict,
|
| 153 |
+
lazy_evaluation: bool,
|
| 154 |
+
max_profile: int,
|
| 155 |
+
):
|
| 156 |
+
rich.print(f"{task_id}: Started")
|
| 157 |
+
start_time = time.time()
|
| 158 |
+
|
| 159 |
+
######################### Profiling Setup #########################
|
| 160 |
+
n_reference = len(ptask["reference"])
|
| 161 |
+
entry_point = ptask["entry_point"]
|
| 162 |
+
pe_input = (
|
| 163 |
+
mbpp_deserialize_inputs(task_id, ptask["pe_input"])[0]
|
| 164 |
+
if task_id.startswith("Mbpp/")
|
| 165 |
+
else ptask["pe_input"][0]
|
| 166 |
+
)
|
| 167 |
+
####################################################################
|
| 168 |
+
|
| 169 |
+
####################################################################
|
| 170 |
+
############### Lazily profile reference solutions #################
|
| 171 |
+
####################################################################
|
| 172 |
+
cache_ref_num_inst = [None] * n_reference
|
| 173 |
+
|
| 174 |
+
def get_avg_ref_profile(idx, check_order=True) -> Optional[Tuple]:
|
| 175 |
+
nonlocal cache_ref_num_inst
|
| 176 |
+
|
| 177 |
+
assert (
|
| 178 |
+
idx < n_reference - 1
|
| 179 |
+
and cache_ref_num_inst[idx + 1] is not None
|
| 180 |
+
or idx == n_reference - 1
|
| 181 |
+
), f"Calling get_avg_ref_profile({idx}) before get_avg_ref_profile({idx+1}) is called, is not allowed! {n_reference = }"
|
| 182 |
+
|
| 183 |
+
if cache_ref_num_inst[idx] is not None:
|
| 184 |
+
return cache_ref_num_inst[idx], ptask["scores"][idx]
|
| 185 |
+
|
| 186 |
+
evaluation_time = PERF_EVAL_TIMEOUT_SECOND
|
| 187 |
+
ref_solution = ptask["reference"][idx]
|
| 188 |
+
for _ in range(2): # at most retry twice
|
| 189 |
+
profiles = profile(
|
| 190 |
+
ref_solution,
|
| 191 |
+
entry_point,
|
| 192 |
+
[pe_input],
|
| 193 |
+
timeout_second_per_test=evaluation_time,
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
# Bad thing#1: timeout / failure happens
|
| 197 |
+
if are_profiles_broken(profiles):
|
| 198 |
+
print(f"{task_id}: [WARNING] Error in ref: {profiles}")
|
| 199 |
+
rich.print(Syntax(ref_solution, "python"))
|
| 200 |
+
print(f"{task_id}: Retrying w/ +10s timeout...")
|
| 201 |
+
evaluation_time += 10
|
| 202 |
+
else:
|
| 203 |
+
break
|
| 204 |
+
|
| 205 |
+
avg_profile = mean(profiles)
|
| 206 |
+
# Bad thing#2: if the current #instruction is faster than that of i+1
|
| 207 |
+
if idx < n_reference - 1 and avg_profile < cache_ref_num_inst[idx + 1]:
|
| 208 |
+
print(f"{task_id}: [WARNING] #{idx} ref faster than #{idx + 1}")
|
| 209 |
+
print(f"ref {idx}: #inst {avg_profile}\tscore {ptask['scores'][idx]:.1f}")
|
| 210 |
+
print(
|
| 211 |
+
f"ref {idx+1}: #inst {cache_ref_num_inst[idx+1]}\tscore {ptask['scores'][idx+1]:.1f}"
|
| 212 |
+
)
|
| 213 |
+
rich.print(Syntax(ref_solution, "python"))
|
| 214 |
+
if check_order:
|
| 215 |
+
return None
|
| 216 |
+
|
| 217 |
+
cache_ref_num_inst[idx] = avg_profile
|
| 218 |
+
ret_dict["ref"][idx]["_num_cpu_instructions"] = avg_profile
|
| 219 |
+
return cache_ref_num_inst[idx], ptask["scores"][idx]
|
| 220 |
+
|
| 221 |
+
####################################################################
|
| 222 |
+
############################## END #################################
|
| 223 |
+
####################################################################
|
| 224 |
+
|
| 225 |
+
if not lazy_evaluation: # compute everything ahead of time
|
| 226 |
+
for i in range(n_reference - 1, -1, -1):
|
| 227 |
+
if get_avg_ref_profile(i) is None:
|
| 228 |
+
break
|
| 229 |
+
|
| 230 |
+
assert (
|
| 231 |
+
None not in cache_ref_num_inst
|
| 232 |
+
), f"{task_id}: Failed to profile certain reference: {cache_ref_num_inst = }"
|
| 233 |
+
|
| 234 |
+
profile_cache = {}
|
| 235 |
+
|
| 236 |
+
cur_profiled = 0
|
| 237 |
+
for result in ret_dict["results"]:
|
| 238 |
+
if cur_profiled >= max_profile:
|
| 239 |
+
rich.print(f"{task_id}: Reached max_profile limit {max_profile}, stopped")
|
| 240 |
+
break
|
| 241 |
+
if not result["pass"]:
|
| 242 |
+
continue
|
| 243 |
+
|
| 244 |
+
solution = result["solution"]
|
| 245 |
+
|
| 246 |
+
if solution in profile_cache: # reuse cache
|
| 247 |
+
sample_profiles = profile_cache[solution]
|
| 248 |
+
else:
|
| 249 |
+
sample_profiles = profile(
|
| 250 |
+
solution,
|
| 251 |
+
entry_point,
|
| 252 |
+
[pe_input],
|
| 253 |
+
timeout_second_per_test=PERF_EVAL_TIMEOUT_SECOND,
|
| 254 |
+
)
|
| 255 |
+
profile_cache[solution] = sample_profiles # store cache
|
| 256 |
+
|
| 257 |
+
score = 0
|
| 258 |
+
norm_score = 0
|
| 259 |
+
result["matching_cluster_idx"] = -1 # -1 means even slower than the slowest ref
|
| 260 |
+
# if the solution results in a timeout, score is 0
|
| 261 |
+
if are_profiles_broken(sample_profiles):
|
| 262 |
+
print(
|
| 263 |
+
f"{task_id}: Tested solution error'ed out: {sample_profiles} ... regarded as 0 score"
|
| 264 |
+
)
|
| 265 |
+
rich.print(Syntax(solution, "python"))
|
| 266 |
+
else:
|
| 267 |
+
avg_sample_profile = result["_num_cpu_instructions"] = mean(sample_profiles)
|
| 268 |
+
# Get profiles from fast to slow (back to front):
|
| 269 |
+
for j in range(n_reference - 1, -1, -1):
|
| 270 |
+
avg_ref_profile, ref_score = get_avg_ref_profile(j, check_order=False)
|
| 271 |
+
if avg_sample_profile <= avg_ref_profile:
|
| 272 |
+
result["matching_cluster_idx"] = j
|
| 273 |
+
score = ref_score
|
| 274 |
+
norm_score = 100 * (j + 1) / n_reference
|
| 275 |
+
break
|
| 276 |
+
|
| 277 |
+
result["dps"] = score
|
| 278 |
+
result["dps_norm"] = norm_score
|
| 279 |
+
result["profiled"] = True
|
| 280 |
+
cur_profiled += 1
|
| 281 |
+
|
| 282 |
+
ret_dict["dps"] = mean(not_none([r["dps"] for r in ret_dict["results"]]))
|
| 283 |
+
ret_dict["dps_norm"] = mean(not_none([r["dps_norm"] for r in ret_dict["results"]]))
|
| 284 |
+
ret_dict["n_profiled"] = cur_profiled
|
| 285 |
+
|
| 286 |
+
table_print(
|
| 287 |
+
f"[bold green]{task_id} Completed[/]",
|
| 288 |
+
{
|
| 289 |
+
"Duration": f"{time.time() - start_time:.1f}s",
|
| 290 |
+
"DPS": f"[green]{ret_dict['dps']:.1f}[/]",
|
| 291 |
+
"DPS_norm": f"[green]{ret_dict['dps_norm']:.1f}[/]",
|
| 292 |
+
"# Profiled": f"{cur_profiled} / {len(ret_dict['results'])}",
|
| 293 |
+
"Pass@1": f"{ret_dict['pass@1']:.1f}%",
|
| 294 |
+
},
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
return ret_dict
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
# TODO(@ganler): OPTIMIZATION: reuse the samples from the generations of other datasets
|
| 301 |
+
def script(
|
| 302 |
+
samples: Optional[str] = None,
|
| 303 |
+
min_correct: int = 10,
|
| 304 |
+
max_profile: Optional[int] = None,
|
| 305 |
+
n_samples: int = 100,
|
| 306 |
+
temperature: float = 1.0,
|
| 307 |
+
parallel: Optional[int] = None,
|
| 308 |
+
lazy_evaluation: bool = True,
|
| 309 |
+
i_just_wanna_run: bool = False,
|
| 310 |
+
**model_kwargs,
|
| 311 |
+
):
|
| 312 |
+
max_profile = max_profile or min(min_correct * 2, n_samples)
|
| 313 |
+
assert min_correct <= max_profile <= n_samples
|
| 314 |
+
simple_test_profiler() # test linux perf setup
|
| 315 |
+
|
| 316 |
+
if model_kwargs:
|
| 317 |
+
# To suppress the warning of tokenizers
|
| 318 |
+
os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get(
|
| 319 |
+
"TOKENIZERS_PARALLELISM", "false"
|
| 320 |
+
)
|
| 321 |
+
# overwrite parameters
|
| 322 |
+
samples = run_codegen(
|
| 323 |
+
dataset="evalperf",
|
| 324 |
+
n_samples=n_samples,
|
| 325 |
+
temperature=temperature,
|
| 326 |
+
**model_kwargs,
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
assert samples is not None, "Please provide the path to the samples"
|
| 330 |
+
|
| 331 |
+
# Data loading
|
| 332 |
+
problems, expected_output = get_evalplus_data()
|
| 333 |
+
ptasks = get_evalperf_data()
|
| 334 |
+
|
| 335 |
+
# Parallelism
|
| 336 |
+
max_workers = parallel or max(1, default_parallelism(divisor=4))
|
| 337 |
+
assert 0 < max_workers < multiprocessing.cpu_count(), "Invalid max CPU workers"
|
| 338 |
+
|
| 339 |
+
if os.path.isdir(samples):
|
| 340 |
+
result_path = os.path.join(samples, "evalperf_results.json")
|
| 341 |
+
else:
|
| 342 |
+
assert samples.endswith(".jsonl")
|
| 343 |
+
result_path = samples.replace(".jsonl", "_evalperf_results.json")
|
| 344 |
+
brief_result_path = result_path.replace(
|
| 345 |
+
"evalperf_results.json", "evalperf_results.brief.json"
|
| 346 |
+
)
|
| 347 |
+
|
| 348 |
+
# resume results
|
| 349 |
+
eval_results = {}
|
| 350 |
+
if not i_just_wanna_run and os.path.exists(result_path):
|
| 351 |
+
resumed_result = json.load(open(result_path, "r"))
|
| 352 |
+
if (
|
| 353 |
+
resumed_result["n_samples"] == n_samples
|
| 354 |
+
and resumed_result["temperature"] == temperature
|
| 355 |
+
and resumed_result["min_correct"] == min_correct
|
| 356 |
+
and resumed_result["max_profile"] == max_profile
|
| 357 |
+
):
|
| 358 |
+
eval_results = resumed_result["eval"]
|
| 359 |
+
for etask in eval_results:
|
| 360 |
+
ptasks.pop(etask, None)
|
| 361 |
+
|
| 362 |
+
rich.print(f"Resumed {len(eval_results)} results from {result_path}")
|
| 363 |
+
|
| 364 |
+
# Load model's samples: task_id -> a list of samples
|
| 365 |
+
sample_iter = stream_jsonl(samples)
|
| 366 |
+
samples = defaultdict(list)
|
| 367 |
+
for task in sample_iter:
|
| 368 |
+
samples[task["task_id"].replace("_", "/")].append(task["solution"])
|
| 369 |
+
samples = {k: v[:n_samples] for k, v in samples.items()}
|
| 370 |
+
|
| 371 |
+
# assert each task has n_samples
|
| 372 |
+
for task_id, s in samples.items():
|
| 373 |
+
assert len(s) == n_samples, f"{task_id} has {len(s)} samples != {n_samples}"
|
| 374 |
+
|
| 375 |
+
# Initialize eval_results
|
| 376 |
+
for task_id, ptask in ptasks.items():
|
| 377 |
+
eval_results[task_id] = {
|
| 378 |
+
"task_id": task_id,
|
| 379 |
+
"results": [],
|
| 380 |
+
"ref": [
|
| 381 |
+
{"solution": s, "score": r, "_num_cpu_instructions": None}
|
| 382 |
+
for s, r in zip(ptask["reference"], ptask["scores"])
|
| 383 |
+
],
|
| 384 |
+
"dps": None,
|
| 385 |
+
"dps_norm": None,
|
| 386 |
+
"pass@1": None,
|
| 387 |
+
"n_profiled": None,
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
rule("Correctness Checking...")
|
| 391 |
+
with progress("Correctness") as p:
|
| 392 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 393 |
+
futures = [
|
| 394 |
+
executor.submit(
|
| 395 |
+
correctness_worker,
|
| 396 |
+
task_id,
|
| 397 |
+
samples[task_id],
|
| 398 |
+
problems[task_id],
|
| 399 |
+
expected_output[task_id],
|
| 400 |
+
)
|
| 401 |
+
for task_id in ptasks
|
| 402 |
+
]
|
| 403 |
+
|
| 404 |
+
for future in p.track(as_completed(futures), total=len(futures)):
|
| 405 |
+
task_id, results = future.result()
|
| 406 |
+
eval_results[task_id]["results"] = results
|
| 407 |
+
eval_results[task_id]["pass@1"] = (
|
| 408 |
+
100 * len([r for r in results if r["pass"]]) / n_samples
|
| 409 |
+
)
|
| 410 |
+
|
| 411 |
+
rule("EvalPerf Configurations")
|
| 412 |
+
if lazy_evaluation:
|
| 413 |
+
rich.print(
|
| 414 |
+
"[bold yellow]Lazy evaluation is enabled[/]: "
|
| 415 |
+
"Fast evaluation without enumeratively checking reference order consistency."
|
| 416 |
+
)
|
| 417 |
+
|
| 418 |
+
table_print(
|
| 419 |
+
"Configurations",
|
| 420 |
+
{
|
| 421 |
+
"Max CPU": max_workers,
|
| 422 |
+
"#Tasks": len(ptasks),
|
| 423 |
+
"#Samples per task": n_samples,
|
| 424 |
+
"Min correct": min_correct,
|
| 425 |
+
"Max profile": max_profile,
|
| 426 |
+
"Result path": result_path,
|
| 427 |
+
},
|
| 428 |
+
)
|
| 429 |
+
|
| 430 |
+
rich.print(f"IDs of tasks to evaluate: {list(ptasks.keys())}")
|
| 431 |
+
rule("Evaluation Start")
|
| 432 |
+
undone = []
|
| 433 |
+
with progress("Profiling") as p:
|
| 434 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 435 |
+
futures = []
|
| 436 |
+
for task_id, ptask in ptasks.items():
|
| 437 |
+
n_pass = len([r for r in eval_results[task_id]["results"] if r["pass"]])
|
| 438 |
+
if n_pass < min_correct:
|
| 439 |
+
rich.print(
|
| 440 |
+
f"{task_id}: [bold yellow]{n_pass} < {min_correct} correct solutions, skipped[/]"
|
| 441 |
+
)
|
| 442 |
+
continue
|
| 443 |
+
futures.append(
|
| 444 |
+
executor.submit(
|
| 445 |
+
perf_worker,
|
| 446 |
+
task_id,
|
| 447 |
+
ptask,
|
| 448 |
+
eval_results[task_id],
|
| 449 |
+
lazy_evaluation,
|
| 450 |
+
max_profile,
|
| 451 |
+
)
|
| 452 |
+
)
|
| 453 |
+
undone.append(task_id)
|
| 454 |
+
rich.print(f"{task_id}: Queued")
|
| 455 |
+
|
| 456 |
+
for future in p.track(as_completed(futures), total=len(futures)):
|
| 457 |
+
result = future.result()
|
| 458 |
+
eval_results[result["task_id"]] = result
|
| 459 |
+
undone.remove(result["task_id"])
|
| 460 |
+
if undone and len(undone) < max_workers:
|
| 461 |
+
print(f"Still running: {undone}")
|
| 462 |
+
|
| 463 |
+
rule("Evaluation Summary")
|
| 464 |
+
dps = mean(not_none([res["dps"] for res in eval_results.values()]))
|
| 465 |
+
dps_norm = mean(not_none([res["dps_norm"] for res in eval_results.values()]))
|
| 466 |
+
pass_1 = mean(not_none([res["pass@1"] for res in eval_results.values()]))
|
| 467 |
+
n_evalperfed = len(not_none([res["dps"] for res in eval_results.values()]))
|
| 468 |
+
|
| 469 |
+
table_print(
|
| 470 |
+
"EvalPerf Summary",
|
| 471 |
+
{
|
| 472 |
+
"DPS": f"{dps:.1f}",
|
| 473 |
+
"DPS_norm": f"{dps_norm:.1f}",
|
| 474 |
+
"Pass@1": f"{pass_1:.1f}%",
|
| 475 |
+
"#EvalPerf-ed tasks": f"{n_evalperfed} / {len(eval_results)}",
|
| 476 |
+
"min_correct": min_correct,
|
| 477 |
+
"n_samples": n_samples,
|
| 478 |
+
"temperature": temperature,
|
| 479 |
+
},
|
| 480 |
+
)
|
| 481 |
+
|
| 482 |
+
# Save full results
|
| 483 |
+
with open(result_path, "w") as f:
|
| 484 |
+
f.write(
|
| 485 |
+
json.dumps(
|
| 486 |
+
{
|
| 487 |
+
"date": datetime.now().strftime("%Y-%m-%d %H:%M"),
|
| 488 |
+
"n_samples": n_samples,
|
| 489 |
+
"temperature": temperature,
|
| 490 |
+
"min_correct": min_correct,
|
| 491 |
+
"max_profile": max_profile,
|
| 492 |
+
"eval": eval_results,
|
| 493 |
+
}
|
| 494 |
+
)
|
| 495 |
+
)
|
| 496 |
+
rich.print(f"Full results have been saved to {result_path}")
|
| 497 |
+
|
| 498 |
+
# Save brief results
|
| 499 |
+
with open(brief_result_path, "w") as f:
|
| 500 |
+
f.write(
|
| 501 |
+
json.dumps(
|
| 502 |
+
{
|
| 503 |
+
"date": datetime.now().strftime("%Y-%m-%d %H:%M"),
|
| 504 |
+
"config": {
|
| 505 |
+
"n_samples": n_samples,
|
| 506 |
+
"temperature": temperature,
|
| 507 |
+
"min_correct": min_correct,
|
| 508 |
+
"max_profile": max_profile,
|
| 509 |
+
},
|
| 510 |
+
"summary": {
|
| 511 |
+
"dps": dps,
|
| 512 |
+
"dps_norm": dps_norm,
|
| 513 |
+
"pass@1": pass_1,
|
| 514 |
+
},
|
| 515 |
+
"eval": {
|
| 516 |
+
task_id: {
|
| 517 |
+
"dps": res["dps"],
|
| 518 |
+
"dps_norm": res["dps_norm"],
|
| 519 |
+
"pass@1": res["pass@1"],
|
| 520 |
+
"profiled": [
|
| 521 |
+
{
|
| 522 |
+
"solution": r["solution"],
|
| 523 |
+
"matching_cluster_idx": r["matching_cluster_idx"],
|
| 524 |
+
}
|
| 525 |
+
for r in res["results"]
|
| 526 |
+
if r["profiled"]
|
| 527 |
+
],
|
| 528 |
+
}
|
| 529 |
+
for task_id, res in eval_results.items()
|
| 530 |
+
},
|
| 531 |
+
}
|
| 532 |
+
)
|
| 533 |
+
)
|
| 534 |
+
|
| 535 |
+
rich.print(f"Brief results have been saved to {brief_result_path}")
|
| 536 |
+
|
| 537 |
+
rule("To visualize win-rates and pair-wise DPS, run:")
|
| 538 |
+
rich.print(
|
| 539 |
+
Syntax(
|
| 540 |
+
f"""\
|
| 541 |
+
git clone git@github.com:evalplus/evalplus.github.io.git
|
| 542 |
+
git --git-dir=evalplus.github.io/.git pull
|
| 543 |
+
cp {brief_result_path} evalplus.github.io/results/evalperf
|
| 544 |
+
python evalplus.github.io/results/evalperf/stats.py
|
| 545 |
+
python -m http.server -d evalplus.github.io {get_free_port()}""",
|
| 546 |
+
"bash",
|
| 547 |
+
)
|
| 548 |
+
)
|
| 549 |
+
|
| 550 |
+
|
| 551 |
+
def main():
|
| 552 |
+
from fire import Fire
|
| 553 |
+
|
| 554 |
+
Fire(script)
|
| 555 |
+
|
| 556 |
+
|
| 557 |
+
if __name__ == "__main__":
|
| 558 |
+
main()
|
evalplus/build/lib/evalplus/evaluate.py
ADDED
|
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import multiprocessing
|
| 3 |
+
import os
|
| 4 |
+
import pickle
|
| 5 |
+
import threading
|
| 6 |
+
import time
|
| 7 |
+
from collections import Counter, defaultdict
|
| 8 |
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 11 |
+
from warnings import warn
|
| 12 |
+
|
| 13 |
+
import numpy as np
|
| 14 |
+
from termcolor import cprint
|
| 15 |
+
from tqdm import tqdm
|
| 16 |
+
|
| 17 |
+
from evalplus.codegen import run_codegen
|
| 18 |
+
from evalplus.config import *
|
| 19 |
+
from evalplus.data import (
|
| 20 |
+
get_human_eval_plus,
|
| 21 |
+
get_human_eval_plus_hash,
|
| 22 |
+
get_mbpp_plus,
|
| 23 |
+
get_mbpp_plus_hash,
|
| 24 |
+
load_solutions,
|
| 25 |
+
)
|
| 26 |
+
from evalplus.data.mbpp import mbpp_serialize_inputs
|
| 27 |
+
from evalplus.data.utils import CACHE_DIR
|
| 28 |
+
from evalplus.eval import (
|
| 29 |
+
PASS,
|
| 30 |
+
compatible_eval_result,
|
| 31 |
+
estimate_pass_at_k,
|
| 32 |
+
untrusted_check,
|
| 33 |
+
)
|
| 34 |
+
from evalplus.eval._special_oracle import MBPP_OUTPUT_NOT_NONE_TASKS
|
| 35 |
+
from evalplus.gen.util import trusted_exec
|
| 36 |
+
|
| 37 |
+
# 1st item: the status
|
| 38 |
+
# 2nd item (optional): the detailed pass/fail boolean for each input
|
| 39 |
+
Result = Tuple[str, List[bool]]
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def get_groundtruth(problems, hashcode, tasks_only_output_not_none):
|
| 43 |
+
cache_file = os.path.join(CACHE_DIR, f"{hashcode}.pkl")
|
| 44 |
+
if os.path.exists(cache_file):
|
| 45 |
+
print(f"Load from ground-truth from {cache_file}")
|
| 46 |
+
with open(cache_file, "rb") as f:
|
| 47 |
+
return pickle.load(f)
|
| 48 |
+
|
| 49 |
+
os.makedirs(CACHE_DIR, exist_ok=True)
|
| 50 |
+
print("Computing expected output...")
|
| 51 |
+
tbegin = time.time()
|
| 52 |
+
expected_output = {}
|
| 53 |
+
for task_id, problem in problems.items():
|
| 54 |
+
oracle = {}
|
| 55 |
+
oracle["base"], oracle["base_time"] = trusted_exec(
|
| 56 |
+
problem["prompt"] + problem["canonical_solution"],
|
| 57 |
+
problem["base_input"],
|
| 58 |
+
problem["entry_point"],
|
| 59 |
+
record_time=True,
|
| 60 |
+
output_not_none=problem["entry_point"] in tasks_only_output_not_none,
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
oracle["plus"], oracle["plus_time"] = trusted_exec(
|
| 64 |
+
problem["prompt"] + problem["canonical_solution"],
|
| 65 |
+
problem["plus_input"],
|
| 66 |
+
problem["entry_point"],
|
| 67 |
+
record_time=True,
|
| 68 |
+
output_not_none=problem["entry_point"] in tasks_only_output_not_none,
|
| 69 |
+
)
|
| 70 |
+
expected_output[task_id] = oracle
|
| 71 |
+
print(f"Expected outputs computed in {time.time() - tbegin:.2f}s")
|
| 72 |
+
|
| 73 |
+
with open(cache_file, "wb") as f:
|
| 74 |
+
pickle.dump(expected_output, f)
|
| 75 |
+
|
| 76 |
+
return expected_output
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def check_correctness(
|
| 80 |
+
dataset: str,
|
| 81 |
+
completion_id: int,
|
| 82 |
+
problem: Dict[str, Any],
|
| 83 |
+
solution: str,
|
| 84 |
+
expected_output: Dict[str, List],
|
| 85 |
+
base_only=False,
|
| 86 |
+
fast_check=False,
|
| 87 |
+
identifier=None,
|
| 88 |
+
min_time_limit: float = DEFAULT_MIN_TIME_LIMIT,
|
| 89 |
+
gt_time_limit_factor: float = DEFAULT_GT_TIME_LIMIT_FACTOR,
|
| 90 |
+
) -> Dict[str, Result]: # {...}, "base" | "plus" -> (status, details)
|
| 91 |
+
ret = {
|
| 92 |
+
"completion_id": completion_id,
|
| 93 |
+
"task_id": problem["task_id"],
|
| 94 |
+
"_identifier": identifier,
|
| 95 |
+
"solution": solution,
|
| 96 |
+
}
|
| 97 |
+
ret["base"] = untrusted_check(
|
| 98 |
+
dataset,
|
| 99 |
+
solution,
|
| 100 |
+
problem["base_input"],
|
| 101 |
+
problem["entry_point"],
|
| 102 |
+
expected=expected_output["base"],
|
| 103 |
+
atol=problem["atol"],
|
| 104 |
+
ref_time=expected_output["base_time"],
|
| 105 |
+
fast_check=fast_check,
|
| 106 |
+
min_time_limit=min_time_limit,
|
| 107 |
+
gt_time_limit_factor=gt_time_limit_factor,
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
if not base_only:
|
| 111 |
+
ret["plus"] = untrusted_check(
|
| 112 |
+
dataset,
|
| 113 |
+
solution,
|
| 114 |
+
problem["plus_input"],
|
| 115 |
+
problem["entry_point"],
|
| 116 |
+
expected=expected_output["plus"],
|
| 117 |
+
atol=problem["atol"],
|
| 118 |
+
ref_time=expected_output["plus_time"],
|
| 119 |
+
fast_check=fast_check,
|
| 120 |
+
min_time_limit=min_time_limit,
|
| 121 |
+
gt_time_limit_factor=gt_time_limit_factor,
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
return ret
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def evaluate(
|
| 128 |
+
dataset: str,
|
| 129 |
+
samples: Optional[str] = None,
|
| 130 |
+
base_only: bool = False,
|
| 131 |
+
parallel: Optional[int] = None,
|
| 132 |
+
i_just_wanna_run: bool = False,
|
| 133 |
+
test_details: bool = False,
|
| 134 |
+
min_time_limit: float = DEFAULT_MIN_TIME_LIMIT,
|
| 135 |
+
gt_time_limit_factor: float = DEFAULT_GT_TIME_LIMIT_FACTOR,
|
| 136 |
+
mini: bool = False,
|
| 137 |
+
noextreme: bool = False,
|
| 138 |
+
version: str = "default",
|
| 139 |
+
output_file: Optional[str] = None,
|
| 140 |
+
gguf_file: Optional[str] = None,
|
| 141 |
+
**model_kwargs,
|
| 142 |
+
):
|
| 143 |
+
if model_kwargs:
|
| 144 |
+
# To suppress the warning of tokenizers
|
| 145 |
+
os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get(
|
| 146 |
+
"TOKENIZERS_PARALLELISM", "false"
|
| 147 |
+
)
|
| 148 |
+
samples = run_codegen(
|
| 149 |
+
dataset=dataset,
|
| 150 |
+
gguf_file=gguf_file,
|
| 151 |
+
**model_kwargs,
|
| 152 |
+
)
|
| 153 |
+
assert samples is not None, "No samples provided"
|
| 154 |
+
|
| 155 |
+
n_workers = parallel or max(1, multiprocessing.cpu_count() // 2)
|
| 156 |
+
|
| 157 |
+
if os.path.isdir(samples):
|
| 158 |
+
result_path = os.path.join(samples, "eval_results.json")
|
| 159 |
+
else:
|
| 160 |
+
assert samples.endswith(".jsonl")
|
| 161 |
+
# legacy compatibility
|
| 162 |
+
if os.path.exists(samples.replace(".jsonl", "_eval_results.json")):
|
| 163 |
+
result_path = samples.replace(".jsonl", "_eval_results.json")
|
| 164 |
+
else:
|
| 165 |
+
result_path = samples.replace(".jsonl", ".eval_results.json")
|
| 166 |
+
|
| 167 |
+
if output_file is not None:
|
| 168 |
+
result_path = output_file
|
| 169 |
+
|
| 170 |
+
if os.path.isfile(result_path) and not i_just_wanna_run:
|
| 171 |
+
print(f"Load from previous results from {result_path}")
|
| 172 |
+
with open(result_path, "r") as f:
|
| 173 |
+
results = json.load(f)
|
| 174 |
+
|
| 175 |
+
results = compatible_eval_result(results)
|
| 176 |
+
else:
|
| 177 |
+
if dataset == "humaneval":
|
| 178 |
+
problems = get_human_eval_plus(
|
| 179 |
+
mini=mini, noextreme=noextreme, version=version
|
| 180 |
+
)
|
| 181 |
+
dataset_hash = get_human_eval_plus_hash(
|
| 182 |
+
mini=mini, noextreme=noextreme, version=version
|
| 183 |
+
)
|
| 184 |
+
expected_output = get_groundtruth(problems, dataset_hash, [])
|
| 185 |
+
elif dataset == "mbpp":
|
| 186 |
+
problems = get_mbpp_plus(mini=mini, noextreme=noextreme, version=version)
|
| 187 |
+
dataset_hash = get_mbpp_plus_hash(
|
| 188 |
+
mini=mini, noextreme=noextreme, version=version
|
| 189 |
+
)
|
| 190 |
+
expected_output = get_groundtruth(
|
| 191 |
+
problems,
|
| 192 |
+
dataset_hash,
|
| 193 |
+
MBPP_OUTPUT_NOT_NONE_TASKS,
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
results = {
|
| 197 |
+
"date": datetime.now().strftime("%Y-%m-%d %H:%M"),
|
| 198 |
+
"hash": dataset_hash,
|
| 199 |
+
"eval": {},
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
with ProcessPoolExecutor(max_workers=n_workers) as executor:
|
| 203 |
+
futures = []
|
| 204 |
+
completion_id = Counter()
|
| 205 |
+
n_samples = 0
|
| 206 |
+
eval_results = defaultdict(list) # task_id ->
|
| 207 |
+
remainings = set()
|
| 208 |
+
|
| 209 |
+
print("Reading samples...")
|
| 210 |
+
for sample in tqdm(load_solutions(samples)):
|
| 211 |
+
task_id = sample["task_id"]
|
| 212 |
+
if task_id not in problems:
|
| 213 |
+
warn(
|
| 214 |
+
f"Task {task_id} is found in the samples but not found in the dataset"
|
| 215 |
+
)
|
| 216 |
+
continue
|
| 217 |
+
solution = (
|
| 218 |
+
sample["solution"]
|
| 219 |
+
if "solution" in sample
|
| 220 |
+
else problems[task_id]["prompt"] + sample["completion"]
|
| 221 |
+
)
|
| 222 |
+
remainings.add(sample["_identifier"])
|
| 223 |
+
args = (
|
| 224 |
+
dataset,
|
| 225 |
+
completion_id[task_id],
|
| 226 |
+
problems[task_id],
|
| 227 |
+
solution,
|
| 228 |
+
expected_output[task_id],
|
| 229 |
+
base_only,
|
| 230 |
+
not test_details, # fast_check
|
| 231 |
+
sample["_identifier"],
|
| 232 |
+
min_time_limit,
|
| 233 |
+
gt_time_limit_factor,
|
| 234 |
+
)
|
| 235 |
+
futures.append(executor.submit(check_correctness, *args))
|
| 236 |
+
completion_id[task_id] += 1
|
| 237 |
+
n_samples += 1
|
| 238 |
+
|
| 239 |
+
assert n_samples == len(remainings), "Missing problems in unfinished"
|
| 240 |
+
assert len(completion_id) == len(problems), "Missing problems in samples"
|
| 241 |
+
|
| 242 |
+
def stucking_checker():
|
| 243 |
+
while remainings:
|
| 244 |
+
last_size = len(remainings)
|
| 245 |
+
time.sleep(20)
|
| 246 |
+
if last_size != len(remainings) or len(remainings) == 0:
|
| 247 |
+
continue
|
| 248 |
+
# Potential stucking
|
| 249 |
+
warn("No samples had finished testing in the last 20s")
|
| 250 |
+
warn(f"{len(remainings)} samples to be tested: {remainings}")
|
| 251 |
+
|
| 252 |
+
threading.Thread(target=stucking_checker).start()
|
| 253 |
+
|
| 254 |
+
for future in tqdm(as_completed(futures), total=n_samples):
|
| 255 |
+
result = future.result()
|
| 256 |
+
remainings.remove(result["_identifier"])
|
| 257 |
+
eval_results[result["task_id"]].append(result)
|
| 258 |
+
|
| 259 |
+
# sort the results for each problem by completion_id
|
| 260 |
+
for task_id, task_results in eval_results.items():
|
| 261 |
+
task_results.sort(key=lambda x: x["completion_id"])
|
| 262 |
+
results["eval"][task_id] = []
|
| 263 |
+
for res in task_results:
|
| 264 |
+
|
| 265 |
+
def get_failed_tests(stat, details, inputs) -> List[Any]:
|
| 266 |
+
if stat == PASS or not details:
|
| 267 |
+
return []
|
| 268 |
+
|
| 269 |
+
if test_details:
|
| 270 |
+
return [
|
| 271 |
+
inputs[i] for i in range(len(details)) if not details[i]
|
| 272 |
+
]
|
| 273 |
+
|
| 274 |
+
# else => simply return the only and the last fail test
|
| 275 |
+
return [inputs[len(details) - 1]]
|
| 276 |
+
|
| 277 |
+
base_stat, base_details = res["base"]
|
| 278 |
+
base_fail_tests = get_failed_tests(
|
| 279 |
+
base_stat, base_details, problems[task_id]["base_input"]
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
# initialize plus tests
|
| 283 |
+
plus_stat = None
|
| 284 |
+
plus_fail_tests = []
|
| 285 |
+
|
| 286 |
+
# with plus tests
|
| 287 |
+
if not base_only:
|
| 288 |
+
plus_stat, plus_details = res["plus"]
|
| 289 |
+
plus_fail_tests = get_failed_tests(
|
| 290 |
+
plus_stat, plus_details, problems[task_id]["plus_input"]
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
if dataset == "mbpp":
|
| 294 |
+
base_fail_tests = mbpp_serialize_inputs(task_id, base_fail_tests)
|
| 295 |
+
plus_fail_tests = mbpp_serialize_inputs(task_id, plus_fail_tests)
|
| 296 |
+
|
| 297 |
+
results["eval"][task_id].append(
|
| 298 |
+
{
|
| 299 |
+
"task_id": task_id,
|
| 300 |
+
"solution": res["solution"],
|
| 301 |
+
"base_status": base_stat,
|
| 302 |
+
"plus_status": plus_stat,
|
| 303 |
+
"base_fail_tests": base_fail_tests,
|
| 304 |
+
"plus_fail_tests": plus_fail_tests,
|
| 305 |
+
}
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
# Calculate pass@k.
|
| 309 |
+
total = np.array([len(r) for r in results["eval"].values()])
|
| 310 |
+
base_correct = []
|
| 311 |
+
new_correct = []
|
| 312 |
+
|
| 313 |
+
for res in results["eval"].values():
|
| 314 |
+
bc = sum([r["base_status"] == PASS for r in res])
|
| 315 |
+
base_correct.append(bc)
|
| 316 |
+
if not base_only:
|
| 317 |
+
new_correct.append(
|
| 318 |
+
sum(
|
| 319 |
+
[
|
| 320 |
+
res[i]["base_status"] == res[i]["plus_status"] == PASS
|
| 321 |
+
for i in range(len(res))
|
| 322 |
+
]
|
| 323 |
+
)
|
| 324 |
+
)
|
| 325 |
+
base_correct = np.array(base_correct)
|
| 326 |
+
|
| 327 |
+
pass_at_k = {
|
| 328 |
+
f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
|
| 329 |
+
for k in [1, 10, 100]
|
| 330 |
+
if total.min() >= k
|
| 331 |
+
}
|
| 332 |
+
cprint(f"{dataset} (base tests)", "red")
|
| 333 |
+
for k, v in pass_at_k.items():
|
| 334 |
+
cprint(f"{k}:\t{v:.3f}", "red")
|
| 335 |
+
results["pass_at_k"] = {"base": pass_at_k}
|
| 336 |
+
|
| 337 |
+
if new_correct:
|
| 338 |
+
cprint(f"{dataset}+ (base + extra tests)", "green")
|
| 339 |
+
pass_at_k = {
|
| 340 |
+
f"pass@{k}": estimate_pass_at_k(total, np.array(new_correct), k).mean()
|
| 341 |
+
for k in [1, 10, 100]
|
| 342 |
+
if (total >= k).all()
|
| 343 |
+
}
|
| 344 |
+
for k, v in pass_at_k.items():
|
| 345 |
+
cprint(f"{k}:\t{v:.3f}", "green")
|
| 346 |
+
results["pass_at_k"]["plus"] = pass_at_k
|
| 347 |
+
|
| 348 |
+
# save results
|
| 349 |
+
if os.path.isfile(result_path) and i_just_wanna_run:
|
| 350 |
+
decision = ""
|
| 351 |
+
while decision.lower() not in ["y", "n"]:
|
| 352 |
+
print(f"{result_path} already exists. Press [Y/N] to overwrite or exit...")
|
| 353 |
+
decision = input()
|
| 354 |
+
|
| 355 |
+
if decision.lower() == "y":
|
| 356 |
+
# mv the file to a backup
|
| 357 |
+
new_path = result_path + ".bak"
|
| 358 |
+
while os.path.isfile(new_path):
|
| 359 |
+
new_path += ".bak"
|
| 360 |
+
os.rename(result_path, new_path)
|
| 361 |
+
print(f"Backup {result_path} to {new_path}")
|
| 362 |
+
|
| 363 |
+
if not os.path.isfile(result_path):
|
| 364 |
+
with open(result_path, "w") as f:
|
| 365 |
+
json.dump(results, f)
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
def main():
|
| 369 |
+
from fire import Fire
|
| 370 |
+
|
| 371 |
+
Fire(evaluate)
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
if __name__ == "__main__":
|
| 375 |
+
main()
|
evalplus/build/lib/evalplus/gen/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import copy
|
| 2 |
+
from typing import Any, List
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class BaseGen(object):
|
| 6 |
+
def __init__(self, inputs: List[Any], entry_point: str, contract: str):
|
| 7 |
+
"""Initializing a input mutator.
|
| 8 |
+
|
| 9 |
+
Args:
|
| 10 |
+
inputs (List[Any]): The set of initial inputs (i.e., seeds)
|
| 11 |
+
entry_point (str): The function name to invoke with the input
|
| 12 |
+
contract (str): The contract to verify input validity
|
| 13 |
+
"""
|
| 14 |
+
self.contract = contract
|
| 15 |
+
self.entry_point = entry_point
|
| 16 |
+
self.seed_pool: List[Any] = copy.deepcopy(inputs)
|
| 17 |
+
self.new_inputs = []
|
| 18 |
+
self.seed_hash = set([hash(str(x)) for x in self.seed_pool])
|
| 19 |
+
|
| 20 |
+
def generate(self, num: int) -> List[Any]:
|
| 21 |
+
raise NotImplementedError
|
evalplus/build/lib/evalplus/gen/chatgpt_gen.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ast
|
| 2 |
+
import random
|
| 3 |
+
from typing import List
|
| 4 |
+
|
| 5 |
+
import openai
|
| 6 |
+
from openai.types.chat import ChatCompletion
|
| 7 |
+
|
| 8 |
+
from evalplus.data.utils import to_raw
|
| 9 |
+
from evalplus.gen import BaseGen
|
| 10 |
+
from evalplus.gen.util import trusted_check_exec
|
| 11 |
+
from evalplus.gen.util.openai_request import make_auto_request
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ChatGPTGen(BaseGen):
|
| 15 |
+
def __init__(self, inputs: List, signature: str, contract_code: str, gd_code: str):
|
| 16 |
+
super().__init__(inputs, signature, contract_code)
|
| 17 |
+
self.gd_code = gd_code
|
| 18 |
+
self.prompt_messages = [
|
| 19 |
+
"Please generate complex inputs to test the function.",
|
| 20 |
+
"Please generate corner case inputs to test the function.",
|
| 21 |
+
"Please generate difficult inputs to test the function.",
|
| 22 |
+
]
|
| 23 |
+
self.iteration = 20
|
| 24 |
+
self.client = openai.Client()
|
| 25 |
+
|
| 26 |
+
def seed_selection(self) -> List:
|
| 27 |
+
# get 5 for now.
|
| 28 |
+
return random.sample(self.seed_pool, k=min(len(self.seed_pool), 5))
|
| 29 |
+
|
| 30 |
+
@staticmethod
|
| 31 |
+
def _parse_ret(ret: ChatCompletion) -> List:
|
| 32 |
+
rets = []
|
| 33 |
+
output = ret.choices[0].message.content
|
| 34 |
+
if "```" in output:
|
| 35 |
+
for x in output.split("```")[1].splitlines():
|
| 36 |
+
if x.strip() == "":
|
| 37 |
+
continue
|
| 38 |
+
try:
|
| 39 |
+
# remove comments
|
| 40 |
+
input = ast.literal_eval(f"[{x.split('#')[0].strip()}]")
|
| 41 |
+
except: # something wrong.
|
| 42 |
+
continue
|
| 43 |
+
rets.append(input)
|
| 44 |
+
return rets
|
| 45 |
+
|
| 46 |
+
def chatgpt_generate(self, selected_inputs: List) -> List:
|
| 47 |
+
# append the groundtruth function
|
| 48 |
+
# actually it can be any function (maybe we can generate inputs for each llm generated code individually)
|
| 49 |
+
message = f"Here is a function that we want to test:\n```\n{self.gd_code}\n```"
|
| 50 |
+
str_inputs = "\n".join(
|
| 51 |
+
[
|
| 52 |
+
", ".join([f"'{to_raw(i)}'" if type(i) == str else str(i) for i in x])
|
| 53 |
+
for x in selected_inputs
|
| 54 |
+
]
|
| 55 |
+
)
|
| 56 |
+
message += f"\nThese are some example inputs used to test the function:\n```\n{str_inputs}\n```"
|
| 57 |
+
message += f"\n{random.choice(self.prompt_messages)}"
|
| 58 |
+
ret = make_auto_request(
|
| 59 |
+
self.client,
|
| 60 |
+
message=message,
|
| 61 |
+
model="gpt-3.5-turbo",
|
| 62 |
+
max_tokens=256,
|
| 63 |
+
response_format={"type": "text"},
|
| 64 |
+
)
|
| 65 |
+
return self._parse_ret(ret)
|
| 66 |
+
|
| 67 |
+
def generate(self, num: int):
|
| 68 |
+
while len(self.new_inputs) < num and self.iteration >= 0:
|
| 69 |
+
seeds = self.seed_selection()
|
| 70 |
+
new_inputs = self.chatgpt_generate(seeds)
|
| 71 |
+
for new_input in new_inputs:
|
| 72 |
+
if hash(str(new_input)) not in self.seed_hash:
|
| 73 |
+
if trusted_check_exec(self.contract, [new_input], self.entry_point):
|
| 74 |
+
self.seed_pool.append(new_input)
|
| 75 |
+
self.seed_hash.add(hash(str(new_input)))
|
| 76 |
+
self.new_inputs.append(new_input)
|
| 77 |
+
self.iteration -= 1
|
| 78 |
+
return self.new_inputs[:num]
|
evalplus/build/lib/evalplus/gen/mut_gen.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
from abc import abstractmethod
|
| 3 |
+
from typing import Any, List
|
| 4 |
+
|
| 5 |
+
from evalplus.gen import BaseGen
|
| 6 |
+
from evalplus.gen.util import trusted_check_exec
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class MutateGen(BaseGen):
|
| 10 |
+
def __init__(self, inputs: List, signature: str, contract_code: str):
|
| 11 |
+
super().__init__(inputs, signature, contract_code)
|
| 12 |
+
|
| 13 |
+
def seed_selection(self):
|
| 14 |
+
# random for now.
|
| 15 |
+
return random.choice(self.seed_pool)
|
| 16 |
+
|
| 17 |
+
@abstractmethod
|
| 18 |
+
def mutate(self, seed_input: Any) -> Any:
|
| 19 |
+
pass
|
| 20 |
+
|
| 21 |
+
def generate(self, num: int) -> List[Any]:
|
| 22 |
+
while len(self.new_inputs) < num:
|
| 23 |
+
seed = self.seed_selection()
|
| 24 |
+
new_input = self.mutate(seed)
|
| 25 |
+
if hash(str(new_input)) not in self.seed_hash:
|
| 26 |
+
if trusted_check_exec(self.contract, [new_input], self.entry_point):
|
| 27 |
+
self.seed_pool.append(new_input)
|
| 28 |
+
self.seed_hash.add(hash(str(new_input)))
|
| 29 |
+
self.new_inputs.append(new_input)
|
| 30 |
+
return self.new_inputs[:num]
|
evalplus/build/lib/evalplus/gen/type_mut.py
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import copy
|
| 2 |
+
import random
|
| 3 |
+
import string
|
| 4 |
+
import time
|
| 5 |
+
from typing import Any, Dict, List, Set, Tuple
|
| 6 |
+
|
| 7 |
+
from multipledispatch import dispatch
|
| 8 |
+
|
| 9 |
+
from evalplus.gen.mut_gen import MutateGen
|
| 10 |
+
from evalplus.gen.util import trusted_check_exec
|
| 11 |
+
|
| 12 |
+
MAX_MULTI_STEP_SIZE = 5
|
| 13 |
+
MUTATE_BOUND_SIZE = 8
|
| 14 |
+
|
| 15 |
+
NoneType = type(None)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# decorator to use ingredients
|
| 19 |
+
class use_ingredient:
|
| 20 |
+
def __init__(self, prob: float):
|
| 21 |
+
assert 0 <= prob <= 0.95
|
| 22 |
+
self.prob = prob
|
| 23 |
+
|
| 24 |
+
def __call__(obj, func):
|
| 25 |
+
def wrapper(self, seed_input):
|
| 26 |
+
if random.random() < obj.prob and self.ingredients[type(seed_input)]:
|
| 27 |
+
return random.choice(list(self.ingredients[type(seed_input)]))
|
| 28 |
+
else:
|
| 29 |
+
return func(self, seed_input)
|
| 30 |
+
|
| 31 |
+
return wrapper
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class TypedMutGen(MutateGen):
|
| 35 |
+
def __init__(self, inputs: List, signature: str, contract_code: str):
|
| 36 |
+
super().__init__(inputs, signature, contract_code)
|
| 37 |
+
self.timeout = 60 * 60 # 1 hour
|
| 38 |
+
self.ingredients = {
|
| 39 |
+
int: set(),
|
| 40 |
+
float: set(),
|
| 41 |
+
str: set(),
|
| 42 |
+
complex: set(),
|
| 43 |
+
}
|
| 44 |
+
for x in inputs:
|
| 45 |
+
self.fetch_ingredient(x)
|
| 46 |
+
|
| 47 |
+
def seed_selection(self):
|
| 48 |
+
# random for now.
|
| 49 |
+
return random.choice(self.seed_pool)
|
| 50 |
+
|
| 51 |
+
def mutate(self, seed_input: Any) -> List:
|
| 52 |
+
new_input = copy.deepcopy(seed_input)
|
| 53 |
+
|
| 54 |
+
patience = MUTATE_BOUND_SIZE
|
| 55 |
+
while new_input == seed_input or patience == 0:
|
| 56 |
+
new_input = self.typed_mutate(new_input)
|
| 57 |
+
patience -= 1
|
| 58 |
+
|
| 59 |
+
return new_input
|
| 60 |
+
|
| 61 |
+
#########################
|
| 62 |
+
# Type-aware generation #
|
| 63 |
+
#########################
|
| 64 |
+
@dispatch(NoneType)
|
| 65 |
+
def typed_gen(self, _):
|
| 66 |
+
return None
|
| 67 |
+
|
| 68 |
+
@dispatch(int)
|
| 69 |
+
def typed_gen(self, _):
|
| 70 |
+
@use_ingredient(0.5)
|
| 71 |
+
def _impl(*_):
|
| 72 |
+
return random.randint(-100, 100)
|
| 73 |
+
|
| 74 |
+
return _impl(self, _)
|
| 75 |
+
|
| 76 |
+
@dispatch(float)
|
| 77 |
+
def typed_gen(self, _):
|
| 78 |
+
@use_ingredient(0.5)
|
| 79 |
+
def _impl(*_):
|
| 80 |
+
return random.uniform(-100, 100)
|
| 81 |
+
|
| 82 |
+
return _impl(self, _)
|
| 83 |
+
|
| 84 |
+
@dispatch(bool)
|
| 85 |
+
def typed_gen(self, _):
|
| 86 |
+
return random.choice([True, False])
|
| 87 |
+
|
| 88 |
+
@dispatch(str)
|
| 89 |
+
def typed_gen(self, _):
|
| 90 |
+
@use_ingredient(0.5)
|
| 91 |
+
def _impl(*_):
|
| 92 |
+
return "".join(
|
| 93 |
+
random.choice(string.ascii_letters)
|
| 94 |
+
for _ in range(random.randint(0, 10))
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
return _impl(self, _)
|
| 98 |
+
|
| 99 |
+
def any_gen(self):
|
| 100 |
+
# weighted choose
|
| 101 |
+
choice = random.choices(
|
| 102 |
+
[
|
| 103 |
+
True,
|
| 104 |
+
1,
|
| 105 |
+
1.1,
|
| 106 |
+
"str",
|
| 107 |
+
[], # list
|
| 108 |
+
tuple(), # tuple
|
| 109 |
+
dict(), # dict
|
| 110 |
+
None, # None
|
| 111 |
+
],
|
| 112 |
+
[0.2, 0.2, 0.2, 0.2, 0.05, 0.05, 0.05, 0.05],
|
| 113 |
+
)[0]
|
| 114 |
+
return self.typed_gen(choice)
|
| 115 |
+
|
| 116 |
+
@dispatch(list)
|
| 117 |
+
def typed_gen(self, _):
|
| 118 |
+
ret = []
|
| 119 |
+
size = random.randint(0, 10)
|
| 120 |
+
if random.randint(0, 4) == 0: # heterogeneous
|
| 121 |
+
for _ in range(size):
|
| 122 |
+
ret.append(self.any_gen())
|
| 123 |
+
else: # homogeneous
|
| 124 |
+
t = random.choice([bool(), int(), float(), str()])
|
| 125 |
+
for _ in range(size):
|
| 126 |
+
ret.append(self.typed_gen(t))
|
| 127 |
+
return ret
|
| 128 |
+
|
| 129 |
+
@dispatch(tuple)
|
| 130 |
+
def typed_gen(self, _):
|
| 131 |
+
return tuple(self.typed_gen([]))
|
| 132 |
+
|
| 133 |
+
# NOTE: disable set for now as Steven is too weak in Python (/s)
|
| 134 |
+
# @dispatch(set)
|
| 135 |
+
# def typed_gen(self, _):
|
| 136 |
+
# return set(self.typed_gen([]))
|
| 137 |
+
|
| 138 |
+
@dispatch(dict)
|
| 139 |
+
def typed_gen(self, _):
|
| 140 |
+
ret = dict()
|
| 141 |
+
values = self.typed_gen([])
|
| 142 |
+
# NOTE: Assumption: nobody uses dict with heterogeneous keys
|
| 143 |
+
# NOTE: Assumption: nobody uses dict with boolean keys
|
| 144 |
+
key_type = random.choice([int(), float(), str()])
|
| 145 |
+
for v in values:
|
| 146 |
+
ret[self.typed_gen(key_type)] = self.typed_gen(v)
|
| 147 |
+
return ret
|
| 148 |
+
|
| 149 |
+
########################
|
| 150 |
+
# Type-aware mutation #
|
| 151 |
+
########################
|
| 152 |
+
# Simple primitives
|
| 153 |
+
@dispatch(int)
|
| 154 |
+
def typed_mutate(self, seed_input: int):
|
| 155 |
+
@use_ingredient(0.5)
|
| 156 |
+
def _impl(_, seed_input: int):
|
| 157 |
+
return seed_input + random.randint(-1, 1)
|
| 158 |
+
|
| 159 |
+
return _impl(self, seed_input)
|
| 160 |
+
|
| 161 |
+
@dispatch(float)
|
| 162 |
+
def typed_mutate(self, seed_input: float):
|
| 163 |
+
@use_ingredient(0.5)
|
| 164 |
+
def _impl(_, seed_input: float):
|
| 165 |
+
if random.randint(0, 1):
|
| 166 |
+
return seed_input + random.uniform(-1, 1)
|
| 167 |
+
return seed_input * (1 + random.uniform(-0.5, 0.5))
|
| 168 |
+
|
| 169 |
+
return _impl(self, seed_input)
|
| 170 |
+
|
| 171 |
+
@dispatch(complex)
|
| 172 |
+
def typed_mutate(self, seed_input: complex):
|
| 173 |
+
@use_ingredient(0.5)
|
| 174 |
+
def _impl(_, seed_input: complex):
|
| 175 |
+
imag = seed_input.imag + random.uniform(-1, 1)
|
| 176 |
+
return complex(0, imag)
|
| 177 |
+
|
| 178 |
+
return _impl(self, seed_input)
|
| 179 |
+
|
| 180 |
+
@dispatch(bool)
|
| 181 |
+
def typed_mutate(self, seed_input: bool):
|
| 182 |
+
return random.choice([True, False])
|
| 183 |
+
|
| 184 |
+
@dispatch(NoneType)
|
| 185 |
+
def typed_mutate(self, seed_input: NoneType):
|
| 186 |
+
return None
|
| 187 |
+
|
| 188 |
+
# List-like
|
| 189 |
+
@dispatch(list)
|
| 190 |
+
def typed_mutate(self, seed_input: List):
|
| 191 |
+
if len(seed_input) == 0:
|
| 192 |
+
return self.typed_gen([])
|
| 193 |
+
|
| 194 |
+
choice = random.randint(0, 3)
|
| 195 |
+
idx = random.randint(0, len(seed_input) - 1)
|
| 196 |
+
if choice == 0: # remove one element
|
| 197 |
+
seed_input.pop(random.randint(0, len(seed_input) - 1))
|
| 198 |
+
elif choice == 1 and len(seed_input) > 0: # add one mutated element
|
| 199 |
+
seed_input.insert(
|
| 200 |
+
random.randint(0, len(seed_input) - 1),
|
| 201 |
+
self.typed_mutate(seed_input[idx]),
|
| 202 |
+
)
|
| 203 |
+
elif choice == 2 and len(seed_input) > 0: # repeat one element
|
| 204 |
+
seed_input.append(seed_input[idx])
|
| 205 |
+
else: # inplace element change
|
| 206 |
+
seed_input[idx] = self.typed_mutate(seed_input[idx])
|
| 207 |
+
return seed_input
|
| 208 |
+
|
| 209 |
+
@dispatch(tuple)
|
| 210 |
+
def typed_mutate(self, seed_input: Tuple):
|
| 211 |
+
return tuple(self.typed_mutate(list(seed_input)))
|
| 212 |
+
|
| 213 |
+
# String
|
| 214 |
+
@dispatch(str)
|
| 215 |
+
def typed_mutate(self, seed_input: str):
|
| 216 |
+
@use_ingredient(0.4)
|
| 217 |
+
def _impl(_, seed_input: str):
|
| 218 |
+
choice = random.randint(0, 2) if seed_input else 0
|
| 219 |
+
if choice == 0 and self.ingredients[str]: # insert an ingredient
|
| 220 |
+
idx = random.randint(0, len(seed_input))
|
| 221 |
+
return (
|
| 222 |
+
seed_input[:idx]
|
| 223 |
+
+ random.choice(list(self.ingredients[str]))
|
| 224 |
+
+ seed_input[idx:]
|
| 225 |
+
)
|
| 226 |
+
# other choices assume len(seed_input) > 0
|
| 227 |
+
elif choice == 1: # replace a substring with empty or mutated string
|
| 228 |
+
start = random.randint(0, len(seed_input) - 1)
|
| 229 |
+
end = random.randint(start + 1, len(seed_input))
|
| 230 |
+
mid = (
|
| 231 |
+
""
|
| 232 |
+
if random.randint(0, 1)
|
| 233 |
+
else self.typed_mutate(seed_input[start:end])
|
| 234 |
+
)
|
| 235 |
+
return seed_input[:start] + mid + seed_input[end:]
|
| 236 |
+
elif choice == 2: # repeat one element
|
| 237 |
+
idx = random.randint(0, len(seed_input) - 1)
|
| 238 |
+
return (
|
| 239 |
+
seed_input[:idx]
|
| 240 |
+
+ seed_input[random.randint(0, len(seed_input) - 1)]
|
| 241 |
+
+ seed_input[idx:]
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
# random char
|
| 245 |
+
return self.typed_gen(str())
|
| 246 |
+
|
| 247 |
+
return _impl(self, seed_input)
|
| 248 |
+
|
| 249 |
+
# Set
|
| 250 |
+
@dispatch(set)
|
| 251 |
+
def typed_mutate(self, seed_input: Set):
|
| 252 |
+
return set(self.typed_mutate(list(seed_input)))
|
| 253 |
+
|
| 254 |
+
# Dict
|
| 255 |
+
@dispatch(dict)
|
| 256 |
+
def typed_mutate(self, seed_input: Dict):
|
| 257 |
+
if len(seed_input) == 0:
|
| 258 |
+
return self.typed_gen(dict())
|
| 259 |
+
|
| 260 |
+
choice = random.randint(0, 2)
|
| 261 |
+
if choice == 0: # remove a kv
|
| 262 |
+
del seed_input[random.choice(list(seed_input.keys()))]
|
| 263 |
+
elif choice == 1: # add a kv
|
| 264 |
+
k = self.typed_mutate(random.choice(list(seed_input.keys())))
|
| 265 |
+
v = self.typed_mutate(random.choice(list(seed_input.values())))
|
| 266 |
+
seed_input[k] = v
|
| 267 |
+
elif choice == 2: # inplace value change
|
| 268 |
+
k0, v0 = random.choice(list(seed_input.items()))
|
| 269 |
+
seed_input[k0] = self.typed_mutate(v0)
|
| 270 |
+
return seed_input
|
| 271 |
+
|
| 272 |
+
############################################
|
| 273 |
+
# Fetching ingredients to self.ingredients #
|
| 274 |
+
############################################
|
| 275 |
+
def fetch_ingredient(self, seed_input):
|
| 276 |
+
self.typed_fetch(seed_input)
|
| 277 |
+
|
| 278 |
+
@dispatch(int)
|
| 279 |
+
def typed_fetch(self, seed_input: int):
|
| 280 |
+
self.ingredients[int].add(seed_input)
|
| 281 |
+
|
| 282 |
+
@dispatch(float)
|
| 283 |
+
def typed_fetch(self, seed_input: float):
|
| 284 |
+
self.ingredients[float].add(seed_input)
|
| 285 |
+
|
| 286 |
+
@dispatch(complex)
|
| 287 |
+
def typed_fetch(self, seed_input: complex):
|
| 288 |
+
self.ingredients[complex].add(seed_input)
|
| 289 |
+
|
| 290 |
+
@dispatch(str)
|
| 291 |
+
def typed_fetch(self, seed_input: str):
|
| 292 |
+
self.ingredients[str].add(seed_input)
|
| 293 |
+
for token in seed_input.strip().split():
|
| 294 |
+
self.ingredients[str].add(token)
|
| 295 |
+
|
| 296 |
+
# List-like
|
| 297 |
+
def _fetch_list_like(self, seed_input):
|
| 298 |
+
for x in seed_input:
|
| 299 |
+
if self.typed_fetch.dispatch(type(x)):
|
| 300 |
+
self.fetch_ingredient(x)
|
| 301 |
+
|
| 302 |
+
@dispatch(list)
|
| 303 |
+
def typed_fetch(self, seed_input: List):
|
| 304 |
+
self._fetch_list_like(seed_input)
|
| 305 |
+
|
| 306 |
+
@dispatch(tuple)
|
| 307 |
+
def typed_fetch(self, seed_input: Tuple):
|
| 308 |
+
self._fetch_list_like(seed_input)
|
| 309 |
+
|
| 310 |
+
# NOTE: disable set for now as Steven is too weak in Python (/s)
|
| 311 |
+
# @dispatch(set)
|
| 312 |
+
# def typed_fetch(self, seed_input: Set):
|
| 313 |
+
# self._fetch_list_like(seed_input)
|
| 314 |
+
|
| 315 |
+
# Dict
|
| 316 |
+
@dispatch(dict)
|
| 317 |
+
def typed_fetch(self, seed_input: Dict):
|
| 318 |
+
self._fetch_list_like(seed_input.keys())
|
| 319 |
+
self._fetch_list_like(seed_input.values())
|
| 320 |
+
|
| 321 |
+
def generate(self, num: int):
|
| 322 |
+
start = time.time()
|
| 323 |
+
num_generated = 1
|
| 324 |
+
while len(self.new_inputs) < num and time.time() - start < self.timeout:
|
| 325 |
+
if num_generated % 1000 == 0:
|
| 326 |
+
print(
|
| 327 |
+
f"generated {num_generated} already with {len(self.new_inputs)} new inputs ... "
|
| 328 |
+
)
|
| 329 |
+
new_input = self.seed_selection()
|
| 330 |
+
# Multi-step instead of single-step
|
| 331 |
+
for _ in range(random.randint(1, MAX_MULTI_STEP_SIZE)):
|
| 332 |
+
new_input = self.mutate(new_input)
|
| 333 |
+
num_generated += 1
|
| 334 |
+
if hash(str(new_input)) not in self.seed_hash:
|
| 335 |
+
if trusted_check_exec(self.contract, [new_input], self.entry_point):
|
| 336 |
+
self.typed_fetch(new_input)
|
| 337 |
+
self.seed_pool.append(new_input)
|
| 338 |
+
self.new_inputs.append(new_input)
|
| 339 |
+
self.seed_hash.add(hash(str(new_input)))
|
| 340 |
+
return self.new_inputs[:num]
|
evalplus/build/lib/evalplus/gen/util/__init__.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
from copy import deepcopy
|
| 3 |
+
|
| 4 |
+
from evalplus.eval.utils import time_limit
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def trusted_exec(code, inputs, entry_point, record_time=False, output_not_none=False):
|
| 8 |
+
"""Execute trusted code in place."""
|
| 9 |
+
exec_globals = {}
|
| 10 |
+
exec(code, exec_globals)
|
| 11 |
+
fn = exec_globals[entry_point]
|
| 12 |
+
|
| 13 |
+
rtime = []
|
| 14 |
+
ret = []
|
| 15 |
+
for inp in inputs:
|
| 16 |
+
inp = deepcopy(inp)
|
| 17 |
+
if record_time:
|
| 18 |
+
start = time.time()
|
| 19 |
+
ret.append(fn(*inp))
|
| 20 |
+
rtime.append(time.time() - start)
|
| 21 |
+
else:
|
| 22 |
+
ret.append(fn(*inp))
|
| 23 |
+
|
| 24 |
+
if output_not_none:
|
| 25 |
+
ret = [i is not None for i in ret]
|
| 26 |
+
|
| 27 |
+
if record_time:
|
| 28 |
+
return ret, rtime
|
| 29 |
+
else:
|
| 30 |
+
return ret
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def trusted_check_exec(code, inputs, entry_point):
|
| 34 |
+
"""Check trusted_exec success."""
|
| 35 |
+
try:
|
| 36 |
+
with time_limit(seconds=1.0):
|
| 37 |
+
trusted_exec(code, inputs, entry_point)
|
| 38 |
+
except Exception:
|
| 39 |
+
return False
|
| 40 |
+
return True
|
evalplus/build/lib/evalplus/gen/util/anthropic_request.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import signal
|
| 2 |
+
import time
|
| 3 |
+
|
| 4 |
+
import anthropic
|
| 5 |
+
from anthropic.types import Message
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def handler(signum, frame):
|
| 9 |
+
# swallow signum and frame
|
| 10 |
+
raise Exception("end of time")
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def make_auto_request(client: anthropic.Client, *args, **kwargs) -> Message:
|
| 14 |
+
ret = None
|
| 15 |
+
while ret is None:
|
| 16 |
+
try:
|
| 17 |
+
signal.signal(signal.SIGALRM, handler)
|
| 18 |
+
signal.alarm(100)
|
| 19 |
+
ret = client.messages.create(*args, top_p=0.95, **kwargs)
|
| 20 |
+
signal.alarm(0)
|
| 21 |
+
except anthropic.RateLimitError:
|
| 22 |
+
print("Rate limit exceeded. Waiting...")
|
| 23 |
+
signal.alarm(0)
|
| 24 |
+
time.sleep(5)
|
| 25 |
+
except anthropic.APIConnectionError:
|
| 26 |
+
print("API connection error. Waiting...")
|
| 27 |
+
signal.alarm(0)
|
| 28 |
+
time.sleep(5)
|
| 29 |
+
except anthropic.InternalServerError:
|
| 30 |
+
print("Internal server error. Waiting...")
|
| 31 |
+
signal.alarm(0)
|
| 32 |
+
time.sleep(5)
|
| 33 |
+
except anthropic.APIError as e:
|
| 34 |
+
print("Unknown API error")
|
| 35 |
+
print(e)
|
| 36 |
+
if (
|
| 37 |
+
e.body["error"]["message"]
|
| 38 |
+
== "Output blocked by content filtering policy"
|
| 39 |
+
):
|
| 40 |
+
raise Exception("Content filtering policy blocked output")
|
| 41 |
+
signal.alarm(0)
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print("Unknown error. Waiting...")
|
| 44 |
+
print(e)
|
| 45 |
+
signal.alarm(0)
|
| 46 |
+
time.sleep(1)
|
| 47 |
+
return ret
|
evalplus/build/lib/evalplus/gen/util/openai_request.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
|
| 3 |
+
import openai
|
| 4 |
+
from openai.types.chat import ChatCompletion
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def make_request(
|
| 8 |
+
client: openai.Client,
|
| 9 |
+
message: str,
|
| 10 |
+
model: str,
|
| 11 |
+
max_tokens: int = 512,
|
| 12 |
+
temperature: float = 1,
|
| 13 |
+
n: int = 1,
|
| 14 |
+
**kwargs
|
| 15 |
+
) -> ChatCompletion:
|
| 16 |
+
kwargs["top_p"] = 0.95
|
| 17 |
+
kwargs["max_completion_tokens"] = max_tokens
|
| 18 |
+
if model.startswith("o1-"): # pop top-p and max_completion_tokens
|
| 19 |
+
kwargs.pop("top_p")
|
| 20 |
+
kwargs.pop("max_completion_tokens")
|
| 21 |
+
temperature = 1.0 # o1 models do not support temperature
|
| 22 |
+
|
| 23 |
+
return client.chat.completions.create(
|
| 24 |
+
model=model,
|
| 25 |
+
messages=[
|
| 26 |
+
{"role": "user", "content": message},
|
| 27 |
+
],
|
| 28 |
+
temperature=temperature,
|
| 29 |
+
n=n,
|
| 30 |
+
**kwargs
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def make_auto_request(*args, **kwargs) -> ChatCompletion:
|
| 35 |
+
ret = None
|
| 36 |
+
while ret is None:
|
| 37 |
+
try:
|
| 38 |
+
ret = make_request(*args, **kwargs)
|
| 39 |
+
except openai.RateLimitError:
|
| 40 |
+
print("Rate limit exceeded. Waiting...")
|
| 41 |
+
time.sleep(5)
|
| 42 |
+
except openai.APIConnectionError:
|
| 43 |
+
print("API connection error. Waiting...")
|
| 44 |
+
time.sleep(5)
|
| 45 |
+
except openai.APIError as e:
|
| 46 |
+
print(e)
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print("Unknown error. Waiting...")
|
| 49 |
+
print(e)
|
| 50 |
+
time.sleep(1)
|
| 51 |
+
return ret
|
evalplus/build/lib/evalplus/inputgen.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Generate a .jsonl file where each line is a json object
|
| 2 |
+
representing a programming problem with a task ID ("task_id")
|
| 3 |
+
and a list of enhanced inputs ("inputs") for that task.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import json
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
from evalplus.data.mbpp import mbpp_serialize_inputs
|
| 11 |
+
from evalplus.gen.chatgpt_gen import ChatGPTGen
|
| 12 |
+
from evalplus.gen.type_mut import TypedMutGen
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class SetEncoder(json.JSONEncoder):
|
| 16 |
+
def default(self, obj):
|
| 17 |
+
if isinstance(obj, set):
|
| 18 |
+
return list(obj)
|
| 19 |
+
return json.JSONEncoder.default(self, obj)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# Used for MBPP as MBPP's prompt is not a formal function signature
|
| 23 |
+
def insert_contract_into_code(entry_point, code, contract):
|
| 24 |
+
lines = code.split("\n")
|
| 25 |
+
index = lines.index(
|
| 26 |
+
next(line for line in lines if line.startswith(f"def {entry_point}"))
|
| 27 |
+
)
|
| 28 |
+
lines.insert(index + 1, contract)
|
| 29 |
+
return "\n".join(lines)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def input_generation(args, problems):
|
| 33 |
+
with open(args.output, "w") as file:
|
| 34 |
+
for problem in problems.values():
|
| 35 |
+
new_input = {}
|
| 36 |
+
task_id = problem["task_id"]
|
| 37 |
+
print(f"generating inputs for {task_id} ...")
|
| 38 |
+
# by default we do not include constraints in the prompt (code)
|
| 39 |
+
code = problem["prompt"] + problem["canonical_solution"]
|
| 40 |
+
# but we use c_code to include contract which checks input validity at execution time
|
| 41 |
+
if args.dataset == "humaneval":
|
| 42 |
+
c_code = (
|
| 43 |
+
problem["prompt"]
|
| 44 |
+
+ problem["contract"]
|
| 45 |
+
+ problem["canonical_solution"]
|
| 46 |
+
)
|
| 47 |
+
elif args.dataset == "mbpp":
|
| 48 |
+
c_code = problem["prompt"] + insert_contract_into_code(
|
| 49 |
+
entry_point=problem["entry_point"],
|
| 50 |
+
code=problem["canonical_solution"],
|
| 51 |
+
contract=problem["contract"],
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
# first generate chatgpt
|
| 55 |
+
input_gen = ChatGPTGen(
|
| 56 |
+
problem["base_input"], problem["entry_point"], c_code, code
|
| 57 |
+
).generate(args.chatgpt_len)
|
| 58 |
+
# generate mutation next
|
| 59 |
+
|
| 60 |
+
if input_gen is None or len(input_gen) == 0:
|
| 61 |
+
new_input["task_id"] = task_id
|
| 62 |
+
new_input["inputs"] = {}
|
| 63 |
+
file.write(json.dumps(new_input, cls=SetEncoder) + "\n")
|
| 64 |
+
continue
|
| 65 |
+
|
| 66 |
+
input_gen.extend(
|
| 67 |
+
TypedMutGen(input_gen, problem["entry_point"], c_code).generate(
|
| 68 |
+
args.mut_len
|
| 69 |
+
)
|
| 70 |
+
)
|
| 71 |
+
print(f"generated {len(input_gen)} inputs")
|
| 72 |
+
new_input["task_id"] = task_id
|
| 73 |
+
if args.dataset == "mbpp":
|
| 74 |
+
new_input["inputs"] = mbpp_serialize_inputs(task_id, input_gen)
|
| 75 |
+
new_input["inputs"] = input_gen
|
| 76 |
+
file.write(json.dumps(new_input, cls=SetEncoder) + "\n")
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def main():
|
| 80 |
+
parser = argparse.ArgumentParser()
|
| 81 |
+
parser.add_argument(
|
| 82 |
+
"--dataset", required=True, type=str, choices=["humaneval", "mbpp"]
|
| 83 |
+
)
|
| 84 |
+
parser.add_argument("--chatgpt_len", required=True, type=int)
|
| 85 |
+
parser.add_argument("--mut_len", required=True, type=int)
|
| 86 |
+
parser.add_argument("--output", type=str, help="Output .jsonl path")
|
| 87 |
+
args = parser.parse_args()
|
| 88 |
+
|
| 89 |
+
problems = None
|
| 90 |
+
if args.dataset == "humaneval":
|
| 91 |
+
from evalplus.data import get_human_eval_plus
|
| 92 |
+
|
| 93 |
+
# Allow it to be incomplete
|
| 94 |
+
problems = get_human_eval_plus(err_incomplete=False)
|
| 95 |
+
args.output = args.output or "HumanEvalPlusInputs.jsonl"
|
| 96 |
+
|
| 97 |
+
if args.dataset == "mbpp":
|
| 98 |
+
from evalplus.data import get_mbpp_plus
|
| 99 |
+
|
| 100 |
+
problems = get_mbpp_plus(err_incomplete=False)
|
| 101 |
+
args.output = args.output or "MbppPlusInput.jsonl"
|
| 102 |
+
|
| 103 |
+
assert not os.path.isfile(args.output), f"{args.output} already exists!"
|
| 104 |
+
input_generation(args, problems)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
if __name__ == "__main__":
|
| 108 |
+
main()
|
evalplus/build/lib/evalplus/lecacy_sanitize.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Legacy version of post-processing LLM-generated Python code.
|
| 2 |
+
This sanitizer is implemented using regex and string manipulation.
|
| 3 |
+
You might want to use the latest tree-sitter-based sanitizer (evalplus.sanitize) instead.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import pathlib
|
| 8 |
+
import re
|
| 9 |
+
from typing import List, Optional
|
| 10 |
+
|
| 11 |
+
from tqdm import tqdm
|
| 12 |
+
|
| 13 |
+
from evalplus.data import (
|
| 14 |
+
get_human_eval_plus,
|
| 15 |
+
get_mbpp_plus,
|
| 16 |
+
load_solutions,
|
| 17 |
+
write_directory,
|
| 18 |
+
write_jsonl,
|
| 19 |
+
)
|
| 20 |
+
from evalplus.syncheck import syntax_check
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def remove_unindented_lines(
|
| 24 |
+
code: str, protect_before: str, execeptions: List[str], trim_tails: List[str]
|
| 25 |
+
) -> str:
|
| 26 |
+
lines = code.splitlines()
|
| 27 |
+
cut_idx = []
|
| 28 |
+
cut_enabled = False
|
| 29 |
+
for i, line in enumerate(lines):
|
| 30 |
+
if not cut_enabled and line.startswith(protect_before):
|
| 31 |
+
cut_enabled = True
|
| 32 |
+
continue
|
| 33 |
+
if line.strip() == "":
|
| 34 |
+
continue
|
| 35 |
+
if any(line.startswith(e) for e in execeptions):
|
| 36 |
+
continue
|
| 37 |
+
|
| 38 |
+
lspace = len(line) - len(line.lstrip())
|
| 39 |
+
if lspace == 0:
|
| 40 |
+
cut_idx.append(i)
|
| 41 |
+
|
| 42 |
+
if any(line.rstrip().startswith(t) for t in trim_tails):
|
| 43 |
+
# cut off everything behind
|
| 44 |
+
cut_idx.extend(list(range(i, len(lines))))
|
| 45 |
+
break
|
| 46 |
+
|
| 47 |
+
return "\n".join([line for i, line in enumerate(lines) if i not in cut_idx])
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def to_four_space_indents(old_code):
|
| 51 |
+
new_code = ""
|
| 52 |
+
for line in old_code.splitlines():
|
| 53 |
+
lspace = len(line) - len(line.lstrip())
|
| 54 |
+
if lspace == 3:
|
| 55 |
+
new_code += " "
|
| 56 |
+
new_code += line + "\n"
|
| 57 |
+
return new_code
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def sanitize(
|
| 61 |
+
old_code: str,
|
| 62 |
+
entry_point: str,
|
| 63 |
+
rm_prefix_lines: Optional[str] = None,
|
| 64 |
+
eofs: List = None,
|
| 65 |
+
):
|
| 66 |
+
new_code = old_code
|
| 67 |
+
if rm_prefix_lines is not None:
|
| 68 |
+
new_code = "\n".join(
|
| 69 |
+
[
|
| 70 |
+
line
|
| 71 |
+
for line in old_code.splitlines()
|
| 72 |
+
if not line.startswith(rm_prefix_lines)
|
| 73 |
+
]
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
new_code = "\n" + new_code
|
| 77 |
+
def_left = "def " + entry_point
|
| 78 |
+
|
| 79 |
+
# basic handling of chat output
|
| 80 |
+
new_code = new_code.replace("\n```python\n", "\n```\n")
|
| 81 |
+
for chunk in new_code.split("\n```\n"):
|
| 82 |
+
if def_left in chunk:
|
| 83 |
+
new_code = chunk
|
| 84 |
+
break
|
| 85 |
+
|
| 86 |
+
chunks = [chunk for chunk in re.split(f"{def_left}\\s*\\(", new_code)]
|
| 87 |
+
# TODO: having return does not mean this is complete
|
| 88 |
+
bodies = [chunk for chunk in chunks[1:] if " return " in chunk.split("\ndef")[0]]
|
| 89 |
+
def_left = def_left + "("
|
| 90 |
+
new_code = def_left + def_left.join(bodies) if len(bodies) > 0 else "" # fn + impl
|
| 91 |
+
new_code = to_four_space_indents(new_code)
|
| 92 |
+
|
| 93 |
+
for eof in eofs or []:
|
| 94 |
+
new_code = new_code.split(eof)[0]
|
| 95 |
+
|
| 96 |
+
# remove lines starting from the first unindented line after def_left
|
| 97 |
+
new_code = remove_unindented_lines(
|
| 98 |
+
new_code,
|
| 99 |
+
protect_before=def_left,
|
| 100 |
+
execeptions=["def ", "import ", "from "],
|
| 101 |
+
trim_tails=['"""', "if", "print"],
|
| 102 |
+
)
|
| 103 |
+
new_code = chunks[0] + new_code
|
| 104 |
+
|
| 105 |
+
# cut all functions that are not syntactically correct && not the entry point
|
| 106 |
+
parts = new_code.split("\ndef ")
|
| 107 |
+
includes = [parts[0]]
|
| 108 |
+
for fn in new_code.split("\ndef ")[1:]:
|
| 109 |
+
if (
|
| 110 |
+
fn.strip().startswith(entry_point + " ")
|
| 111 |
+
or fn.strip().startswith(entry_point + "(")
|
| 112 |
+
or syntax_check("\ndef " + fn)
|
| 113 |
+
):
|
| 114 |
+
includes.append(fn)
|
| 115 |
+
new_code = "\ndef ".join(includes)
|
| 116 |
+
return new_code.strip()
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def script(
|
| 120 |
+
samples: str,
|
| 121 |
+
eofs: List[str] = [],
|
| 122 |
+
inplace: bool = False,
|
| 123 |
+
rm_prefix_lines: str = None,
|
| 124 |
+
debug_task: str = None,
|
| 125 |
+
mbpp_version: str = "default",
|
| 126 |
+
):
|
| 127 |
+
# task_id -> entry_point
|
| 128 |
+
entry_point = {}
|
| 129 |
+
dataset = {**get_human_eval_plus(), **get_mbpp_plus(version=mbpp_version)}
|
| 130 |
+
|
| 131 |
+
for task_id, problem in dataset.items():
|
| 132 |
+
entry_point[task_id] = problem["entry_point"]
|
| 133 |
+
|
| 134 |
+
# make a new folder with "-sanitized" suffix
|
| 135 |
+
is_folder = os.path.isdir(samples)
|
| 136 |
+
target_path = pathlib.Path(samples)
|
| 137 |
+
if not inplace:
|
| 138 |
+
if is_folder:
|
| 139 |
+
new_name = target_path.name + "-sanitized"
|
| 140 |
+
else:
|
| 141 |
+
new_name = target_path.name.replace(".jsonl", "-sanitized.jsonl")
|
| 142 |
+
target_path = target_path.parent / new_name
|
| 143 |
+
target_path = str(target_path)
|
| 144 |
+
|
| 145 |
+
nsan = 0
|
| 146 |
+
ntotal = 0
|
| 147 |
+
|
| 148 |
+
new_solutions = []
|
| 149 |
+
|
| 150 |
+
for solution in tqdm(load_solutions(samples)):
|
| 151 |
+
task_id = solution["task_id"]
|
| 152 |
+
dbg_identifier = solution["_identifier"]
|
| 153 |
+
if debug_task is not None and task_id != debug_task:
|
| 154 |
+
continue
|
| 155 |
+
|
| 156 |
+
ntotal += 1
|
| 157 |
+
if "solution" in solution:
|
| 158 |
+
old_code = solution["solution"]
|
| 159 |
+
else:
|
| 160 |
+
assert "completion" in solution
|
| 161 |
+
old_code = dataset[task_id]["prompt"] + "\n" + solution["completion"]
|
| 162 |
+
|
| 163 |
+
old_code = old_code.strip()
|
| 164 |
+
|
| 165 |
+
new_code = sanitize(
|
| 166 |
+
old_code=old_code,
|
| 167 |
+
entry_point=entry_point[task_id],
|
| 168 |
+
rm_prefix_lines=rm_prefix_lines,
|
| 169 |
+
eofs=eofs,
|
| 170 |
+
).strip()
|
| 171 |
+
|
| 172 |
+
# if changed, print the message
|
| 173 |
+
if new_code != old_code:
|
| 174 |
+
msg = "Sanitized: " + dbg_identifier
|
| 175 |
+
if is_folder:
|
| 176 |
+
msg += " -> " + dbg_identifier.replace(samples, target_path)
|
| 177 |
+
print(msg)
|
| 178 |
+
nsan += 1
|
| 179 |
+
|
| 180 |
+
new_solutions.append({"task_id": task_id, "solution": new_code})
|
| 181 |
+
|
| 182 |
+
if is_folder:
|
| 183 |
+
write_directory(target_path, new_solutions)
|
| 184 |
+
else:
|
| 185 |
+
write_jsonl(target_path, new_solutions)
|
| 186 |
+
|
| 187 |
+
if nsan > 0:
|
| 188 |
+
print(f"Sanitized {nsan} out of {ntotal} files.")
|
| 189 |
+
else:
|
| 190 |
+
print(f"All files seems valid -- no files are sanitized.")
|
| 191 |
+
print(f"Check the sanitized files at {target_path}")
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def main():
|
| 195 |
+
from fire import Fire
|
| 196 |
+
|
| 197 |
+
Fire(script)
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
if __name__ == "__main__":
|
| 201 |
+
main()
|
evalplus/build/lib/evalplus/perf/__init__.py
ADDED
|
File without changes
|