f541119578 commited on
Commit
fdf190d
·
verified ·
1 Parent(s): e6933d6

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. __pycache__/cloudgpt_aoai.cpython-310.pyc +0 -0
  3. __pycache__/rouge.cpython-310.pyc +0 -0
  4. ans.sh +26 -0
  5. arenaans.sh +6 -0
  6. battle.py +55 -0
  7. battle.sh +21 -0
  8. battlescore.py +105 -0
  9. bertencode.py +40 -0
  10. cleanans.py +10 -0
  11. cloudgpt-apim-token-cache.bin +3 -0
  12. cloudgpt_aoai.py +619 -0
  13. combine.py +11 -0
  14. config_sft_fhw.yaml +41 -0
  15. crux.sh +87 -0
  16. evalplus/.dockerignore +182 -0
  17. evalplus/.github/ISSUE_TEMPLATE/buggy_contract.yml +48 -0
  18. evalplus/.github/ISSUE_TEMPLATE/buggy_test.yml +49 -0
  19. evalplus/.github/ISSUE_TEMPLATE/config.yml +1 -0
  20. evalplus/.github/ISSUE_TEMPLATE/model_eval_request.yml +73 -0
  21. evalplus/.gitignore +182 -0
  22. evalplus/.pre-commit-config.yaml +20 -0
  23. evalplus/CITATION.cff +25 -0
  24. evalplus/Dockerfile +19 -0
  25. evalplus/LICENSE +205 -0
  26. evalplus/MANIFEST.in +1 -0
  27. evalplus/README.md +325 -0
  28. evalplus/build/lib/evalplus/__init__.py +4 -0
  29. evalplus/build/lib/evalplus/_version.py +16 -0
  30. evalplus/build/lib/evalplus/codegen.py +272 -0
  31. evalplus/build/lib/evalplus/config.py +16 -0
  32. evalplus/build/lib/evalplus/data/__init__.py +14 -0
  33. evalplus/build/lib/evalplus/data/humaneval.py +96 -0
  34. evalplus/build/lib/evalplus/data/mbpp.py +203 -0
  35. evalplus/build/lib/evalplus/data/utils.py +166 -0
  36. evalplus/build/lib/evalplus/eval/__init__.py +316 -0
  37. evalplus/build/lib/evalplus/eval/_special_oracle.py +55 -0
  38. evalplus/build/lib/evalplus/eval/utils.py +187 -0
  39. evalplus/build/lib/evalplus/evalperf.py +558 -0
  40. evalplus/build/lib/evalplus/evaluate.py +375 -0
  41. evalplus/build/lib/evalplus/gen/__init__.py +21 -0
  42. evalplus/build/lib/evalplus/gen/chatgpt_gen.py +78 -0
  43. evalplus/build/lib/evalplus/gen/mut_gen.py +30 -0
  44. evalplus/build/lib/evalplus/gen/type_mut.py +340 -0
  45. evalplus/build/lib/evalplus/gen/util/__init__.py +40 -0
  46. evalplus/build/lib/evalplus/gen/util/anthropic_request.py +47 -0
  47. evalplus/build/lib/evalplus/gen/util/openai_request.py +51 -0
  48. evalplus/build/lib/evalplus/inputgen.py +108 -0
  49. evalplus/build/lib/evalplus/lecacy_sanitize.py +201 -0
  50. evalplus/build/lib/evalplus/perf/__init__.py +0 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ evalplus/gallary/overview.png filter=lfs diff=lfs merge=lfs -text
37
+ evalplus/gallary/render.gif filter=lfs diff=lfs merge=lfs -text
38
+ nohup.out filter=lfs diff=lfs merge=lfs -text
__pycache__/cloudgpt_aoai.cpython-310.pyc ADDED
Binary file (17.7 kB). View file
 
__pycache__/rouge.cpython-310.pyc ADDED
Binary file (630 Bytes). View file
 
ans.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/llama --datapath /home/aiscuser/fhw/data/athene_python_7w.json
2
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/llama --datapath /home/aiscuser/fhw/data/deepseekcoder_python_7w.json
3
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/llama --datapath /home/aiscuser/fhw/data/llama_python_7w.json
4
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/llama --datapath /home/aiscuser/fhw/data/qwen_python_7w.json
5
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/llama --datapath /home/aiscuser/fhw/data/qwq_python_7w.json
6
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/deepseekcoder --datapath /home/aiscuser/fhw/data/athene_python_7w.json
7
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/deepseekcoder --datapath /home/aiscuser/fhw/data/deepseekcoder_python_7w.json
8
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/deepseekcoder --datapath /home/aiscuser/fhw/data/llama_python_7w.json
9
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/deepseekcoder --datapath /home/aiscuser/fhw/data/qwen_python_7w.json
10
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/deepseekcoder --datapath /home/aiscuser/fhw/data/qwq_python_7w.json
11
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/athene --datapath /home/aiscuser/fhw/data/athene_python_7w.json
12
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/athene --datapath /home/aiscuser/fhw/data/deepseekcoder_python_7w.json
13
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/athene --datapath /home/aiscuser/fhw/data/llama_python_7w.json
14
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/athene --datapath /home/aiscuser/fhw/data/qwen_python_7w.json
15
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/athene --datapath /home/aiscuser/fhw/data/qwq_python_7w.json
16
+ python vllmans.py --path /home/aiscuser/fhw/model_weights/qwen --datapath /home/aiscuser/fhw/data/athene_python_7w.json
17
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/qwen --datapath /home/aiscuser/fhw/data/deepseekcoder_python_7w.json
18
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/qwen --datapath /home/aiscuser/fhw/data/llama_python_7w.json
19
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/qwen --datapath /home/aiscuser/fhw/data/qwen_python_7w.json
20
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/qwen --datapath /home/aiscuser/fhw/data/qwq_python_7w.json
21
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/qwq --datapath /home/aiscuser/fhw/data/athene_python_7w.json
22
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/qwq --datapath /home/aiscuser/fhw/data/deepseekcoder_python_7w.json
23
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/qwq --datapath /home/aiscuser/fhw/data/llama_python_7w.json
24
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/qwq --datapath /home/aiscuser/fhw/data/qwen_python_7w.json
25
+ #python vllmans.py --path /home/aiscuser/fhw/model_weights/qwq --datapath /home/aiscuser/fhw/data/qwq_python_7w.json
26
+ python /data/local/zhangdi/DPO/DPO_train.py
arenaans.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ python vllmarenaans.py --model target --judge athene --split 0
2
+ python vllmarenaans.py --model target --judge deepseekcoder --split 0
3
+ python vllmarenaans.py --model target --judge llama --split 0
4
+ python vllmarenaans.py --model target --judge qwen --split 0
5
+ python vllmarenaans.py --model target --judge qwq --split 0
6
+ python /data/local/zhangdi/DPO/DPO_train.py
battle.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ from vllm import LLM, SamplingParams
3
+ import argparse
4
+ import json
5
+ from tqdm import tqdm
6
+
7
+ parser = argparse.ArgumentParser()
8
+ parser.add_argument('--path', type=str,help='模型路径')
9
+ parser.add_argument('--start', type=int,help='开始')
10
+ parser.add_argument('--end', type=int,help='终止')
11
+ args = parser.parse_args()
12
+
13
+ name = args.path[args.path.rfind('/')+1:]
14
+
15
+ fw = open(f"alignment-handbook/data/llama_battle_mistral_qwen_{args.start}_{args.end}.json", 'w+')
16
+
17
+ prompts = []
18
+
19
+
20
+ # Initialize the tokenizer
21
+ tokenizer = AutoTokenizer.from_pretrained(args.path, trust_remote_code=True)
22
+ f1 = open("alignment-handbook/data/llama_python_mistral_answer_0_70000_sft.json", "r+")
23
+ f2 = open("alignment-handbook/data/llama_python_qwen_answer_0_70000_sft.json", "r+")
24
+ lines1 = f1.readlines()[args.start:args.end]
25
+ lines2 = f2.readlines()[args.start:args.end]
26
+ t = 0
27
+ for line1, line2 in zip(lines1, lines2):
28
+ d1 = json.loads(line1)
29
+ d2 = json.loads(line2)
30
+ instruction = d1["messages"][0]["content"]
31
+ answer1 = d1["messages"][1]["content"]
32
+ answer2 = d2["messages"][1]["content"]
33
+ #print(answer1)
34
+ #print(answer2)
35
+ if t%2 == 0:
36
+ prompt = f"This is a chatbot arena. You will be given assistant A’s answer, and assistant B’s answer. Please act as an impartial judge and evaluate the capability of two AI assistants. You should choose the assistant that follows instructions and answers questions better. Your evaluation should consider factors such as helpfulness, relevance, and accuracy. Begin your evaluation by comparing the responses of the two assistants and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. DO NOT allow the LENGTH of the responses to influence your evaluation, choose the one that is straight-to-the-point instead of unnecessarily verbose. When the two candidates perform equally well, choose the SHORTER answer. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation concisely within 200 words, output your final verdict by strictly following this format: “[[A]]” if assistant A is better, “[[B]]” if assistant B is better, and “[[Tie]]” for a tie. Finish your judgement within 300 words.\n\n[User Question]\n{instruction}\n\n[The Start of Assistant A’s Answer]\n{answer1}\n[The End of Assistant A’s Answer]\n\n[The Start of Assistant B’s Answer]\n{answer2}\n[The End of Assistant B’s Answer]"
37
+ else:
38
+ prompt = f"This is a chatbot arena. You will be given assistant A’s answer, and assistant B’s answer. Please act as an impartial judge and evaluate the capability of two AI assistants. You should choose the assistant that follows instructions and answers questions better. Your evaluation should consider factors such as helpfulness, relevance, and accuracy. Begin your evaluation by comparing the responses of the two assistants and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. DO NOT allow the LENGTH of the responses to influence your evaluation, choose the one that is straight-to-the-point instead of unnecessarily verbose. When the two candidates perform equally well, choose the SHORTER answer. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation concisely within 200 words, output your final verdict by strictly following this format: “[[A]]” if assistant A is better, “[[B]]” if assistant B is better, and “[[Tie]]” for a tie. Finish your judgement within 300 words.\n\n[User Question]\n{instruction}\n\n[The Start of Assistant A’s Answer]\n{answer2}\n[The End of Assistant A’s Answer]\n\n[The Start of Assistant B’s Answer]\n{answer1}\n[The End of Assistant B’s Answer]"
39
+ messages = [{"role": "user", "content": prompt}]
40
+ text = tokenizer.apply_chat_template(
41
+ messages,
42
+ tokenize=False
43
+ )
44
+ prompts.append(text)
45
+ t = t + 1
46
+
47
+ # Input the model name or path. Can be GPTQ or AWQ models.
48
+ llm = LLM(args.path, dtype="float16", tensor_parallel_size=8, trust_remote_code=True, max_model_len=8192, enforce_eager=True)
49
+ sampling_params = SamplingParams(temperature=1.0, top_p=0.995, max_tokens=8192)
50
+ outputs = llm.generate(prompts=prompts, sampling_params=sampling_params)
51
+ t = 0
52
+ for output in outputs:
53
+ d = {"arena": output.outputs[0].text, "t": t}
54
+ t = t + 1
55
+ fw.write(json.dumps(d)+"\n")
battle.sh ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/athene --model /home/aiscuser/fhw/model_weights/deepseekcoder
2
+ #python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/athene --model /home/aiscuser/fhw/model_weights/llama
3
+ #python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/athene --model /home/aiscuser/fhw/model_weights/qwen
4
+ #python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/athene --model /home/aiscuser/fhw/model_weights/qwq
5
+ #python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/deepseekcoder --model /home/aiscuser/fhw/model_weights/athene
6
+ python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/deepseekcoder --model /home/aiscuser/fhw/model_weights/llama
7
+ python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/deepseekcoder --model /home/aiscuser/fhw/model_weights/qwen
8
+ python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/deepseekcoder --model /home/aiscuser/fhw/model_weights/qwq
9
+ python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/llama --model /home/aiscuser/fhw/model_weights/athene
10
+ python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/llama --model /home/aiscuser/fhw/model_weights/deepseekcoder
11
+ #python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/llama --model /home/aiscuser/fhw/model_weights/qwen
12
+ #python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/llama --model /home/aiscuser/fhw/model_weights/qwq
13
+ #python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/qwen --model /home/aiscuser/fhw/model_weights/athene
14
+ #python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/qwen --model /home/aiscuser/fhw/model_weights/deepseekcoder
15
+ #python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/qwen --model /home/aiscuser/fhw/model_weights/llama
16
+ #python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/qwen --model /home/aiscuser/fhw/model_weights/qwq
17
+ #python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/qwq --model /home/aiscuser/fhw/model_weights/athene
18
+ #python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/qwq --model /home/aiscuser/fhw/model_weights/deepseekcoder
19
+ #python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/qwq --model /home/aiscuser/fhw/model_weights/llama
20
+ #python vllmbattle.py --judge /home/aiscuser/fhw/model_weights/qwq --model /home/aiscuser/fhw/model_weights/qwen
21
+ python /data/local/zhangdi/DPO/DPO_train.py
battlescore.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ from vllm import LLM, SamplingParams
3
+ import argparse
4
+ import json
5
+ from tqdm import tqdm
6
+ import re
7
+ def extract_score(judgement):
8
+ d = {}
9
+ extracted = re.findall(r"\[\[(\d*\.\d+|\d+)/10\]\]", judgement, re.S)
10
+ if len(extracted) > 0:
11
+ d["score"] = float(extracted[-1])
12
+ return int(d["score"])
13
+ extracted = re.findall(r"\[\[(\d*\.\d+|\d+)\]\]", judgement, re.S)
14
+ if len(extracted) > 0:
15
+ d["score"] = float(extracted[-1])
16
+ return int(d["score"])
17
+ extracted = re.findall(r"\*\*Score: \[(\d*\.\d+|\d+)/10\]\*\*", judgement, re.S)
18
+ if len(extracted) > 0:
19
+ d["score"] = float(extracted[-1])
20
+ return int(d["score"])
21
+ extracted = re.findall(r"\*\*Score: \[(\d*\.\d+|\d+)\]\*\*", judgement, re.S)
22
+ if len(extracted) > 0:
23
+ d["score"] = float(extracted[-1])
24
+ return int(d["score"])
25
+ extracted = re.findall(r"\*\*Score: (\d*\.\d+|\d+)/10\*\*", judgement, re.S)
26
+ if len(extracted) > 0:
27
+ d["score"] = float(extracted[-1])
28
+ return int(d["score"])
29
+ extracted = re.findall(r"\*\*Score: (\d*\.\d+|\d+)\*\*", judgement, re.S)
30
+ if len(extracted) > 0:
31
+ d["score"] = float(extracted[-1])
32
+ return int(d["score"])
33
+ extracted = re.findall(r"\*\*Score:\*\* (\d*\.\d+|\d+)/10", judgement, re.S)
34
+ if len(extracted) > 0:
35
+ d["score"] = float(extracted[-1])
36
+ return int(d["score"])
37
+ extracted = re.findall(r"\*\*Score:\*\* (\d*\.\d+|\d+)", judgement, re.S)
38
+ if len(extracted) > 0:
39
+ d["score"] = float(extracted[-1])
40
+ return int(d["score"])
41
+ extracted = re.findall(r"Score(.*?)", judgement, re.S)
42
+ if len(extracted) > 0:
43
+ judgement = extracted[-1]
44
+ extracted = re.findall(r"\d*\.\d+|\d+", judgement, re.S)
45
+ if len(extracted) > 0:
46
+ d["score"] = float(extracted[-1])
47
+ return int(d["score"])
48
+ return -1
49
+ parser = argparse.ArgumentParser()
50
+ parser.add_argument('--judgename', type=str,help='模型路径')
51
+ parser.add_argument('--modelnames', nargs='+')
52
+ args = parser.parse_args()
53
+
54
+ f = open(f"/home/aiscuser/fhw/data/{args.judgename}_filtered_by_answer.json", "r+")
55
+ ddd = json.loads(f.readlines()[0])
56
+
57
+ fr = open(f"/home/aiscuser/fhw/data/{args.judgename}_answerby_{args.judgename}.json", 'r+')
58
+ linesr = fr.readlines()
59
+
60
+ all_lines = []
61
+ for modelname in args.modelnames:
62
+ f = open(f"/home/aiscuser/fhw/data/{args.judgename}_judge_{modelname}.json", 'r+')
63
+ all_lines.append(f.readlines())
64
+
65
+
66
+ a, b, c, d = 0, 0, 0, 0
67
+
68
+ fw = open(f"/home/aiscuser/fhw/data/{args.judgename}_with_best_answer.json", "w+")
69
+
70
+ for i in tqdm(ddd[args.judgename]):
71
+ reference = json.loads(linesr[i])
72
+ da = json.loads(all_lines[0][a]) if a<len(all_lines[0]) else json.loads(all_lines[0][0])
73
+ db = json.loads(all_lines[1][b]) if b<len(all_lines[1]) else json.loads(all_lines[1][0])
74
+ dc = json.loads(all_lines[2][c]) if c<len(all_lines[2]) else json.loads(all_lines[2][0])
75
+ dd = json.loads(all_lines[3][d]) if d<len(all_lines[3]) else json.loads(all_lines[3][0])
76
+
77
+ da["battlescore"], db["battlescore"], dc["battlescore"], dd["battlescore"] = -1, -1, -1, -1
78
+
79
+ if da["index"] == i:
80
+ da["battlescore"] = extract_score(da["battle"])
81
+ a = a + 1
82
+ if db["index"] == i:
83
+ db["battlescore"] = extract_score(db["battle"])
84
+ b = b + 1
85
+ if dc["index"] == i:
86
+ dc["battlescore"] = extract_score(dc["battle"])
87
+ c = c + 1
88
+ if dd["index"] == i:
89
+ dd["battlescore"] = extract_score(dd["battle"])
90
+ d = d + 1
91
+
92
+ instruction = reference["instruction"]
93
+ scorelist = [da["battlescore"], db["battlescore"], dc["battlescore"], dd["battlescore"]]
94
+ maxscore = max(scorelist)
95
+ maxindex = scorelist.index(maxscore)
96
+
97
+ if maxscore>6:
98
+ bestname = args.modelnames[maxindex]
99
+ bestanswer = [da, db, dc, dd][maxindex]["response"]
100
+ else:
101
+ bestname = args.judgename
102
+ bestanswer = reference["response"]
103
+ fw.write(json.dumps({"instruction": instruction, "scorelist": scorelist, "bestname": bestname, "bestanswer": bestanswer, "modelnames": args.modelnames, "judgename": args.judgename})+"\n")
104
+
105
+
bertencode.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This example starts multiple processes (1 per GPU), which encode
3
+ sentences in parallel. This gives a near linear speed-up
4
+ when encoding large text collections.
5
+ """
6
+ from tqdm import tqdm
7
+ import logging
8
+ import json
9
+ import torch
10
+ from sentence_transformers import LoggingHandler, SentenceTransformer
11
+
12
+ logging.basicConfig(
13
+ format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
14
+ )
15
+
16
+ # Important, you need to shield your code with if __name__. Otherwise, CUDA runs into issues when spawning new processes.
17
+ if __name__ == "__main__":
18
+ # Create a large list of 100k sentences
19
+ f = open("/home/aiscuser/fhw/data/qwq_python_selected.json","r+")
20
+ lines = f.readlines()
21
+ sentences = []
22
+ for line in tqdm(lines):
23
+ d= json.loads(line)
24
+ sentences.append(d["instruction"])
25
+
26
+
27
+ # Define the model
28
+ model = SentenceTransformer("/home/aiscuser/fhw/model_weights/all-roberta-large-v1")
29
+
30
+ # Start the multi-process pool on all available CUDA devices
31
+ pool = model.start_multi_process_pool(["cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:7", "cuda:7", "cuda:7", "cuda:7", "cuda:7", "cuda:7", "cuda:7", "cuda:7", "cuda:7", "cuda:7", ])
32
+
33
+ # Compute the embeddings using the multi-process pool
34
+ emb = model.encode_multi_process(sentences, pool)
35
+
36
+ print("Embeddings computed. Shape:", emb.shape)
37
+
38
+ # Optional: Stop the processes in the pool
39
+ model.stop_multi_process_pool(pool)
40
+ torch.save(emb, "/home/aiscuser/fhw/embeddings/qwq_ins_embeddings.pt", pickle_protocol=4)
cleanans.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from tqdm import tqdm
2
+ import json
3
+ f = open("/home/aiscuser/fhw/data/all_instruct_with_answers.json", "r+")
4
+ fw = open("/home/aiscuser/fhw/data/all_instruct_with_answers_cleaned.json", "w+")
5
+
6
+ lines = f.readlines()
7
+ for line in lines:
8
+ d = json.loads(line)
9
+ d["bestanswer"] = d["bestanswer"].strip("<|start_header_id|>assistant<|end_header_id|>").strip("\n")
10
+ fw.write(json.dumps(d)+"\n")
cloudgpt-apim-token-cache.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc1dedc3209b111ca9ac7676ffad159ce9bff625b0980c4897653414b796f3aa
3
+ size 300
cloudgpt_aoai.py ADDED
@@ -0,0 +1,619 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from typing import (
3
+ Any,
4
+ AsyncGenerator,
5
+ Callable,
6
+ Coroutine,
7
+ Literal,
8
+ Optional,
9
+ ParamSpec,
10
+ TypeVar,
11
+ cast,
12
+ Dict,
13
+ TYPE_CHECKING,
14
+ )
15
+ import sys, os
16
+ import contextlib
17
+ import functools
18
+
19
+ __all__ = [
20
+ "get_openai_token_provider",
21
+ "get_openai_token",
22
+ "get_openai_client",
23
+ "get_chat_completion",
24
+ "encode_image",
25
+ "cloudgpt_available_models",
26
+ ]
27
+
28
+ TokenProvider = Callable[[], str]
29
+ AsyncTokenProvider = Callable[[], Coroutine[Any, Any, str]]
30
+
31
+
32
+ def check_module():
33
+ try:
34
+ import openai, azure.identity.broker # type: ignore
35
+
36
+ del openai, azure.identity.broker
37
+ except ImportError:
38
+ print("Please install the required packages by running the following command:")
39
+ print("pip install openai azure-identity-broker --upgrade")
40
+ exit(1)
41
+
42
+
43
+ check_module()
44
+
45
+ import openai
46
+ from openai import OpenAI
47
+
48
+ _depRt = TypeVar("_depRt")
49
+ _depParam = ParamSpec("_depParam")
50
+
51
+
52
+ def _deprecated(message: str):
53
+ def deprecated_decorator(
54
+ func: Callable[_depParam, _depRt]
55
+ ) -> Callable[_depParam, _depRt]:
56
+ def deprecated_func(
57
+ *args: _depParam.args, **kwargs: _depParam.kwargs
58
+ ) -> _depRt:
59
+ import traceback
60
+
61
+ print(
62
+ "\n ⚠️ \x1b[31m{} is a deprecated function. {}".format(
63
+ func.__name__, message
64
+ )
65
+ )
66
+ traceback.print_stack()
67
+ print("\x1b[0m")
68
+ return func(*args, **kwargs)
69
+
70
+ return deprecated_func
71
+
72
+ return deprecated_decorator
73
+
74
+
75
+ def _validate_token(token: str) -> bool:
76
+ import requests
77
+
78
+ url = "https://cloudgpt-openai.azure-api.net/openai/ping"
79
+
80
+ headers = {
81
+ "Authorization": f"Bearer {token}",
82
+ }
83
+ try:
84
+ response = requests.get(url, headers=headers)
85
+ assert response.status_code == 200 and response.text == "OK", response.text
86
+ return True
87
+ except Exception as e:
88
+ print("Failed to validate token", e)
89
+ return False
90
+
91
+
92
+ @functools.lru_cache(maxsize=3)
93
+ def get_openai_token_provider(
94
+ token_cache_file: str = "cloudgpt-apim-token-cache.bin",
95
+ client_id: Optional[str] = None,
96
+ client_secret: Optional[str] = None,
97
+ use_azure_cli: Optional[bool] = None,
98
+ use_broker_login: Optional[bool] = None,
99
+ use_managed_identity: Optional[bool] = None,
100
+ use_device_code: Optional[bool] = None,
101
+ skip_access_validation: Optional[bool] = False,
102
+ **kwargs: Any,
103
+ ) -> TokenProvider:
104
+ """
105
+ Get a token provider function that could return a valid access token for CloudGPT OpenAI.
106
+
107
+ The return value is a function that should be used with AzureOpenAIClient constructor as azure_ad_token_provider parameter.
108
+ The following code snippet shows how to use it with AzureOpenAIClient:
109
+
110
+ ```python
111
+ token_provider = get_openai_token_provider()
112
+ client = openai.AzureOpenAI(
113
+ api_version="2024-06-01",
114
+ azure_endpoint="https://cloudgpt-openai.azure-api.net/",
115
+ azure_ad_token_provider=token_provider,
116
+ )
117
+ ```
118
+
119
+ Parameters
120
+ ----------
121
+ token_cache_file : str, optional
122
+ path to the token cache file, by default 'cloudgpt-apim-token-cache.bin' in the current directory
123
+ client_id : Optional[str], optional
124
+ client id for AAD app, by default None
125
+ client_secret : Optional[str], optional
126
+ client secret for AAD app, by default None
127
+ use_azure_cli : Optional[bool], optional
128
+ use Azure CLI for authentication, by default None. If AzCli has been installed and logged in,
129
+ it will be used for authentication. This is recommended for headless environments and AzCLI takes
130
+ care of token cache and token refresh.
131
+ use_broker_login : Optional[bool], optional
132
+ use broker login for authentication, by default None.
133
+ If not specified, it will be enabled for known supported environments (e.g. Windows, macOS, WSL, VSCode),
134
+ but sometimes it may not always could cache the token for long-term usage.
135
+ In such cases, you can disable it by setting it to False.
136
+ use_managed_identity : Optional[bool], optional
137
+ use managed identity for authentication, by default None.
138
+ If not specified, it will use user assigned managed identity if client_id is specified,
139
+ For use system assigned managed identity, client_id could be None but need to set use_managed_identity to True.
140
+ use_device_code : Optional[bool], optional
141
+ use device code for authentication, by default None. If not specified, it will use interactive login on supported platform.
142
+ skip_access_validation : Optional[bool], optional
143
+ skip access token validation, by default False.
144
+
145
+ Returns
146
+ -------
147
+ TokenProvider
148
+ the token provider function that could return a valid access token for CloudGPT OpenAI
149
+ """
150
+ import shutil
151
+ from azure.identity.broker import InteractiveBrowserBrokerCredential
152
+ from azure.identity import (
153
+ ManagedIdentityCredential,
154
+ ClientSecretCredential,
155
+ DeviceCodeCredential,
156
+ AuthenticationRecord,
157
+ AzureCliCredential,
158
+ )
159
+ from azure.identity import TokenCachePersistenceOptions
160
+ import msal # type: ignore
161
+
162
+ api_scope_base = "api://feb7b661-cac7-44a8-8dc1-163b63c23df2"
163
+ tenant_id = "72f988bf-86f1-41af-91ab-2d7cd011db47"
164
+ scope = api_scope_base + "/.default"
165
+
166
+ token_cache_option = TokenCachePersistenceOptions(
167
+ name=token_cache_file,
168
+ enable_persistence=True,
169
+ allow_unencrypted_storage=True,
170
+ )
171
+
172
+ def save_auth_record(auth_record: AuthenticationRecord):
173
+ try:
174
+ with open(token_cache_file, "w") as cache_file:
175
+ cache_file.write(auth_record.serialize())
176
+ except Exception as e:
177
+ print("failed to save auth record", e)
178
+
179
+ def load_auth_record() -> Optional[AuthenticationRecord]:
180
+ try:
181
+ if not os.path.exists(token_cache_file):
182
+ return None
183
+ with open(token_cache_file, "r") as cache_file:
184
+ return AuthenticationRecord.deserialize(cache_file.read())
185
+ except Exception as e:
186
+ print("failed to load auth record", e)
187
+ return None
188
+
189
+ auth_record: Optional[AuthenticationRecord] = load_auth_record()
190
+
191
+ current_auth_mode: Literal[
192
+ "client_secret",
193
+ "managed_identity",
194
+ "az_cli",
195
+ "interactive",
196
+ "device_code",
197
+ "none",
198
+ ] = "none"
199
+
200
+ implicit_mode = not (
201
+ use_managed_identity or use_azure_cli or use_broker_login or use_device_code
202
+ )
203
+
204
+ if use_managed_identity or (implicit_mode and client_id is not None):
205
+ if not use_managed_identity and client_secret is not None:
206
+ assert (
207
+ client_id is not None
208
+ ), "client_id must be specified with client_secret"
209
+ current_auth_mode = "client_secret"
210
+ identity = ClientSecretCredential(
211
+ client_id=client_id,
212
+ client_secret=client_secret,
213
+ tenant_id=tenant_id,
214
+ cache_persistence_options=token_cache_option,
215
+ authentication_record=auth_record,
216
+ )
217
+ else:
218
+ current_auth_mode = "managed_identity"
219
+ if client_id is None:
220
+ # using default managed identity
221
+ identity = ManagedIdentityCredential(
222
+ cache_persistence_options=token_cache_option,
223
+ )
224
+ else:
225
+ identity = ManagedIdentityCredential(
226
+ client_id=client_id,
227
+ cache_persistence_options=token_cache_option,
228
+ )
229
+ elif use_azure_cli or (implicit_mode and shutil.which("az") is not None):
230
+ current_auth_mode = "az_cli"
231
+ identity = AzureCliCredential(tenant_id=tenant_id)
232
+ else:
233
+ if implicit_mode:
234
+ # enable broker login for known supported envs if not specified using use_device_code
235
+ if sys.platform.startswith("darwin") or sys.platform.startswith("win32"):
236
+ use_broker_login = True
237
+ elif os.environ.get("WSL_DISTRO_NAME", "") != "":
238
+ use_broker_login = True
239
+ elif os.environ.get("TERM_PROGRAM", "") == "vscode":
240
+ use_broker_login = True
241
+ else:
242
+ use_broker_login = False
243
+ if use_broker_login:
244
+ current_auth_mode = "interactive"
245
+ identity = InteractiveBrowserBrokerCredential(
246
+ tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47",
247
+ cache_persistence_options=token_cache_option,
248
+ use_default_broker_account=True,
249
+ parent_window_handle=msal.PublicClientApplication.CONSOLE_WINDOW_HANDLE,
250
+ authentication_record=auth_record,
251
+ )
252
+ else:
253
+ current_auth_mode = "device_code"
254
+ identity = DeviceCodeCredential(
255
+ tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47",
256
+ cache_persistence_options=token_cache_option,
257
+ authentication_record=auth_record,
258
+ )
259
+
260
+ try:
261
+ auth_record = identity.authenticate(scopes=[scope])
262
+ if auth_record:
263
+ save_auth_record(auth_record)
264
+
265
+ except Exception as e:
266
+ print(
267
+ f"failed to acquire token from AAD for CloudGPT OpenAI using {current_auth_mode}",
268
+ e,
269
+ )
270
+ raise e
271
+
272
+ try:
273
+ from azure.identity import get_bearer_token_provider
274
+
275
+ token_provider = get_bearer_token_provider(identity, scope)
276
+ token_verified_cache: str = ""
277
+
278
+ def token_provider_wrapper():
279
+ nonlocal token_verified_cache
280
+ token = token_provider()
281
+ if token != token_verified_cache:
282
+ if not skip_access_validation:
283
+ assert _validate_token(token), "failed to validate token"
284
+ token_verified_cache = token
285
+ return token
286
+
287
+ return token_provider_wrapper
288
+ except Exception as e:
289
+ print("failed to acquire token from AAD for CloudGPT OpenAI", e)
290
+ raise e
291
+
292
+
293
+ @functools.lru_cache(maxsize=3)
294
+ async def async_get_openai_token_provider(
295
+ **kwargs: Any,
296
+ ) -> AsyncTokenProvider:
297
+ # TODO: implement async version of get_openai_token_provider
298
+ token_provider = get_openai_token_provider(
299
+ **kwargs,
300
+ )
301
+
302
+ async def async_token_provider() -> str:
303
+ return token_provider()
304
+
305
+ return async_token_provider
306
+
307
+
308
+ @_deprecated(
309
+ "use get_openai_token_provider instead whenever possible "
310
+ "and use it as the azure_ad_token_provider parameter in AzureOpenAIClient constructor. "
311
+ "Please do not acquire token directly or use it elsewhere."
312
+ )
313
+ def get_openai_token(
314
+ token_cache_file: str = "cloudgpt-apim-token-cache.bin",
315
+ client_id: Optional[str] = None,
316
+ client_secret: Optional[str] = None,
317
+ use_azure_cli: Optional[bool] = None,
318
+ use_broker_login: Optional[bool] = None,
319
+ use_managed_identity: Optional[bool] = None,
320
+ use_device_code: Optional[bool] = None,
321
+ skip_access_validation: Optional[bool] = False,
322
+ **kwargs: Any,
323
+ ) -> str:
324
+ """
325
+ get access token for CloudGPT OpenAI
326
+ """
327
+ return get_openai_token_provider(
328
+ token_cache_file=token_cache_file,
329
+ client_id=client_id,
330
+ client_secret=client_secret,
331
+ use_azure_cli=use_azure_cli,
332
+ use_broker_login=use_broker_login,
333
+ use_managed_identity=use_managed_identity,
334
+ use_device_code=use_device_code,
335
+ skip_access_validation=skip_access_validation,
336
+ **kwargs,
337
+ )()
338
+
339
+
340
+ """
341
+ Available models for CloudGPT OpenAI
342
+ """
343
+ cloudgpt_available_models = Literal[
344
+ "gpt-35-turbo-20220309",
345
+ "gpt-35-turbo-16k-20230613",
346
+ "gpt-35-turbo-20230613",
347
+ "gpt-35-turbo-1106",
348
+ "gpt-4-20230321",
349
+ "gpt-4-20230613",
350
+ "gpt-4-32k-20230321",
351
+ "gpt-4-32k-20230613",
352
+ "gpt-4-1106-preview",
353
+ "gpt-4-0125-preview",
354
+ "gpt-4-visual-preview",
355
+ "gpt-4-turbo-20240409",
356
+ "gpt-4o-20240513",
357
+ "gpt-4o-20240806",
358
+ "gpt-4o-mini-20240718",
359
+ ]
360
+
361
+ cloudgpt_available_realtime_models = Literal["gpt-4o-realtime-preview-20241001"]
362
+
363
+
364
+ def encode_image(image_path: str, mime_type: Optional[str] = None) -> str:
365
+ """
366
+ Utility function to encode image to base64 for using in OpenAI API
367
+
368
+ Parameters
369
+ ----------
370
+ image_path : str
371
+ path to the image file
372
+
373
+ mime_type : Optional[str], optional
374
+ mime type of the image, by default None and will infer from the file extension if possible
375
+
376
+ Returns
377
+ -------
378
+ str
379
+ base64 encoded image url
380
+ """
381
+ import base64
382
+ import mimetypes
383
+
384
+ file_name = os.path.basename(image_path)
385
+ mime_type = cast(
386
+ Optional[str],
387
+ mime_type if mime_type is not None else mimetypes.guess_type(file_name)[0], # type: ignore
388
+ )
389
+ with open(image_path, "rb") as image_file:
390
+ encoded_image = base64.b64encode(image_file.read()).decode("ascii")
391
+
392
+ if mime_type is None or not mime_type.startswith("image/"):
393
+ print(
394
+ "Warning: mime_type is not specified or not an image mime type. Defaulting to png."
395
+ )
396
+ mime_type = "image/png"
397
+
398
+ image_url = f"data:{mime_type};base64," + encoded_image
399
+ return image_url
400
+
401
+
402
+ @functools.lru_cache(maxsize=3)
403
+ def get_openai_client(
404
+ client_id: Optional[str] = None,
405
+ client_secret: Optional[str] = None,
406
+ use_azure_cli: Optional[bool] = None,
407
+ use_broker_login: Optional[bool] = None,
408
+ use_managed_identity: Optional[bool] = None,
409
+ use_device_code: Optional[bool] = None,
410
+ ) -> OpenAI:
411
+ """
412
+ Initialize OpenAI client for CloudGPT OpenAI.
413
+
414
+ All parameters are optional and will use the default authentication method if not specified.
415
+
416
+ Parameters
417
+ ----------
418
+ client_id : Optional[str], optional
419
+ client id for AAD app, by default None
420
+ client_secret : Optional[str], optional
421
+ client secret for AAD app, by default None
422
+ use_azure_cli : Optional[bool], optional
423
+ use Azure CLI for authentication, by default None. If AzCli has been installed and logged in,
424
+ it will be used for authentication. This is recommended for headless environments and AzCLI takes
425
+ care of token cache and token refresh.
426
+ use_broker_login : Optional[bool], optional
427
+ use broker login for authentication, by default None.
428
+ If not specified, it will be enabled for known supported environments (e.g. Windows, macOS, WSL, VSCode),
429
+ but sometimes it may not always could cache the token for long-term usage.
430
+ In such cases, you can disable it by setting it to False.
431
+ use_managed_identity : Optional[bool], optional
432
+ use managed identity for authentication, by default None.
433
+ If not specified, it will use user assigned managed identity if client_id is specified,
434
+ For use system assigned managed identity, client_id could be None but need to set use_managed_identity to True.
435
+ use_device_code : Optional[bool], optional
436
+ use device code for authentication, by default None. If not specified, it will use interactive login on supported platform.
437
+
438
+ Returns
439
+ -------
440
+ OpenAI
441
+ OpenAI client for CloudGPT OpenAI. Check https://github.com/openai/openai-python for more details.
442
+ """
443
+ token_provider = get_openai_token_provider(
444
+ client_id=client_id,
445
+ client_secret=client_secret,
446
+ use_azure_cli=use_azure_cli,
447
+ use_broker_login=use_broker_login,
448
+ use_managed_identity=use_managed_identity,
449
+ use_device_code=use_device_code,
450
+ )
451
+ print(token_provider())
452
+ client = openai.AzureOpenAI(
453
+ api_version="2024-06-01",
454
+ azure_endpoint="https://cloudgpt-openai.azure-api.net/",
455
+ azure_ad_token_provider=token_provider,
456
+ )
457
+ return client
458
+
459
+
460
+ def get_chat_completion(
461
+ model: Optional[cloudgpt_available_models] = None,
462
+ client_id: Optional[str] = None,
463
+ client_secret: Optional[str] = None,
464
+ use_azure_cli: Optional[bool] = None,
465
+ use_broker_login: Optional[bool] = None,
466
+ use_managed_identity: Optional[bool] = None,
467
+ use_device_code: Optional[bool] = None,
468
+ **kwargs: Any,
469
+ ):
470
+ """
471
+ Helper function to get chat completion from OpenAI API
472
+ """
473
+
474
+ engine: Optional[str] = kwargs.get("engine")
475
+
476
+ model_name: Any = model
477
+ if model_name is None:
478
+ if engine is None:
479
+ raise ValueError("model name must be specified by 'model' parameter")
480
+ model_name = engine
481
+
482
+ if "engine" in kwargs:
483
+ del kwargs["engine"]
484
+
485
+ client = get_openai_client(
486
+ client_id=client_id,
487
+ client_secret=client_secret,
488
+ use_azure_cli=use_azure_cli,
489
+ use_broker_login=use_broker_login,
490
+ use_managed_identity=use_managed_identity,
491
+ use_device_code=use_device_code,
492
+ )
493
+
494
+ response: Any = client.completions.create(model=model_name, **kwargs)
495
+
496
+ return response
497
+
498
+
499
+ def _check_rtclient():
500
+ try:
501
+ import rtclient # type: ignore
502
+
503
+ del rtclient
504
+ except ImportError:
505
+ raise ImportError(
506
+ f"rtclient package is required when using realtime API`. Please install it by running \n"
507
+ "pip install https://github.com/Azure-Samples/aoai-realtime-audio-sdk/releases/download/py%2Fv0.5.1/rtclient-0.5.1-py3-none-any.whl"
508
+ )
509
+ return True
510
+
511
+
512
+ if TYPE_CHECKING:
513
+ from rtclient import RTClient, RTLowLevelClient
514
+
515
+
516
+ async def get_realtime_low_level_client(
517
+ model: cloudgpt_available_realtime_models = "gpt-4o-realtime-preview-20241001",
518
+ **kwargs: Any,
519
+ ) -> RTLowLevelClient:
520
+ """
521
+ Get realtime client with low level API for fined grained control
522
+
523
+ Usage:
524
+ ```python
525
+ async with await get_realtime_low_level_client() as client:
526
+ # use client
527
+ pass
528
+ ```
529
+ """
530
+ assert _check_rtclient()
531
+ from rtclient import RTLowLevelClient
532
+
533
+ class CloudGPT_AOAI_RTLowLevelClient(RTLowLevelClient):
534
+ def __init__(
535
+ self,
536
+ token_provider: AsyncTokenProvider,
537
+ url: str = "https://cloudgpt-openai.azure-api.net/",
538
+ azure_deployment: cloudgpt_available_realtime_models | None = None,
539
+ ):
540
+ self._async_token_provider = token_provider
541
+
542
+ from azure.core.credentials import AzureKeyCredential
543
+
544
+ key_credential = AzureKeyCredential("placeholder")
545
+
546
+ super().__init__(
547
+ url=url,
548
+ key_credential=key_credential,
549
+ azure_deployment=azure_deployment,
550
+ )
551
+
552
+ async def _get_auth(self) -> Dict[str, str]:
553
+ token = await self._async_token_provider()
554
+ return {"Authorization": f"Bearer {token}"}
555
+
556
+ token_provider = await async_get_openai_token_provider(**kwargs)
557
+ return CloudGPT_AOAI_RTLowLevelClient(
558
+ token_provider=token_provider,
559
+ azure_deployment=model,
560
+ )
561
+
562
+
563
+ async def get_realtime_client(
564
+ model: cloudgpt_available_realtime_models = "gpt-4o-realtime-preview-20241001",
565
+ **kwargs: Any,
566
+ ) -> RTClient:
567
+ """
568
+ Get realtime client with high level API for simplified usage
569
+
570
+ Usage:
571
+ ```python
572
+ async with await get_realtime_client() as client:
573
+ # use client
574
+ pass
575
+ ```
576
+ """
577
+ assert _check_rtclient()
578
+ from rtclient import RTClient, MessageQueueWithError, Session
579
+
580
+ class CloudGPT_AOAI_RTClient(RTClient):
581
+ def __init__(
582
+ self,
583
+ low_level_client: Optional[RTLowLevelClient] = None,
584
+ ):
585
+ self._client = low_level_client
586
+
587
+ self._message_queue = MessageQueueWithError(
588
+ receive_delegate=self._receive_message,
589
+ error_predicate=lambda m: m is not None and (m.type == "error"),
590
+ )
591
+
592
+ self.session: Optional[Session] = None
593
+
594
+ self._response_map: dict[str, str] = {}
595
+
596
+ low_level_client = await get_realtime_low_level_client(model=model, **kwargs)
597
+ return CloudGPT_AOAI_RTClient(low_level_client=low_level_client)
598
+
599
+
600
+ def _test_call(**kwargs: Any):
601
+ test_message = "What is the content?"
602
+
603
+ client = get_openai_client(**kwargs)
604
+
605
+ response = client.chat.completions.create(
606
+ model="gpt-4o-mini-20240718",
607
+ messages=[{"role": "user", "content": test_message}],
608
+ temperature=0.7,
609
+ max_tokens=100,
610
+ top_p=0.95,
611
+ frequency_penalty=0,
612
+ presence_penalty=0,
613
+ )
614
+
615
+ print(response.choices[0].message)
616
+
617
+
618
+ if __name__ == "__main__":
619
+ _test_call(use_broker_login=True)
combine.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ names = os.listdir("/home/aiscuser/fhw/data")
4
+ lines = []
5
+ for name in names:
6
+ if "_with_best_answer" in name:
7
+ f = open(f"/home/aiscuser/fhw/data/{name}", "r+")
8
+ lines.extend(f.readlines())
9
+ fw = open("/home/aiscuser/fhw/data/all_instruct_with_answers.json", "w+")
10
+ for line in lines:
11
+ fw.write(line)
config_sft_fhw.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: mistralai/Mistral-7B-v0.1
3
+ model_revision: main
4
+ torch_dtype: bfloat16
5
+ attn_implementation: flash_attention_2
6
+
7
+ # Data training arguments
8
+ chat_template: "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
9
+ dataset_mixer:
10
+ data/my: 1.0
11
+ dataset_splits:
12
+ - train
13
+ preprocessing_num_workers: 128
14
+
15
+ # SFT trainer config
16
+ bf16: true
17
+ do_eval: False
18
+ eval_strategy: epoch
19
+ gradient_accumulation_steps: 1
20
+ gradient_checkpointing: true
21
+ gradient_checkpointing_kwargs:
22
+ use_reentrant: False
23
+ learning_rate: 1.0e-05
24
+ log_level: info
25
+ logging_steps: 5
26
+ logging_strategy: steps
27
+ lr_scheduler_type: cosine
28
+ max_seq_length: 4096
29
+ num_train_epochs: 3
30
+ output_dir: trained_models/deepseekcoder
31
+ overwrite_output_dir: true
32
+ per_device_eval_batch_size: 8
33
+ per_device_train_batch_size: 8
34
+ push_to_hub: true
35
+ remove_unused_columns: true
36
+ report_to:
37
+ - tensorboard
38
+ save_strategy: "epoch"
39
+ save_total_limit: 5
40
+ seed: 42
41
+ warmup_ratio: 0.1
crux.sh ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python main.py \
2
+ --model /home/aiscuser/fhw/model_weights/warriordeep448/ \
3
+ --trust_remote_code \
4
+ --tasks output_prediction \
5
+ --batch_size 20 \
6
+ --n_samples 10 \
7
+ --max_length_generation 4096 \
8
+ --precision fp16 \
9
+ --temperature 0.8 \
10
+ --save_generations \
11
+ --save_generations_path model_generations_raw/warriordeep448_temp0.8_output/shard_0.json \
12
+ --shuffle \
13
+ --tensor_parallel_size 8
14
+
15
+ python main.py \
16
+ --model /home/aiscuser/fhw/model_weights/warriordeep448/ \
17
+ --trust_remote_code \
18
+ --tasks output_prediction \
19
+ --batch_size 20 \
20
+ --n_samples 10 \
21
+ --max_length_generation 4096 \
22
+ --precision fp16 \
23
+ --temperature 0.2 \
24
+ --save_generations \
25
+ --save_generations_path model_generations_raw/warriordeep448_temp0.2_output/shard_0.json \
26
+ --shuffle \
27
+ --tensor_parallel_size 8
28
+
29
+ python main.py \
30
+ --model /home/aiscuser/fhw/model_weights/warriordeep448/ \
31
+ --trust_remote_code \
32
+ --tasks output_prediction \
33
+ --batch_size 20 \
34
+ --n_samples 10 \
35
+ --max_length_generation 4096 \
36
+ --precision fp16 \
37
+ --temperature 0.8 \
38
+ --save_generations \
39
+ --save_generations_path model_generations_raw/warriordeep448+cot_temp0.8_output/shard_0.json \
40
+ --cot \
41
+ --shuffle \
42
+ --tensor_parallel_size 8
43
+
44
+ python main.py \
45
+ --model /home/aiscuser/fhw/model_weights/warriordeep448/ \
46
+ --trust_remote_code \
47
+ --tasks output_prediction \
48
+ --batch_size 20 \
49
+ --n_samples 10 \
50
+ --max_length_generation 4096 \
51
+ --precision fp16 \
52
+ --temperature 0.2 \
53
+ --save_generations \
54
+ --save_generations_path model_generations_raw/warriordeep448+cot_temp0.2_output/shard_0.json \
55
+ --cot \
56
+ --shuffle \
57
+ --tensor_parallel_size 8
58
+
59
+ python main.py \
60
+ --model /home/aiscuser/fhw/model_weights/warriordeep448/ \
61
+ --trust_remote_code \
62
+ --tasks input_prediction \
63
+ --batch_size 20 \
64
+ --n_samples 10 \
65
+ --max_length_generation 4096 \
66
+ --precision fp16 \
67
+ --temperature 0.8 \
68
+ --save_generations \
69
+ --save_generations_path model_generations_raw/warriordeep448+cot_temp0.8_input/shard_0.json \
70
+ --cot \
71
+ --shuffle \
72
+ --tensor_parallel_size 8
73
+
74
+ python main.py \
75
+ --model /home/aiscuser/fhw/model_weights/warriordeep448/ \
76
+ --trust_remote_code \
77
+ --tasks input_prediction \
78
+ --batch_size 20 \
79
+ --n_samples 10 \
80
+ --max_length_generation 4096 \
81
+ --precision fp16 \
82
+ --temperature 0.2 \
83
+ --save_generations \
84
+ --save_generations_path model_generations_raw/warriordeep448+cot_temp0.2_input/shard_0.json \
85
+ --cot \
86
+ --shuffle \
87
+ --tensor_parallel_size 8
evalplus/.dockerignore ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ # nuclear option because steven uses PyCharm.
161
+ .idea/
162
+
163
+ # VSCode
164
+ .vscode/
165
+
166
+ # EvalPlus specific
167
+ EvalPlus/
168
+ backup/
169
+ passrate.p*
170
+ min_cov_dir/
171
+ HumanEvalPlus*.gz
172
+ MbppPlus*.gz
173
+ evalplus/_version.py
174
+ *mbpp.json
175
+ *.jsonl
176
+ *.json
177
+ *.png
178
+ *.pdf
179
+ trash-bin
180
+ .bak
181
+ evalplus.github.io
182
+ evalplus_results/
evalplus/.github/ISSUE_TEMPLATE/buggy_contract.yml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "🐛 Report Bad Contract"
2
+ description: Report to us that certain program contract should be repaired.
3
+ title: "🐛 [TestRemoval] - <TASK_ID> <WHY>"
4
+ labels: ["program contract"]
5
+ body:
6
+ - type: input
7
+ id: version
8
+ attributes:
9
+ label: "EvalPlus version"
10
+ description: What is the version of EvalPlus? You can find it by running `pip show evalplus`.
11
+ placeholder: For example, 0.1.0
12
+ validations:
13
+ required: true
14
+ - type: input
15
+ id: cache
16
+ attributes:
17
+ label: "Output of running `ls ~/.cache/evalplus`"
18
+ validations:
19
+ required: true
20
+ - type: input
21
+ id: task_id
22
+ attributes:
23
+ label: "Task ID of the programming task"
24
+ placeholder: HumanEval/[??]
25
+ validations:
26
+ required: true
27
+ - type: textarea
28
+ id: original
29
+ attributes:
30
+ label: "The original wrong contract"
31
+ description: You can run `python -c "from evalplus.data import get_human_eval_plus; print(get_human_eval_plus()['HumanEval/❓']['contract'])"`
32
+ render: python
33
+ validations:
34
+ required: true
35
+ - type: textarea
36
+ id: new
37
+ attributes:
38
+ label: "Your proposed new contract"
39
+ render: python
40
+ validations:
41
+ required: true
42
+ - type: textarea
43
+ id: other
44
+ attributes:
45
+ label: "Other context"
46
+ description: (Optional) Anything else the maintainer should notice?
47
+ validations:
48
+ required: false
evalplus/.github/ISSUE_TEMPLATE/buggy_test.yml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "🐛 Report Bad Test Inputs"
2
+ description: Report to us that certain test inputs should be removed.
3
+ title: "🐛 [TestRemoval] - <TASK_ID> <WHY>"
4
+ labels: ["bug"]
5
+ body:
6
+ - type: input
7
+ id: version
8
+ attributes:
9
+ label: "EvalPlus version"
10
+ description: What is the version of EvalPlus? You can find it by running `pip show evalplus`.
11
+ placeholder: For example, 0.1.0
12
+ validations:
13
+ required: true
14
+ - type: input
15
+ id: cache
16
+ attributes:
17
+ label: "Output of running `ls ~/.cache/evalplus`"
18
+ validations:
19
+ required: true
20
+ - type: input
21
+ id: task_id
22
+ attributes:
23
+ label: "Task ID of the programming task"
24
+ placeholder: HumanEval/[??]
25
+ validations:
26
+ required: true
27
+ - type: textarea
28
+ id: test_input
29
+ attributes:
30
+ label: "Test input"
31
+ description: The text form of the test input that you think should be removed
32
+ render: python
33
+ validations:
34
+ required: true
35
+ - type: textarea
36
+ id: description
37
+ attributes:
38
+ label: "Description"
39
+ description: An explicit description of why you think this test should be removed
40
+ placeholder: Here is a correct solution but it is incorrectly falsified by the test because ...
41
+ validations:
42
+ required: true
43
+ - type: textarea
44
+ id: other
45
+ attributes:
46
+ label: "Other context"
47
+ description: (Optional) Anything else the maintainer should notice?
48
+ validations:
49
+ required: false
evalplus/.github/ISSUE_TEMPLATE/config.yml ADDED
@@ -0,0 +1 @@
 
 
1
+ blank_issues_enabled: true
evalplus/.github/ISSUE_TEMPLATE/model_eval_request.yml ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "🤗 Model Evaluation Request"
2
+ description: Request EvalPlus maintainers to evaluate your model independently and update it on our leaderboard.
3
+ title: "🤗 [REQUEST] - FILL_THE_MODEL_NAME_HERE"
4
+ labels: ["model eval"]
5
+ body:
6
+ - type: textarea
7
+ id: about
8
+ attributes:
9
+ label: "Model introduction"
10
+ description: Provide a brief introduction to the model.
11
+ placeholder: The models is created by ... and is used for ...
12
+ validations:
13
+ required: true
14
+ - type: input
15
+ id: url
16
+ attributes:
17
+ label: "Model URL"
18
+ description: Indicate the URL (e.g., huggingface or other release pages) of the model
19
+ placeholder: https://huggingface.co/[???]/[???]
20
+ validations:
21
+ required: true
22
+ - type: textarea
23
+ id: other
24
+ attributes:
25
+ label: "Additional information (Optional)"
26
+ description: Special steps indicating how to run the model with preferably scripts/codes.
27
+ placeholder: What data type precision should be used? What is the minimal hardware requirement? Can it be accelerated by tools such as vLLM?
28
+ validations:
29
+ required: false
30
+ - type: textarea
31
+ id: decomtamination
32
+ attributes:
33
+ label: "Decontamination"
34
+ description: How does the authors avoid contamination for their training data?
35
+ placeholder: Please clarify the decontamination steps and quantify it, e.g., N-gram match of ground-truth code in the training dataset.
36
+ validations:
37
+ required: true
38
+ - type: dropdown
39
+ id: author
40
+ attributes:
41
+ label: "Author"
42
+ description: "Are you (one of) the author(s) of the model?"
43
+ multiple: false
44
+ options:
45
+ - "Yes"
46
+ - "No"
47
+ validations:
48
+ required: true
49
+ - type: dropdown
50
+ id: data
51
+ attributes:
52
+ label: "Data"
53
+ description: "Is the training/fine-tuning data available in public?"
54
+ multiple: false
55
+ options:
56
+ - "Yes (If so please specify in 'Additional information')"
57
+ - "No"
58
+ validations:
59
+ required: true
60
+ - type: checkboxes
61
+ id: security
62
+ attributes:
63
+ label: "Security"
64
+ options:
65
+ - label: "I confirm that the model is safe to run which is not designed to produce malicious code or content."
66
+ required: true
67
+ - type: checkboxes
68
+ id: integrity
69
+ attributes:
70
+ label: "Integrity"
71
+ options:
72
+ - label: "I confirm that the model comes from unique and original work and does not contain any plagiarism."
73
+ required: true
evalplus/.gitignore ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ # nuclear option because steven uses PyCharm.
161
+ .idea/
162
+
163
+ # VSCode
164
+ .vscode/
165
+
166
+ # EvalPlus specific
167
+ EvalPlus/
168
+ backup/
169
+ passrate.p*
170
+ min_cov_dir/
171
+ HumanEvalPlus*.gz
172
+ MbppPlus*.gz
173
+ evalplus/_version.py
174
+ *mbpp.json
175
+ *.jsonl
176
+ *.json
177
+ *.png
178
+ *.pdf
179
+ trash-bin
180
+ .bak
181
+ evalplus.github.io
182
+ evalplus_results/
evalplus/.pre-commit-config.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/pycqa/isort
3
+ rev: 5.12.0
4
+ hooks:
5
+ - id: isort
6
+ name: isort (python)
7
+ args: ["--profile", "black"]
8
+ - repo: https://github.com/psf/black
9
+ rev: 22.6.0
10
+ hooks:
11
+ - id: black
12
+ - repo: https://github.com/pre-commit/pre-commit-hooks
13
+ rev: v4.3.0
14
+ hooks:
15
+ - id: check-yaml
16
+ - id: end-of-file-fixer
17
+ - id: trailing-whitespace
18
+ exclude: (?x)^(
19
+ groundtruth/.*
20
+ )$
evalplus/CITATION.cff ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cff-version: 1.2.0
2
+ message: "If you use this work and love it, consider citing it as below \U0001F917"
3
+ title: EvalPlus
4
+ authors:
5
+ - family-names: EvalPlus Team
6
+ url: https://github.com/evalplus/evalplus
7
+ doi: https://doi.org/10.48550/arXiv.2305.01210
8
+ date-released: 2023-05-01
9
+ license: Apache-2.0
10
+ preferred-citation:
11
+ type: article
12
+ title: "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation"
13
+ authors:
14
+ - family-names: Liu
15
+ given-names: Jiawei
16
+ - family-names: Xia
17
+ given-names: Chunqiu Steven
18
+ - family-names: Wang
19
+ given-names: Yuyao
20
+ - family-names: Zhang
21
+ given-names: Lingming
22
+ year: 2023
23
+ journal: "arXiv preprint arXiv:2305.01210"
24
+ doi: https://doi.org/10.48550/arXiv.2305.01210
25
+ url: https://arxiv.org/abs/2305.01210
evalplus/Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Better use newer Python as generated code can use new features
2
+ FROM python:3.11-slim
3
+
4
+ # install git and c++ (required by cirronlib.cpp)
5
+ RUN apt-get update && apt-get install -y git g++
6
+
7
+ # upgrade to latest pip
8
+ RUN pip install --upgrade pip
9
+
10
+ COPY . /evalplus
11
+
12
+ RUN cd /evalplus && pip install ".[perf]"
13
+
14
+ # Pre-install the dataset
15
+ RUN python3 -c "from evalplus.data import *; get_human_eval_plus(); get_mbpp_plus(); get_evalperf_data()"
16
+
17
+ WORKDIR /app
18
+
19
+ CMD ["bash"]
evalplus/LICENSE ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
202
+
203
+ -------------------------------------------------------------------------------
204
+ The files under "evalplus/eval/" additionally complies with the MIT License for
205
+ being built on OpenAI's HumanEval work.
evalplus/MANIFEST.in ADDED
@@ -0,0 +1 @@
 
 
1
+ exclude evalplus/_experimental/**/*.py
evalplus/README.md ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # `EvalPlus(📖) => 📚`
2
+
3
+ <p align="center">
4
+ <a href="https://evalplus.github.io"><img src="https://img.shields.io/badge/%F0%9F%8F%86-leaderboard-8A2BE2"></a>
5
+ <a href="https://openreview.net/forum?id=1qvx610Cu7"><img src="https://img.shields.io/badge/EvalPlus-NeurIPS'23-a55fed.svg"></a>
6
+ <a href="https://openreview.net/forum?id=IBCBMeAhmC"><img src="https://img.shields.io/badge/EvalPerf-COLM'24-a55fed.svg"></a>
7
+ <a href="https://huggingface.co/evalplus/"><img src="https://img.shields.io/badge/🤗%20Hugging%20Face-evalplus-%23ff8811.svg"></a>
8
+ <a href="https://pypi.org/project/evalplus/"><img src="https://img.shields.io/pypi/v/evalplus?color=g"></a>
9
+ <a href="https://hub.docker.com/r/ganler/evalplus" title="Docker"><img src="https://img.shields.io/docker/image-size/ganler/evalplus"></a>
10
+ </p>
11
+
12
+ <p align="center">
13
+ <a href="#-about">📙About</a> •
14
+ <a href="#-quick-start">🔥Quick Start</a> •
15
+ <a href="#-llm-backends">🚀LLM Backends</a> •
16
+ <a href="#-documents">📚Documents</a> •
17
+ <a href="#-citation">📜Citation</a> •
18
+ <a href="#-acknowledgement">🙏Acknowledgement</a>
19
+ </p>
20
+
21
+ ## 📢 News
22
+
23
+ Who's using EvalPlus datasets? EvalPlus has been used by various LLM teams, including:
24
+
25
+ * [Meta Llama 3.1 and 3.3](https://ai.meta.com/blog/meta-llama-3-1/)
26
+ * [Allen AI TÜLU 1/2/3](https://github.com/allenai/open-instruct/blob/main/docs/tulu1_tulu2.md#benchmark-based-eval)
27
+ * [Qwen2.5-Coder](https://qwenlm.github.io/blog/qwen2.5-coder-family/)
28
+ * [CodeQwen 1.5](https://qwenlm.github.io/blog/codeqwen1.5/)
29
+ * [DeepSeek-Coder V2](https://arxiv.org/pdf/2406.11931)
30
+ * [Qwen2](https://arxiv.org/pdf/2407.10671)
31
+ * [Snowflake Arctic](https://www.snowflake.com/en/data-cloud/arctic/)
32
+ * [StarCoder2](https://arxiv.org/pdf/2402.19173)
33
+ * [Magicoder](https://arxiv.org/pdf/2312.02120)
34
+ * [WizardCoder](https://arxiv.org/pdf/2306.08568)
35
+
36
+ Below tracks the notable updates of EvalPlus:
37
+
38
+ - **[2024-10-20 `v0.3.1`]**: EvalPlus `v0.3.1` is officially released! Highlights: *(i)* Code efficiency evaluation via EvalPerf, *(ii)* one command to run all: generation + post-processing + evaluation, *(iii)* support for more inference backends such as Google Gemini & Anthropic, etc.
39
+ - **[2024-06-09 pre `v0.3.0`]**: Improved ground-truth solutions for MBPP+ tasks (IDs: 459, 102, 559). Thanks to [EvalArena](https://github.com/crux-eval/eval-arena).
40
+ - **[2024-04-17 pre `v0.3.0`]**: MBPP+ is upgraded to `v0.2.0` by removing some broken tasks (399 -> 378 tasks). ~4pp pass@1 improvement could be expected.
41
+
42
+ <details><summary>Earlier news <i>:: click to expand ::</i></summary>
43
+ <div>
44
+
45
+ - ([`v0.2.1`](https://github.com/evalplus/evalplus/releases/tag/v0.2.1)) You can use EvalPlus datasets via [bigcode-evaluation-harness](https://github.com/bigcode-project/bigcode-evaluation-harness)! HumanEval+ oracle fixes (32).
46
+ - ([`v0.2.0`](https://github.com/evalplus/evalplus/releases/tag/v0.2.0)) MBPP+ is released! HumanEval contract & input fixes (0/3/9/148/114/1/2/99/28/32/35/160).
47
+ - ([`v0.1.7`](https://github.com/evalplus/evalplus/releases/tag/v0.1.7)) [Leaderboard](https://evalplus.github.io/leaderboard.html) release; HumanEval+ contract and input fixes (32/166/126/6)
48
+ - ([`v0.1.6`](https://github.com/evalplus/evalplus/releases/tag/v0.1.6)) Configurable and by-default-conservative timeout settings; HumanEval+ contract & ground-truth fixes (129/148/75/53/0/3/9/140)
49
+ - ([`v0.1.5`](https://github.com/evalplus/evalplus/releases/tag/v0.1.5)) HumanEval+ mini is released for ultra-fast evaluation when you have too many samples!
50
+ - ([`v0.1.1`](https://github.com/evalplus/evalplus/releases/tag/v0.1.1)) Optimizing user experiences: evaluation speed, PyPI package, Docker, etc.
51
+ - ([`v0.1.0`](https://github.com/evalplus/evalplus/releases/tag/v0.1.0)) HumanEval+ is released!
52
+
53
+ </div>
54
+ </details>
55
+
56
+
57
+ ## 📙 About
58
+
59
+ EvalPlus is a rigorous evaluation framework for LLM4Code, with:
60
+
61
+ - ✨ **HumanEval+**: 80x more tests than the original HumanEval!
62
+ - ✨ **MBPP+**: 35x more tests than the original MBPP!
63
+ - ✨ **EvalPerf**: evaluating the efficiency of LLM-generated code!
64
+ - ✨ **Framework**: our packages/images/tools can easily and safely evaluate LLMs on above benchmarks.
65
+
66
+ Why EvalPlus?
67
+
68
+ - ✨ **Precise evaluation**: See [our leaderboard](https://evalplus.github.io/leaderboard.html) for latest LLM rankings before & after rigorous evaluation.
69
+ - ✨ **Coding rigorousness**: Look at the score differences! esp. before & after using EvalPlus tests! Less drop means more rigorousness in code generation; while a bigger drop means the generated code tends to be fragile.
70
+ - ✨ **Code efficiency**: Beyond correctness, our EvalPerf dataset evaluates the efficiency of LLM-generated code via performance-exercising coding tasks and test inputs.
71
+
72
+ Want to know more details? Read our papers & materials!
73
+
74
+ - **EvalPlus**: [NeurIPS'23 paper](https://openreview.net/forum?id=1qvx610Cu7), [Slides](https://docs.google.com/presentation/d/1eTxzUQG9uHaU13BGhrqm4wH5NmMZiM3nI0ezKlODxKs), [Poster](https://jw-liu.xyz/assets/pdf/EvalPlus_Poster.pdf), [Leaderboard](https://evalplus.github.io/leaderboard.html)
75
+ - **EvalPerf**: [COLM'24 paper](https://openreview.net/forum?id=IBCBMeAhmC), [Poster](https://jw-liu.xyz/assets/pdf/jiawei-colm-evalperf-poster.pdf), [Documentation](./docs/evalperf.md), [Leaderboard](https://evalplus.github.io/evalperf.html)
76
+
77
+
78
+ ## 🔥 Quick Start
79
+
80
+ ### Code Correctness Evaluation: HumanEval(+) or MBPP(+)
81
+
82
+ ```bash
83
+ pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"
84
+ # Or `pip install "evalplus[vllm]" --upgrade` for the latest stable release
85
+
86
+ evalplus.evaluate --model "ise-uiuc/Magicoder-S-DS-6.7B" \
87
+ --dataset [humaneval|mbpp] \
88
+ --backend vllm \
89
+ --greedy
90
+ ```
91
+
92
+ <details><summary>🛡️ Safe code execution within Docker <i>:: click to expand ::</i></summary>
93
+ <div>
94
+
95
+ ```bash
96
+ # Local generation
97
+ evalplus.codegen --model "ise-uiuc/Magicoder-S-DS-6.7B" \
98
+ --dataset humaneval \
99
+ --backend vllm \
100
+ --greedy
101
+
102
+ # Code execution within Docker
103
+ docker run --rm --pull=always -v $(pwd)/evalplus_results:/app ganler/evalplus:latest \
104
+ evalplus.evaluate --dataset humaneval \
105
+ --samples /app/humaneval/ise-uiuc--Magicoder-S-DS-6.7B_vllm_temp_0.0.jsonl
106
+ ```
107
+
108
+ </div>
109
+ </details>
110
+
111
+ ### Code Efficiency Evaluation: EvalPerf (*nix only)
112
+
113
+ ```bash
114
+ pip install --upgrade "evalplus[perf,vllm] @ git+https://github.com/evalplus/evalplus"
115
+ # Or `pip install "evalplus[perf,vllm]" --upgrade` for the latest stable release
116
+
117
+ sudo sh -c 'echo 0 > /proc/sys/kernel/perf_event_paranoid' # Enable perf
118
+ evalplus.evalperf --model "ise-uiuc/Magicoder-S-DS-6.7B" --backend vllm
119
+ ```
120
+
121
+ <details><summary>🛡️ Safe code execution within Docker <i>:: click to expand ::</i></summary>
122
+ <div>
123
+
124
+ ```bash
125
+ # Local generation
126
+ evalplus.codegen --model "ise-uiuc/Magicoder-S-DS-6.7B" \
127
+ --dataset evalperf \
128
+ --backend vllm \
129
+ --temperature 1.0 \
130
+ --n-samples 100
131
+
132
+ # Code execution within Docker
133
+ sudo sh -c 'echo 0 > /proc/sys/kernel/perf_event_paranoid' # Enable perf
134
+ docker run --cap-add PERFMON --rm --pull=always -v $(pwd)/evalplus_results:/app ganler/evalplus:latest \
135
+ evalplus.evalperf --samples /app/evalperf/ise-uiuc--Magicoder-S-DS-6.7B_vllm_temp_1.0.jsonl
136
+ ```
137
+
138
+ </div>
139
+ </details>
140
+
141
+ ## 🚀 LLM Backends
142
+
143
+ ### HuggingFace models
144
+
145
+ - `transformers` backend:
146
+
147
+ ```bash
148
+ evalplus.evaluate --model "ise-uiuc/Magicoder-S-DS-6.7B" \
149
+ --dataset [humaneval|mbpp] \
150
+ --backend hf \
151
+ --greedy
152
+ ```
153
+
154
+ > [!Note]
155
+ >
156
+ > EvalPlus uses different prompts for base and chat models.
157
+ > By default it is detected by `tokenizer.chat_template` when using `hf`/`vllm` as backend.
158
+ > For other backends, only chat mode is allowed.
159
+ >
160
+ > Therefore, if your base models come with a `tokenizer.chat_template`,
161
+ > please add `--force-base-prompt` to avoid being evaluated
162
+ > in a chat mode.
163
+
164
+ <details><summary>Enable Flash Attention 2 <i>:: click to expand ::</i></summary>
165
+ <div>
166
+
167
+ ```bash
168
+ # Install Flash Attention 2
169
+ pip install packaging ninja
170
+ pip install flash-attn --no-build-isolation
171
+ # Note: if you have installation problem, consider using pre-built
172
+ # wheels from https://github.com/Dao-AILab/flash-attention/releases
173
+
174
+ # Run evaluation with FA2
175
+ evalplus.evaluate --model "ise-uiuc/Magicoder-S-DS-6.7B" \
176
+ --dataset [humaneval|mbpp] \
177
+ --backend hf \
178
+ --attn-implementation [flash_attention_2|sdpa] \
179
+ --greedy
180
+ ```
181
+
182
+ </div>
183
+ </details>
184
+
185
+ - `vllm` backend:
186
+
187
+ ```bash
188
+ evalplus.evaluate --model "ise-uiuc/Magicoder-S-DS-6.7B" \
189
+ --dataset [humaneval|mbpp] \
190
+ --backend vllm \
191
+ --tp [TENSOR_PARALLEL_SIZE] \
192
+ --greedy
193
+ ```
194
+
195
+ - `openai` compatible servers (e.g., [vLLM](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html)):
196
+
197
+ ```bash
198
+ # OpenAI models
199
+ export OPENAI_API_KEY="{KEY}" # https://platform.openai.com/settings/organization/api-keys
200
+ evalplus.evaluate --model "gpt-4o-2024-08-06" \
201
+ --dataset [humaneval|mbpp] \
202
+ --backend openai --greedy
203
+
204
+ # DeepSeek
205
+ export OPENAI_API_KEY="{KEY}" # https://platform.deepseek.com/api_keys
206
+ evalplus.evaluate --model "deepseek-chat" \
207
+ --dataset [humaneval|mbpp] \
208
+ --base-url https://api.deepseek.com \
209
+ --backend openai --greedy
210
+
211
+ # Grok
212
+ export OPENAI_API_KEY="{KEY}" # https://console.x.ai/
213
+ evalplus.evaluate --model "grok-beta" \
214
+ --dataset [humaneval|mbpp] \
215
+ --base-url https://api.x.ai/v1 \
216
+ --backend openai --greedy
217
+
218
+ # vLLM server
219
+ # First, launch a vLLM server: https://docs.vllm.ai/en/latest/serving/deploying_with_docker.html
220
+ evalplus.evaluate --model "ise-uiuc/Magicoder-S-DS-6.7B" \
221
+ --dataset [humaneval|mbpp] \
222
+ --base-url http://localhost:8000/v1 \
223
+ --backend openai --greedy
224
+
225
+ # GPTQModel
226
+ evalplus.evaluate --model "ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1" \
227
+ --dataset [humaneval|mbpp] \
228
+ --backend gptqmodel --greedy
229
+ ```
230
+
231
+ ### OpenAI models
232
+
233
+ - Access OpenAI APIs from [OpenAI Console](https://platform.openai.com/)
234
+
235
+ ```bash
236
+ export OPENAI_API_KEY="[YOUR_API_KEY]"
237
+ evalplus.evaluate --model "gpt-4o" \
238
+ --dataset [humaneval|mbpp] \
239
+ --backend openai \
240
+ --greedy
241
+ ```
242
+
243
+ ### Anthropic models
244
+
245
+ - Access Anthropic APIs from [Anthropic Console](https://console.anthropic.com/)
246
+
247
+ ```bash
248
+ export ANTHROPIC_API_KEY="[YOUR_API_KEY]"
249
+ evalplus.evaluate --model "claude-3-haiku-20240307" \
250
+ --dataset [humaneval|mbpp] \
251
+ --backend anthropic \
252
+ --greedy
253
+ ```
254
+
255
+ ### Google Gemini models
256
+
257
+ - Access Gemini APIs from [Google AI Studio](https://aistudio.google.com/)
258
+
259
+ ```bash
260
+ export GOOGLE_API_KEY="[YOUR_API_KEY]"
261
+ evalplus.evaluate --model "gemini-1.5-pro" \
262
+ --dataset [humaneval|mbpp] \
263
+ --backend google \
264
+ --greedy
265
+ ```
266
+
267
+ ### Amazon Bedrock models
268
+
269
+ - [Amazon Bedrock](https://aws.amazon.com/bedrock/)
270
+
271
+ ```bash
272
+ export BEDROCK_ROLE_ARN="[BEDROCK_ROLE_ARN]"
273
+ evalplus.evaluate --model "anthropic.claude-3-5-sonnet-20241022-v2:0" \
274
+ --dataset [humaneval|mbpp] \
275
+ --backend bedrock \
276
+ --greedy
277
+ ```
278
+
279
+ You can checkout the generation and results at `evalplus_results/[humaneval|mbpp]/`
280
+
281
+ <details><summary>⏬ Using EvalPlus as a local repo? <i>:: click to expand ::</i></summary>
282
+ <div>
283
+
284
+ ```bash
285
+ git clone https://github.com/evalplus/evalplus.git
286
+ cd evalplus
287
+ export PYTHONPATH=$PYTHONPATH:$(pwd)
288
+ pip install -r requirements.txt
289
+ ```
290
+
291
+ </div>
292
+ </details>
293
+
294
+ ## 📚 Documents
295
+
296
+ To learn more about how to use EvalPlus, please refer to:
297
+
298
+ - [EvalPlus Commands](./docs/cli.md)
299
+ - [EvalPerf](./docs/evalperf.md)
300
+ - [Program Execution](./docs/execution.md)
301
+
302
+ ## 📜 Citation
303
+
304
+ ```bibtex
305
+ @inproceedings{evalplus,
306
+ title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation},
307
+ author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming},
308
+ booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
309
+ year = {2023},
310
+ url = {https://openreview.net/forum?id=1qvx610Cu7},
311
+ }
312
+
313
+ @inproceedings{evalperf,
314
+ title = {Evaluating Language Models for Efficient Code Generation},
315
+ author = {Liu, Jiawei and Xie, Songrun and Wang, Junhao and Wei, Yuxiang and Ding, Yifeng and Zhang, Lingming},
316
+ booktitle = {First Conference on Language Modeling},
317
+ year = {2024},
318
+ url = {https://openreview.net/forum?id=IBCBMeAhmC},
319
+ }
320
+ ```
321
+
322
+ ## 🙏 Acknowledgement
323
+
324
+ - [HumanEval](https://github.com/openai/human-eval)
325
+ - [MBPP](https://github.com/google-research/google-research/tree/master/mbpp)
evalplus/build/lib/evalplus/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ try:
2
+ from evalplus._version import __version__, __version_tuple__
3
+ except ImportError:
4
+ __version__ = "local-dev"
evalplus/build/lib/evalplus/_version.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # file generated by setuptools_scm
2
+ # don't change, don't track in version control
3
+ TYPE_CHECKING = False
4
+ if TYPE_CHECKING:
5
+ from typing import Tuple, Union
6
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
7
+ else:
8
+ VERSION_TUPLE = object
9
+
10
+ version: str
11
+ __version__: str
12
+ __version_tuple__: VERSION_TUPLE
13
+ version_tuple: VERSION_TUPLE
14
+
15
+ __version__ = version = '0.4.0.dev33'
16
+ __version_tuple__ = version_tuple = (0, 4, 0, 'dev33')
evalplus/build/lib/evalplus/codegen.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from typing import Dict, List, Optional
4
+
5
+ from evalplus.data import get_evalperf_data, get_human_eval_plus, get_mbpp_plus
6
+ from evalplus.provider import DecoderBase, make_model
7
+ from evalplus.sanitize import sanitize
8
+ from evalplus.utils import progress
9
+
10
+
11
+ def codegen(
12
+ target_path: str,
13
+ model: DecoderBase,
14
+ dataset: Dict,
15
+ greedy=False,
16
+ n_samples=1,
17
+ id_range=None,
18
+ resume=True,
19
+ ):
20
+ task2nexist = {}
21
+ if resume and target_path.endswith(".jsonl") and os.path.isfile(target_path):
22
+ with open(target_path, "r") as f:
23
+ for line in f:
24
+ if not line.strip():
25
+ continue
26
+ task_id = json.loads(line)["task_id"]
27
+ task2nexist[task_id] = task2nexist.get(task_id, 0) + 1
28
+
29
+ if target_path.endswith(".jsonl"):
30
+ raw_target_path = target_path.replace(".jsonl", ".raw.jsonl")
31
+ else:
32
+ raw_target_path = target_path + ".raw"
33
+ os.makedirs(target_path, exist_ok=True)
34
+
35
+ print(f"Sanitized code outputs will be saved to {target_path}")
36
+ print(f"Raw outputs will be saved to {raw_target_path}")
37
+
38
+ backend_type: str = type(model).__name__
39
+ with progress(backend_type) as p:
40
+ for task_id, task in p.track(dataset.items()):
41
+ if id_range is not None:
42
+ id_num = int(task_id.split("/")[1])
43
+ low, high = id_range
44
+ if id_num < low or id_num >= high:
45
+ p.console.print(f"Skipping {task_id} as it is not in {id_range}")
46
+ continue
47
+
48
+ if not target_path.endswith(".jsonl"):
49
+ p_name = task_id.replace("/", "_")
50
+ os.makedirs(os.path.join(target_path, p_name), exist_ok=True)
51
+ task2nexist[task_id] = len(
52
+ [
53
+ f
54
+ for f in os.listdir(os.path.join(target_path, p_name))
55
+ if f.endswith(".py")
56
+ ]
57
+ )
58
+
59
+ n_more_samples = n_samples
60
+ log = f"Codegen: {task_id} @ {model}"
61
+ if resume and task2nexist.get(task_id, 0) > 0:
62
+ log += f" (resuming from {task2nexist[task_id]})"
63
+ n_more_samples -= task2nexist[task_id]
64
+
65
+ p.console.print(log)
66
+
67
+ sidx = n_samples - n_more_samples
68
+ while sidx < n_samples:
69
+ prompt = task["prompt"].strip() + "\n"
70
+ outputs = model.codegen(
71
+ prompt,
72
+ do_sample=not greedy,
73
+ num_samples=n_samples - sidx,
74
+ )
75
+ assert outputs, "No outputs from model!"
76
+ for impl in outputs:
77
+ solution = prompt + impl if model.is_direct_completion() else impl
78
+ sanitized_solution = sanitize(
79
+ solution, entrypoint=task["entry_point"]
80
+ )
81
+ if target_path.endswith(".jsonl"):
82
+ # Writing the sanitized version
83
+ with open(target_path, "a") as f:
84
+ f.write(
85
+ json.dumps(
86
+ {"task_id": task_id, "solution": sanitized_solution}
87
+ )
88
+ + "\n"
89
+ )
90
+
91
+ # Writing the raw version
92
+ with open(raw_target_path, "a") as f:
93
+ f.write(
94
+ json.dumps({"task_id": task_id, "solution": solution})
95
+ + "\n"
96
+ )
97
+ else:
98
+ # Writing the sanitized version
99
+ with open(
100
+ os.path.join(target_path, p_name, f"{sidx}.py"),
101
+ "w",
102
+ encoding="utf-8",
103
+ ) as f:
104
+ f.write(sanitized_solution)
105
+
106
+ # Writing the raw version
107
+ with open(
108
+ os.path.join(raw_target_path, p_name, f"{sidx}.py"),
109
+ "w",
110
+ encoding="utf-8",
111
+ ) as f:
112
+ f.write(solution)
113
+ sidx += 1
114
+
115
+
116
+ def run_codegen(
117
+ model: str,
118
+ dataset: str,
119
+ root: str = "evalplus_results",
120
+ bs: Optional[int] = None,
121
+ n_samples: int = 1,
122
+ temperature: float = 0.0,
123
+ resume: bool = True,
124
+ greedy: bool = False,
125
+ id_range: List = None,
126
+ version: str = "default",
127
+ backend: str = "vllm",
128
+ force_base_prompt: bool = False,
129
+ base_url: str = None,
130
+ tp: int = 1,
131
+ evalperf_type: str = None, # For EvalPerf
132
+ jsonl_fmt: bool = True,
133
+ attn_implementation: str = "eager",
134
+ device_map: Optional[str] = None,
135
+ trust_remote_code: bool = False,
136
+ enable_prefix_caching: bool = False,
137
+ enable_chunked_prefill: bool = False,
138
+ dtype: str = "bfloat16",
139
+ gptqmodel_backend: str = "auto", # For GPTQModel
140
+ gguf_file: Optional[str] = None
141
+ ):
142
+ assert dataset in ["humaneval", "mbpp", "evalperf"], f"Invalid dataset {dataset}"
143
+ assert evalperf_type is None or evalperf_type in [
144
+ "instruct",
145
+ "perf-instruct",
146
+ "perf-CoT",
147
+ ]
148
+
149
+ # Make dir for codes generated by each model
150
+ identifier = model.strip("./").replace("/", "--") + f"_{backend}_temp_{temperature}"
151
+ if evalperf_type:
152
+ identifier += f"-{evalperf_type}"
153
+
154
+ target_path = os.path.join(root, dataset, identifier)
155
+ if jsonl_fmt:
156
+ target_path += ".jsonl"
157
+ else:
158
+ os.makedirs(target_path, exist_ok=True)
159
+
160
+ if dataset == "humaneval":
161
+ dataset_dict = get_human_eval_plus(version=version)
162
+ elif dataset == "mbpp":
163
+ dataset_dict = get_mbpp_plus(version=version)
164
+ elif dataset == "evalperf":
165
+ original_dataset = {**get_human_eval_plus(), **get_mbpp_plus()}
166
+ dataset_dict = {k: original_dataset[k] for k in get_evalperf_data()}
167
+ assert id_range is None, "id_range not supported for evalperf"
168
+ else:
169
+ raise ValueError(f"Invalid dataset {dataset}")
170
+
171
+ all_tasks_complete = False
172
+ if jsonl_fmt and os.path.isfile(target_path):
173
+ task_counts = {}
174
+ with open(target_path, "r") as f:
175
+ for line in f:
176
+ if not line.strip():
177
+ continue
178
+ data = json.loads(line)
179
+ task_id = data["task_id"]
180
+ task_counts[task_id] = task_counts.get(task_id, 0) + 1
181
+
182
+ all_tasks_complete = all(
183
+ task_counts.get(task_id, 0) >= n_samples
184
+ for task_id in dataset_dict.keys()
185
+ )
186
+
187
+ if all_tasks_complete:
188
+ print("All samples are already cached. Skipping codegen.")
189
+ return target_path
190
+
191
+ if greedy and (temperature != 0 or bs != 1 or n_samples != 1):
192
+ temperature = 0.0
193
+ bs = 1
194
+ n_samples = 1
195
+ print("Greedy decoding ON (--greedy): setting bs=1, n_samples=1, temperature=0")
196
+
197
+ if id_range is not None:
198
+ assert len(id_range) == 2, "id_range must be a list of length 2"
199
+ assert id_range[0] < id_range[1], "id_range must be increasing"
200
+ id_range = tuple(id_range)
201
+
202
+ if bs is None:
203
+ bs = min(n_samples, 32)
204
+ print(f"Setting batch size to {bs}")
205
+
206
+ # Make project dir
207
+ os.makedirs(root, exist_ok=True)
208
+ # Make dataset dir
209
+ os.makedirs(os.path.join(root, dataset), exist_ok=True)
210
+
211
+ # Model instructions
212
+ instruction_prefix = "Please provide a self-contained Python script that solves the following problem in a markdown code block:"
213
+ response_prefix = "Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:"
214
+
215
+ if evalperf_type == "perf-instruct":
216
+ instruction_prefix = "Please provide an efficient and self-contained Python script that solves the following problem in a markdown code block:"
217
+ response_prefix = "Below is a Python script with a self-contained function that efficiently solves the problem and passes corresponding tests:"
218
+ elif evalperf_type == "perf-CoT":
219
+ instruction_prefix = "Think step by step: please provide an efficient and self-contained Python script that solves the following problem in a markdown code block:"
220
+ response_prefix = "Below is a Python script with a self-contained function that efficiently solves the problem and passes corresponding tests:"
221
+ elif evalperf_type is not None and evalperf_type != "instruct":
222
+ raise ValueError(f"Invalid evalperf_type: {evalperf_type}")
223
+
224
+ # Model creation
225
+ model_runner = make_model(
226
+ model=model,
227
+ backend=backend,
228
+ batch_size=bs,
229
+ temperature=temperature,
230
+ force_base_prompt=force_base_prompt,
231
+ dataset=dataset,
232
+ base_url=base_url,
233
+ tp=tp,
234
+ instruction_prefix=instruction_prefix,
235
+ response_prefix=response_prefix,
236
+ device_map=device_map,
237
+ attn_implementation=attn_implementation,
238
+ trust_remote_code=trust_remote_code,
239
+ enable_prefix_caching=enable_prefix_caching,
240
+ enable_chunked_prefill=enable_chunked_prefill,
241
+ dtype=dtype,
242
+ gptqmodel_backend=gptqmodel_backend,
243
+ gguf_file=gguf_file,
244
+ )
245
+
246
+ codegen(
247
+ target_path=target_path,
248
+ dataset=dataset_dict,
249
+ greedy=greedy,
250
+ model=model_runner,
251
+ n_samples=n_samples,
252
+ resume=resume,
253
+ id_range=id_range,
254
+ )
255
+
256
+ # force shutdown the model runner
257
+ del model_runner
258
+ import gc
259
+
260
+ gc.collect()
261
+
262
+ return target_path
263
+
264
+
265
+ def main():
266
+ from fire import Fire
267
+
268
+ Fire(run_codegen)
269
+
270
+
271
+ if __name__ == "__main__":
272
+ main()
evalplus/build/lib/evalplus/config.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## EvalPlus
2
+ DEFAULT_GT_TIME_LIMIT_FACTOR = 4.0
3
+ DEFAULT_MIN_TIME_LIMIT = 1.0
4
+
5
+ ## EvalPerf
6
+
7
+ ### General
8
+ PERF_PROFILE_ROUNDS = 1
9
+ PERF_RAM_GB_PER_PROC = 12
10
+
11
+ ### Evaluation Phase
12
+ PERF_EVAL_TIMEOUT_SECOND = 45
13
+
14
+ ### Curation Phase
15
+ PERF_CURATE_TIMEOUT_SECOND = 20
16
+ PREF_CURATE_MIN_INSTRUCTION = 10000
evalplus/build/lib/evalplus/data/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from datasets import load_dataset
4
+
5
+ from evalplus.data.humaneval import get_human_eval_plus, get_human_eval_plus_hash
6
+ from evalplus.data.mbpp import get_mbpp_plus, get_mbpp_plus_hash
7
+ from evalplus.data.utils import load_solutions, write_directory, write_jsonl
8
+
9
+
10
+ def get_evalperf_data():
11
+ dataset = load_dataset("evalplus/evalperf", split="test").to_list()
12
+ for d in dataset:
13
+ d["pe_input"] = json.loads(d["pe_input"])
14
+ return {task["task_id"]: task for task in dataset}
evalplus/build/lib/evalplus/data/humaneval.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import json
3
+ import os
4
+ from typing import Dict
5
+
6
+ from evalplus.data.utils import (
7
+ CACHE_DIR,
8
+ completeness_check,
9
+ get_dataset_metadata,
10
+ make_cache,
11
+ stream_jsonl,
12
+ )
13
+
14
+ HUMANEVAL_PLUS_VERSION = "v0.1.10"
15
+ HUMANEVAL_OVERRIDE_PATH = os.environ.get("HUMANEVAL_OVERRIDE_PATH", None)
16
+
17
+
18
+ def _ready_human_eval_plus_path(mini=False, noextreme=False, version="default") -> str:
19
+ if HUMANEVAL_OVERRIDE_PATH:
20
+ return HUMANEVAL_OVERRIDE_PATH
21
+
22
+ version = HUMANEVAL_PLUS_VERSION if version == "default" else version
23
+ url, plus_path = get_dataset_metadata(
24
+ "HumanEvalPlus", HUMANEVAL_PLUS_VERSION, mini, noextreme
25
+ )
26
+ make_cache(url, plus_path)
27
+
28
+ return plus_path
29
+
30
+
31
+ def get_human_eval_plus_hash(mini=False, noextreme=False, version="default") -> str:
32
+ """Get the hash of HumanEvalPlus.
33
+ Returns:
34
+ str: The hash of HumanEvalPlus
35
+ """
36
+ plus_path = _ready_human_eval_plus_path(mini, noextreme, version="default")
37
+ with open(plus_path, "rb") as f:
38
+ plus = f.read()
39
+ return hashlib.md5(plus).hexdigest()
40
+
41
+
42
+ def get_human_eval_plus(
43
+ err_incomplete=True, mini=False, noextreme=False, version="default"
44
+ ) -> Dict[str, Dict]:
45
+ """Get HumanEvalPlus locally.
46
+ Args:
47
+ err_incomplete (bool, optional): Whether to raise error if HumanEvalPlus is not complete. Defaults to True.
48
+ mini (bool, optional): Whether to use the mini version of HumanEvalPlus. Defaults to False.
49
+ Returns:
50
+ List[Dict[str, str]]: List of dicts with keys "task_id", "prompt", "contract", "canonical_solution", "base_input"
51
+ Notes:
52
+ "task_id" is the identifier string for the task
53
+ "prompt" is the function signature with docstring
54
+ "contract" is the assertions for the function's input (validity)
55
+ "canonical_solution" is the ground-truth implementation for diff-testing
56
+ "base_input" is the test inputs from original HumanEval
57
+ "plus_input" is the test inputs brought by EvalPlus
58
+ "atol" is the absolute tolerance for diff-testing
59
+ """
60
+ plus_path = _ready_human_eval_plus_path(
61
+ mini=mini, noextreme=noextreme, version=version
62
+ )
63
+ plus = {task["task_id"]: task for task in stream_jsonl(plus_path)}
64
+ if err_incomplete:
65
+ completeness_check("HumanEval+", plus)
66
+ return plus
67
+
68
+
69
+ def get_human_eval() -> Dict[str, Dict]:
70
+ """Get HumanEval from OpenAI's github repo and return as a list of parsed dicts.
71
+
72
+ Returns:
73
+ List[Dict[str, str]]: List of dicts with keys "prompt", "test", "entry_point"
74
+
75
+ Notes:
76
+ "task_id" is the identifier string for the task.
77
+ "prompt" is the prompt to be used for the task (function signature with docstrings).
78
+ "test" is test-cases wrapped in a `check` function.
79
+ "entry_point" is the name of the function.
80
+ """
81
+ # Check if human eval file exists in CACHE_DIR
82
+ human_eval_path = os.path.join(CACHE_DIR, "HumanEval.jsonl")
83
+ make_cache(
84
+ "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz",
85
+ human_eval_path,
86
+ )
87
+
88
+ human_eval = open(human_eval_path, "r").read().split("\n")
89
+ human_eval = [json.loads(line) for line in human_eval if line]
90
+
91
+ # Handle 115_max_fill.py to make its docstring well-formed
92
+ human_eval[115]["prompt"] = "import math\n" + human_eval[115]["prompt"].replace(
93
+ "import math\n", ""
94
+ )
95
+
96
+ return {task["task_id"]: task for task in human_eval}
evalplus/build/lib/evalplus/data/mbpp.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import json
3
+ import os
4
+ from typing import Dict
5
+
6
+ import wget
7
+
8
+ from evalplus.data.utils import (
9
+ CACHE_DIR,
10
+ completeness_check,
11
+ get_dataset_metadata,
12
+ make_cache,
13
+ stream_jsonl,
14
+ )
15
+
16
+ MBPP_PLUS_VERSION = "v0.2.0"
17
+ MBPP_OVERRIDE_PATH = os.environ.get("MBPP_OVERRIDE_PATH", None)
18
+
19
+
20
+ def _ready_mbpp_plus_path(mini=False, noextreme=False, version="default") -> str:
21
+ assert mini is False, "Mini version of MBPP+ is not available yet."
22
+
23
+ if MBPP_OVERRIDE_PATH:
24
+ return MBPP_OVERRIDE_PATH
25
+
26
+ version = MBPP_PLUS_VERSION if version == "default" else version
27
+
28
+ url, plus_path = get_dataset_metadata("MbppPlus", version, mini, noextreme)
29
+ make_cache(url, plus_path)
30
+
31
+ return plus_path
32
+
33
+
34
+ def mbpp_serialize_inputs(task_id: str, inputs: list) -> list:
35
+ task_id = int(task_id.split("/")[-1])
36
+
37
+ if task_id == 115:
38
+ return [[[list(item) for item in inp[0]]] for inp in inputs]
39
+ elif task_id == 124:
40
+ return [(str(inp[0]), str(inp[1])) for inp in inputs]
41
+ elif task_id == 252:
42
+ return [[str(inp[0])] for inp in inputs]
43
+
44
+ return inputs
45
+
46
+
47
+ def mbpp_deserialize_inputs(task_id: str, inputs: list) -> list:
48
+ task_id = int(task_id.split("/")[-1])
49
+ if task_id in [
50
+ 2,
51
+ 116,
52
+ 132,
53
+ 143,
54
+ 222,
55
+ 261,
56
+ 273,
57
+ 394,
58
+ 399,
59
+ 421,
60
+ 424,
61
+ 429,
62
+ 470,
63
+ 560,
64
+ 579,
65
+ 596,
66
+ 616,
67
+ 630,
68
+ 726,
69
+ 740,
70
+ 744,
71
+ 809,
72
+ ]:
73
+ modified_inputs = [[tuple(lst) for lst in inp] for inp in inputs]
74
+
75
+ elif task_id in [
76
+ 63,
77
+ 64,
78
+ 70,
79
+ 94,
80
+ 120,
81
+ 237,
82
+ 272,
83
+ 299,
84
+ 400,
85
+ 409,
86
+ 417,
87
+ 438,
88
+ 473,
89
+ 614,
90
+ 780,
91
+ ]:
92
+ modified_inputs = [
93
+ [[tuple(lst) for lst in lst_lst] for lst_lst in inp] for inp in inputs
94
+ ]
95
+
96
+ elif task_id in [75, 413, 444, 753]:
97
+ modified_inputs = [
98
+ [[tuple(lst) for lst in inp[0]]] + [inp[1]] for inp in inputs
99
+ ]
100
+
101
+ elif task_id == 106 or task_id == 750:
102
+ modified_inputs = [[inp[0]] + [tuple(inp[1])] for inp in inputs]
103
+
104
+ elif task_id == 115:
105
+ modified_inputs = [
106
+ [
107
+ [
108
+ set(item) if isinstance(item, list) and len(item) else {}
109
+ for item in inp[0]
110
+ ]
111
+ ]
112
+ for inp in inputs
113
+ ]
114
+
115
+ elif task_id == 124:
116
+ modified_inputs = [(float(inp[0]), complex(inp[1])) for inp in inputs]
117
+
118
+ elif task_id in [250, 405, 446, 617, 720, 763, 808]:
119
+ modified_inputs = [[tuple(inp[0])] + [inp[1]] for inp in inputs]
120
+
121
+ elif task_id in [259, 401, 445]:
122
+ modified_inputs = [
123
+ [[tuple(lst) for lst in lst_lst] for lst_lst in inp] for inp in inputs
124
+ ]
125
+ modified_inputs = [[tuple(lst) for lst in inp] for inp in modified_inputs]
126
+
127
+ elif task_id == 278:
128
+ modified_inputs = [
129
+ [[tuple(item) if isinstance(item, list) else item for item in inp[0]]]
130
+ for inp in inputs
131
+ ]
132
+ modified_inputs = [[tuple(lst) for lst in inp] for inp in modified_inputs]
133
+
134
+ elif task_id == 307:
135
+ modified_inputs = [[tuple(inp[0])] + [inp[1], inp[2]] for inp in inputs]
136
+
137
+ elif task_id == 722:
138
+ modified_inputs = [
139
+ [{key: tuple(value) for key, value in inp[0].items()}] + inp[1:]
140
+ for inp in inputs
141
+ ]
142
+
143
+ elif task_id == 252:
144
+ modified_inputs = [[complex(inp[0])] for inp in inputs]
145
+
146
+ elif task_id in [580, 615, 791]:
147
+
148
+ def turn_all_list_into_tuple(inp):
149
+ if isinstance(inp, list):
150
+ return tuple([turn_all_list_into_tuple(item) for item in inp])
151
+ return inp
152
+
153
+ modified_inputs = [turn_all_list_into_tuple(inp) for inp in inputs]
154
+
155
+ else:
156
+ modified_inputs = inputs
157
+
158
+ return modified_inputs
159
+
160
+
161
+ def get_mbpp() -> Dict[str, Dict]:
162
+ """Get sanitized MBPP from Google's Github repo."""
163
+ mbpp_path = os.path.join(CACHE_DIR, "sanitized-mbpp.json")
164
+
165
+ if not os.path.exists(mbpp_path):
166
+ os.makedirs(CACHE_DIR, exist_ok=True)
167
+
168
+ # Install MBPP-sanitized from scratch
169
+ print("Downloading original MBPP dataset...")
170
+ wget.download(
171
+ "https://github.com/google-research/google-research/raw/master/mbpp/sanitized-mbpp.json",
172
+ mbpp_path,
173
+ )
174
+
175
+ with open(mbpp_path, "r") as f:
176
+ mbpp = json.load(f)
177
+
178
+ return {str(task["task_id"]): task for task in mbpp}
179
+
180
+
181
+ def get_mbpp_plus(
182
+ err_incomplete=True, mini=False, noextreme=False, version="default"
183
+ ) -> Dict[str, Dict]:
184
+ plus_path = _ready_mbpp_plus_path(mini=mini, noextreme=noextreme, version=version)
185
+ plus = {task["task_id"]: task for task in stream_jsonl(plus_path)}
186
+ for task_id, task in plus.items():
187
+ task["base_input"] = mbpp_deserialize_inputs(task_id, task["base_input"])
188
+ task["plus_input"] = mbpp_deserialize_inputs(task_id, task["plus_input"])
189
+
190
+ if err_incomplete:
191
+ completeness_check("MBPP+", plus)
192
+ return plus
193
+
194
+
195
+ def get_mbpp_plus_hash(mini=False, noextreme=False, version="default") -> str:
196
+ """Get the hash of MbppPlus.
197
+ Returns:
198
+ str: The hash of MbppPlus
199
+ """
200
+ plus_path = _ready_mbpp_plus_path(mini=mini, noextreme=noextreme, version=version)
201
+ with open(plus_path, "rb") as f:
202
+ plus = f.read()
203
+ return hashlib.md5(plus).hexdigest()
evalplus/build/lib/evalplus/data/utils.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gzip
2
+ import json
3
+ import os
4
+ from os import PathLike
5
+ from typing import Dict, Iterable
6
+
7
+ import tempdir
8
+ import wget
9
+ from appdirs import user_cache_dir
10
+
11
+ CACHE_DIR = user_cache_dir("evalplus")
12
+
13
+
14
+ def get_dataset_metadata(name: str, version: str, mini: bool, noextreme: bool = False):
15
+ assert name in ["HumanEvalPlus", "MbppPlus"], f"Unknown/unsupported dataset: {name}"
16
+ extra = ""
17
+ assert not (mini and noextreme), "Cannot have both mini and noextreme"
18
+ if mini:
19
+ extra = "-Mini"
20
+ if noextreme:
21
+ extra = "-NoExtreme"
22
+ url = f"https://github.com/evalplus/{name.lower()}_release/releases/download/{version}/{name}{extra}.jsonl.gz"
23
+ cache_path = os.path.join(CACHE_DIR, f"{name}{extra}-{version}.jsonl")
24
+ return url, cache_path
25
+
26
+
27
+ def make_cache(gzip_url, cache_path):
28
+ # Check if human eval file exists in CACHE_DIR
29
+ if not os.path.exists(cache_path):
30
+ # Install HumanEval dataset and parse as jsonl
31
+ print(f"Downloading dataset from {gzip_url}")
32
+ with tempdir.TempDir() as tmpdir:
33
+ plus_gz_path = os.path.join(tmpdir, f"data.jsonl.gz")
34
+ wget.download(gzip_url, plus_gz_path)
35
+
36
+ with gzip.open(plus_gz_path, "rb") as f:
37
+ plus = f.read().decode("utf-8")
38
+
39
+ # create CACHE_DIR if not exists
40
+ if not os.path.exists(CACHE_DIR):
41
+ os.makedirs(CACHE_DIR)
42
+
43
+ # Write the original human eval file to CACHE_DIR
44
+ with open(cache_path, "w") as f:
45
+ f.write(plus)
46
+
47
+
48
+ def write_jsonl(
49
+ filename: str, data: Iterable[Dict], append: bool = False, drop_builtin: bool = True
50
+ ):
51
+ """
52
+ Writes an iterable of dictionaries to jsonl
53
+ """
54
+ if append:
55
+ mode = "ab"
56
+ else:
57
+ mode = "wb"
58
+ filename = os.path.expanduser(filename)
59
+ if filename.endswith(".gz"):
60
+ with open(filename, mode) as fp:
61
+ with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp:
62
+ for x in data:
63
+ if drop_builtin:
64
+ x = {k: v for k, v in x.items() if not k.startswith("_")}
65
+ gzfp.write((json.dumps(x) + "\n").encode("utf-8"))
66
+ else:
67
+ with open(filename, mode) as fp:
68
+ for x in data:
69
+ if drop_builtin:
70
+ x = {k: v for k, v in x.items() if not k.startswith("_")}
71
+ fp.write((json.dumps(x) + "\n").encode("utf-8"))
72
+
73
+
74
+ def stream_jsonl(filename: str) -> Iterable[Dict]:
75
+ """
76
+ Parses each jsonl line and yields it as a dictionary
77
+ """
78
+ if filename.endswith(".gz"):
79
+ with open(filename, "rb") as gzfp:
80
+ with gzip.open(gzfp, "rt") as fp:
81
+ for line in fp:
82
+ if any(not x.isspace() for x in line):
83
+ yield json.loads(line)
84
+ else:
85
+ with open(filename, "r") as fp:
86
+ for line in fp:
87
+ if any(not x.isspace() for x in line):
88
+ yield json.loads(line)
89
+
90
+
91
+ def load_solutions(sample_path: PathLike) -> Iterable[Dict]:
92
+ """We accept two formats of inputs.
93
+ + `sample.jsonl` which is the format from HumanEval, i.e., {task_id, completion or solution}.
94
+ + A folder which contains sub-folders named after the task_id. Each sub-folder
95
+ contains samples named in `[?].py` where `?` is the solution id starting with 0.
96
+ Different from `sample.jsonl`, the solutions must be complete (with prompt prefix).
97
+ """
98
+
99
+ # if it is a file
100
+ if os.path.isfile(sample_path):
101
+ for i, sample in enumerate(stream_jsonl(sample_path)):
102
+ assert (
103
+ "completion" in sample or "solution" in sample
104
+ ), "No completion or solution found in sample!"
105
+ assert "solution" not in sample or isinstance(
106
+ sample["solution"], str
107
+ ), "Solution must be a string! If you have multiple solutions, please repeat the task_id."
108
+ assert "completion" not in sample or isinstance(
109
+ sample["completion"], str
110
+ ), "Completion must be a string! If you have multiple solutions, please repeat the task_id."
111
+
112
+ sample["_identifier"] = (
113
+ sample["task_id"] + f" (line {i+1} in {sample_path})"
114
+ )
115
+ yield sample
116
+ else:
117
+ # if it is a folder
118
+ for task_id in os.listdir(sample_path):
119
+ task_path = os.path.join(sample_path, task_id)
120
+ if not os.path.isdir(task_path):
121
+ continue
122
+
123
+ for solution_id in os.listdir(task_path):
124
+ solution_path = os.path.join(task_path, solution_id)
125
+ if os.path.isfile(solution_path) and solution_path.endswith(".py"):
126
+ with open(solution_path, "r") as f:
127
+ completion = f.read()
128
+ yield {
129
+ "_identifier": solution_path,
130
+ "_path": solution_path,
131
+ "task_id": task_id.replace("_", "/"),
132
+ "solution": completion,
133
+ }
134
+
135
+
136
+ def write_directory(directory: PathLike, data: Iterable[Dict]):
137
+ os.makedirs(directory, exist_ok=True)
138
+ counters = {}
139
+ for sample in data:
140
+ assert "solution" in sample, "Samples must come with `solution` field!"
141
+ task_id = sample["task_id"].replace("/", "_")
142
+ task_dir = os.path.join(directory, task_id)
143
+ os.makedirs(task_dir, exist_ok=True)
144
+ if task_id not in counters:
145
+ counters[task_id] = 0
146
+ sample_id = counters[task_id]
147
+ with open(os.path.join(task_dir, f"{sample_id}.py"), "w") as f:
148
+ f.write(sample["solution"])
149
+ counters[task_id] += 1
150
+
151
+
152
+ def completeness_check(name, plus):
153
+ for task_id, task in plus.items():
154
+ for key in [
155
+ "prompt",
156
+ "contract",
157
+ "canonical_solution",
158
+ "base_input",
159
+ "plus_input",
160
+ "atol",
161
+ ]:
162
+ assert key in task, f"{key} not found in {name} #{task_id}!"
163
+
164
+
165
+ def to_raw(string):
166
+ return string.encode("unicode-escape").decode().replace("\\\\", "\\")
evalplus/build/lib/evalplus/eval/__init__.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # The MIT License
2
+ #
3
+ # Copyright (c) OpenAI (https://openai.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ # THE SOFTWARE.
22
+
23
+ import itertools
24
+ import multiprocessing
25
+ import os
26
+ import time
27
+ from multiprocessing import Array, Value
28
+ from typing import Any, Dict, List, Optional, Tuple, Union
29
+
30
+ import numpy as np
31
+ import psutil
32
+
33
+ from evalplus.config import *
34
+ from evalplus.eval._special_oracle import (
35
+ MBPP_OUTPUT_NOT_NONE_TASKS,
36
+ MBPP_OUTPUT_SET_EQ_TASKS,
37
+ _digit_distance_nums,
38
+ _poly,
39
+ _surface_Area,
40
+ )
41
+ from evalplus.eval.utils import (
42
+ create_tempdir,
43
+ reliability_guard,
44
+ swallow_io,
45
+ time_limit,
46
+ )
47
+
48
+
49
+ def compatible_eval_result(results: Dict) -> Dict:
50
+ # compatibility
51
+ for task_results in results["eval"].values():
52
+ # update the "files" field to "nfiles"
53
+ if "files" in task_results and "nfiles" not in task_results:
54
+ task_results["nfiles"] = len(task_results.pop("files"))
55
+ return results
56
+
57
+
58
+ # unbiased estimator from https://github.com/openai/human-eval
59
+ def estimate_pass_at_k(
60
+ num_samples: Union[int, List[int], np.ndarray],
61
+ num_correct: Union[List[int], np.ndarray],
62
+ k: int,
63
+ ) -> np.ndarray:
64
+ """
65
+ Estimates pass@k of each problem and returns them in an array.
66
+ """
67
+
68
+ def estimator(n: int, c: int, k: int) -> float:
69
+ """
70
+ Calculates 1 - comb(n - c, k) / comb(n, k).
71
+ """
72
+ if n - c < k:
73
+ return 1.0
74
+ return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
75
+
76
+ if isinstance(num_samples, int):
77
+ num_samples_it = itertools.repeat(num_samples, len(num_correct))
78
+ else:
79
+ assert len(num_samples) == len(num_correct)
80
+ num_samples_it = iter(num_samples)
81
+
82
+ return np.array(
83
+ [estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]
84
+ )
85
+
86
+
87
+ PASS = "pass"
88
+ FAIL = "fail"
89
+ TIMEOUT = "timeout"
90
+
91
+ _SUCCESS = 0
92
+ _FAILED = 1
93
+ _TIMEOUT = 2
94
+ _UNKNOWN = 3
95
+
96
+ _mapping = {_SUCCESS: PASS, _FAILED: FAIL, _TIMEOUT: TIMEOUT, _UNKNOWN: None}
97
+
98
+
99
+ def query_maximum_memory_bytes() -> Optional[int]:
100
+ # Disable functionalities that can make destructive changes to the test.
101
+ # allow only 4GB memory usage
102
+ maximum_memory_bytes = os.getenv(
103
+ "EVALPLUS_MAX_MEMORY_BYTES", 4 * 1024 * 1024 * 1024
104
+ )
105
+ maximum_memory_bytes = min(int(maximum_memory_bytes), psutil.virtual_memory().total)
106
+ if maximum_memory_bytes == -1:
107
+ return None
108
+ return maximum_memory_bytes
109
+
110
+
111
+ def is_floats(x) -> bool:
112
+ # check if it is float; List[float]; Tuple[float]
113
+ if isinstance(x, float):
114
+ return True
115
+ if isinstance(x, (list, tuple)) and x:
116
+ return all(isinstance(i, float) for i in x)
117
+ if isinstance(x, np.ndarray):
118
+ return x.dtype == np.float64 or x.dtype == np.float32
119
+ return False
120
+
121
+
122
+ def unsafe_execute(
123
+ dataset: str,
124
+ entry_point: str,
125
+ code: str,
126
+ inputs,
127
+ expected: List,
128
+ time_limits,
129
+ atol,
130
+ fast_check,
131
+ stat, # Value
132
+ details, # Array
133
+ progress, # Value
134
+ ):
135
+ with create_tempdir():
136
+ # These system calls are needed when cleaning up tempdir.
137
+ import os
138
+ import shutil
139
+
140
+ rmtree = shutil.rmtree
141
+ rmdir = os.rmdir
142
+ chdir = os.chdir
143
+ #reliability_guard(maximum_memory_bytes=query_maximum_memory_bytes())
144
+ exec_globals = {}
145
+ try:
146
+ with swallow_io():
147
+ exec(code, exec_globals)
148
+ fn = exec_globals[entry_point]
149
+
150
+ for i, inp in enumerate(inputs):
151
+ try:
152
+ with time_limit(time_limits[i]):
153
+ with swallow_io():
154
+ out = fn(*inp)
155
+
156
+ exp = expected[i]
157
+ exact_match = out == exp
158
+
159
+ # ================================================ #
160
+ # ============== special oracles ================= #
161
+ if dataset == "mbpp":
162
+ if "are_equivalent" == entry_point: # Mbpp/164 special oracle
163
+ exact_match = exact_match or True
164
+ elif "sum_div" == entry_point: # Mbpp/295 special oracle
165
+ exact_match = exact_match or out == 0
166
+ elif "surface_Area" == entry_point: # Mbpp/581 special oracle
167
+ exact_match = (
168
+ exact_match or abs(out - _surface_Area(*inp)) <= atol
169
+ )
170
+ elif (
171
+ "digit_distance_nums" == entry_point
172
+ ): # Mbpp/558 special oracle
173
+ exact_match = exact_match or out == _digit_distance_nums(
174
+ *inp
175
+ )
176
+ elif entry_point in MBPP_OUTPUT_SET_EQ_TASKS:
177
+ exact_match = set(out) == set(exp)
178
+ elif entry_point in MBPP_OUTPUT_NOT_NONE_TASKS:
179
+ # exp is True if not None
180
+ # False if None
181
+ if isinstance(out, bool):
182
+ exact_match = out == exp
183
+ else:
184
+ exact_match = exp == (out is not None)
185
+
186
+ if dataset == "humaneval":
187
+ if "find_zero" == entry_point:
188
+ assert abs(_poly(*inp, out)) <= atol
189
+ details[i] = True
190
+ progress.value += 1
191
+ continue
192
+ # ============== special oracles ================= #
193
+ # ================================================ #
194
+
195
+ if atol == 0 and is_floats(exp):
196
+ atol = 1e-6 # enforce atol for float comparison
197
+ if not exact_match and atol != 0:
198
+ # explicitly set rtol=1e-07
199
+ # to match `np.testing.assert_allclose`'s default values
200
+ assert type(out) == type(exp)
201
+ if isinstance(exp, (list, tuple)):
202
+ assert len(out) == len(exp)
203
+ assert np.allclose(out, exp, rtol=1e-07, atol=atol)
204
+ else:
205
+ assert exact_match
206
+ except BaseException:
207
+ details[i] = False
208
+ progress.value += 1
209
+ if fast_check:
210
+ raise
211
+ continue
212
+
213
+ details[i] = True
214
+ progress.value += 1
215
+
216
+ stat.value = _SUCCESS
217
+ except BaseException:
218
+ stat.value = _FAILED
219
+ # Needed for cleaning up.
220
+ shutil.rmtree = rmtree
221
+ os.rmdir = rmdir
222
+ os.chdir = chdir
223
+
224
+
225
+ def untrusted_check(
226
+ dataset: str,
227
+ code: str,
228
+ inputs: List[Any],
229
+ entry_point: str,
230
+ expected,
231
+ atol,
232
+ ref_time: List[float],
233
+ fast_check: bool = False,
234
+ min_time_limit: float = DEFAULT_MIN_TIME_LIMIT,
235
+ gt_time_limit_factor: float = DEFAULT_GT_TIME_LIMIT_FACTOR,
236
+ ) -> Tuple[str, np.ndarray]:
237
+ time_limits = [max(min_time_limit, gt_time_limit_factor * t) for t in ref_time]
238
+ timeout = min(os.getenv("EVALPLUS_TIMEOUT_PER_TASK", 60), sum(time_limits)) + 1
239
+ if not fast_check:
240
+ timeout += 1 # extra time for data collection
241
+
242
+ # shared memory objects
243
+ progress = Value("i", 0)
244
+ stat = Value("i", _UNKNOWN)
245
+ details = Array("b", [False for _ in range(len(inputs))])
246
+
247
+ p = multiprocessing.Process(
248
+ target=unsafe_execute,
249
+ args=(
250
+ dataset,
251
+ entry_point,
252
+ code,
253
+ inputs,
254
+ expected,
255
+ time_limits,
256
+ atol,
257
+ fast_check,
258
+ # return values
259
+ stat,
260
+ details,
261
+ progress,
262
+ ),
263
+ )
264
+ p.start()
265
+ p.join(timeout=timeout + 1)
266
+ if p.is_alive():
267
+ p.terminate()
268
+ time.sleep(0.1)
269
+ if p.is_alive():
270
+ p.kill()
271
+ time.sleep(0.1)
272
+
273
+ stat = _mapping[stat.value]
274
+ details = details[: progress.value]
275
+
276
+ if not stat:
277
+ stat = TIMEOUT
278
+
279
+ if stat == PASS:
280
+ if len(details) != len(inputs) or not all(details):
281
+ stat = FAIL
282
+
283
+ return stat, details
284
+
285
+
286
+ def evaluate_files(
287
+ dataset: str,
288
+ files: List[str],
289
+ inputs: List,
290
+ expected: List,
291
+ entry_point: str,
292
+ atol: float,
293
+ ref_time: List[float],
294
+ fast_check: bool = False,
295
+ min_time_limit: float = DEFAULT_MIN_TIME_LIMIT,
296
+ gt_time_limit_factor: float = DEFAULT_GT_TIME_LIMIT_FACTOR,
297
+ ) -> List[Tuple[str, List[bool]]]:
298
+ ret = []
299
+ # sort files by the id in name (i.e., "../n.py")
300
+ files = sorted(files, key=lambda x: int(x.split("/")[-1].split(".")[0]))
301
+ for file in files:
302
+ code = open(file, "r").read()
303
+ stat, det = untrusted_check(
304
+ dataset,
305
+ code,
306
+ inputs,
307
+ entry_point,
308
+ expected=expected,
309
+ atol=atol,
310
+ ref_time=ref_time,
311
+ fast_check=fast_check,
312
+ min_time_limit=min_time_limit,
313
+ gt_time_limit_factor=gt_time_limit_factor,
314
+ )
315
+ ret.append((stat, det.tolist()))
316
+ return ret
evalplus/build/lib/evalplus/eval/_special_oracle.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Special oracle handlings for problems where direct differential testing is not applicable."""
2
+
3
+ import math
4
+
5
+ # For tasks whose output are not serializable, we only check the output is not None, which
6
+ # is also consistent with the original dataset.
7
+ MBPP_OUTPUT_NOT_NONE_TASKS = ["check_str", "text_match_three", "text_starta_endb"]
8
+
9
+ # Tasks that needs to perform set comparison over two lists
10
+ MBPP_OUTPUT_SET_EQ_TASKS = [
11
+ "similar_elements", # Mbpp/2
12
+ "find_char_long", # Mbpp/7
13
+ "common_in_nested_lists", # Mbpp/111
14
+ "extract_singly", # Mbpp/140
15
+ "larg_nnum", # Mbpp/232
16
+ "intersection_array", # Mbpp/249
17
+ "find_dissimilar", # Mbpp/579
18
+ "Diff", # Mbpp/769
19
+ ]
20
+
21
+
22
+ # oracle for Mbpp/581
23
+ def _surface_Area(base_edge, height):
24
+ """
25
+ Recognizes the "height" as the perpendicular distance from the base to the apex of the pyramid
26
+ """
27
+ slant_height = math.sqrt((base_edge / 2) ** 2 + height**2)
28
+ base_area = base_edge**2
29
+ lateral_area = 4 * (base_edge * slant_height) / 2
30
+ total_surface_area = base_area + lateral_area
31
+ return round(total_surface_area)
32
+
33
+
34
+ # oracle for Mbpp/558
35
+ def _digit_distance_nums(num1, num2):
36
+ """
37
+ Preprocesses the two numbers to have the same length by padding with zeros
38
+ """
39
+ str_num1, str_num2 = str(num1), str(num2)
40
+ max_length = max(len(str_num1), len(str_num2))
41
+ str_num1, str_num2 = str_num1.zfill(max_length), str_num2.zfill(max_length)
42
+ total_difference = 0
43
+ for digit1, digit2 in zip(str_num1, str_num2):
44
+ difference = abs(int(digit1) - int(digit2))
45
+ total_difference += difference
46
+ return total_difference
47
+
48
+
49
+ # oracle for HumaneEval/032
50
+ def _poly(xs: list, x: float):
51
+ """
52
+ Evaluates polynomial with coefficients xs at point x.
53
+ return xs[0] + xs[1] * x + xs[1] * x^2 + .... xs[n] * x^n
54
+ """
55
+ return sum([coeff * math.pow(x, i) for i, coeff in enumerate(xs)])
evalplus/build/lib/evalplus/eval/utils.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # The MIT License
2
+ #
3
+ # Copyright (c) OpenAI (https://openai.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ # THE SOFTWARE.
22
+
23
+ import contextlib
24
+ import faulthandler
25
+ import io
26
+ import os
27
+ import platform
28
+ import signal
29
+ import tempfile
30
+ from typing import Optional
31
+
32
+
33
+ @contextlib.contextmanager
34
+ def swallow_io():
35
+ stream = WriteOnlyStringIO()
36
+ with contextlib.redirect_stdout(stream):
37
+ with contextlib.redirect_stderr(stream):
38
+ with redirect_stdin(stream):
39
+ yield
40
+
41
+
42
+ @contextlib.contextmanager
43
+ def time_limit(seconds: float):
44
+ def signal_handler(signum, frame):
45
+ raise TimeoutException("Timed out!")
46
+
47
+ signal.setitimer(signal.ITIMER_REAL, seconds)
48
+ signal.signal(signal.SIGALRM, signal_handler)
49
+ try:
50
+ yield
51
+ finally:
52
+ signal.setitimer(signal.ITIMER_REAL, 0)
53
+
54
+
55
+ @contextlib.contextmanager
56
+ def create_tempdir():
57
+ with tempfile.TemporaryDirectory() as dirname:
58
+ with chdir(dirname):
59
+ yield dirname
60
+
61
+
62
+ @contextlib.contextmanager
63
+ def chdir(root):
64
+ if root == ".":
65
+ yield
66
+ return
67
+ cwd = os.getcwd()
68
+ os.chdir(root)
69
+ try:
70
+ yield
71
+ except BaseException as exc:
72
+ raise exc
73
+ finally:
74
+ os.chdir(cwd)
75
+
76
+
77
+ class TimeoutException(Exception):
78
+ pass
79
+
80
+
81
+ class WriteOnlyStringIO(io.StringIO):
82
+ """StringIO that throws an exception when it's read from"""
83
+
84
+ def read(self, *args, **kwargs):
85
+ raise IOError
86
+
87
+ def readline(self, *args, **kwargs):
88
+ raise IOError
89
+
90
+ def readlines(self, *args, **kwargs):
91
+ raise IOError
92
+
93
+ def readable(self, *args, **kwargs):
94
+ """Returns True if the IO object can be read."""
95
+ return False
96
+
97
+
98
+ class redirect_stdin(contextlib._RedirectStream): # type: ignore
99
+ _stream = "stdin"
100
+
101
+
102
+ def reliability_guard(maximum_memory_bytes: Optional[int] = None):
103
+ """
104
+ This disables various destructive functions and prevents the generated code
105
+ from interfering with the test (e.g. fork bomb, killing other processes,
106
+ removing filesystem files, etc.)
107
+
108
+ WARNING
109
+ This function is NOT a security sandbox. Untrusted code, including, model-
110
+ generated code, should not be blindly executed outside of one. See the
111
+ Codex paper for more information about OpenAI's code sandbox, and proceed
112
+ with caution.
113
+ """
114
+
115
+ if maximum_memory_bytes is not None:
116
+ import resource
117
+
118
+ resource.setrlimit(
119
+ resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)
120
+ )
121
+ resource.setrlimit(
122
+ resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)
123
+ )
124
+ if not platform.uname().system == "Darwin":
125
+ resource.setrlimit(
126
+ resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)
127
+ )
128
+
129
+ faulthandler.disable()
130
+
131
+ import builtins
132
+
133
+ builtins.exit = None
134
+ builtins.quit = None
135
+
136
+ import os
137
+
138
+ os.environ["OMP_NUM_THREADS"] = "1"
139
+
140
+ os.kill = None
141
+ os.system = None
142
+ os.putenv = None
143
+ os.remove = None
144
+ os.removedirs = None
145
+ os.rmdir = None
146
+ os.fchdir = None
147
+ os.setuid = None
148
+ os.fork = None
149
+ os.forkpty = None
150
+ os.killpg = None
151
+ os.rename = None
152
+ os.renames = None
153
+ os.truncate = None
154
+ os.replace = None
155
+ os.unlink = None
156
+ os.fchmod = None
157
+ os.fchown = None
158
+ os.chmod = None
159
+ os.chown = None
160
+ os.chroot = None
161
+ os.fchdir = None
162
+ os.lchflags = None
163
+ os.lchmod = None
164
+ os.lchown = None
165
+ os.getcwd = None
166
+ os.chdir = None
167
+ builtins.open = None
168
+
169
+ import shutil
170
+
171
+ shutil.rmtree = None
172
+ shutil.move = None
173
+ shutil.chown = None
174
+
175
+ import subprocess
176
+
177
+ subprocess.Popen = None # type: ignore
178
+
179
+ __builtins__["help"] = None
180
+
181
+ import sys
182
+
183
+ sys.modules["ipdb"] = None
184
+ sys.modules["joblib"] = None
185
+ sys.modules["resource"] = None
186
+ sys.modules["psutil"] = None
187
+ sys.modules["tkinter"] = None
evalplus/build/lib/evalplus/evalperf.py ADDED
@@ -0,0 +1,558 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Compute the Differential Performance Scores (DPS) and DPS_{norm} of given samples from a model.
2
+
3
+ Check our COLM paper for more details: https://www.arxiv.org/abs/2408.06450
4
+
5
+ ^Updates from the COLM paper:
6
+ * Condition to activate efficiency evaluation for a task:
7
+ * Paper: as long as you have at least one correct solution, and we select up to 10 correct solutions for efficiency sampling
8
+ * Here: you need to have at least `min_correct` correct solutions, and we evaluate the efficiency of all correct solutions
9
+ * Updating rationale: to make the evaluation more statistically robust
10
+
11
+ @inproceedings{liu2024evaluating,
12
+ title = {Evaluating Language Models for Efficient Code Generation},
13
+ author = {Liu, Jiawei and Xie, Songrun and Wang, Junhao and Wei, Yuxiang and Ding, Yifeng and Zhang, Lingming},
14
+ booktitle = {First Conference on Language Modeling},
15
+ year = {2024},
16
+ url = {https://openreview.net/forum?id=IBCBMeAhmC},
17
+ }
18
+ """
19
+
20
+ import json
21
+ import multiprocessing
22
+ import os
23
+ import socket
24
+ import time
25
+ from collections import defaultdict
26
+ from concurrent.futures import ThreadPoolExecutor, as_completed
27
+ from contextlib import closing
28
+ from datetime import datetime
29
+ from statistics import mean
30
+ from typing import Dict, List, Optional, Tuple
31
+
32
+ import rich
33
+ from rich.rule import Rule
34
+ from rich.syntax import Syntax
35
+ from rich.table import Table
36
+
37
+ from evalplus.codegen import run_codegen
38
+ from evalplus.config import *
39
+ from evalplus.config import PERF_EVAL_TIMEOUT_SECOND
40
+ from evalplus.data import (
41
+ get_evalperf_data,
42
+ get_human_eval_plus,
43
+ get_human_eval_plus_hash,
44
+ get_mbpp_plus,
45
+ get_mbpp_plus_hash,
46
+ )
47
+ from evalplus.data.mbpp import mbpp_deserialize_inputs
48
+ from evalplus.data.utils import stream_jsonl
49
+ from evalplus.eval import PASS, untrusted_check
50
+ from evalplus.eval._special_oracle import MBPP_OUTPUT_NOT_NONE_TASKS
51
+ from evalplus.evaluate import get_groundtruth
52
+ from evalplus.perf.profile import (
53
+ are_profiles_broken,
54
+ default_parallelism,
55
+ profile,
56
+ simple_test_profiler,
57
+ )
58
+ from evalplus.utils import progress
59
+
60
+
61
+ def rule(msg: str):
62
+ rich.print(Rule(msg))
63
+
64
+
65
+ def not_none(l: list) -> list:
66
+ return [x for x in l if x is not None]
67
+
68
+
69
+ def get_free_port():
70
+ with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
71
+ s.bind(("", 0))
72
+ s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
73
+ return s.getsockname()[1]
74
+
75
+
76
+ def correctness_check(
77
+ solution: str, dataset: str, task: Dict, expected_output: List
78
+ ) -> Tuple:
79
+ assert isinstance(solution, str)
80
+ result = untrusted_check(
81
+ dataset,
82
+ solution,
83
+ task["base_input"] + list(task["plus_input"]),
84
+ task["entry_point"],
85
+ expected_output["base"] + expected_output["plus"],
86
+ task["atol"],
87
+ expected_output["base_time"] + expected_output["plus_time"],
88
+ fast_check=True,
89
+ min_time_limit=DEFAULT_MIN_TIME_LIMIT,
90
+ gt_time_limit_factor=DEFAULT_GT_TIME_LIMIT_FACTOR,
91
+ )
92
+ return result, solution
93
+
94
+
95
+ def get_evalplus_data():
96
+ problems_he = get_human_eval_plus(noextreme=True)
97
+ dataset_hash = get_human_eval_plus_hash(noextreme=True)
98
+ expected_output_human = get_groundtruth(problems_he, dataset_hash, [])
99
+ problems_mbpp = get_mbpp_plus(noextreme=True)
100
+ dataset_hash = get_mbpp_plus_hash(noextreme=True)
101
+ expected_output_mbpp = get_groundtruth(
102
+ problems_mbpp,
103
+ dataset_hash,
104
+ MBPP_OUTPUT_NOT_NONE_TASKS,
105
+ )
106
+ problems = {**problems_he, **problems_mbpp}
107
+ expected_output = {**expected_output_human, **expected_output_mbpp}
108
+ return problems, expected_output
109
+
110
+
111
+ def table_print(table_name: str, kv: Dict):
112
+ table = Table(
113
+ title=table_name,
114
+ show_header=True,
115
+ header_style="bold",
116
+ )
117
+ for col_name in kv:
118
+ table.add_column(col_name)
119
+
120
+ table.add_row(*[str(v) for v in kv.values()])
121
+ rich.print(table)
122
+
123
+
124
+ def correctness_worker(task_id: str, samples: list, ctask: Dict, expected_output: Dict):
125
+ assert isinstance(
126
+ samples, list
127
+ ), f"{task_id}: samples is not a list but {type(samples)}"
128
+
129
+ results = []
130
+
131
+ for solution in samples:
132
+ result, solution = correctness_check(
133
+ solution, task_id.split("/")[0].lower(), ctask, expected_output
134
+ )
135
+ results.append(
136
+ {
137
+ "solution": solution,
138
+ "pass": result[0] == PASS,
139
+ "profiled": False,
140
+ "matching_cluster_idx": None,
141
+ "dps": None,
142
+ "dps_norm": None,
143
+ }
144
+ )
145
+
146
+ return task_id, results
147
+
148
+
149
+ def perf_worker(
150
+ task_id: str,
151
+ ptask: Dict, # EvalPerf data
152
+ ret_dict: Dict,
153
+ lazy_evaluation: bool,
154
+ max_profile: int,
155
+ ):
156
+ rich.print(f"{task_id}: Started")
157
+ start_time = time.time()
158
+
159
+ ######################### Profiling Setup #########################
160
+ n_reference = len(ptask["reference"])
161
+ entry_point = ptask["entry_point"]
162
+ pe_input = (
163
+ mbpp_deserialize_inputs(task_id, ptask["pe_input"])[0]
164
+ if task_id.startswith("Mbpp/")
165
+ else ptask["pe_input"][0]
166
+ )
167
+ ####################################################################
168
+
169
+ ####################################################################
170
+ ############### Lazily profile reference solutions #################
171
+ ####################################################################
172
+ cache_ref_num_inst = [None] * n_reference
173
+
174
+ def get_avg_ref_profile(idx, check_order=True) -> Optional[Tuple]:
175
+ nonlocal cache_ref_num_inst
176
+
177
+ assert (
178
+ idx < n_reference - 1
179
+ and cache_ref_num_inst[idx + 1] is not None
180
+ or idx == n_reference - 1
181
+ ), f"Calling get_avg_ref_profile({idx}) before get_avg_ref_profile({idx+1}) is called, is not allowed! {n_reference = }"
182
+
183
+ if cache_ref_num_inst[idx] is not None:
184
+ return cache_ref_num_inst[idx], ptask["scores"][idx]
185
+
186
+ evaluation_time = PERF_EVAL_TIMEOUT_SECOND
187
+ ref_solution = ptask["reference"][idx]
188
+ for _ in range(2): # at most retry twice
189
+ profiles = profile(
190
+ ref_solution,
191
+ entry_point,
192
+ [pe_input],
193
+ timeout_second_per_test=evaluation_time,
194
+ )
195
+
196
+ # Bad thing#1: timeout / failure happens
197
+ if are_profiles_broken(profiles):
198
+ print(f"{task_id}: [WARNING] Error in ref: {profiles}")
199
+ rich.print(Syntax(ref_solution, "python"))
200
+ print(f"{task_id}: Retrying w/ +10s timeout...")
201
+ evaluation_time += 10
202
+ else:
203
+ break
204
+
205
+ avg_profile = mean(profiles)
206
+ # Bad thing#2: if the current #instruction is faster than that of i+1
207
+ if idx < n_reference - 1 and avg_profile < cache_ref_num_inst[idx + 1]:
208
+ print(f"{task_id}: [WARNING] #{idx} ref faster than #{idx + 1}")
209
+ print(f"ref {idx}: #inst {avg_profile}\tscore {ptask['scores'][idx]:.1f}")
210
+ print(
211
+ f"ref {idx+1}: #inst {cache_ref_num_inst[idx+1]}\tscore {ptask['scores'][idx+1]:.1f}"
212
+ )
213
+ rich.print(Syntax(ref_solution, "python"))
214
+ if check_order:
215
+ return None
216
+
217
+ cache_ref_num_inst[idx] = avg_profile
218
+ ret_dict["ref"][idx]["_num_cpu_instructions"] = avg_profile
219
+ return cache_ref_num_inst[idx], ptask["scores"][idx]
220
+
221
+ ####################################################################
222
+ ############################## END #################################
223
+ ####################################################################
224
+
225
+ if not lazy_evaluation: # compute everything ahead of time
226
+ for i in range(n_reference - 1, -1, -1):
227
+ if get_avg_ref_profile(i) is None:
228
+ break
229
+
230
+ assert (
231
+ None not in cache_ref_num_inst
232
+ ), f"{task_id}: Failed to profile certain reference: {cache_ref_num_inst = }"
233
+
234
+ profile_cache = {}
235
+
236
+ cur_profiled = 0
237
+ for result in ret_dict["results"]:
238
+ if cur_profiled >= max_profile:
239
+ rich.print(f"{task_id}: Reached max_profile limit {max_profile}, stopped")
240
+ break
241
+ if not result["pass"]:
242
+ continue
243
+
244
+ solution = result["solution"]
245
+
246
+ if solution in profile_cache: # reuse cache
247
+ sample_profiles = profile_cache[solution]
248
+ else:
249
+ sample_profiles = profile(
250
+ solution,
251
+ entry_point,
252
+ [pe_input],
253
+ timeout_second_per_test=PERF_EVAL_TIMEOUT_SECOND,
254
+ )
255
+ profile_cache[solution] = sample_profiles # store cache
256
+
257
+ score = 0
258
+ norm_score = 0
259
+ result["matching_cluster_idx"] = -1 # -1 means even slower than the slowest ref
260
+ # if the solution results in a timeout, score is 0
261
+ if are_profiles_broken(sample_profiles):
262
+ print(
263
+ f"{task_id}: Tested solution error'ed out: {sample_profiles} ... regarded as 0 score"
264
+ )
265
+ rich.print(Syntax(solution, "python"))
266
+ else:
267
+ avg_sample_profile = result["_num_cpu_instructions"] = mean(sample_profiles)
268
+ # Get profiles from fast to slow (back to front):
269
+ for j in range(n_reference - 1, -1, -1):
270
+ avg_ref_profile, ref_score = get_avg_ref_profile(j, check_order=False)
271
+ if avg_sample_profile <= avg_ref_profile:
272
+ result["matching_cluster_idx"] = j
273
+ score = ref_score
274
+ norm_score = 100 * (j + 1) / n_reference
275
+ break
276
+
277
+ result["dps"] = score
278
+ result["dps_norm"] = norm_score
279
+ result["profiled"] = True
280
+ cur_profiled += 1
281
+
282
+ ret_dict["dps"] = mean(not_none([r["dps"] for r in ret_dict["results"]]))
283
+ ret_dict["dps_norm"] = mean(not_none([r["dps_norm"] for r in ret_dict["results"]]))
284
+ ret_dict["n_profiled"] = cur_profiled
285
+
286
+ table_print(
287
+ f"[bold green]{task_id} Completed[/]",
288
+ {
289
+ "Duration": f"{time.time() - start_time:.1f}s",
290
+ "DPS": f"[green]{ret_dict['dps']:.1f}[/]",
291
+ "DPS_norm": f"[green]{ret_dict['dps_norm']:.1f}[/]",
292
+ "# Profiled": f"{cur_profiled} / {len(ret_dict['results'])}",
293
+ "Pass@1": f"{ret_dict['pass@1']:.1f}%",
294
+ },
295
+ )
296
+
297
+ return ret_dict
298
+
299
+
300
+ # TODO(@ganler): OPTIMIZATION: reuse the samples from the generations of other datasets
301
+ def script(
302
+ samples: Optional[str] = None,
303
+ min_correct: int = 10,
304
+ max_profile: Optional[int] = None,
305
+ n_samples: int = 100,
306
+ temperature: float = 1.0,
307
+ parallel: Optional[int] = None,
308
+ lazy_evaluation: bool = True,
309
+ i_just_wanna_run: bool = False,
310
+ **model_kwargs,
311
+ ):
312
+ max_profile = max_profile or min(min_correct * 2, n_samples)
313
+ assert min_correct <= max_profile <= n_samples
314
+ simple_test_profiler() # test linux perf setup
315
+
316
+ if model_kwargs:
317
+ # To suppress the warning of tokenizers
318
+ os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get(
319
+ "TOKENIZERS_PARALLELISM", "false"
320
+ )
321
+ # overwrite parameters
322
+ samples = run_codegen(
323
+ dataset="evalperf",
324
+ n_samples=n_samples,
325
+ temperature=temperature,
326
+ **model_kwargs,
327
+ )
328
+
329
+ assert samples is not None, "Please provide the path to the samples"
330
+
331
+ # Data loading
332
+ problems, expected_output = get_evalplus_data()
333
+ ptasks = get_evalperf_data()
334
+
335
+ # Parallelism
336
+ max_workers = parallel or max(1, default_parallelism(divisor=4))
337
+ assert 0 < max_workers < multiprocessing.cpu_count(), "Invalid max CPU workers"
338
+
339
+ if os.path.isdir(samples):
340
+ result_path = os.path.join(samples, "evalperf_results.json")
341
+ else:
342
+ assert samples.endswith(".jsonl")
343
+ result_path = samples.replace(".jsonl", "_evalperf_results.json")
344
+ brief_result_path = result_path.replace(
345
+ "evalperf_results.json", "evalperf_results.brief.json"
346
+ )
347
+
348
+ # resume results
349
+ eval_results = {}
350
+ if not i_just_wanna_run and os.path.exists(result_path):
351
+ resumed_result = json.load(open(result_path, "r"))
352
+ if (
353
+ resumed_result["n_samples"] == n_samples
354
+ and resumed_result["temperature"] == temperature
355
+ and resumed_result["min_correct"] == min_correct
356
+ and resumed_result["max_profile"] == max_profile
357
+ ):
358
+ eval_results = resumed_result["eval"]
359
+ for etask in eval_results:
360
+ ptasks.pop(etask, None)
361
+
362
+ rich.print(f"Resumed {len(eval_results)} results from {result_path}")
363
+
364
+ # Load model's samples: task_id -> a list of samples
365
+ sample_iter = stream_jsonl(samples)
366
+ samples = defaultdict(list)
367
+ for task in sample_iter:
368
+ samples[task["task_id"].replace("_", "/")].append(task["solution"])
369
+ samples = {k: v[:n_samples] for k, v in samples.items()}
370
+
371
+ # assert each task has n_samples
372
+ for task_id, s in samples.items():
373
+ assert len(s) == n_samples, f"{task_id} has {len(s)} samples != {n_samples}"
374
+
375
+ # Initialize eval_results
376
+ for task_id, ptask in ptasks.items():
377
+ eval_results[task_id] = {
378
+ "task_id": task_id,
379
+ "results": [],
380
+ "ref": [
381
+ {"solution": s, "score": r, "_num_cpu_instructions": None}
382
+ for s, r in zip(ptask["reference"], ptask["scores"])
383
+ ],
384
+ "dps": None,
385
+ "dps_norm": None,
386
+ "pass@1": None,
387
+ "n_profiled": None,
388
+ }
389
+
390
+ rule("Correctness Checking...")
391
+ with progress("Correctness") as p:
392
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
393
+ futures = [
394
+ executor.submit(
395
+ correctness_worker,
396
+ task_id,
397
+ samples[task_id],
398
+ problems[task_id],
399
+ expected_output[task_id],
400
+ )
401
+ for task_id in ptasks
402
+ ]
403
+
404
+ for future in p.track(as_completed(futures), total=len(futures)):
405
+ task_id, results = future.result()
406
+ eval_results[task_id]["results"] = results
407
+ eval_results[task_id]["pass@1"] = (
408
+ 100 * len([r for r in results if r["pass"]]) / n_samples
409
+ )
410
+
411
+ rule("EvalPerf Configurations")
412
+ if lazy_evaluation:
413
+ rich.print(
414
+ "[bold yellow]Lazy evaluation is enabled[/]: "
415
+ "Fast evaluation without enumeratively checking reference order consistency."
416
+ )
417
+
418
+ table_print(
419
+ "Configurations",
420
+ {
421
+ "Max CPU": max_workers,
422
+ "#Tasks": len(ptasks),
423
+ "#Samples per task": n_samples,
424
+ "Min correct": min_correct,
425
+ "Max profile": max_profile,
426
+ "Result path": result_path,
427
+ },
428
+ )
429
+
430
+ rich.print(f"IDs of tasks to evaluate: {list(ptasks.keys())}")
431
+ rule("Evaluation Start")
432
+ undone = []
433
+ with progress("Profiling") as p:
434
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
435
+ futures = []
436
+ for task_id, ptask in ptasks.items():
437
+ n_pass = len([r for r in eval_results[task_id]["results"] if r["pass"]])
438
+ if n_pass < min_correct:
439
+ rich.print(
440
+ f"{task_id}: [bold yellow]{n_pass} < {min_correct} correct solutions, skipped[/]"
441
+ )
442
+ continue
443
+ futures.append(
444
+ executor.submit(
445
+ perf_worker,
446
+ task_id,
447
+ ptask,
448
+ eval_results[task_id],
449
+ lazy_evaluation,
450
+ max_profile,
451
+ )
452
+ )
453
+ undone.append(task_id)
454
+ rich.print(f"{task_id}: Queued")
455
+
456
+ for future in p.track(as_completed(futures), total=len(futures)):
457
+ result = future.result()
458
+ eval_results[result["task_id"]] = result
459
+ undone.remove(result["task_id"])
460
+ if undone and len(undone) < max_workers:
461
+ print(f"Still running: {undone}")
462
+
463
+ rule("Evaluation Summary")
464
+ dps = mean(not_none([res["dps"] for res in eval_results.values()]))
465
+ dps_norm = mean(not_none([res["dps_norm"] for res in eval_results.values()]))
466
+ pass_1 = mean(not_none([res["pass@1"] for res in eval_results.values()]))
467
+ n_evalperfed = len(not_none([res["dps"] for res in eval_results.values()]))
468
+
469
+ table_print(
470
+ "EvalPerf Summary",
471
+ {
472
+ "DPS": f"{dps:.1f}",
473
+ "DPS_norm": f"{dps_norm:.1f}",
474
+ "Pass@1": f"{pass_1:.1f}%",
475
+ "#EvalPerf-ed tasks": f"{n_evalperfed} / {len(eval_results)}",
476
+ "min_correct": min_correct,
477
+ "n_samples": n_samples,
478
+ "temperature": temperature,
479
+ },
480
+ )
481
+
482
+ # Save full results
483
+ with open(result_path, "w") as f:
484
+ f.write(
485
+ json.dumps(
486
+ {
487
+ "date": datetime.now().strftime("%Y-%m-%d %H:%M"),
488
+ "n_samples": n_samples,
489
+ "temperature": temperature,
490
+ "min_correct": min_correct,
491
+ "max_profile": max_profile,
492
+ "eval": eval_results,
493
+ }
494
+ )
495
+ )
496
+ rich.print(f"Full results have been saved to {result_path}")
497
+
498
+ # Save brief results
499
+ with open(brief_result_path, "w") as f:
500
+ f.write(
501
+ json.dumps(
502
+ {
503
+ "date": datetime.now().strftime("%Y-%m-%d %H:%M"),
504
+ "config": {
505
+ "n_samples": n_samples,
506
+ "temperature": temperature,
507
+ "min_correct": min_correct,
508
+ "max_profile": max_profile,
509
+ },
510
+ "summary": {
511
+ "dps": dps,
512
+ "dps_norm": dps_norm,
513
+ "pass@1": pass_1,
514
+ },
515
+ "eval": {
516
+ task_id: {
517
+ "dps": res["dps"],
518
+ "dps_norm": res["dps_norm"],
519
+ "pass@1": res["pass@1"],
520
+ "profiled": [
521
+ {
522
+ "solution": r["solution"],
523
+ "matching_cluster_idx": r["matching_cluster_idx"],
524
+ }
525
+ for r in res["results"]
526
+ if r["profiled"]
527
+ ],
528
+ }
529
+ for task_id, res in eval_results.items()
530
+ },
531
+ }
532
+ )
533
+ )
534
+
535
+ rich.print(f"Brief results have been saved to {brief_result_path}")
536
+
537
+ rule("To visualize win-rates and pair-wise DPS, run:")
538
+ rich.print(
539
+ Syntax(
540
+ f"""\
541
+ git clone git@github.com:evalplus/evalplus.github.io.git
542
+ git --git-dir=evalplus.github.io/.git pull
543
+ cp {brief_result_path} evalplus.github.io/results/evalperf
544
+ python evalplus.github.io/results/evalperf/stats.py
545
+ python -m http.server -d evalplus.github.io {get_free_port()}""",
546
+ "bash",
547
+ )
548
+ )
549
+
550
+
551
+ def main():
552
+ from fire import Fire
553
+
554
+ Fire(script)
555
+
556
+
557
+ if __name__ == "__main__":
558
+ main()
evalplus/build/lib/evalplus/evaluate.py ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import multiprocessing
3
+ import os
4
+ import pickle
5
+ import threading
6
+ import time
7
+ from collections import Counter, defaultdict
8
+ from concurrent.futures import ProcessPoolExecutor, as_completed
9
+ from datetime import datetime
10
+ from typing import Any, Dict, List, Optional, Tuple
11
+ from warnings import warn
12
+
13
+ import numpy as np
14
+ from termcolor import cprint
15
+ from tqdm import tqdm
16
+
17
+ from evalplus.codegen import run_codegen
18
+ from evalplus.config import *
19
+ from evalplus.data import (
20
+ get_human_eval_plus,
21
+ get_human_eval_plus_hash,
22
+ get_mbpp_plus,
23
+ get_mbpp_plus_hash,
24
+ load_solutions,
25
+ )
26
+ from evalplus.data.mbpp import mbpp_serialize_inputs
27
+ from evalplus.data.utils import CACHE_DIR
28
+ from evalplus.eval import (
29
+ PASS,
30
+ compatible_eval_result,
31
+ estimate_pass_at_k,
32
+ untrusted_check,
33
+ )
34
+ from evalplus.eval._special_oracle import MBPP_OUTPUT_NOT_NONE_TASKS
35
+ from evalplus.gen.util import trusted_exec
36
+
37
+ # 1st item: the status
38
+ # 2nd item (optional): the detailed pass/fail boolean for each input
39
+ Result = Tuple[str, List[bool]]
40
+
41
+
42
+ def get_groundtruth(problems, hashcode, tasks_only_output_not_none):
43
+ cache_file = os.path.join(CACHE_DIR, f"{hashcode}.pkl")
44
+ if os.path.exists(cache_file):
45
+ print(f"Load from ground-truth from {cache_file}")
46
+ with open(cache_file, "rb") as f:
47
+ return pickle.load(f)
48
+
49
+ os.makedirs(CACHE_DIR, exist_ok=True)
50
+ print("Computing expected output...")
51
+ tbegin = time.time()
52
+ expected_output = {}
53
+ for task_id, problem in problems.items():
54
+ oracle = {}
55
+ oracle["base"], oracle["base_time"] = trusted_exec(
56
+ problem["prompt"] + problem["canonical_solution"],
57
+ problem["base_input"],
58
+ problem["entry_point"],
59
+ record_time=True,
60
+ output_not_none=problem["entry_point"] in tasks_only_output_not_none,
61
+ )
62
+
63
+ oracle["plus"], oracle["plus_time"] = trusted_exec(
64
+ problem["prompt"] + problem["canonical_solution"],
65
+ problem["plus_input"],
66
+ problem["entry_point"],
67
+ record_time=True,
68
+ output_not_none=problem["entry_point"] in tasks_only_output_not_none,
69
+ )
70
+ expected_output[task_id] = oracle
71
+ print(f"Expected outputs computed in {time.time() - tbegin:.2f}s")
72
+
73
+ with open(cache_file, "wb") as f:
74
+ pickle.dump(expected_output, f)
75
+
76
+ return expected_output
77
+
78
+
79
+ def check_correctness(
80
+ dataset: str,
81
+ completion_id: int,
82
+ problem: Dict[str, Any],
83
+ solution: str,
84
+ expected_output: Dict[str, List],
85
+ base_only=False,
86
+ fast_check=False,
87
+ identifier=None,
88
+ min_time_limit: float = DEFAULT_MIN_TIME_LIMIT,
89
+ gt_time_limit_factor: float = DEFAULT_GT_TIME_LIMIT_FACTOR,
90
+ ) -> Dict[str, Result]: # {...}, "base" | "plus" -> (status, details)
91
+ ret = {
92
+ "completion_id": completion_id,
93
+ "task_id": problem["task_id"],
94
+ "_identifier": identifier,
95
+ "solution": solution,
96
+ }
97
+ ret["base"] = untrusted_check(
98
+ dataset,
99
+ solution,
100
+ problem["base_input"],
101
+ problem["entry_point"],
102
+ expected=expected_output["base"],
103
+ atol=problem["atol"],
104
+ ref_time=expected_output["base_time"],
105
+ fast_check=fast_check,
106
+ min_time_limit=min_time_limit,
107
+ gt_time_limit_factor=gt_time_limit_factor,
108
+ )
109
+
110
+ if not base_only:
111
+ ret["plus"] = untrusted_check(
112
+ dataset,
113
+ solution,
114
+ problem["plus_input"],
115
+ problem["entry_point"],
116
+ expected=expected_output["plus"],
117
+ atol=problem["atol"],
118
+ ref_time=expected_output["plus_time"],
119
+ fast_check=fast_check,
120
+ min_time_limit=min_time_limit,
121
+ gt_time_limit_factor=gt_time_limit_factor,
122
+ )
123
+
124
+ return ret
125
+
126
+
127
+ def evaluate(
128
+ dataset: str,
129
+ samples: Optional[str] = None,
130
+ base_only: bool = False,
131
+ parallel: Optional[int] = None,
132
+ i_just_wanna_run: bool = False,
133
+ test_details: bool = False,
134
+ min_time_limit: float = DEFAULT_MIN_TIME_LIMIT,
135
+ gt_time_limit_factor: float = DEFAULT_GT_TIME_LIMIT_FACTOR,
136
+ mini: bool = False,
137
+ noextreme: bool = False,
138
+ version: str = "default",
139
+ output_file: Optional[str] = None,
140
+ gguf_file: Optional[str] = None,
141
+ **model_kwargs,
142
+ ):
143
+ if model_kwargs:
144
+ # To suppress the warning of tokenizers
145
+ os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get(
146
+ "TOKENIZERS_PARALLELISM", "false"
147
+ )
148
+ samples = run_codegen(
149
+ dataset=dataset,
150
+ gguf_file=gguf_file,
151
+ **model_kwargs,
152
+ )
153
+ assert samples is not None, "No samples provided"
154
+
155
+ n_workers = parallel or max(1, multiprocessing.cpu_count() // 2)
156
+
157
+ if os.path.isdir(samples):
158
+ result_path = os.path.join(samples, "eval_results.json")
159
+ else:
160
+ assert samples.endswith(".jsonl")
161
+ # legacy compatibility
162
+ if os.path.exists(samples.replace(".jsonl", "_eval_results.json")):
163
+ result_path = samples.replace(".jsonl", "_eval_results.json")
164
+ else:
165
+ result_path = samples.replace(".jsonl", ".eval_results.json")
166
+
167
+ if output_file is not None:
168
+ result_path = output_file
169
+
170
+ if os.path.isfile(result_path) and not i_just_wanna_run:
171
+ print(f"Load from previous results from {result_path}")
172
+ with open(result_path, "r") as f:
173
+ results = json.load(f)
174
+
175
+ results = compatible_eval_result(results)
176
+ else:
177
+ if dataset == "humaneval":
178
+ problems = get_human_eval_plus(
179
+ mini=mini, noextreme=noextreme, version=version
180
+ )
181
+ dataset_hash = get_human_eval_plus_hash(
182
+ mini=mini, noextreme=noextreme, version=version
183
+ )
184
+ expected_output = get_groundtruth(problems, dataset_hash, [])
185
+ elif dataset == "mbpp":
186
+ problems = get_mbpp_plus(mini=mini, noextreme=noextreme, version=version)
187
+ dataset_hash = get_mbpp_plus_hash(
188
+ mini=mini, noextreme=noextreme, version=version
189
+ )
190
+ expected_output = get_groundtruth(
191
+ problems,
192
+ dataset_hash,
193
+ MBPP_OUTPUT_NOT_NONE_TASKS,
194
+ )
195
+
196
+ results = {
197
+ "date": datetime.now().strftime("%Y-%m-%d %H:%M"),
198
+ "hash": dataset_hash,
199
+ "eval": {},
200
+ }
201
+
202
+ with ProcessPoolExecutor(max_workers=n_workers) as executor:
203
+ futures = []
204
+ completion_id = Counter()
205
+ n_samples = 0
206
+ eval_results = defaultdict(list) # task_id ->
207
+ remainings = set()
208
+
209
+ print("Reading samples...")
210
+ for sample in tqdm(load_solutions(samples)):
211
+ task_id = sample["task_id"]
212
+ if task_id not in problems:
213
+ warn(
214
+ f"Task {task_id} is found in the samples but not found in the dataset"
215
+ )
216
+ continue
217
+ solution = (
218
+ sample["solution"]
219
+ if "solution" in sample
220
+ else problems[task_id]["prompt"] + sample["completion"]
221
+ )
222
+ remainings.add(sample["_identifier"])
223
+ args = (
224
+ dataset,
225
+ completion_id[task_id],
226
+ problems[task_id],
227
+ solution,
228
+ expected_output[task_id],
229
+ base_only,
230
+ not test_details, # fast_check
231
+ sample["_identifier"],
232
+ min_time_limit,
233
+ gt_time_limit_factor,
234
+ )
235
+ futures.append(executor.submit(check_correctness, *args))
236
+ completion_id[task_id] += 1
237
+ n_samples += 1
238
+
239
+ assert n_samples == len(remainings), "Missing problems in unfinished"
240
+ assert len(completion_id) == len(problems), "Missing problems in samples"
241
+
242
+ def stucking_checker():
243
+ while remainings:
244
+ last_size = len(remainings)
245
+ time.sleep(20)
246
+ if last_size != len(remainings) or len(remainings) == 0:
247
+ continue
248
+ # Potential stucking
249
+ warn("No samples had finished testing in the last 20s")
250
+ warn(f"{len(remainings)} samples to be tested: {remainings}")
251
+
252
+ threading.Thread(target=stucking_checker).start()
253
+
254
+ for future in tqdm(as_completed(futures), total=n_samples):
255
+ result = future.result()
256
+ remainings.remove(result["_identifier"])
257
+ eval_results[result["task_id"]].append(result)
258
+
259
+ # sort the results for each problem by completion_id
260
+ for task_id, task_results in eval_results.items():
261
+ task_results.sort(key=lambda x: x["completion_id"])
262
+ results["eval"][task_id] = []
263
+ for res in task_results:
264
+
265
+ def get_failed_tests(stat, details, inputs) -> List[Any]:
266
+ if stat == PASS or not details:
267
+ return []
268
+
269
+ if test_details:
270
+ return [
271
+ inputs[i] for i in range(len(details)) if not details[i]
272
+ ]
273
+
274
+ # else => simply return the only and the last fail test
275
+ return [inputs[len(details) - 1]]
276
+
277
+ base_stat, base_details = res["base"]
278
+ base_fail_tests = get_failed_tests(
279
+ base_stat, base_details, problems[task_id]["base_input"]
280
+ )
281
+
282
+ # initialize plus tests
283
+ plus_stat = None
284
+ plus_fail_tests = []
285
+
286
+ # with plus tests
287
+ if not base_only:
288
+ plus_stat, plus_details = res["plus"]
289
+ plus_fail_tests = get_failed_tests(
290
+ plus_stat, plus_details, problems[task_id]["plus_input"]
291
+ )
292
+
293
+ if dataset == "mbpp":
294
+ base_fail_tests = mbpp_serialize_inputs(task_id, base_fail_tests)
295
+ plus_fail_tests = mbpp_serialize_inputs(task_id, plus_fail_tests)
296
+
297
+ results["eval"][task_id].append(
298
+ {
299
+ "task_id": task_id,
300
+ "solution": res["solution"],
301
+ "base_status": base_stat,
302
+ "plus_status": plus_stat,
303
+ "base_fail_tests": base_fail_tests,
304
+ "plus_fail_tests": plus_fail_tests,
305
+ }
306
+ )
307
+
308
+ # Calculate pass@k.
309
+ total = np.array([len(r) for r in results["eval"].values()])
310
+ base_correct = []
311
+ new_correct = []
312
+
313
+ for res in results["eval"].values():
314
+ bc = sum([r["base_status"] == PASS for r in res])
315
+ base_correct.append(bc)
316
+ if not base_only:
317
+ new_correct.append(
318
+ sum(
319
+ [
320
+ res[i]["base_status"] == res[i]["plus_status"] == PASS
321
+ for i in range(len(res))
322
+ ]
323
+ )
324
+ )
325
+ base_correct = np.array(base_correct)
326
+
327
+ pass_at_k = {
328
+ f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
329
+ for k in [1, 10, 100]
330
+ if total.min() >= k
331
+ }
332
+ cprint(f"{dataset} (base tests)", "red")
333
+ for k, v in pass_at_k.items():
334
+ cprint(f"{k}:\t{v:.3f}", "red")
335
+ results["pass_at_k"] = {"base": pass_at_k}
336
+
337
+ if new_correct:
338
+ cprint(f"{dataset}+ (base + extra tests)", "green")
339
+ pass_at_k = {
340
+ f"pass@{k}": estimate_pass_at_k(total, np.array(new_correct), k).mean()
341
+ for k in [1, 10, 100]
342
+ if (total >= k).all()
343
+ }
344
+ for k, v in pass_at_k.items():
345
+ cprint(f"{k}:\t{v:.3f}", "green")
346
+ results["pass_at_k"]["plus"] = pass_at_k
347
+
348
+ # save results
349
+ if os.path.isfile(result_path) and i_just_wanna_run:
350
+ decision = ""
351
+ while decision.lower() not in ["y", "n"]:
352
+ print(f"{result_path} already exists. Press [Y/N] to overwrite or exit...")
353
+ decision = input()
354
+
355
+ if decision.lower() == "y":
356
+ # mv the file to a backup
357
+ new_path = result_path + ".bak"
358
+ while os.path.isfile(new_path):
359
+ new_path += ".bak"
360
+ os.rename(result_path, new_path)
361
+ print(f"Backup {result_path} to {new_path}")
362
+
363
+ if not os.path.isfile(result_path):
364
+ with open(result_path, "w") as f:
365
+ json.dump(results, f)
366
+
367
+
368
+ def main():
369
+ from fire import Fire
370
+
371
+ Fire(evaluate)
372
+
373
+
374
+ if __name__ == "__main__":
375
+ main()
evalplus/build/lib/evalplus/gen/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from typing import Any, List
3
+
4
+
5
+ class BaseGen(object):
6
+ def __init__(self, inputs: List[Any], entry_point: str, contract: str):
7
+ """Initializing a input mutator.
8
+
9
+ Args:
10
+ inputs (List[Any]): The set of initial inputs (i.e., seeds)
11
+ entry_point (str): The function name to invoke with the input
12
+ contract (str): The contract to verify input validity
13
+ """
14
+ self.contract = contract
15
+ self.entry_point = entry_point
16
+ self.seed_pool: List[Any] = copy.deepcopy(inputs)
17
+ self.new_inputs = []
18
+ self.seed_hash = set([hash(str(x)) for x in self.seed_pool])
19
+
20
+ def generate(self, num: int) -> List[Any]:
21
+ raise NotImplementedError
evalplus/build/lib/evalplus/gen/chatgpt_gen.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import random
3
+ from typing import List
4
+
5
+ import openai
6
+ from openai.types.chat import ChatCompletion
7
+
8
+ from evalplus.data.utils import to_raw
9
+ from evalplus.gen import BaseGen
10
+ from evalplus.gen.util import trusted_check_exec
11
+ from evalplus.gen.util.openai_request import make_auto_request
12
+
13
+
14
+ class ChatGPTGen(BaseGen):
15
+ def __init__(self, inputs: List, signature: str, contract_code: str, gd_code: str):
16
+ super().__init__(inputs, signature, contract_code)
17
+ self.gd_code = gd_code
18
+ self.prompt_messages = [
19
+ "Please generate complex inputs to test the function.",
20
+ "Please generate corner case inputs to test the function.",
21
+ "Please generate difficult inputs to test the function.",
22
+ ]
23
+ self.iteration = 20
24
+ self.client = openai.Client()
25
+
26
+ def seed_selection(self) -> List:
27
+ # get 5 for now.
28
+ return random.sample(self.seed_pool, k=min(len(self.seed_pool), 5))
29
+
30
+ @staticmethod
31
+ def _parse_ret(ret: ChatCompletion) -> List:
32
+ rets = []
33
+ output = ret.choices[0].message.content
34
+ if "```" in output:
35
+ for x in output.split("```")[1].splitlines():
36
+ if x.strip() == "":
37
+ continue
38
+ try:
39
+ # remove comments
40
+ input = ast.literal_eval(f"[{x.split('#')[0].strip()}]")
41
+ except: # something wrong.
42
+ continue
43
+ rets.append(input)
44
+ return rets
45
+
46
+ def chatgpt_generate(self, selected_inputs: List) -> List:
47
+ # append the groundtruth function
48
+ # actually it can be any function (maybe we can generate inputs for each llm generated code individually)
49
+ message = f"Here is a function that we want to test:\n```\n{self.gd_code}\n```"
50
+ str_inputs = "\n".join(
51
+ [
52
+ ", ".join([f"'{to_raw(i)}'" if type(i) == str else str(i) for i in x])
53
+ for x in selected_inputs
54
+ ]
55
+ )
56
+ message += f"\nThese are some example inputs used to test the function:\n```\n{str_inputs}\n```"
57
+ message += f"\n{random.choice(self.prompt_messages)}"
58
+ ret = make_auto_request(
59
+ self.client,
60
+ message=message,
61
+ model="gpt-3.5-turbo",
62
+ max_tokens=256,
63
+ response_format={"type": "text"},
64
+ )
65
+ return self._parse_ret(ret)
66
+
67
+ def generate(self, num: int):
68
+ while len(self.new_inputs) < num and self.iteration >= 0:
69
+ seeds = self.seed_selection()
70
+ new_inputs = self.chatgpt_generate(seeds)
71
+ for new_input in new_inputs:
72
+ if hash(str(new_input)) not in self.seed_hash:
73
+ if trusted_check_exec(self.contract, [new_input], self.entry_point):
74
+ self.seed_pool.append(new_input)
75
+ self.seed_hash.add(hash(str(new_input)))
76
+ self.new_inputs.append(new_input)
77
+ self.iteration -= 1
78
+ return self.new_inputs[:num]
evalplus/build/lib/evalplus/gen/mut_gen.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from abc import abstractmethod
3
+ from typing import Any, List
4
+
5
+ from evalplus.gen import BaseGen
6
+ from evalplus.gen.util import trusted_check_exec
7
+
8
+
9
+ class MutateGen(BaseGen):
10
+ def __init__(self, inputs: List, signature: str, contract_code: str):
11
+ super().__init__(inputs, signature, contract_code)
12
+
13
+ def seed_selection(self):
14
+ # random for now.
15
+ return random.choice(self.seed_pool)
16
+
17
+ @abstractmethod
18
+ def mutate(self, seed_input: Any) -> Any:
19
+ pass
20
+
21
+ def generate(self, num: int) -> List[Any]:
22
+ while len(self.new_inputs) < num:
23
+ seed = self.seed_selection()
24
+ new_input = self.mutate(seed)
25
+ if hash(str(new_input)) not in self.seed_hash:
26
+ if trusted_check_exec(self.contract, [new_input], self.entry_point):
27
+ self.seed_pool.append(new_input)
28
+ self.seed_hash.add(hash(str(new_input)))
29
+ self.new_inputs.append(new_input)
30
+ return self.new_inputs[:num]
evalplus/build/lib/evalplus/gen/type_mut.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import random
3
+ import string
4
+ import time
5
+ from typing import Any, Dict, List, Set, Tuple
6
+
7
+ from multipledispatch import dispatch
8
+
9
+ from evalplus.gen.mut_gen import MutateGen
10
+ from evalplus.gen.util import trusted_check_exec
11
+
12
+ MAX_MULTI_STEP_SIZE = 5
13
+ MUTATE_BOUND_SIZE = 8
14
+
15
+ NoneType = type(None)
16
+
17
+
18
+ # decorator to use ingredients
19
+ class use_ingredient:
20
+ def __init__(self, prob: float):
21
+ assert 0 <= prob <= 0.95
22
+ self.prob = prob
23
+
24
+ def __call__(obj, func):
25
+ def wrapper(self, seed_input):
26
+ if random.random() < obj.prob and self.ingredients[type(seed_input)]:
27
+ return random.choice(list(self.ingredients[type(seed_input)]))
28
+ else:
29
+ return func(self, seed_input)
30
+
31
+ return wrapper
32
+
33
+
34
+ class TypedMutGen(MutateGen):
35
+ def __init__(self, inputs: List, signature: str, contract_code: str):
36
+ super().__init__(inputs, signature, contract_code)
37
+ self.timeout = 60 * 60 # 1 hour
38
+ self.ingredients = {
39
+ int: set(),
40
+ float: set(),
41
+ str: set(),
42
+ complex: set(),
43
+ }
44
+ for x in inputs:
45
+ self.fetch_ingredient(x)
46
+
47
+ def seed_selection(self):
48
+ # random for now.
49
+ return random.choice(self.seed_pool)
50
+
51
+ def mutate(self, seed_input: Any) -> List:
52
+ new_input = copy.deepcopy(seed_input)
53
+
54
+ patience = MUTATE_BOUND_SIZE
55
+ while new_input == seed_input or patience == 0:
56
+ new_input = self.typed_mutate(new_input)
57
+ patience -= 1
58
+
59
+ return new_input
60
+
61
+ #########################
62
+ # Type-aware generation #
63
+ #########################
64
+ @dispatch(NoneType)
65
+ def typed_gen(self, _):
66
+ return None
67
+
68
+ @dispatch(int)
69
+ def typed_gen(self, _):
70
+ @use_ingredient(0.5)
71
+ def _impl(*_):
72
+ return random.randint(-100, 100)
73
+
74
+ return _impl(self, _)
75
+
76
+ @dispatch(float)
77
+ def typed_gen(self, _):
78
+ @use_ingredient(0.5)
79
+ def _impl(*_):
80
+ return random.uniform(-100, 100)
81
+
82
+ return _impl(self, _)
83
+
84
+ @dispatch(bool)
85
+ def typed_gen(self, _):
86
+ return random.choice([True, False])
87
+
88
+ @dispatch(str)
89
+ def typed_gen(self, _):
90
+ @use_ingredient(0.5)
91
+ def _impl(*_):
92
+ return "".join(
93
+ random.choice(string.ascii_letters)
94
+ for _ in range(random.randint(0, 10))
95
+ )
96
+
97
+ return _impl(self, _)
98
+
99
+ def any_gen(self):
100
+ # weighted choose
101
+ choice = random.choices(
102
+ [
103
+ True,
104
+ 1,
105
+ 1.1,
106
+ "str",
107
+ [], # list
108
+ tuple(), # tuple
109
+ dict(), # dict
110
+ None, # None
111
+ ],
112
+ [0.2, 0.2, 0.2, 0.2, 0.05, 0.05, 0.05, 0.05],
113
+ )[0]
114
+ return self.typed_gen(choice)
115
+
116
+ @dispatch(list)
117
+ def typed_gen(self, _):
118
+ ret = []
119
+ size = random.randint(0, 10)
120
+ if random.randint(0, 4) == 0: # heterogeneous
121
+ for _ in range(size):
122
+ ret.append(self.any_gen())
123
+ else: # homogeneous
124
+ t = random.choice([bool(), int(), float(), str()])
125
+ for _ in range(size):
126
+ ret.append(self.typed_gen(t))
127
+ return ret
128
+
129
+ @dispatch(tuple)
130
+ def typed_gen(self, _):
131
+ return tuple(self.typed_gen([]))
132
+
133
+ # NOTE: disable set for now as Steven is too weak in Python (/s)
134
+ # @dispatch(set)
135
+ # def typed_gen(self, _):
136
+ # return set(self.typed_gen([]))
137
+
138
+ @dispatch(dict)
139
+ def typed_gen(self, _):
140
+ ret = dict()
141
+ values = self.typed_gen([])
142
+ # NOTE: Assumption: nobody uses dict with heterogeneous keys
143
+ # NOTE: Assumption: nobody uses dict with boolean keys
144
+ key_type = random.choice([int(), float(), str()])
145
+ for v in values:
146
+ ret[self.typed_gen(key_type)] = self.typed_gen(v)
147
+ return ret
148
+
149
+ ########################
150
+ # Type-aware mutation #
151
+ ########################
152
+ # Simple primitives
153
+ @dispatch(int)
154
+ def typed_mutate(self, seed_input: int):
155
+ @use_ingredient(0.5)
156
+ def _impl(_, seed_input: int):
157
+ return seed_input + random.randint(-1, 1)
158
+
159
+ return _impl(self, seed_input)
160
+
161
+ @dispatch(float)
162
+ def typed_mutate(self, seed_input: float):
163
+ @use_ingredient(0.5)
164
+ def _impl(_, seed_input: float):
165
+ if random.randint(0, 1):
166
+ return seed_input + random.uniform(-1, 1)
167
+ return seed_input * (1 + random.uniform(-0.5, 0.5))
168
+
169
+ return _impl(self, seed_input)
170
+
171
+ @dispatch(complex)
172
+ def typed_mutate(self, seed_input: complex):
173
+ @use_ingredient(0.5)
174
+ def _impl(_, seed_input: complex):
175
+ imag = seed_input.imag + random.uniform(-1, 1)
176
+ return complex(0, imag)
177
+
178
+ return _impl(self, seed_input)
179
+
180
+ @dispatch(bool)
181
+ def typed_mutate(self, seed_input: bool):
182
+ return random.choice([True, False])
183
+
184
+ @dispatch(NoneType)
185
+ def typed_mutate(self, seed_input: NoneType):
186
+ return None
187
+
188
+ # List-like
189
+ @dispatch(list)
190
+ def typed_mutate(self, seed_input: List):
191
+ if len(seed_input) == 0:
192
+ return self.typed_gen([])
193
+
194
+ choice = random.randint(0, 3)
195
+ idx = random.randint(0, len(seed_input) - 1)
196
+ if choice == 0: # remove one element
197
+ seed_input.pop(random.randint(0, len(seed_input) - 1))
198
+ elif choice == 1 and len(seed_input) > 0: # add one mutated element
199
+ seed_input.insert(
200
+ random.randint(0, len(seed_input) - 1),
201
+ self.typed_mutate(seed_input[idx]),
202
+ )
203
+ elif choice == 2 and len(seed_input) > 0: # repeat one element
204
+ seed_input.append(seed_input[idx])
205
+ else: # inplace element change
206
+ seed_input[idx] = self.typed_mutate(seed_input[idx])
207
+ return seed_input
208
+
209
+ @dispatch(tuple)
210
+ def typed_mutate(self, seed_input: Tuple):
211
+ return tuple(self.typed_mutate(list(seed_input)))
212
+
213
+ # String
214
+ @dispatch(str)
215
+ def typed_mutate(self, seed_input: str):
216
+ @use_ingredient(0.4)
217
+ def _impl(_, seed_input: str):
218
+ choice = random.randint(0, 2) if seed_input else 0
219
+ if choice == 0 and self.ingredients[str]: # insert an ingredient
220
+ idx = random.randint(0, len(seed_input))
221
+ return (
222
+ seed_input[:idx]
223
+ + random.choice(list(self.ingredients[str]))
224
+ + seed_input[idx:]
225
+ )
226
+ # other choices assume len(seed_input) > 0
227
+ elif choice == 1: # replace a substring with empty or mutated string
228
+ start = random.randint(0, len(seed_input) - 1)
229
+ end = random.randint(start + 1, len(seed_input))
230
+ mid = (
231
+ ""
232
+ if random.randint(0, 1)
233
+ else self.typed_mutate(seed_input[start:end])
234
+ )
235
+ return seed_input[:start] + mid + seed_input[end:]
236
+ elif choice == 2: # repeat one element
237
+ idx = random.randint(0, len(seed_input) - 1)
238
+ return (
239
+ seed_input[:idx]
240
+ + seed_input[random.randint(0, len(seed_input) - 1)]
241
+ + seed_input[idx:]
242
+ )
243
+
244
+ # random char
245
+ return self.typed_gen(str())
246
+
247
+ return _impl(self, seed_input)
248
+
249
+ # Set
250
+ @dispatch(set)
251
+ def typed_mutate(self, seed_input: Set):
252
+ return set(self.typed_mutate(list(seed_input)))
253
+
254
+ # Dict
255
+ @dispatch(dict)
256
+ def typed_mutate(self, seed_input: Dict):
257
+ if len(seed_input) == 0:
258
+ return self.typed_gen(dict())
259
+
260
+ choice = random.randint(0, 2)
261
+ if choice == 0: # remove a kv
262
+ del seed_input[random.choice(list(seed_input.keys()))]
263
+ elif choice == 1: # add a kv
264
+ k = self.typed_mutate(random.choice(list(seed_input.keys())))
265
+ v = self.typed_mutate(random.choice(list(seed_input.values())))
266
+ seed_input[k] = v
267
+ elif choice == 2: # inplace value change
268
+ k0, v0 = random.choice(list(seed_input.items()))
269
+ seed_input[k0] = self.typed_mutate(v0)
270
+ return seed_input
271
+
272
+ ############################################
273
+ # Fetching ingredients to self.ingredients #
274
+ ############################################
275
+ def fetch_ingredient(self, seed_input):
276
+ self.typed_fetch(seed_input)
277
+
278
+ @dispatch(int)
279
+ def typed_fetch(self, seed_input: int):
280
+ self.ingredients[int].add(seed_input)
281
+
282
+ @dispatch(float)
283
+ def typed_fetch(self, seed_input: float):
284
+ self.ingredients[float].add(seed_input)
285
+
286
+ @dispatch(complex)
287
+ def typed_fetch(self, seed_input: complex):
288
+ self.ingredients[complex].add(seed_input)
289
+
290
+ @dispatch(str)
291
+ def typed_fetch(self, seed_input: str):
292
+ self.ingredients[str].add(seed_input)
293
+ for token in seed_input.strip().split():
294
+ self.ingredients[str].add(token)
295
+
296
+ # List-like
297
+ def _fetch_list_like(self, seed_input):
298
+ for x in seed_input:
299
+ if self.typed_fetch.dispatch(type(x)):
300
+ self.fetch_ingredient(x)
301
+
302
+ @dispatch(list)
303
+ def typed_fetch(self, seed_input: List):
304
+ self._fetch_list_like(seed_input)
305
+
306
+ @dispatch(tuple)
307
+ def typed_fetch(self, seed_input: Tuple):
308
+ self._fetch_list_like(seed_input)
309
+
310
+ # NOTE: disable set for now as Steven is too weak in Python (/s)
311
+ # @dispatch(set)
312
+ # def typed_fetch(self, seed_input: Set):
313
+ # self._fetch_list_like(seed_input)
314
+
315
+ # Dict
316
+ @dispatch(dict)
317
+ def typed_fetch(self, seed_input: Dict):
318
+ self._fetch_list_like(seed_input.keys())
319
+ self._fetch_list_like(seed_input.values())
320
+
321
+ def generate(self, num: int):
322
+ start = time.time()
323
+ num_generated = 1
324
+ while len(self.new_inputs) < num and time.time() - start < self.timeout:
325
+ if num_generated % 1000 == 0:
326
+ print(
327
+ f"generated {num_generated} already with {len(self.new_inputs)} new inputs ... "
328
+ )
329
+ new_input = self.seed_selection()
330
+ # Multi-step instead of single-step
331
+ for _ in range(random.randint(1, MAX_MULTI_STEP_SIZE)):
332
+ new_input = self.mutate(new_input)
333
+ num_generated += 1
334
+ if hash(str(new_input)) not in self.seed_hash:
335
+ if trusted_check_exec(self.contract, [new_input], self.entry_point):
336
+ self.typed_fetch(new_input)
337
+ self.seed_pool.append(new_input)
338
+ self.new_inputs.append(new_input)
339
+ self.seed_hash.add(hash(str(new_input)))
340
+ return self.new_inputs[:num]
evalplus/build/lib/evalplus/gen/util/__init__.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from copy import deepcopy
3
+
4
+ from evalplus.eval.utils import time_limit
5
+
6
+
7
+ def trusted_exec(code, inputs, entry_point, record_time=False, output_not_none=False):
8
+ """Execute trusted code in place."""
9
+ exec_globals = {}
10
+ exec(code, exec_globals)
11
+ fn = exec_globals[entry_point]
12
+
13
+ rtime = []
14
+ ret = []
15
+ for inp in inputs:
16
+ inp = deepcopy(inp)
17
+ if record_time:
18
+ start = time.time()
19
+ ret.append(fn(*inp))
20
+ rtime.append(time.time() - start)
21
+ else:
22
+ ret.append(fn(*inp))
23
+
24
+ if output_not_none:
25
+ ret = [i is not None for i in ret]
26
+
27
+ if record_time:
28
+ return ret, rtime
29
+ else:
30
+ return ret
31
+
32
+
33
+ def trusted_check_exec(code, inputs, entry_point):
34
+ """Check trusted_exec success."""
35
+ try:
36
+ with time_limit(seconds=1.0):
37
+ trusted_exec(code, inputs, entry_point)
38
+ except Exception:
39
+ return False
40
+ return True
evalplus/build/lib/evalplus/gen/util/anthropic_request.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import signal
2
+ import time
3
+
4
+ import anthropic
5
+ from anthropic.types import Message
6
+
7
+
8
+ def handler(signum, frame):
9
+ # swallow signum and frame
10
+ raise Exception("end of time")
11
+
12
+
13
+ def make_auto_request(client: anthropic.Client, *args, **kwargs) -> Message:
14
+ ret = None
15
+ while ret is None:
16
+ try:
17
+ signal.signal(signal.SIGALRM, handler)
18
+ signal.alarm(100)
19
+ ret = client.messages.create(*args, top_p=0.95, **kwargs)
20
+ signal.alarm(0)
21
+ except anthropic.RateLimitError:
22
+ print("Rate limit exceeded. Waiting...")
23
+ signal.alarm(0)
24
+ time.sleep(5)
25
+ except anthropic.APIConnectionError:
26
+ print("API connection error. Waiting...")
27
+ signal.alarm(0)
28
+ time.sleep(5)
29
+ except anthropic.InternalServerError:
30
+ print("Internal server error. Waiting...")
31
+ signal.alarm(0)
32
+ time.sleep(5)
33
+ except anthropic.APIError as e:
34
+ print("Unknown API error")
35
+ print(e)
36
+ if (
37
+ e.body["error"]["message"]
38
+ == "Output blocked by content filtering policy"
39
+ ):
40
+ raise Exception("Content filtering policy blocked output")
41
+ signal.alarm(0)
42
+ except Exception as e:
43
+ print("Unknown error. Waiting...")
44
+ print(e)
45
+ signal.alarm(0)
46
+ time.sleep(1)
47
+ return ret
evalplus/build/lib/evalplus/gen/util/openai_request.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ import openai
4
+ from openai.types.chat import ChatCompletion
5
+
6
+
7
+ def make_request(
8
+ client: openai.Client,
9
+ message: str,
10
+ model: str,
11
+ max_tokens: int = 512,
12
+ temperature: float = 1,
13
+ n: int = 1,
14
+ **kwargs
15
+ ) -> ChatCompletion:
16
+ kwargs["top_p"] = 0.95
17
+ kwargs["max_completion_tokens"] = max_tokens
18
+ if model.startswith("o1-"): # pop top-p and max_completion_tokens
19
+ kwargs.pop("top_p")
20
+ kwargs.pop("max_completion_tokens")
21
+ temperature = 1.0 # o1 models do not support temperature
22
+
23
+ return client.chat.completions.create(
24
+ model=model,
25
+ messages=[
26
+ {"role": "user", "content": message},
27
+ ],
28
+ temperature=temperature,
29
+ n=n,
30
+ **kwargs
31
+ )
32
+
33
+
34
+ def make_auto_request(*args, **kwargs) -> ChatCompletion:
35
+ ret = None
36
+ while ret is None:
37
+ try:
38
+ ret = make_request(*args, **kwargs)
39
+ except openai.RateLimitError:
40
+ print("Rate limit exceeded. Waiting...")
41
+ time.sleep(5)
42
+ except openai.APIConnectionError:
43
+ print("API connection error. Waiting...")
44
+ time.sleep(5)
45
+ except openai.APIError as e:
46
+ print(e)
47
+ except Exception as e:
48
+ print("Unknown error. Waiting...")
49
+ print(e)
50
+ time.sleep(1)
51
+ return ret
evalplus/build/lib/evalplus/inputgen.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generate a .jsonl file where each line is a json object
2
+ representing a programming problem with a task ID ("task_id")
3
+ and a list of enhanced inputs ("inputs") for that task.
4
+ """
5
+
6
+ import argparse
7
+ import json
8
+ import os
9
+
10
+ from evalplus.data.mbpp import mbpp_serialize_inputs
11
+ from evalplus.gen.chatgpt_gen import ChatGPTGen
12
+ from evalplus.gen.type_mut import TypedMutGen
13
+
14
+
15
+ class SetEncoder(json.JSONEncoder):
16
+ def default(self, obj):
17
+ if isinstance(obj, set):
18
+ return list(obj)
19
+ return json.JSONEncoder.default(self, obj)
20
+
21
+
22
+ # Used for MBPP as MBPP's prompt is not a formal function signature
23
+ def insert_contract_into_code(entry_point, code, contract):
24
+ lines = code.split("\n")
25
+ index = lines.index(
26
+ next(line for line in lines if line.startswith(f"def {entry_point}"))
27
+ )
28
+ lines.insert(index + 1, contract)
29
+ return "\n".join(lines)
30
+
31
+
32
+ def input_generation(args, problems):
33
+ with open(args.output, "w") as file:
34
+ for problem in problems.values():
35
+ new_input = {}
36
+ task_id = problem["task_id"]
37
+ print(f"generating inputs for {task_id} ...")
38
+ # by default we do not include constraints in the prompt (code)
39
+ code = problem["prompt"] + problem["canonical_solution"]
40
+ # but we use c_code to include contract which checks input validity at execution time
41
+ if args.dataset == "humaneval":
42
+ c_code = (
43
+ problem["prompt"]
44
+ + problem["contract"]
45
+ + problem["canonical_solution"]
46
+ )
47
+ elif args.dataset == "mbpp":
48
+ c_code = problem["prompt"] + insert_contract_into_code(
49
+ entry_point=problem["entry_point"],
50
+ code=problem["canonical_solution"],
51
+ contract=problem["contract"],
52
+ )
53
+
54
+ # first generate chatgpt
55
+ input_gen = ChatGPTGen(
56
+ problem["base_input"], problem["entry_point"], c_code, code
57
+ ).generate(args.chatgpt_len)
58
+ # generate mutation next
59
+
60
+ if input_gen is None or len(input_gen) == 0:
61
+ new_input["task_id"] = task_id
62
+ new_input["inputs"] = {}
63
+ file.write(json.dumps(new_input, cls=SetEncoder) + "\n")
64
+ continue
65
+
66
+ input_gen.extend(
67
+ TypedMutGen(input_gen, problem["entry_point"], c_code).generate(
68
+ args.mut_len
69
+ )
70
+ )
71
+ print(f"generated {len(input_gen)} inputs")
72
+ new_input["task_id"] = task_id
73
+ if args.dataset == "mbpp":
74
+ new_input["inputs"] = mbpp_serialize_inputs(task_id, input_gen)
75
+ new_input["inputs"] = input_gen
76
+ file.write(json.dumps(new_input, cls=SetEncoder) + "\n")
77
+
78
+
79
+ def main():
80
+ parser = argparse.ArgumentParser()
81
+ parser.add_argument(
82
+ "--dataset", required=True, type=str, choices=["humaneval", "mbpp"]
83
+ )
84
+ parser.add_argument("--chatgpt_len", required=True, type=int)
85
+ parser.add_argument("--mut_len", required=True, type=int)
86
+ parser.add_argument("--output", type=str, help="Output .jsonl path")
87
+ args = parser.parse_args()
88
+
89
+ problems = None
90
+ if args.dataset == "humaneval":
91
+ from evalplus.data import get_human_eval_plus
92
+
93
+ # Allow it to be incomplete
94
+ problems = get_human_eval_plus(err_incomplete=False)
95
+ args.output = args.output or "HumanEvalPlusInputs.jsonl"
96
+
97
+ if args.dataset == "mbpp":
98
+ from evalplus.data import get_mbpp_plus
99
+
100
+ problems = get_mbpp_plus(err_incomplete=False)
101
+ args.output = args.output or "MbppPlusInput.jsonl"
102
+
103
+ assert not os.path.isfile(args.output), f"{args.output} already exists!"
104
+ input_generation(args, problems)
105
+
106
+
107
+ if __name__ == "__main__":
108
+ main()
evalplus/build/lib/evalplus/lecacy_sanitize.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Legacy version of post-processing LLM-generated Python code.
2
+ This sanitizer is implemented using regex and string manipulation.
3
+ You might want to use the latest tree-sitter-based sanitizer (evalplus.sanitize) instead.
4
+ """
5
+
6
+ import os
7
+ import pathlib
8
+ import re
9
+ from typing import List, Optional
10
+
11
+ from tqdm import tqdm
12
+
13
+ from evalplus.data import (
14
+ get_human_eval_plus,
15
+ get_mbpp_plus,
16
+ load_solutions,
17
+ write_directory,
18
+ write_jsonl,
19
+ )
20
+ from evalplus.syncheck import syntax_check
21
+
22
+
23
+ def remove_unindented_lines(
24
+ code: str, protect_before: str, execeptions: List[str], trim_tails: List[str]
25
+ ) -> str:
26
+ lines = code.splitlines()
27
+ cut_idx = []
28
+ cut_enabled = False
29
+ for i, line in enumerate(lines):
30
+ if not cut_enabled and line.startswith(protect_before):
31
+ cut_enabled = True
32
+ continue
33
+ if line.strip() == "":
34
+ continue
35
+ if any(line.startswith(e) for e in execeptions):
36
+ continue
37
+
38
+ lspace = len(line) - len(line.lstrip())
39
+ if lspace == 0:
40
+ cut_idx.append(i)
41
+
42
+ if any(line.rstrip().startswith(t) for t in trim_tails):
43
+ # cut off everything behind
44
+ cut_idx.extend(list(range(i, len(lines))))
45
+ break
46
+
47
+ return "\n".join([line for i, line in enumerate(lines) if i not in cut_idx])
48
+
49
+
50
+ def to_four_space_indents(old_code):
51
+ new_code = ""
52
+ for line in old_code.splitlines():
53
+ lspace = len(line) - len(line.lstrip())
54
+ if lspace == 3:
55
+ new_code += " "
56
+ new_code += line + "\n"
57
+ return new_code
58
+
59
+
60
+ def sanitize(
61
+ old_code: str,
62
+ entry_point: str,
63
+ rm_prefix_lines: Optional[str] = None,
64
+ eofs: List = None,
65
+ ):
66
+ new_code = old_code
67
+ if rm_prefix_lines is not None:
68
+ new_code = "\n".join(
69
+ [
70
+ line
71
+ for line in old_code.splitlines()
72
+ if not line.startswith(rm_prefix_lines)
73
+ ]
74
+ )
75
+
76
+ new_code = "\n" + new_code
77
+ def_left = "def " + entry_point
78
+
79
+ # basic handling of chat output
80
+ new_code = new_code.replace("\n```python\n", "\n```\n")
81
+ for chunk in new_code.split("\n```\n"):
82
+ if def_left in chunk:
83
+ new_code = chunk
84
+ break
85
+
86
+ chunks = [chunk for chunk in re.split(f"{def_left}\\s*\\(", new_code)]
87
+ # TODO: having return does not mean this is complete
88
+ bodies = [chunk for chunk in chunks[1:] if " return " in chunk.split("\ndef")[0]]
89
+ def_left = def_left + "("
90
+ new_code = def_left + def_left.join(bodies) if len(bodies) > 0 else "" # fn + impl
91
+ new_code = to_four_space_indents(new_code)
92
+
93
+ for eof in eofs or []:
94
+ new_code = new_code.split(eof)[0]
95
+
96
+ # remove lines starting from the first unindented line after def_left
97
+ new_code = remove_unindented_lines(
98
+ new_code,
99
+ protect_before=def_left,
100
+ execeptions=["def ", "import ", "from "],
101
+ trim_tails=['"""', "if", "print"],
102
+ )
103
+ new_code = chunks[0] + new_code
104
+
105
+ # cut all functions that are not syntactically correct && not the entry point
106
+ parts = new_code.split("\ndef ")
107
+ includes = [parts[0]]
108
+ for fn in new_code.split("\ndef ")[1:]:
109
+ if (
110
+ fn.strip().startswith(entry_point + " ")
111
+ or fn.strip().startswith(entry_point + "(")
112
+ or syntax_check("\ndef " + fn)
113
+ ):
114
+ includes.append(fn)
115
+ new_code = "\ndef ".join(includes)
116
+ return new_code.strip()
117
+
118
+
119
+ def script(
120
+ samples: str,
121
+ eofs: List[str] = [],
122
+ inplace: bool = False,
123
+ rm_prefix_lines: str = None,
124
+ debug_task: str = None,
125
+ mbpp_version: str = "default",
126
+ ):
127
+ # task_id -> entry_point
128
+ entry_point = {}
129
+ dataset = {**get_human_eval_plus(), **get_mbpp_plus(version=mbpp_version)}
130
+
131
+ for task_id, problem in dataset.items():
132
+ entry_point[task_id] = problem["entry_point"]
133
+
134
+ # make a new folder with "-sanitized" suffix
135
+ is_folder = os.path.isdir(samples)
136
+ target_path = pathlib.Path(samples)
137
+ if not inplace:
138
+ if is_folder:
139
+ new_name = target_path.name + "-sanitized"
140
+ else:
141
+ new_name = target_path.name.replace(".jsonl", "-sanitized.jsonl")
142
+ target_path = target_path.parent / new_name
143
+ target_path = str(target_path)
144
+
145
+ nsan = 0
146
+ ntotal = 0
147
+
148
+ new_solutions = []
149
+
150
+ for solution in tqdm(load_solutions(samples)):
151
+ task_id = solution["task_id"]
152
+ dbg_identifier = solution["_identifier"]
153
+ if debug_task is not None and task_id != debug_task:
154
+ continue
155
+
156
+ ntotal += 1
157
+ if "solution" in solution:
158
+ old_code = solution["solution"]
159
+ else:
160
+ assert "completion" in solution
161
+ old_code = dataset[task_id]["prompt"] + "\n" + solution["completion"]
162
+
163
+ old_code = old_code.strip()
164
+
165
+ new_code = sanitize(
166
+ old_code=old_code,
167
+ entry_point=entry_point[task_id],
168
+ rm_prefix_lines=rm_prefix_lines,
169
+ eofs=eofs,
170
+ ).strip()
171
+
172
+ # if changed, print the message
173
+ if new_code != old_code:
174
+ msg = "Sanitized: " + dbg_identifier
175
+ if is_folder:
176
+ msg += " -> " + dbg_identifier.replace(samples, target_path)
177
+ print(msg)
178
+ nsan += 1
179
+
180
+ new_solutions.append({"task_id": task_id, "solution": new_code})
181
+
182
+ if is_folder:
183
+ write_directory(target_path, new_solutions)
184
+ else:
185
+ write_jsonl(target_path, new_solutions)
186
+
187
+ if nsan > 0:
188
+ print(f"Sanitized {nsan} out of {ntotal} files.")
189
+ else:
190
+ print(f"All files seems valid -- no files are sanitized.")
191
+ print(f"Check the sanitized files at {target_path}")
192
+
193
+
194
+ def main():
195
+ from fire import Fire
196
+
197
+ Fire(script)
198
+
199
+
200
+ if __name__ == "__main__":
201
+ main()
evalplus/build/lib/evalplus/perf/__init__.py ADDED
File without changes