CDKA / eval_humaneval.py
rainstonee's picture
Upload 33 files
c5d3e8d verified
Raw
History Blame Contribute Delete
3.13 kB
from human_eval.data import write_jsonl, read_problems
from fire import Fire
from tqdm import trange, tqdm
from utils import initialize_text_to_text_model, model_inference
import re
import os
from human_eval.evaluation import evaluate_functional_correctness
from peft import PeftModel
ALPACA_PREFIX_TEMPLATE_MD = """Below is an instruction that describes a task.\n Write a response that appropriately completes the request.
### Instruction:
Complete the following Python code:
Notes: respond with the entire complete function definition
do not add any comments, be as concise in your code as possible
use only built-in libraries, assume no additional imports other than those provided (if any)
use ` ` (4 spaces) for each level of indentation
code:
```python
{PROMPT}
```
### Response:
```python
"""
def post_process(text):
text = text.replace("```", "")
text = text.replace("\t", " ")
text = re.sub(r'(""".*?"""|\'\'\'.*?\'\'\')', '', text, flags=re.DOTALL)
text = "\n".join([ll.rstrip() for ll in text.splitlines() if ll.strip()])
lines = text.split("\n")
spaces_for_each_line = []
for line in lines:
match = re.match(r'^( *)', line)
if match:
leading_spaces = len(match.group(1))
spaces_for_each_line.append(leading_spaces)
try:
def_line = [i for i, line in enumerate(lines) if "def" in line][0]
def_line_space = spaces_for_each_line[def_line]
except:
print("No def line found")
print(text)
def_line_space = 0
rank_unique_spaces = sorted(list(set(spaces_for_each_line)))
indentation_level = {}
i = 0
for space in rank_unique_spaces:
if space <= def_line_space:
indentation_level[space] = 0
else:
i += 1
indentation_level[space] = i
new_lines = []
for line, space in zip(lines, spaces_for_each_line):
new_lines.append(" " * indentation_level[space] + line.lstrip())
return "\n".join(new_lines)
def generate_one_completion(model, tokenizer, model_type, prompt, template=True):
if template:
prompt_in = ALPACA_PREFIX_TEMPLATE_MD.format(PROMPT=prompt)
pred_text = model_inference(model, tokenizer, prompt_in, model_type, max_target_length=512)
post_pred = post_process(pred_text)
return post_pred
def humaneval(model, tokenizer, save_dir, model_type = "CausalLM", model_name="llama/llama-2-7b-hf"):
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
problems = read_problems()
num_samples_per_task = 1
samples = [
dict(task_id=task_id, completion=generate_one_completion(model, tokenizer, model_type, problems[task_id]["prompt"]))
for task_id in tqdm(problems, desc="Tasks")
for _ in range(num_samples_per_task)
]
target_name = os.path.join(save_dir, f"{model_name.replace('/', '_')}_humaneval_samples.jsonl")
write_jsonl(target_name, samples)
results = evaluate_functional_correctness(target_name, [1])
print(results)