rainstonee
/

CDKA

Model card Files Files and versions

CDKA / eval_humaneval.py

rainstonee's picture

Upload 33 files

c5d3e8d verified about 14 hours ago

History Blame Contribute Delete

3.13 kB

	from human_eval.data import write_jsonl, read_problems
	from fire import Fire
	from tqdm import trange, tqdm
	from utils import initialize_text_to_text_model, model_inference
	import re
	import os
	from human_eval.evaluation import evaluate_functional_correctness
	from peft import PeftModel

	ALPACA_PREFIX_TEMPLATE_MD = """Below is an instruction that describes a task.\n Write a response that appropriately completes the request.

	### Instruction:
	Complete the following Python code:
	Notes: respond with the entire complete function definition
	do not add any comments, be as concise in your code as possible
	use only built-in libraries, assume no additional imports other than those provided (if any)
	use ` ` (4 spaces) for each level of indentation

	code:
	```python
	{PROMPT}
	```

	### Response:
	```python
	"""

	def post_process(text):
	text = text.replace("```", "")
	text = text.replace("\t", " ")
	text = re.sub(r'(""".?"""\|\'\'\'.?\'\'\')', '', text, flags=re.DOTALL)
	text = "\n".join([ll.rstrip() for ll in text.splitlines() if ll.strip()])
	lines = text.split("\n")
	spaces_for_each_line = []
	for line in lines:
	match = re.match(r'^( *)', line)
	if match:
	leading_spaces = len(match.group(1))
	spaces_for_each_line.append(leading_spaces)
	try:
	def_line = [i for i, line in enumerate(lines) if "def" in line][0]
	def_line_space = spaces_for_each_line[def_line]
	except:
	print("No def line found")
	print(text)
	def_line_space = 0
	rank_unique_spaces = sorted(list(set(spaces_for_each_line)))
	indentation_level = {}
	i = 0
	for space in rank_unique_spaces:
	if space <= def_line_space:
	indentation_level[space] = 0
	else:
	i += 1
	indentation_level[space] = i
	new_lines = []
	for line, space in zip(lines, spaces_for_each_line):
	new_lines.append(" " * indentation_level[space] + line.lstrip())
	return "\n".join(new_lines)

	def generate_one_completion(model, tokenizer, model_type, prompt, template=True):
	if template:
	prompt_in = ALPACA_PREFIX_TEMPLATE_MD.format(PROMPT=prompt)
	pred_text = model_inference(model, tokenizer, prompt_in, model_type, max_target_length=512)
	post_pred = post_process(pred_text)
	return post_pred




	def humaneval(model, tokenizer, save_dir, model_type = "CausalLM", model_name="llama/llama-2-7b-hf"):

	import os
	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	problems = read_problems()
	num_samples_per_task = 1
	samples = [
	dict(task_id=task_id, completion=generate_one_completion(model, tokenizer, model_type, problems[task_id]["prompt"]))
	for task_id in tqdm(problems, desc="Tasks")
	for _ in range(num_samples_per_task)
	]

	target_name = os.path.join(save_dir, f"{model_name.replace('/', '_')}_humaneval_samples.jsonl")
	write_jsonl(target_name, samples)
	results = evaluate_functional_correctness(target_name, [1])
	print(results)