DebateCoder / src /datasets /convert-apps-xcode.py

Upload 105 files

01f199c verified about 2 months ago

3.91 kB

	import pandas as pd
	import json


	def read_jsonl(filename):
	"""Reads a jsonl file and yields each line as a dictionary"""
	lines = []
	# i = 0
	with open(filename, "r", encoding="utf-8") as file:
	for line in file:
	lines.append(json.loads(line))
	# i += 1
	# print(i)
	return lines

	# Write a python list of dictionaries into a jsonl file


	def write_jsonl(filename, lines):
	"""Writes a python list of dictionaries into a jsonl file"""
	with open(filename, "w", encoding="utf-8") as file:
	for line in lines:
	file.write(json.dumps(line) + "\n")


	train_set = read_jsonl("./data/APPS/train.jsonl")
	test_set = read_jsonl("./data/APPS/train.jsonl")

	dataset = train_set + test_set

	print(len(dataset))

	dataset = pd.DataFrame(dataset)
	# dataset.columns

	print(dataset['difficulty'].unique())


	# Filter problems from codeforces with atleast 10 input and output
	filter_indices = [False] * len(dataset)
	for i in range(len(dataset)):
	row = dataset.iloc[i]
	if "codeforces" in row['url'] and row['input_output'] and len(json.loads(row['input_output'])["inputs"]) > 5:
	filter_indices[i] = True

	codeforces_dataset = dataset[filter_indices]

	print(len(codeforces_dataset))

	# Randomly choose 50 problems
	codeforces_dataset_50 = codeforces_dataset.sample(n=min(50, len(codeforces_dataset)), random_state=1, replace=False)
	print(len(codeforces_dataset_50))

	codeforces_dataset_50.reset_index(drop=True, inplace=True)

	# Filter interview problems with atleast 10 input and output
	filter_indices = [False] * len(dataset)
	for i in range(len(dataset)):
	row = dataset.iloc[i]
	if "interview" == row['difficulty'] and row['input_output'] and len(row['input_output']) < 2000 and len(json.loads(row['input_output'])["inputs"]) > 5:
	filter_indices[i] = True

	interview_dataset = dataset[filter_indices]

	print(len(interview_dataset))

	# Randomly choose 50 problems
	interview_dataset_50 = interview_dataset.sample(
	n=min(50, len(interview_dataset)), random_state=1, replace=False)
	print(len(interview_dataset_50))

	interview_dataset_50.reset_index(drop=True, inplace=True)


	# Filter introductory problems with atleast 10 input and output
	filter_indices = [False] * len(dataset)
	for i in range(len(dataset)):
	row = dataset.iloc[i]
	if "introductory" == row['difficulty'] and len(row['input_output']) < 2000 and len(json.loads(row['input_output'])["inputs"]) > 5:
	filter_indices[i] = True

	introductory_dataset = dataset[filter_indices]

	print(len(introductory_dataset))

	# Randomly choose 50 problems
	introductory_dataset_50 = introductory_dataset.sample(
	n=min(50, len(introductory_dataset)), random_state=1, replace=False)
	print(len(introductory_dataset_50))

	introductory_dataset_50.reset_index(drop=True, inplace=True)

	selected_df = pd.concat([introductory_dataset_50, interview_dataset_50, codeforces_dataset_50], ignore_index=True)


	def get_test_cases(input, output):
	return {
	"input": "\n".join([str(x) for x in input]) if type(input) == list else input,
	"output": output if type(output) == list else [output]
	}


	selected_datasets = []

	for i in range(len(selected_df)):
	row = selected_df.iloc[i]
	test_cases = json.loads(row['input_output'])

	public_test_cases = list(
	map(get_test_cases, test_cases['inputs'][0:2], test_cases['outputs'][0:2]))
	test_cases = list(
	map(get_test_cases, test_cases['inputs'], test_cases['outputs']))

	test = {
	"name": str(row['id']),
	"description": str(row['question']),
	"difficulty": str(row['difficulty']),
	"id": int(row['id']),
	"sample_io": public_test_cases,
	"test_list": test_cases,
	"starter_code": str(row['starter_code']),
	}

	selected_datasets.append(test)


	write_jsonl("./data/APPS/selected150.jsonl", selected_datasets)