DebateCoder / src /datasets /convert-apps-xcode.py
smallan13's picture
Upload 105 files
01f199c verified
raw
history blame
3.91 kB
import pandas as pd
import json
def read_jsonl(filename):
"""Reads a jsonl file and yields each line as a dictionary"""
lines = []
# i = 0
with open(filename, "r", encoding="utf-8") as file:
for line in file:
lines.append(json.loads(line))
# i += 1
# print(i)
return lines
# Write a python list of dictionaries into a jsonl file
def write_jsonl(filename, lines):
"""Writes a python list of dictionaries into a jsonl file"""
with open(filename, "w", encoding="utf-8") as file:
for line in lines:
file.write(json.dumps(line) + "\n")
train_set = read_jsonl("./data/APPS/train.jsonl")
test_set = read_jsonl("./data/APPS/train.jsonl")
dataset = train_set + test_set
print(len(dataset))
dataset = pd.DataFrame(dataset)
# dataset.columns
print(dataset['difficulty'].unique())
# Filter problems from codeforces with atleast 10 input and output
filter_indices = [False] * len(dataset)
for i in range(len(dataset)):
row = dataset.iloc[i]
if "codeforces" in row['url'] and row['input_output'] and len(json.loads(row['input_output'])["inputs"]) > 5:
filter_indices[i] = True
codeforces_dataset = dataset[filter_indices]
print(len(codeforces_dataset))
# Randomly choose 50 problems
codeforces_dataset_50 = codeforces_dataset.sample(n=min(50, len(codeforces_dataset)), random_state=1, replace=False)
print(len(codeforces_dataset_50))
codeforces_dataset_50.reset_index(drop=True, inplace=True)
# Filter interview problems with atleast 10 input and output
filter_indices = [False] * len(dataset)
for i in range(len(dataset)):
row = dataset.iloc[i]
if "interview" == row['difficulty'] and row['input_output'] and len(row['input_output']) < 2000 and len(json.loads(row['input_output'])["inputs"]) > 5:
filter_indices[i] = True
interview_dataset = dataset[filter_indices]
print(len(interview_dataset))
# Randomly choose 50 problems
interview_dataset_50 = interview_dataset.sample(
n=min(50, len(interview_dataset)), random_state=1, replace=False)
print(len(interview_dataset_50))
interview_dataset_50.reset_index(drop=True, inplace=True)
# Filter introductory problems with atleast 10 input and output
filter_indices = [False] * len(dataset)
for i in range(len(dataset)):
row = dataset.iloc[i]
if "introductory" == row['difficulty'] and len(row['input_output']) < 2000 and len(json.loads(row['input_output'])["inputs"]) > 5:
filter_indices[i] = True
introductory_dataset = dataset[filter_indices]
print(len(introductory_dataset))
# Randomly choose 50 problems
introductory_dataset_50 = introductory_dataset.sample(
n=min(50, len(introductory_dataset)), random_state=1, replace=False)
print(len(introductory_dataset_50))
introductory_dataset_50.reset_index(drop=True, inplace=True)
selected_df = pd.concat([introductory_dataset_50, interview_dataset_50, codeforces_dataset_50], ignore_index=True)
def get_test_cases(input, output):
return {
"input": "\n".join([str(x) for x in input]) if type(input) == list else input,
"output": output if type(output) == list else [output]
}
selected_datasets = []
for i in range(len(selected_df)):
row = selected_df.iloc[i]
test_cases = json.loads(row['input_output'])
public_test_cases = list(
map(get_test_cases, test_cases['inputs'][0:2], test_cases['outputs'][0:2]))
test_cases = list(
map(get_test_cases, test_cases['inputs'], test_cases['outputs']))
test = {
"name": str(row['id']),
"description": str(row['question']),
"difficulty": str(row['difficulty']),
"id": int(row['id']),
"sample_io": public_test_cases,
"test_list": test_cases,
"starter_code": str(row['starter_code']),
}
selected_datasets.append(test)
write_jsonl("./data/APPS/selected150.jsonl", selected_datasets)