File size: 1,919 Bytes
01f199c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | # Using this python file we have converted the code contest dataset to the format of the xCodeEval dataset.
import pandas as pd
import json
def read_jsonl(filename):
"""Reads a jsonl file and yields each line as a dictionary"""
lines = []
# i = 0
with open(filename, "r", encoding="utf-8") as file:
for line in file:
lines.append(json.loads(line))
# i += 1
# print(i)
return lines
# Write a python list of dictionaries into a jsonl file
def write_jsonl(filename, lines):
"""Writes a python list of dictionaries into a jsonl file"""
with open(filename, "w", encoding="utf-8") as file:
for line in lines:
file.write(json.dumps(line) + "\n")
df = pd.read_parquet("./data/CodeContest/validation.parquet", engine='pyarrow')
df = df[['name', 'cf_contest_id', 'cf_tags', 'difficulty',
'description', 'public_tests', 'private_tests', 'generated_tests']]
def get_test_cases(input, output):
return {
"input": str(input),
"output": [str(output)]
}
test_datasets = []
for i in range(len(df)):
row = df.iloc[i]
public_test_cases = list(
map(get_test_cases, row['public_tests']['input'], row['public_tests']['output']))
test_cases = []
test_cases.extend(list(map(
get_test_cases, row['private_tests']['input'], row['private_tests']['output'])))
test_cases.extend(list(map(
get_test_cases, row['generated_tests']['input'], row['generated_tests']['output'])))
test = {
"name": str(row['name']),
"description": str(row['description']),
"tags": list(row['cf_tags']),
"difficulty": int(row['difficulty']),
"id": int(row['cf_contest_id']),
"sample_io": public_test_cases,
"test_list": test_cases
}
test_datasets.append(test)
write_jsonl("./data/CodeContest/Val.jsonl", test_datasets)
|