File size: 1,919 Bytes

01f199c

# Using this python file we have converted the code contest dataset to the format of the xCodeEval dataset.

import pandas as pd
import json


def read_jsonl(filename):
    """Reads a jsonl file and yields each line as a dictionary"""
    lines = []
    # i = 0
    with open(filename, "r", encoding="utf-8") as file:
        for line in file:
            lines.append(json.loads(line))
            # i += 1
            # print(i)
    return lines

# Write a python list of dictionaries into a jsonl file


def write_jsonl(filename, lines):
    """Writes a python list of dictionaries into a jsonl file"""
    with open(filename, "w", encoding="utf-8") as file:
        for line in lines:
            file.write(json.dumps(line) + "\n")


df = pd.read_parquet("./data/CodeContest/validation.parquet", engine='pyarrow')
df = df[['name', 'cf_contest_id', 'cf_tags', 'difficulty',
         'description', 'public_tests', 'private_tests', 'generated_tests']]


def get_test_cases(input, output):
    return {
        "input": str(input),
        "output": [str(output)]
    }


test_datasets = []

for i in range(len(df)):
    row = df.iloc[i]

    public_test_cases = list(
        map(get_test_cases, row['public_tests']['input'], row['public_tests']['output']))
    test_cases = []
    test_cases.extend(list(map(
        get_test_cases, row['private_tests']['input'], row['private_tests']['output'])))
    test_cases.extend(list(map(
        get_test_cases, row['generated_tests']['input'], row['generated_tests']['output'])))

    test = {
        "name": str(row['name']),
        "description": str(row['description']),
        "tags": list(row['cf_tags']),
        "difficulty": int(row['difficulty']),
        "id": int(row['cf_contest_id']),
        "sample_io": public_test_cases,
        "test_list": test_cases
    }

    test_datasets.append(test)


write_jsonl("./data/CodeContest/Val.jsonl", test_datasets)