File size: 3,911 Bytes

01f199c

import pandas as pd
import json


def read_jsonl(filename):
    """Reads a jsonl file and yields each line as a dictionary"""
    lines = []
    # i = 0
    with open(filename, "r", encoding="utf-8") as file:
        for line in file:
            lines.append(json.loads(line))
            # i += 1
            # print(i)
    return lines

# Write a python list of dictionaries into a jsonl file


def write_jsonl(filename, lines):
    """Writes a python list of dictionaries into a jsonl file"""
    with open(filename, "w", encoding="utf-8") as file:
        for line in lines:
            file.write(json.dumps(line) + "\n")


train_set = read_jsonl("./data/APPS/train.jsonl")
test_set = read_jsonl("./data/APPS/train.jsonl")

dataset = train_set + test_set

print(len(dataset))

dataset = pd.DataFrame(dataset)
# dataset.columns

print(dataset['difficulty'].unique())


# Filter problems from codeforces with atleast 10 input and output
filter_indices = [False] * len(dataset)
for i in range(len(dataset)):
    row = dataset.iloc[i]
    if "codeforces" in row['url'] and row['input_output'] and len(json.loads(row['input_output'])["inputs"]) > 5:
        filter_indices[i] = True

codeforces_dataset = dataset[filter_indices]

print(len(codeforces_dataset))

# Randomly choose 50 problems
codeforces_dataset_50 = codeforces_dataset.sample(n=min(50, len(codeforces_dataset)), random_state=1, replace=False)
print(len(codeforces_dataset_50))

codeforces_dataset_50.reset_index(drop=True, inplace=True)

# Filter interview problems with atleast 10 input and output
filter_indices = [False] * len(dataset)
for i in range(len(dataset)):
    row = dataset.iloc[i]
    if "interview" == row['difficulty'] and row['input_output'] and len(row['input_output']) < 2000 and len(json.loads(row['input_output'])["inputs"]) > 5:
        filter_indices[i] = True

interview_dataset = dataset[filter_indices]

print(len(interview_dataset))

# Randomly choose 50 problems
interview_dataset_50 = interview_dataset.sample(
    n=min(50, len(interview_dataset)), random_state=1, replace=False)
print(len(interview_dataset_50))

interview_dataset_50.reset_index(drop=True, inplace=True)


# Filter introductory problems with atleast 10 input and output
filter_indices = [False] * len(dataset)
for i in range(len(dataset)):
    row = dataset.iloc[i]
    if "introductory" == row['difficulty'] and len(row['input_output']) < 2000 and len(json.loads(row['input_output'])["inputs"]) > 5:
        filter_indices[i] = True

introductory_dataset = dataset[filter_indices]

print(len(introductory_dataset))

# Randomly choose 50 problems
introductory_dataset_50 = introductory_dataset.sample(
    n=min(50, len(introductory_dataset)), random_state=1, replace=False)
print(len(introductory_dataset_50))

introductory_dataset_50.reset_index(drop=True, inplace=True)

selected_df = pd.concat([introductory_dataset_50, interview_dataset_50, codeforces_dataset_50], ignore_index=True)


def get_test_cases(input, output):
    return {
        "input": "\n".join([str(x) for x in input]) if type(input) == list else input,
        "output": output if type(output) == list else [output]
    }


selected_datasets = []

for i in range(len(selected_df)):
    row = selected_df.iloc[i]
    test_cases = json.loads(row['input_output'])

    public_test_cases = list(
        map(get_test_cases, test_cases['inputs'][0:2], test_cases['outputs'][0:2]))
    test_cases = list(
        map(get_test_cases, test_cases['inputs'], test_cases['outputs']))

    test = {
        "name": str(row['id']),
        "description": str(row['question']),
        "difficulty": str(row['difficulty']),
        "id": int(row['id']),
        "sample_io": public_test_cases,
        "test_list": test_cases,
        "starter_code": str(row['starter_code']),
    }

    selected_datasets.append(test)


write_jsonl("./data/APPS/selected150.jsonl", selected_datasets)