| | import pandas as pd |
| | import json |
| |
|
| |
|
| | def read_jsonl(filename): |
| | """Reads a jsonl file and yields each line as a dictionary""" |
| | lines = [] |
| | |
| | with open(filename, "r", encoding="utf-8") as file: |
| | for line in file: |
| | lines.append(json.loads(line)) |
| | |
| | |
| | return lines |
| |
|
| | |
| |
|
| |
|
| | def write_jsonl(filename, lines): |
| | """Writes a python list of dictionaries into a jsonl file""" |
| | with open(filename, "w", encoding="utf-8") as file: |
| | for line in lines: |
| | file.write(json.dumps(line) + "\n") |
| |
|
| |
|
| | train_set = read_jsonl("./data/APPS/train.jsonl") |
| | test_set = read_jsonl("./data/APPS/train.jsonl") |
| |
|
| | dataset = train_set + test_set |
| |
|
| | print(len(dataset)) |
| |
|
| | dataset = pd.DataFrame(dataset) |
| | |
| |
|
| | print(dataset['difficulty'].unique()) |
| |
|
| |
|
| | |
| | filter_indices = [False] * len(dataset) |
| | for i in range(len(dataset)): |
| | row = dataset.iloc[i] |
| | if "codeforces" in row['url'] and row['input_output'] and len(json.loads(row['input_output'])["inputs"]) > 5: |
| | filter_indices[i] = True |
| |
|
| | codeforces_dataset = dataset[filter_indices] |
| |
|
| | print(len(codeforces_dataset)) |
| |
|
| | |
| | codeforces_dataset_50 = codeforces_dataset.sample(n=min(50, len(codeforces_dataset)), random_state=1, replace=False) |
| | print(len(codeforces_dataset_50)) |
| |
|
| | codeforces_dataset_50.reset_index(drop=True, inplace=True) |
| |
|
| | |
| | filter_indices = [False] * len(dataset) |
| | for i in range(len(dataset)): |
| | row = dataset.iloc[i] |
| | if "interview" == row['difficulty'] and row['input_output'] and len(row['input_output']) < 2000 and len(json.loads(row['input_output'])["inputs"]) > 5: |
| | filter_indices[i] = True |
| |
|
| | interview_dataset = dataset[filter_indices] |
| |
|
| | print(len(interview_dataset)) |
| |
|
| | |
| | interview_dataset_50 = interview_dataset.sample( |
| | n=min(50, len(interview_dataset)), random_state=1, replace=False) |
| | print(len(interview_dataset_50)) |
| |
|
| | interview_dataset_50.reset_index(drop=True, inplace=True) |
| |
|
| |
|
| | |
| | filter_indices = [False] * len(dataset) |
| | for i in range(len(dataset)): |
| | row = dataset.iloc[i] |
| | if "introductory" == row['difficulty'] and len(row['input_output']) < 2000 and len(json.loads(row['input_output'])["inputs"]) > 5: |
| | filter_indices[i] = True |
| |
|
| | introductory_dataset = dataset[filter_indices] |
| |
|
| | print(len(introductory_dataset)) |
| |
|
| | |
| | introductory_dataset_50 = introductory_dataset.sample( |
| | n=min(50, len(introductory_dataset)), random_state=1, replace=False) |
| | print(len(introductory_dataset_50)) |
| |
|
| | introductory_dataset_50.reset_index(drop=True, inplace=True) |
| |
|
| | selected_df = pd.concat([introductory_dataset_50, interview_dataset_50, codeforces_dataset_50], ignore_index=True) |
| |
|
| |
|
| | def get_test_cases(input, output): |
| | return { |
| | "input": "\n".join([str(x) for x in input]) if type(input) == list else input, |
| | "output": output if type(output) == list else [output] |
| | } |
| |
|
| |
|
| | selected_datasets = [] |
| |
|
| | for i in range(len(selected_df)): |
| | row = selected_df.iloc[i] |
| | test_cases = json.loads(row['input_output']) |
| |
|
| | public_test_cases = list( |
| | map(get_test_cases, test_cases['inputs'][0:2], test_cases['outputs'][0:2])) |
| | test_cases = list( |
| | map(get_test_cases, test_cases['inputs'], test_cases['outputs'])) |
| |
|
| | test = { |
| | "name": str(row['id']), |
| | "description": str(row['question']), |
| | "difficulty": str(row['difficulty']), |
| | "id": int(row['id']), |
| | "sample_io": public_test_cases, |
| | "test_list": test_cases, |
| | "starter_code": str(row['starter_code']), |
| | } |
| |
|
| | selected_datasets.append(test) |
| |
|
| |
|
| | write_jsonl("./data/APPS/selected150.jsonl", selected_datasets) |
| |
|
| |
|
| |
|