| | |
| |
|
| | import pandas as pd |
| | import json |
| |
|
| |
|
| | def read_jsonl(filename): |
| | """Reads a jsonl file and yields each line as a dictionary""" |
| | lines = [] |
| | |
| | with open(filename, "r", encoding="utf-8") as file: |
| | for line in file: |
| | lines.append(json.loads(line)) |
| | |
| | |
| | return lines |
| |
|
| | |
| |
|
| |
|
| | def write_jsonl(filename, lines): |
| | """Writes a python list of dictionaries into a jsonl file""" |
| | with open(filename, "w", encoding="utf-8") as file: |
| | for line in lines: |
| | file.write(json.dumps(line) + "\n") |
| |
|
| |
|
| | df = pd.read_parquet("./data/CodeContest/validation.parquet", engine='pyarrow') |
| | df = df[['name', 'cf_contest_id', 'cf_tags', 'difficulty', |
| | 'description', 'public_tests', 'private_tests', 'generated_tests']] |
| |
|
| |
|
| | def get_test_cases(input, output): |
| | return { |
| | "input": str(input), |
| | "output": [str(output)] |
| | } |
| |
|
| |
|
| | test_datasets = [] |
| |
|
| | for i in range(len(df)): |
| | row = df.iloc[i] |
| |
|
| | public_test_cases = list( |
| | map(get_test_cases, row['public_tests']['input'], row['public_tests']['output'])) |
| | test_cases = [] |
| | test_cases.extend(list(map( |
| | get_test_cases, row['private_tests']['input'], row['private_tests']['output']))) |
| | test_cases.extend(list(map( |
| | get_test_cases, row['generated_tests']['input'], row['generated_tests']['output']))) |
| |
|
| | test = { |
| | "name": str(row['name']), |
| | "description": str(row['description']), |
| | "tags": list(row['cf_tags']), |
| | "difficulty": int(row['difficulty']), |
| | "id": int(row['cf_contest_id']), |
| | "sample_io": public_test_cases, |
| | "test_list": test_cases |
| | } |
| |
|
| | test_datasets.append(test) |
| |
|
| |
|
| | write_jsonl("./data/CodeContest/Val.jsonl", test_datasets) |
| |
|