| import json
|
|
|
| from datasets import concatenate_datasets, load_dataset
|
|
|
| from rllm.data.dataset import DatasetRegistry
|
| from rllm.data.utils import fetch_live_code_bench_system_prompt
|
|
|
|
|
| def prepare_deepcoder_data(train_size: int = None, test_size: int = None):
|
| train_dataset = concatenate_datasets([load_dataset("agentica-org/DeepCoder-Preview-Dataset", name="primeintellect", split="train"), load_dataset("agentica-org/DeepCoder-Preview-Dataset", name="taco", split="train"), load_dataset("agentica-org/DeepCoder-Preview-Dataset", name="lcbv5", split="train")])
|
| test_dataset = concatenate_datasets([load_dataset("agentica-org/DeepCoder-Preview-Dataset", name="codeforces", split="test"), load_dataset("agentica-org/DeepCoder-Preview-Dataset", name="lcbv5", split="test")])
|
|
|
| def preprocess_fn(example, idx):
|
| starter_code = example.get("starter_code", "")
|
| question = fetch_live_code_bench_system_prompt(example["problem"], starter_code if starter_code else None)
|
|
|
| tests_raw = example["tests"]
|
|
|
| if isinstance(tests_raw, str):
|
| tests = json.loads(tests_raw)
|
| else:
|
| tests = tests_raw
|
| metadata = example.get("metadata", {})
|
|
|
|
|
| if isinstance(tests, dict) and "inputs" in tests and "outputs" in tests:
|
| normalized_tests = []
|
| for input_val, output_val in zip(tests["inputs"], tests["outputs"], strict=False):
|
| normalized_tests.append({"input": input_val, "output": output_val, "testtype": "stdin_stdout"})
|
| tests = normalized_tests
|
|
|
|
|
| if not isinstance(tests, list):
|
| tests = [tests] if tests else []
|
|
|
| for test in tests:
|
| if test.get("testtype") == "functional" and metadata.get("func_name") is not None:
|
| test["metadata"] = {"func_name": str(metadata["func_name"])}
|
| else:
|
| test["metadata"] = {"func_name": None}
|
|
|
| return {"question": question, "ground_truth": json.dumps(tests), "data_source": "livecodebench", "uid": f"deepcoder_{idx}", "index": idx, "starter_code": starter_code, "metadata": json.dumps(metadata)}
|
|
|
| if train_size:
|
| train_dataset = train_dataset.select(range(min(train_size, len(train_dataset))))
|
| if test_size:
|
| test_dataset = test_dataset.select(range(min(test_size, len(test_dataset))))
|
|
|
| train_dataset = train_dataset.map(preprocess_fn, with_indices=True, writer_batch_size=10, num_proc=16)
|
| test_dataset = test_dataset.map(preprocess_fn, with_indices=True, writer_batch_size=10, num_proc=16)
|
| train_dataset = DatasetRegistry.register_dataset("deepcoder", train_dataset, "train")
|
| test_dataset = DatasetRegistry.register_dataset("deepcoder", test_dataset, "test")
|
|
|
| return train_dataset, test_dataset
|
|
|
|
|
| if __name__ == "__main__":
|
| train_dataset, test_dataset = prepare_deepcoder_data()
|
| print(f" - Train dataset: {len(train_dataset.get_data())} examples")
|
| print(f" - Test dataset: {len(test_dataset.get_data())} examples")
|
| print(train_dataset.get_data()[0])
|
|
|