Tsukihjy/testcase / methods /utils /dataset_all.py
Tsukihjy's picture
download
raw
4.26 kB
from datasets import load_dataset, load_from_disk
def get_codecontest():
ds = load_dataset("deepmind/code_contests")
dataset_cc = []
return ds
def get_ucaso():
# 'name', 'problem_link', 'test_data_link', 'solution_link', 'contest_link', 'inner_contest_link',
# 'problem_level', 'cp_id', 'problem_id', 'description', 'num_tests', 'solution', 'runtime_limit_sentences',
# 'memory_limit_sentences', 'runtime_limit', 'memory_limit', 'samples', 'description_no_samples', 'num_samples',
# 'description_raw', 'input_format', 'output_format'])
dataset_path = "/home/jyhuang/datasets/data_copy/datasets/usaco_v3"
ds = load_from_disk(dataset_path)
dataset_list = []
for i in range(len(ds)):
item = ds[i]
dataset_list.append({
"problem_id": item["problem_id"],
"query": item["description"],
"solution": item["solution"],
"runtime_limit": item["runtime_limit"],
"memory_limit": item["memory_limit"]
})
return dataset_list
import json
def read_json(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file) # 读取整个JSON文件并解析成Python数据
return data
def resolve_ours():
qustion_file = "/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/contents_with_sample.json"
ans_file = "./data/Ours/cor_ans.json"
# 读取 JSON 文件
with open(qustion_file, "r", encoding="utf-8") as f:
data = json.load(f)
with open(ans_file, "r", encoding="utf-8") as f:
ans_data = json.load(f)
quetions_list = []
for key, item in data.items():
content = key # 先放标题
ques = item['content']
# 拼接所有的 sectionTitle 和 text
no_sample = False
for section in ques.get("contentSections", []):
section_title = section.get("sectionTitle", "").strip()
text = section.get("text", "").strip()
content += f"\n{section_title}"
if section_title == "样例":
if len(item['sample']) > 0:
content += f"\n输入:\n{item['sample'][0]['inputData']}\n输出:\n{item['sample'][0]['outputData']}\n"
else:
print(f"{key} no test sample")
no_sample = True
break
content += f"\n{text}"
if no_sample:
continue
# Filter for solutions where lang is 'cpp' and select the first 3
selected_codes = []
solutions = ans_data[key]
cpp_solutions = [sol["code"] for sol in solutions if sol['lang'] == 'cpp']
if len(cpp_solutions) < 3:
print(f"Warning: Less than 3 C++ solutions found for problem {key}. Found {len(cpp_solutions)} solutions.")
continue
selected_codes = cpp_solutions[:3] # Select top 3 cpp solutions
quetions_list.append({
"problem_id": key,
"query":content,
"solutions": selected_codes,
# TODO 缺少时间,内存限制
"runtime_limit": item["timeLimit"],
"memory_limit": item["memoryLimit"]
})
return quetions_list
def get_ours():
dataset_file = "/home/luoxianzhen/yang/data/Ours/TestcaseBench-v28.json"
with open(dataset_file, "r", encoding="utf-8") as f:
data = json.load(f)
return data
def get_datasets_by_name(dataset_name):
if dataset_name == "ucaso":
return get_ucaso()
elif dataset_name == "ours":
return get_ours()
def write_json_to_file(data, filepath):
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
if __name__ == "__main__":
# res = resolve_ours()
# write_json_to_file(res, "./data/Ours/datasets_v1.json")
res = get_ours()
print(res[0]['query'])
# for item in res:
# if item["problem_id"] == "#6183. 看无可看":
# print(len(item["query"]))
# print(len(read_json("/home/i-luoxianzhen/data/TestCase-Gen/saved_tests/lcb/lcbtests-ours-3final.json")))

Xet Storage Details

Size:
4.26 kB
·
Xet hash:
81919d531d74de8b18a795766c18e47467a666ff27f84ae62a51f86638c5349a

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.