Buckets:

Tsukihjy
/

testcase

Files

xet

Tsukihjy/testcase / methods /utils /dataset_all.py

Tsukihjy

about 1 month ago

download

raw

4.26 kB

	from datasets import load_dataset, load_from_disk

	def get_codecontest():
	ds = load_dataset("deepmind/code_contests")
	dataset_cc = []

	return ds


	def get_ucaso():
	# 'name', 'problem_link', 'test_data_link', 'solution_link', 'contest_link', 'inner_contest_link',
	# 'problem_level', 'cp_id', 'problem_id', 'description', 'num_tests', 'solution', 'runtime_limit_sentences',
	# 'memory_limit_sentences', 'runtime_limit', 'memory_limit', 'samples', 'description_no_samples', 'num_samples',
	# 'description_raw', 'input_format', 'output_format'])
	dataset_path = "/home/jyhuang/datasets/data_copy/datasets/usaco_v3"
	ds = load_from_disk(dataset_path)
	dataset_list = []
	for i in range(len(ds)):
	item = ds[i]
	dataset_list.append({
	"problem_id": item["problem_id"],
	"query": item["description"],
	"solution": item["solution"],
	"runtime_limit": item["runtime_limit"],
	"memory_limit": item["memory_limit"]
	})
	return dataset_list

	import json

	def read_json(file_path):
	with open(file_path, 'r', encoding='utf-8') as file:
	data = json.load(file) # 读取整个JSON文件并解析成Python数据
	return data

	def resolve_ours():
	qustion_file = "/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/contents_with_sample.json"
	ans_file = "./data/Ours/cor_ans.json"

	# 读取 JSON 文件
	with open(qustion_file, "r", encoding="utf-8") as f:
	data = json.load(f)

	with open(ans_file, "r", encoding="utf-8") as f:
	ans_data = json.load(f)
	quetions_list = []


	for key, item in data.items():
	content = key # 先放标题
	ques = item['content']
	# 拼接所有的 sectionTitle 和 text
	no_sample = False
	for section in ques.get("contentSections", []):
	section_title = section.get("sectionTitle", "").strip()
	text = section.get("text", "").strip()
	content += f"\n{section_title}"
	if section_title == "样例":
	if len(item['sample']) > 0:
	content += f"\n输入:\n{item['sample'][0]['inputData']}\n输出:\n{item['sample'][0]['outputData']}\n"
	else:
	print(f"{key} no test sample")
	no_sample = True
	break
	content += f"\n{text}"
	if no_sample:
	continue
	# Filter for solutions where lang is 'cpp' and select the first 3
	selected_codes = []
	solutions = ans_data[key]
	cpp_solutions = [sol["code"] for sol in solutions if sol['lang'] == 'cpp']
	if len(cpp_solutions) < 3:
	print(f"Warning: Less than 3 C++ solutions found for problem {key}. Found {len(cpp_solutions)} solutions.")
	continue
	selected_codes = cpp_solutions[:3] # Select top 3 cpp solutions


	quetions_list.append({
	"problem_id": key,
	"query":content,
	"solutions": selected_codes,
	# TODO 缺少时间，内存限制
	"runtime_limit": item["timeLimit"],
	"memory_limit": item["memoryLimit"]
	})

	return quetions_list


	def get_ours():
	dataset_file = "/home/luoxianzhen/yang/data/Ours/TestcaseBench-v28.json"
	with open(dataset_file, "r", encoding="utf-8") as f:
	data = json.load(f)
	return data


	def get_datasets_by_name(dataset_name):
	if dataset_name == "ucaso":
	return get_ucaso()
	elif dataset_name == "ours":
	return get_ours()


	def write_json_to_file(data, filepath):
	with open(filepath, 'w', encoding='utf-8') as f:
	json.dump(data, f, ensure_ascii=False, indent=4)

	if __name__ == "__main__":
	# res = resolve_ours()
	# write_json_to_file(res, "./data/Ours/datasets_v1.json")
	res = get_ours()
	print(res[0]['query'])
	# for item in res:
	# if item["problem_id"] == "#6183. 看无可看":
	# print(len(item["query"]))
	# print(len(read_json("/home/i-luoxianzhen/data/TestCase-Gen/saved_tests/lcb/lcbtests-ours-3final.json")))

Xet Storage Details

Size:: 4.26 kB
Xet hash:: 81919d531d74de8b18a795766c18e47467a666ff27f84ae62a51f86638c5349a

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.