| from datasets import load_dataset, load_from_disk | |
| def get_codecontest(): | |
| ds = load_dataset("deepmind/code_contests") | |
| dataset_cc = [] | |
| return ds | |
| def get_ucaso(): | |
| # 'name', 'problem_link', 'test_data_link', 'solution_link', 'contest_link', 'inner_contest_link', | |
| # 'problem_level', 'cp_id', 'problem_id', 'description', 'num_tests', 'solution', 'runtime_limit_sentences', | |
| # 'memory_limit_sentences', 'runtime_limit', 'memory_limit', 'samples', 'description_no_samples', 'num_samples', | |
| # 'description_raw', 'input_format', 'output_format']) | |
| dataset_path = "/home/jyhuang/datasets/data_copy/datasets/usaco_v3" | |
| ds = load_from_disk(dataset_path) | |
| dataset_list = [] | |
| for i in range(len(ds)): | |
| item = ds[i] | |
| dataset_list.append({ | |
| "problem_id": item["problem_id"], | |
| "query": item["description"], | |
| "solution": item["solution"], | |
| "runtime_limit": item["runtime_limit"], | |
| "memory_limit": item["memory_limit"] | |
| }) | |
| return dataset_list | |
| import json | |
| def read_json(file_path): | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| data = json.load(file) # 读取整个JSON文件并解析成Python数据 | |
| return data | |
| def resolve_ours(): | |
| qustion_file = "/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/contents_with_sample.json" | |
| ans_file = "./data/Ours/cor_ans.json" | |
| # 读取 JSON 文件 | |
| with open(qustion_file, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| with open(ans_file, "r", encoding="utf-8") as f: | |
| ans_data = json.load(f) | |
| quetions_list = [] | |
| for key, item in data.items(): | |
| content = key # 先放标题 | |
| ques = item['content'] | |
| # 拼接所有的 sectionTitle 和 text | |
| no_sample = False | |
| for section in ques.get("contentSections", []): | |
| section_title = section.get("sectionTitle", "").strip() | |
| text = section.get("text", "").strip() | |
| content += f"\n{section_title}" | |
| if section_title == "样例": | |
| if len(item['sample']) > 0: | |
| content += f"\n输入:\n{item['sample'][0]['inputData']}\n输出:\n{item['sample'][0]['outputData']}\n" | |
| else: | |
| print(f"{key} no test sample") | |
| no_sample = True | |
| break | |
| content += f"\n{text}" | |
| if no_sample: | |
| continue | |
| # Filter for solutions where lang is 'cpp' and select the first 3 | |
| selected_codes = [] | |
| solutions = ans_data[key] | |
| cpp_solutions = [sol["code"] for sol in solutions if sol['lang'] == 'cpp'] | |
| if len(cpp_solutions) < 3: | |
| print(f"Warning: Less than 3 C++ solutions found for problem {key}. Found {len(cpp_solutions)} solutions.") | |
| continue | |
| selected_codes = cpp_solutions[:3] # Select top 3 cpp solutions | |
| quetions_list.append({ | |
| "problem_id": key, | |
| "query":content, | |
| "solutions": selected_codes, | |
| # TODO 缺少时间,内存限制 | |
| "runtime_limit": item["timeLimit"], | |
| "memory_limit": item["memoryLimit"] | |
| }) | |
| return quetions_list | |
| def get_ours(): | |
| dataset_file = "/home/luoxianzhen/yang/data/Ours/TestcaseBench-v28.json" | |
| with open(dataset_file, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| return data | |
| def get_datasets_by_name(dataset_name): | |
| if dataset_name == "ucaso": | |
| return get_ucaso() | |
| elif dataset_name == "ours": | |
| return get_ours() | |
| def write_json_to_file(data, filepath): | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| json.dump(data, f, ensure_ascii=False, indent=4) | |
| if __name__ == "__main__": | |
| # res = resolve_ours() | |
| # write_json_to_file(res, "./data/Ours/datasets_v1.json") | |
| res = get_ours() | |
| print(res[0]['query']) | |
| # for item in res: | |
| # if item["problem_id"] == "#6183. 看无可看": | |
| # print(len(item["query"])) | |
| # print(len(read_json("/home/i-luoxianzhen/data/TestCase-Gen/saved_tests/lcb/lcbtests-ours-3final.json"))) |
Xet Storage Details
- Size:
- 4.26 kB
- Xet hash:
- 81919d531d74de8b18a795766c18e47467a666ff27f84ae62a51f86638c5349a
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.