| import json | |
| def remove_text_after_phrase(text, phrase): | |
| # 找到指定句子的索引 | |
| index = text.find(phrase) | |
| if index != -1: # 如果找到了该句子 | |
| # 返回该句子之前的所有内容 | |
| return text[:index] | |
| return text # 如果未找到,返回原文本 | |
| def remove_freopen_lines(code): | |
| lines = code.splitlines() | |
| cleaned = [line for line in lines if 'freopen' not in line] | |
| return '\n'.join(cleaned) | |
| def remove_fread_lines(code): | |
| lines = code.splitlines() | |
| cleaned = [line for line in lines if 'fread' not in line] | |
| return '\n'.join(cleaned) | |
| import re | |
| def get_transformed_id(problem_id): | |
| if "地底蔷薇" in problem_id: | |
| return "地底蔷薇" | |
| tcb_id = problem_id.replace('/', ' ') | |
| tcb_id = re.sub(r'^[^.]*\.', '', tcb_id) | |
| i = tcb_id.find('」') | |
| if i != -1: | |
| tcb_id = tcb_id[i+1:] | |
| tcb_id = tcb_id.strip() | |
| if tcb_id == "": | |
| print(problem_id) | |
| return problem_id | |
| return tcb_id | |
| final_curr = "/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/TestcaseBench-v6.json" | |
| file1 = "/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/contents_with_sample.json" | |
| file2 = "/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/balance_v3.0.dev_with_code.json" | |
| contents_with_sample = json.load(open(file1, "r", encoding="utf-8")) | |
| balance_v3 = json.load(open(file2, "r", encoding="utf-8")) | |
| for k, v in contents_with_sample.items(): | |
| if k not in balance_v3.keys(): | |
| balance_v3[k] = v | |
| tcb_v6 = json.load(open(final_curr, "r", encoding="utf-8")) | |
| tcb_v6_dict = {} | |
| for item in tcb_v6: | |
| tcb_v6_dict[item['tcb_id']] = item | |
| delete_data = [] | |
| for k, v in balance_v3.items(): | |
| tcb_id = get_transformed_id(k) | |
| if tcb_id not in tcb_v6_dict.keys(): | |
| delete_data.append(tcb_id) | |
| tcb_v7_new_data = {} | |
| base_data = json.load(open("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/balance_v1.0.dev_with_code.json", "r", encoding="utf-8")) | |
| for k, v in base_data.items(): | |
| tcb_id = get_transformed_id(k) | |
| if tcb_id in delete_data or tcb_id in tcb_v6_dict.keys(): | |
| continue | |
| tcb_v7_new_data[tcb_id] = v | |
| print(f"add {len(tcb_v7_new_data)} items") | |
| pattern = r"^样例 \d+$" | |
| import re | |
| section_title_set = set() | |
| ## 切换为整个题目的文本,需要去除对每个测试点的描述 | |
| ds_query = [] | |
| for key, value in tcb_v7_new_data.items(): | |
| content = key # 先放标题 | |
| ques = value['content'] | |
| # 拼接所有的 sectionTitle 和 text | |
| no_sample = False | |
| for section in ques.get("contentSections", []): | |
| section_title = section.get("sectionTitle", "").strip() | |
| section_title_set.add(section_title) | |
| text = section.get("text", "").strip() | |
| section_content = "" | |
| section_content += f"\n{section_title}" | |
| if section_title == "样例": | |
| if len(value['sample']) > 0: | |
| for i in range(len(value['sample'])): | |
| section_content += f"\n输入:\n{value['sample'][i]['inputData']}\n输出:\n{value['sample'][i]['outputData']}\n" | |
| else: | |
| print(f"{key} no test sample") | |
| no_sample = True | |
| break | |
| if re.match(pattern, section_title): | |
| i = int(section_title[-1]) - 1 | |
| if i < len(value['sample']): | |
| section_content += f"\n输入:\n{value['sample'][i]['inputData']}\n输出:\n{value['sample'][i]['outputData']}\n" | |
| o_text = text | |
| text = remove_text_after_phrase(text, "见附加文件") | |
| if text != o_text: | |
| print(f"{key} 修改 删除---见附加文件 \n{o_text}") | |
| continue | |
| if section_title == "数据范围与提示": | |
| o_text = text | |
| # text = remove_text_after_phrase(text, "各测试点具体限制如下") | |
| # text = remove_text_after_phrase(text, "每个测试点的具体限制见下表") | |
| # text = remove_text_after_phrase(text, "测试点编号") | |
| # text = remove_text_after_phrase(text, "| 测试点 |") | |
| if text != o_text: | |
| print(f"{key} 修改 数据范围与提示") | |
| section_content += f"\n{text}" | |
| content += section_content | |
| selected_codes = [] | |
| solutions = value['correct_codes'] | |
| cpp_solutions = [sol["code"] for sol in solutions if sol['lang'] == 'cpp'] | |
| if len(cpp_solutions) < 3: | |
| print(f"Warning: Less than 3 C++ solutions found for problem {key}. Found {len(cpp_solutions)} solutions.") | |
| continue | |
| selected_codes = cpp_solutions[:3] # Select top 3 cpp solutions | |
| ## 除去正确代码中的 freopen fread 等代码行 | |
| right_codes = [] | |
| for code in selected_codes: | |
| right_codes.append(remove_freopen_lines(code)) | |
| for item in value['cowrong_codes']: | |
| item['code'] = remove_freopen_lines(item['code']) | |
| content = content.replace("这是一道模板题", "") | |
| ds_query.append({ | |
| "tcb_id": key, | |
| 'query': content, | |
| 'solutions': right_codes, | |
| 'runtime_limit': value['timeLimit'], | |
| 'memory_limit': value['memoryLimit'], | |
| "wrong_code": value['cowrong_codes'] | |
| }) | |
| print(f"finall add {len(ds_query)}") | |
| json.dump(ds_query, open(f"/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/tcb_v7_add_data_no_en.jsonl", "w", encoding="utf-8"), indent=4, ensure_ascii=False) | |
Xet Storage Details
- Size:
- 5.38 kB
- Xet hash:
- 6f4ecbed2b8ef8fc085b4e47cf0cb8cd45b87822daa024896430c9ed4ffe8e68
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.