Tsukihjy/testcase / testcase-data /Ours /get_delete_sample.py
Tsukihjy's picture
download
raw
5.38 kB
import json
def remove_text_after_phrase(text, phrase):
# 找到指定句子的索引
index = text.find(phrase)
if index != -1: # 如果找到了该句子
# 返回该句子之前的所有内容
return text[:index]
return text # 如果未找到,返回原文本
def remove_freopen_lines(code):
lines = code.splitlines()
cleaned = [line for line in lines if 'freopen' not in line]
return '\n'.join(cleaned)
def remove_fread_lines(code):
lines = code.splitlines()
cleaned = [line for line in lines if 'fread' not in line]
return '\n'.join(cleaned)
import re
def get_transformed_id(problem_id):
if "地底蔷薇" in problem_id:
return "地底蔷薇"
tcb_id = problem_id.replace('/', ' ')
tcb_id = re.sub(r'^[^.]*\.', '', tcb_id)
i = tcb_id.find('」')
if i != -1:
tcb_id = tcb_id[i+1:]
tcb_id = tcb_id.strip()
if tcb_id == "":
print(problem_id)
return problem_id
return tcb_id
final_curr = "/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/TestcaseBench-v6.json"
file1 = "/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/contents_with_sample.json"
file2 = "/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/balance_v3.0.dev_with_code.json"
contents_with_sample = json.load(open(file1, "r", encoding="utf-8"))
balance_v3 = json.load(open(file2, "r", encoding="utf-8"))
for k, v in contents_with_sample.items():
if k not in balance_v3.keys():
balance_v3[k] = v
tcb_v6 = json.load(open(final_curr, "r", encoding="utf-8"))
tcb_v6_dict = {}
for item in tcb_v6:
tcb_v6_dict[item['tcb_id']] = item
delete_data = []
for k, v in balance_v3.items():
tcb_id = get_transformed_id(k)
if tcb_id not in tcb_v6_dict.keys():
delete_data.append(tcb_id)
tcb_v7_new_data = {}
base_data = json.load(open("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/balance_v1.0.dev_with_code.json", "r", encoding="utf-8"))
for k, v in base_data.items():
tcb_id = get_transformed_id(k)
if tcb_id in delete_data or tcb_id in tcb_v6_dict.keys():
continue
tcb_v7_new_data[tcb_id] = v
print(f"add {len(tcb_v7_new_data)} items")
pattern = r"^样例 \d+$"
import re
section_title_set = set()
## 切换为整个题目的文本,需要去除对每个测试点的描述
ds_query = []
for key, value in tcb_v7_new_data.items():
content = key # 先放标题
ques = value['content']
# 拼接所有的 sectionTitle 和 text
no_sample = False
for section in ques.get("contentSections", []):
section_title = section.get("sectionTitle", "").strip()
section_title_set.add(section_title)
text = section.get("text", "").strip()
section_content = ""
section_content += f"\n{section_title}"
if section_title == "样例":
if len(value['sample']) > 0:
for i in range(len(value['sample'])):
section_content += f"\n输入:\n{value['sample'][i]['inputData']}\n输出:\n{value['sample'][i]['outputData']}\n"
else:
print(f"{key} no test sample")
no_sample = True
break
if re.match(pattern, section_title):
i = int(section_title[-1]) - 1
if i < len(value['sample']):
section_content += f"\n输入:\n{value['sample'][i]['inputData']}\n输出:\n{value['sample'][i]['outputData']}\n"
o_text = text
text = remove_text_after_phrase(text, "见附加文件")
if text != o_text:
print(f"{key} 修改 删除---见附加文件 \n{o_text}")
continue
if section_title == "数据范围与提示":
o_text = text
# text = remove_text_after_phrase(text, "各测试点具体限制如下")
# text = remove_text_after_phrase(text, "每个测试点的具体限制见下表")
# text = remove_text_after_phrase(text, "测试点编号")
# text = remove_text_after_phrase(text, "| 测试点 |")
if text != o_text:
print(f"{key} 修改 数据范围与提示")
section_content += f"\n{text}"
content += section_content
selected_codes = []
solutions = value['correct_codes']
cpp_solutions = [sol["code"] for sol in solutions if sol['lang'] == 'cpp']
if len(cpp_solutions) < 3:
print(f"Warning: Less than 3 C++ solutions found for problem {key}. Found {len(cpp_solutions)} solutions.")
continue
selected_codes = cpp_solutions[:3] # Select top 3 cpp solutions
## 除去正确代码中的 freopen fread 等代码行
right_codes = []
for code in selected_codes:
right_codes.append(remove_freopen_lines(code))
for item in value['cowrong_codes']:
item['code'] = remove_freopen_lines(item['code'])
content = content.replace("这是一道模板题", "")
ds_query.append({
"tcb_id": key,
'query': content,
'solutions': right_codes,
'runtime_limit': value['timeLimit'],
'memory_limit': value['memoryLimit'],
"wrong_code": value['cowrong_codes']
})
print(f"finall add {len(ds_query)}")
json.dump(ds_query, open(f"/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/tcb_v7_add_data_no_en.jsonl", "w", encoding="utf-8"), indent=4, ensure_ascii=False)

Xet Storage Details

Size:
5.38 kB
·
Xet hash:
6f4ecbed2b8ef8fc085b4e47cf0cb8cd45b87822daa024896430c9ed4ffe8e68

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.