Buckets:

Tsukihjy
/

testcase

Files

xet

Tsukihjy/testcase / testcase-data /Ours /get_delete_sample.py

Tsukihjy

about 1 month ago

download

raw

5.38 kB

	import json

	def remove_text_after_phrase(text, phrase):
	# 找到指定句子的索引
	index = text.find(phrase)
	if index != -1: # 如果找到了该句子
	# 返回该句子之前的所有内容
	return text[:index]
	return text # 如果未找到，返回原文本

	def remove_freopen_lines(code):
	lines = code.splitlines()
	cleaned = [line for line in lines if 'freopen' not in line]
	return '\n'.join(cleaned)

	def remove_fread_lines(code):
	lines = code.splitlines()
	cleaned = [line for line in lines if 'fread' not in line]
	return '\n'.join(cleaned)

	import re
	def get_transformed_id(problem_id):
	if "地底蔷薇" in problem_id:
	return "地底蔷薇"
	tcb_id = problem_id.replace('/', ' ')
	tcb_id = re.sub(r'^[^.]*\.', '', tcb_id)
	i = tcb_id.find('」')
	if i != -1:
	tcb_id = tcb_id[i+1:]

	tcb_id = tcb_id.strip()
	if tcb_id == "":
	print(problem_id)
	return problem_id
	return tcb_id

	final_curr = "/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/TestcaseBench-v6.json"

	file1 = "/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/contents_with_sample.json"

	file2 = "/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/balance_v3.0.dev_with_code.json"


	contents_with_sample = json.load(open(file1, "r", encoding="utf-8"))
	balance_v3 = json.load(open(file2, "r", encoding="utf-8"))

	for k, v in contents_with_sample.items():
	if k not in balance_v3.keys():
	balance_v3[k] = v

	tcb_v6 = json.load(open(final_curr, "r", encoding="utf-8"))
	tcb_v6_dict = {}
	for item in tcb_v6:
	tcb_v6_dict[item['tcb_id']] = item
	delete_data = []
	for k, v in balance_v3.items():
	tcb_id = get_transformed_id(k)
	if tcb_id not in tcb_v6_dict.keys():
	delete_data.append(tcb_id)

	tcb_v7_new_data = {}
	base_data = json.load(open("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/balance_v1.0.dev_with_code.json", "r", encoding="utf-8"))
	for k, v in base_data.items():
	tcb_id = get_transformed_id(k)
	if tcb_id in delete_data or tcb_id in tcb_v6_dict.keys():
	continue
	tcb_v7_new_data[tcb_id] = v

	print(f"add {len(tcb_v7_new_data)} items")

	pattern = r"^样例 \d+$"

	import re
	section_title_set = set()

	## 切换为整个题目的文本，需要去除对每个测试点的描述
	ds_query = []
	for key, value in tcb_v7_new_data.items():
	content = key # 先放标题
	ques = value['content']
	# 拼接所有的 sectionTitle 和 text
	no_sample = False

	for section in ques.get("contentSections", []):
	section_title = section.get("sectionTitle", "").strip()
	section_title_set.add(section_title)
	text = section.get("text", "").strip()
	section_content = ""
	section_content += f"\n{section_title}"
	if section_title == "样例":
	if len(value['sample']) > 0:
	for i in range(len(value['sample'])):
	section_content += f"\n输入:\n{value['sample'][i]['inputData']}\n输出:\n{value['sample'][i]['outputData']}\n"
	else:
	print(f"{key} no test sample")
	no_sample = True
	break
	if re.match(pattern, section_title):
	i = int(section_title[-1]) - 1
	if i < len(value['sample']):
	section_content += f"\n输入:\n{value['sample'][i]['inputData']}\n输出:\n{value['sample'][i]['outputData']}\n"
	o_text = text
	text = remove_text_after_phrase(text, "见附加文件")
	if text != o_text:
	print(f"{key} 修改删除---见附加文件 \n{o_text}")
	continue
	if section_title == "数据范围与提示":
	o_text = text
	# text = remove_text_after_phrase(text, "各测试点具体限制如下")
	# text = remove_text_after_phrase(text, "每个测试点的具体限制见下表")
	# text = remove_text_after_phrase(text, "测试点编号")
	# text = remove_text_after_phrase(text, "\| 测试点 \|")
	if text != o_text:
	print(f"{key} 修改数据范围与提示")

	section_content += f"\n{text}"

	content += section_content

	selected_codes = []
	solutions = value['correct_codes']
	cpp_solutions = [sol["code"] for sol in solutions if sol['lang'] == 'cpp']
	if len(cpp_solutions) < 3:
	print(f"Warning: Less than 3 C++ solutions found for problem {key}. Found {len(cpp_solutions)} solutions.")
	continue
	selected_codes = cpp_solutions[:3] # Select top 3 cpp solutions

	## 除去正确代码中的 freopen fread 等代码行
	right_codes = []
	for code in selected_codes:
	right_codes.append(remove_freopen_lines(code))

	for item in value['cowrong_codes']:
	item['code'] = remove_freopen_lines(item['code'])

	content = content.replace("这是一道模板题", "")

	ds_query.append({
	"tcb_id": key,
	'query': content,
	'solutions': right_codes,
	'runtime_limit': value['timeLimit'],
	'memory_limit': value['memoryLimit'],
	"wrong_code": value['cowrong_codes']
	})

	print(f"finall add {len(ds_query)}")


	json.dump(ds_query, open(f"/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/tcb_v7_add_data_no_en.jsonl", "w", encoding="utf-8"), indent=4, ensure_ascii=False)

Xet Storage Details

Size:: 5.38 kB
Xet hash:: 6f4ecbed2b8ef8fc085b4e47cf0cb8cd45b87822daa024896430c9ed4ffe8e68

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.