rm_code / clear.py

Upload folder using huggingface_hub

d8a76be verified 7 months ago

4.32 kB

	# import re
	# import pandas as pd
	# from datasets import load_dataset
	# import re
	# input_path = "/home/data/formatted_test1.parquet"
	# output_path = "/home/data/formatted_test1.1.parquet"

	# def normalize_last_assistant_tag(text: str) -> str:
	# """
	# 保留最后一次 <\|im_start\|>assistant 及其后的正常内容（可为空），
	# 删除这一轮内的 <\|im_end\|> 以及之后的所有内容，
	# 统一成：
	# ...<\|im_start\|>assistant\n<保留内容（可为空）>
	# """
	# if not isinstance(text, str):
	# return text

	# # 找到最后一次 <\|im_start\|>assistant
	# idx = text.rfind("<\|im_start\|>assistant")
	# if idx == -1:
	# return text

	# before = text[:idx] # assistant 之前的所有内容
	# after = text[idx:] # 包含 <\|im_start\|>assistant 的部分

	# # 切掉这一轮内的 <\|im_end\|> 以及它之后的所有内容
	# after = after.split("<\|im_end\|>", 1)[0]

	# # 去掉多余空行
	# after = re.sub(r"\n{3,}", "\n\n", after).rstrip()

	# # 确保格式统一为 ...<\|im_start\|>assistant\n<保留内容（可为空）>
	# if not after.endswith("\n"):
	# after += "\n"

	# return before.rstrip() + "\n" + after


	# def fix_spacing(text: str) -> str:
	# if not isinstance(text, str):
	# return text
	# return re.sub(r'(\w)\s+:', r'\1:', text)

	# def unify_system_block(text: str) -> str:
	# if not isinstance(text, str):
	# return text
	# if text.startswith("<\|im_start\|>system"):
	# match = re.search(r"<\\|im_start\\|>system\s(.?)<\\|im_end\\|>", text, re.S)
	# if match:
	# system_block = match.group(1)
	# system_block = re.sub(r"\s\n\s", " ", system_block)
	# system_block = re.sub(r"\s{2,}", " ", system_block).strip()
	# text = text.replace(match.group(1), system_block)
	# return text

	# def close_action_asterisks(text: str) -> str:
	# if not isinstance(text, str):
	# return text
	# def replacer(m):
	# seg = m.group(0)
	# return seg + "" if seg.count("") % 2 != 0 else seg
	# return re.sub(r"\[^\]{0,200}", replacer, text)

	# def remove_square_brackets_meta(text: str) -> str:
	# if not isinstance(text, str):
	# return text
	# return re.sub(r"\[[^\]]+\]", "", text)

	# def fix_chatml_pairs(text: str) -> str:
	# if not isinstance(text, str):
	# return text
	# text = re.sub(r"\n{3,}", "\n\n", text)
	# starts = len(re.findall(r"<\\|im_start\\|>", text))
	# ends = len(re.findall(r"<\\|im_end\\|>", text))
	# if starts > ends:
	# text += "<\|im_end\|>"
	# text = re.sub(
	# r"(<\\|im_start\\|>assistant)(\s*<\\|im_start\\|>assistant)",
	# r"\1\n<\|im_end\|>\n<\|im_start\|>assistant",
	# text,
	# )
	# text = re.sub(
	# r"(<\\|im_start\\|>user)(\s*<\\|im_start\\|>user)",
	# r"\1\n<\|im_end\|>\n<\|im_start\|>user",
	# text,
	# )
	# return text

	# def clean_sample(text: str) -> str:
	# if not isinstance(text, str):
	# return text
	# text = fix_spacing(text)
	# text = unify_system_block(text)
	# text = close_action_asterisks(text)
	# text = remove_square_brackets_meta(text)
	# text = fix_chatml_pairs(text)
	# return text.strip()

	# df = pd.read_parquet(input_path) # 需要安装 pyarrow 或 fastparquet

	# if "chosen_prompt" in df.columns:
	# # 先做整体清洗，再把最后一轮统一为“开放式 assistant 标签”
	# df["chosen_prompt"] = df["chosen_prompt"].apply(clean_sample)
	# df["chosen_prompt"] = df["chosen_prompt"].apply(normalize_last_assistant_tag)

	# df.to_parquet(output_path, index=False)
	# print(f"处理+清洗完成！最后一轮 assistant 标签已统一为开放式标签，结果已保存到 {output_path}")
	import random
	from datasets import load_dataset
	# 1. 加载处理好的 parquet 文件
	dataset1 = load_dataset("parquet", data_files="/home/data/formatted_test1.parquet", split="train")

	indices = random.sample(range(len(dataset1)), 5)
	samples = dataset1.select(indices)

	# 完整打印
	for idx, item in zip(indices, samples):
	print(f"\n=== Sample index {idx} ===")
	for key, value in item.items():
	print(f"[{key}]")
	print(value) # 直接原样输出
	print("-" * 60)