# import re # import pandas as pd # from datasets import load_dataset # import re # input_path = "/home/data/formatted_test1.parquet" # output_path = "/home/data/formatted_test1.1.parquet" # def normalize_last_assistant_tag(text: str) -> str: # """ # 保留最后一次 <|im_start|>assistant 及其后的正常内容(可为空), # 删除这一轮内的 <|im_end|> 以及之后的所有内容, # 统一成: # ...<|im_start|>assistant\n<保留内容(可为空)> # """ # if not isinstance(text, str): # return text # # 找到最后一次 <|im_start|>assistant # idx = text.rfind("<|im_start|>assistant") # if idx == -1: # return text # before = text[:idx] # assistant 之前的所有内容 # after = text[idx:] # 包含 <|im_start|>assistant 的部分 # # 切掉这一轮内的 <|im_end|> 以及它之后的所有内容 # after = after.split("<|im_end|>", 1)[0] # # 去掉多余空行 # after = re.sub(r"\n{3,}", "\n\n", after).rstrip() # # 确保格式统一为 ...<|im_start|>assistant\n<保留内容(可为空)> # if not after.endswith("\n"): # after += "\n" # return before.rstrip() + "\n" + after # def fix_spacing(text: str) -> str: # if not isinstance(text, str): # return text # return re.sub(r'(\w)\s+:', r'\1:', text) # def unify_system_block(text: str) -> str: # if not isinstance(text, str): # return text # if text.startswith("<|im_start|>system"): # match = re.search(r"<\|im_start\|>system\s*(.*?)<\|im_end\|>", text, re.S) # if match: # system_block = match.group(1) # system_block = re.sub(r"\s*\n\s*", " ", system_block) # system_block = re.sub(r"\s{2,}", " ", system_block).strip() # text = text.replace(match.group(1), system_block) # return text # def close_action_asterisks(text: str) -> str: # if not isinstance(text, str): # return text # def replacer(m): # seg = m.group(0) # return seg + "*" if seg.count("*") % 2 != 0 else seg # return re.sub(r"\*[^\*]{0,200}", replacer, text) # def remove_square_brackets_meta(text: str) -> str: # if not isinstance(text, str): # return text # return re.sub(r"\[[^\]]+\]", "", text) # def fix_chatml_pairs(text: str) -> str: # if not isinstance(text, str): # return text # text = re.sub(r"\n{3,}", "\n\n", text) # starts = len(re.findall(r"<\|im_start\|>", text)) # ends = len(re.findall(r"<\|im_end\|>", text)) # if starts > ends: # text += "<|im_end|>" # text = re.sub( # r"(<\|im_start\|>assistant)(\s*<\|im_start\|>assistant)", # r"\1\n<|im_end|>\n<|im_start|>assistant", # text, # ) # text = re.sub( # r"(<\|im_start\|>user)(\s*<\|im_start\|>user)", # r"\1\n<|im_end|>\n<|im_start|>user", # text, # ) # return text # def clean_sample(text: str) -> str: # if not isinstance(text, str): # return text # text = fix_spacing(text) # text = unify_system_block(text) # text = close_action_asterisks(text) # text = remove_square_brackets_meta(text) # text = fix_chatml_pairs(text) # return text.strip() # df = pd.read_parquet(input_path) # 需要安装 pyarrow 或 fastparquet # if "chosen_prompt" in df.columns: # # 先做整体清洗,再把最后一轮统一为“开放式 assistant 标签” # df["chosen_prompt"] = df["chosen_prompt"].apply(clean_sample) # df["chosen_prompt"] = df["chosen_prompt"].apply(normalize_last_assistant_tag) # df.to_parquet(output_path, index=False) # print(f"处理+清洗完成!最后一轮 assistant 标签已统一为开放式标签,结果已保存到 {output_path}") import random from datasets import load_dataset # 1. 加载处理好的 parquet 文件 dataset1 = load_dataset("parquet", data_files="/home/data/formatted_test1.parquet", split="train") indices = random.sample(range(len(dataset1)), 5) samples = dataset1.select(indices) # 完整打印 for idx, item in zip(indices, samples): print(f"\n=== Sample index {idx} ===") for key, value in item.items(): print(f"[{key}]") print(value) # 直接原样输出 print("-" * 60)