| # import re | |
| # import pandas as pd | |
| # from datasets import load_dataset | |
| # import re | |
| # input_path = "/home/data/formatted_test1.parquet" | |
| # output_path = "/home/data/formatted_test1.1.parquet" | |
| # def normalize_last_assistant_tag(text: str) -> str: | |
| # """ | |
| # 保留最后一次 <|im_start|>assistant 及其后的正常内容(可为空), | |
| # 删除这一轮内的 <|im_end|> 以及之后的所有内容, | |
| # 统一成: | |
| # ...<|im_start|>assistant\n<保留内容(可为空)> | |
| # """ | |
| # if not isinstance(text, str): | |
| # return text | |
| # # 找到最后一次 <|im_start|>assistant | |
| # idx = text.rfind("<|im_start|>assistant") | |
| # if idx == -1: | |
| # return text | |
| # before = text[:idx] # assistant 之前的所有内容 | |
| # after = text[idx:] # 包含 <|im_start|>assistant 的部分 | |
| # # 切掉这一轮内的 <|im_end|> 以及它之后的所有内容 | |
| # after = after.split("<|im_end|>", 1)[0] | |
| # # 去掉多余空行 | |
| # after = re.sub(r"\n{3,}", "\n\n", after).rstrip() | |
| # # 确保格式统一为 ...<|im_start|>assistant\n<保留内容(可为空)> | |
| # if not after.endswith("\n"): | |
| # after += "\n" | |
| # return before.rstrip() + "\n" + after | |
| # def fix_spacing(text: str) -> str: | |
| # if not isinstance(text, str): | |
| # return text | |
| # return re.sub(r'(\w)\s+:', r'\1:', text) | |
| # def unify_system_block(text: str) -> str: | |
| # if not isinstance(text, str): | |
| # return text | |
| # if text.startswith("<|im_start|>system"): | |
| # match = re.search(r"<\|im_start\|>system\s*(.*?)<\|im_end\|>", text, re.S) | |
| # if match: | |
| # system_block = match.group(1) | |
| # system_block = re.sub(r"\s*\n\s*", " ", system_block) | |
| # system_block = re.sub(r"\s{2,}", " ", system_block).strip() | |
| # text = text.replace(match.group(1), system_block) | |
| # return text | |
| # def close_action_asterisks(text: str) -> str: | |
| # if not isinstance(text, str): | |
| # return text | |
| # def replacer(m): | |
| # seg = m.group(0) | |
| # return seg + "*" if seg.count("*") % 2 != 0 else seg | |
| # return re.sub(r"\*[^\*]{0,200}", replacer, text) | |
| # def remove_square_brackets_meta(text: str) -> str: | |
| # if not isinstance(text, str): | |
| # return text | |
| # return re.sub(r"\[[^\]]+\]", "", text) | |
| # def fix_chatml_pairs(text: str) -> str: | |
| # if not isinstance(text, str): | |
| # return text | |
| # text = re.sub(r"\n{3,}", "\n\n", text) | |
| # starts = len(re.findall(r"<\|im_start\|>", text)) | |
| # ends = len(re.findall(r"<\|im_end\|>", text)) | |
| # if starts > ends: | |
| # text += "<|im_end|>" | |
| # text = re.sub( | |
| # r"(<\|im_start\|>assistant)(\s*<\|im_start\|>assistant)", | |
| # r"\1\n<|im_end|>\n<|im_start|>assistant", | |
| # text, | |
| # ) | |
| # text = re.sub( | |
| # r"(<\|im_start\|>user)(\s*<\|im_start\|>user)", | |
| # r"\1\n<|im_end|>\n<|im_start|>user", | |
| # text, | |
| # ) | |
| # return text | |
| # def clean_sample(text: str) -> str: | |
| # if not isinstance(text, str): | |
| # return text | |
| # text = fix_spacing(text) | |
| # text = unify_system_block(text) | |
| # text = close_action_asterisks(text) | |
| # text = remove_square_brackets_meta(text) | |
| # text = fix_chatml_pairs(text) | |
| # return text.strip() | |
| # df = pd.read_parquet(input_path) # 需要安装 pyarrow 或 fastparquet | |
| # if "chosen_prompt" in df.columns: | |
| # # 先做整体清洗,再把最后一轮统一为“开放式 assistant 标签” | |
| # df["chosen_prompt"] = df["chosen_prompt"].apply(clean_sample) | |
| # df["chosen_prompt"] = df["chosen_prompt"].apply(normalize_last_assistant_tag) | |
| # df.to_parquet(output_path, index=False) | |
| # print(f"处理+清洗完成!最后一轮 assistant 标签已统一为开放式标签,结果已保存到 {output_path}") | |
| import random | |
| from datasets import load_dataset | |
| # 1. 加载处理好的 parquet 文件 | |
| dataset1 = load_dataset("parquet", data_files="/home/data/formatted_test1.parquet", split="train") | |
| indices = random.sample(range(len(dataset1)), 5) | |
| samples = dataset1.select(indices) | |
| # 完整打印 | |
| for idx, item in zip(indices, samples): | |
| print(f"\n=== Sample index {idx} ===") | |
| for key, value in item.items(): | |
| print(f"[{key}]") | |
| print(value) # 直接原样输出 | |
| print("-" * 60) |