File size: 4,321 Bytes
d8a76be | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | # import re
# import pandas as pd
# from datasets import load_dataset
# import re
# input_path = "/home/data/formatted_test1.parquet"
# output_path = "/home/data/formatted_test1.1.parquet"
# def normalize_last_assistant_tag(text: str) -> str:
# """
# 保留最后一次 <|im_start|>assistant 及其后的正常内容(可为空),
# 删除这一轮内的 <|im_end|> 以及之后的所有内容,
# 统一成:
# ...<|im_start|>assistant\n<保留内容(可为空)>
# """
# if not isinstance(text, str):
# return text
# # 找到最后一次 <|im_start|>assistant
# idx = text.rfind("<|im_start|>assistant")
# if idx == -1:
# return text
# before = text[:idx] # assistant 之前的所有内容
# after = text[idx:] # 包含 <|im_start|>assistant 的部分
# # 切掉这一轮内的 <|im_end|> 以及它之后的所有内容
# after = after.split("<|im_end|>", 1)[0]
# # 去掉多余空行
# after = re.sub(r"\n{3,}", "\n\n", after).rstrip()
# # 确保格式统一为 ...<|im_start|>assistant\n<保留内容(可为空)>
# if not after.endswith("\n"):
# after += "\n"
# return before.rstrip() + "\n" + after
# def fix_spacing(text: str) -> str:
# if not isinstance(text, str):
# return text
# return re.sub(r'(\w)\s+:', r'\1:', text)
# def unify_system_block(text: str) -> str:
# if not isinstance(text, str):
# return text
# if text.startswith("<|im_start|>system"):
# match = re.search(r"<\|im_start\|>system\s*(.*?)<\|im_end\|>", text, re.S)
# if match:
# system_block = match.group(1)
# system_block = re.sub(r"\s*\n\s*", " ", system_block)
# system_block = re.sub(r"\s{2,}", " ", system_block).strip()
# text = text.replace(match.group(1), system_block)
# return text
# def close_action_asterisks(text: str) -> str:
# if not isinstance(text, str):
# return text
# def replacer(m):
# seg = m.group(0)
# return seg + "*" if seg.count("*") % 2 != 0 else seg
# return re.sub(r"\*[^\*]{0,200}", replacer, text)
# def remove_square_brackets_meta(text: str) -> str:
# if not isinstance(text, str):
# return text
# return re.sub(r"\[[^\]]+\]", "", text)
# def fix_chatml_pairs(text: str) -> str:
# if not isinstance(text, str):
# return text
# text = re.sub(r"\n{3,}", "\n\n", text)
# starts = len(re.findall(r"<\|im_start\|>", text))
# ends = len(re.findall(r"<\|im_end\|>", text))
# if starts > ends:
# text += "<|im_end|>"
# text = re.sub(
# r"(<\|im_start\|>assistant)(\s*<\|im_start\|>assistant)",
# r"\1\n<|im_end|>\n<|im_start|>assistant",
# text,
# )
# text = re.sub(
# r"(<\|im_start\|>user)(\s*<\|im_start\|>user)",
# r"\1\n<|im_end|>\n<|im_start|>user",
# text,
# )
# return text
# def clean_sample(text: str) -> str:
# if not isinstance(text, str):
# return text
# text = fix_spacing(text)
# text = unify_system_block(text)
# text = close_action_asterisks(text)
# text = remove_square_brackets_meta(text)
# text = fix_chatml_pairs(text)
# return text.strip()
# df = pd.read_parquet(input_path) # 需要安装 pyarrow 或 fastparquet
# if "chosen_prompt" in df.columns:
# # 先做整体清洗,再把最后一轮统一为“开放式 assistant 标签”
# df["chosen_prompt"] = df["chosen_prompt"].apply(clean_sample)
# df["chosen_prompt"] = df["chosen_prompt"].apply(normalize_last_assistant_tag)
# df.to_parquet(output_path, index=False)
# print(f"处理+清洗完成!最后一轮 assistant 标签已统一为开放式标签,结果已保存到 {output_path}")
import random
from datasets import load_dataset
# 1. 加载处理好的 parquet 文件
dataset1 = load_dataset("parquet", data_files="/home/data/formatted_test1.parquet", split="train")
indices = random.sample(range(len(dataset1)), 5)
samples = dataset1.select(indices)
# 完整打印
for idx, item in zip(indices, samples):
print(f"\n=== Sample index {idx} ===")
for key, value in item.items():
print(f"[{key}]")
print(value) # 直接原样输出
print("-" * 60) |