File size: 4,321 Bytes
d8a76be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# import re
# import pandas as pd
# from datasets import load_dataset
# import re
# input_path = "/home/data/formatted_test1.parquet"
# output_path = "/home/data/formatted_test1.1.parquet"

# def normalize_last_assistant_tag(text: str) -> str:
#     """
#     保留最后一次 <|im_start|>assistant 及其后的正常内容(可为空),
#     删除这一轮内的 <|im_end|> 以及之后的所有内容,
#     统一成:
#         ...<|im_start|>assistant\n<保留内容(可为空)>
#     """
#     if not isinstance(text, str):
#         return text

#     # 找到最后一次 <|im_start|>assistant
#     idx = text.rfind("<|im_start|>assistant")
#     if idx == -1:
#         return text

#     before = text[:idx]  # assistant 之前的所有内容
#     after = text[idx:]   # 包含 <|im_start|>assistant 的部分

#     # 切掉这一轮内的 <|im_end|> 以及它之后的所有内容
#     after = after.split("<|im_end|>", 1)[0]

#     # 去掉多余空行
#     after = re.sub(r"\n{3,}", "\n\n", after).rstrip()

#     # 确保格式统一为 ...<|im_start|>assistant\n<保留内容(可为空)>
#     if not after.endswith("\n"):
#         after += "\n"

#     return before.rstrip() + "\n" + after


# def fix_spacing(text: str) -> str:
#     if not isinstance(text, str):
#         return text
#     return re.sub(r'(\w)\s+:', r'\1:', text)

# def unify_system_block(text: str) -> str:
#     if not isinstance(text, str):
#         return text
#     if text.startswith("<|im_start|>system"):
#         match = re.search(r"<\|im_start\|>system\s*(.*?)<\|im_end\|>", text, re.S)
#         if match:
#             system_block = match.group(1)
#             system_block = re.sub(r"\s*\n\s*", " ", system_block)
#             system_block = re.sub(r"\s{2,}", " ", system_block).strip()
#             text = text.replace(match.group(1), system_block)
#     return text

# def close_action_asterisks(text: str) -> str:
#     if not isinstance(text, str):
#         return text
#     def replacer(m):
#         seg = m.group(0)
#         return seg + "*" if seg.count("*") % 2 != 0 else seg
#     return re.sub(r"\*[^\*]{0,200}", replacer, text)

# def remove_square_brackets_meta(text: str) -> str:
#     if not isinstance(text, str):
#         return text
#     return re.sub(r"\[[^\]]+\]", "", text)

# def fix_chatml_pairs(text: str) -> str:
#     if not isinstance(text, str):
#         return text
#     text = re.sub(r"\n{3,}", "\n\n", text)
#     starts = len(re.findall(r"<\|im_start\|>", text))
#     ends = len(re.findall(r"<\|im_end\|>", text))
#     if starts > ends:
#         text += "<|im_end|>"
#     text = re.sub(
#         r"(<\|im_start\|>assistant)(\s*<\|im_start\|>assistant)",
#         r"\1\n<|im_end|>\n<|im_start|>assistant",
#         text,
#     )
#     text = re.sub(
#         r"(<\|im_start\|>user)(\s*<\|im_start\|>user)",
#         r"\1\n<|im_end|>\n<|im_start|>user",
#         text,
#     )
#     return text

# def clean_sample(text: str) -> str:
#     if not isinstance(text, str):
#         return text
#     text = fix_spacing(text)
#     text = unify_system_block(text)
#     text = close_action_asterisks(text)
#     text = remove_square_brackets_meta(text)
#     text = fix_chatml_pairs(text)
#     return text.strip()

# df = pd.read_parquet(input_path)  # 需要安装 pyarrow 或 fastparquet

# if "chosen_prompt" in df.columns:
#     # 先做整体清洗,再把最后一轮统一为“开放式 assistant 标签”
#     df["chosen_prompt"] = df["chosen_prompt"].apply(clean_sample)
#     df["chosen_prompt"] = df["chosen_prompt"].apply(normalize_last_assistant_tag)

# df.to_parquet(output_path, index=False)
# print(f"处理+清洗完成!最后一轮 assistant 标签已统一为开放式标签,结果已保存到 {output_path}")
import random
from datasets import load_dataset
# 1. 加载处理好的 parquet 文件
dataset1 = load_dataset("parquet", data_files="/home/data/formatted_test1.parquet", split="train")

indices = random.sample(range(len(dataset1)), 5)
samples = dataset1.select(indices)

# 完整打印
for idx, item in zip(indices, samples):
    print(f"\n=== Sample index {idx} ===")
    for key, value in item.items():
        print(f"[{key}]")
        print(value)            # 直接原样输出
        print("-" * 60)