File size: 4,257 Bytes

d8a76be

# from datasets import load_dataset, concatenate_datasets
# ds1 = load_dataset("parquet", data_files="/home/data/4201-L0_full_label.parquet", split="train")
# ds2 = load_dataset("parquet", data_files="/home/data/V5.0label_1000-8192/pk-2355-L6_full_label.parquet", split="train")
# keep_cols = ["chosen_prompt", "chosen", "reject", "chosen_label", "reject_label",
#              "chosen_violations", "reject_violations", "chosen_model", "reject_model"]
# drop_cols1 = [c for c in ds1.column_names if c not in keep_cols]
# drop_cols2 = [c for c in ds2.column_names if c not in keep_cols]
# ds1 = ds1.remove_columns(drop_cols1)
# ds2 = ds2.remove_columns(drop_cols2)
# merged = concatenate_datasets([ds1, ds2])
# print(f"合并后样本数: {len(merged)}")
# output_path = "/home/data/raw/test/4201_2355_full_label.parquet"
# merged.to_parquet(output_path)
# print(f"已保存到 {output_path}")

import pandas as pd
import re
import matplotlib.pyplot as plt
df = pd.read_parquet("/home/data/result/2_rm3.4.1_9e-6.parquet")
# df=df[df["right"]==0]
# df["score_diff"] = df["chosen_score"] - df["reject_score"]
# df=df[df["score_diff"]==0]
# print((df["score_diff"]==0).sum())
# len1=len(df)
# df1=df[(df["chosen_label"]=="safe")&(df["reject_label"]=="safe")]
# ds2=df[(df["chosen_label"]=="safe")&(df["reject_label"]=="unsafe")]
# df3=df[(df["chosen_label"]=="unsafe")&(df["reject_label"]=="safe")]
# df4=df[(df["chosen_label"]=="unsafe")&(df["reject_label"]=="ununsafe")]
# df4['score_diff'] = df4['chosen_score'] - df4['reject_score']
# print(df4['score_diff'].describe())
# plt.figure(figsize=(10, 6))
# plt.hist(df4['score_diff'], bins=200, alpha=0.7, color='blue', edgecolor='black')
# plt.title("Distribution of [chosen_score] - [reject_score]")
# plt.xlabel("[chosen_score] - [reject_score]")
# plt.ylabel("Frequency")
# plt.savefig("score_diff_histogram6.png")
# print(f"总错误数: {len1}, safe-safe: {len(df1)}, safe-unsafe: {len(ds2)}, unsafe-safe: {len(df3)}, unsafe-unsafe: {len(df4)}")
# print(df.columns)
sampled_rows = df.sample(n=1).to_dict(orient="records")
# 打印每一条完整样本
# columns_to_keep=[ "chosen",'chosen_label', 'chosen_violations','chosen_model','reject','reject_label',
#        'reject_violations','reject_model']
# sampled_rows = [{k: v for k, v in row.items() if k in columns_to_keep} for row in sampled_rows]
for i, row in enumerate(sampled_rows):
    print(f"Sample {i+1}:\n")
    for k, v in row.items():
        print(f"{k}:\n{v}\n{'-'*40}")
    print("="*80)

# sample_review_S1_S12.py
# import pandas as pd

# PATH = "/home/data/train_10k_label.parquet"
# N_PER_LABEL = 20
# TARGET_LABELS = [f"S{i}" for i in range(1, 13)]
# RANDOM_STATE = 20240819  # 复现实验用

# # 读数据
# df = pd.read_parquet(PATH)

# # 规范化 violations，并把多标签拆成哑变量列
# violations_norm = (
#     df.get("violations", pd.Series([""] * len(df)))
#       .astype(str).str.upper().str.replace(" ", "", regex=False)
# )
# dummies = violations_norm.str.get_dummies(",")

# # 打印时尽量展示关键文本列（存在才显示）
# cols_to_show = ["violations"]
# for c in [ "chosen"]:
#     if c in df.columns:
#         cols_to_show.append(c)

# # 让 pandas 打印完整文本（不截断）
# pd.set_option("display.max_colwidth", None)

# all_samples = []  # 如需导出，可把每类样本收集到这里

# for tag in TARGET_LABELS:
#     if tag not in dummies.columns:
#         print(f"\n=== {tag}: 0 条 ===")
#         continue

#     subset = df[dummies[tag] == 1]
#     k = min(N_PER_LABEL, len(subset))
#     print(f"\n=== {tag}: 抽取 {k}/{len(subset)} 条 ===")

#     if k > 0:
#         sample = subset.sample(n=k, random_state=RANDOM_STATE)
#         # 打印
#         print(sample[cols_to_show].reset_index(drop=True).to_string(index=True))
#         # 如需导出，收集起来
#         tmp = sample.copy()
#         tmp["__S_tag__"] = tag
#         all_samples.append(tmp)

# # 可选：把所有抽样保存成一个文件便于人工复核
# # if all_samples:
# #     out = pd.concat(all_samples, ignore_index=True)
# #     out.to_csv("/home/data/review_samples_S1_S12.csv", index=False, encoding="utf-8")
# #     print("\n已保存到 /home/data/review_samples_S1_S12.csv")