rm_code / diff.py
hahayang012's picture
Upload folder using huggingface_hub
d8a76be verified
# from datasets import load_dataset, concatenate_datasets
# ds1 = load_dataset("parquet", data_files="/home/data/4201-L0_full_label.parquet", split="train")
# ds2 = load_dataset("parquet", data_files="/home/data/V5.0label_1000-8192/pk-2355-L6_full_label.parquet", split="train")
# keep_cols = ["chosen_prompt", "chosen", "reject", "chosen_label", "reject_label",
# "chosen_violations", "reject_violations", "chosen_model", "reject_model"]
# drop_cols1 = [c for c in ds1.column_names if c not in keep_cols]
# drop_cols2 = [c for c in ds2.column_names if c not in keep_cols]
# ds1 = ds1.remove_columns(drop_cols1)
# ds2 = ds2.remove_columns(drop_cols2)
# merged = concatenate_datasets([ds1, ds2])
# print(f"合并后样本数: {len(merged)}")
# output_path = "/home/data/raw/test/4201_2355_full_label.parquet"
# merged.to_parquet(output_path)
# print(f"已保存到 {output_path}")
import pandas as pd
import re
import matplotlib.pyplot as plt
df = pd.read_parquet("/home/data/result/2_rm3.4.1_9e-6.parquet")
# df=df[df["right"]==0]
# df["score_diff"] = df["chosen_score"] - df["reject_score"]
# df=df[df["score_diff"]==0]
# print((df["score_diff"]==0).sum())
# len1=len(df)
# df1=df[(df["chosen_label"]=="safe")&(df["reject_label"]=="safe")]
# ds2=df[(df["chosen_label"]=="safe")&(df["reject_label"]=="unsafe")]
# df3=df[(df["chosen_label"]=="unsafe")&(df["reject_label"]=="safe")]
# df4=df[(df["chosen_label"]=="unsafe")&(df["reject_label"]=="ununsafe")]
# df4['score_diff'] = df4['chosen_score'] - df4['reject_score']
# print(df4['score_diff'].describe())
# plt.figure(figsize=(10, 6))
# plt.hist(df4['score_diff'], bins=200, alpha=0.7, color='blue', edgecolor='black')
# plt.title("Distribution of [chosen_score] - [reject_score]")
# plt.xlabel("[chosen_score] - [reject_score]")
# plt.ylabel("Frequency")
# plt.savefig("score_diff_histogram6.png")
# print(f"总错误数: {len1}, safe-safe: {len(df1)}, safe-unsafe: {len(ds2)}, unsafe-safe: {len(df3)}, unsafe-unsafe: {len(df4)}")
# print(df.columns)
sampled_rows = df.sample(n=1).to_dict(orient="records")
# 打印每一条完整样本
# columns_to_keep=[ "chosen",'chosen_label', 'chosen_violations','chosen_model','reject','reject_label',
# 'reject_violations','reject_model']
# sampled_rows = [{k: v for k, v in row.items() if k in columns_to_keep} for row in sampled_rows]
for i, row in enumerate(sampled_rows):
print(f"Sample {i+1}:\n")
for k, v in row.items():
print(f"{k}:\n{v}\n{'-'*40}")
print("="*80)
# sample_review_S1_S12.py
# import pandas as pd
# PATH = "/home/data/train_10k_label.parquet"
# N_PER_LABEL = 20
# TARGET_LABELS = [f"S{i}" for i in range(1, 13)]
# RANDOM_STATE = 20240819 # 复现实验用
# # 读数据
# df = pd.read_parquet(PATH)
# # 规范化 violations,并把多标签拆成哑变量列
# violations_norm = (
# df.get("violations", pd.Series([""] * len(df)))
# .astype(str).str.upper().str.replace(" ", "", regex=False)
# )
# dummies = violations_norm.str.get_dummies(",")
# # 打印时尽量展示关键文本列(存在才显示)
# cols_to_show = ["violations"]
# for c in [ "chosen"]:
# if c in df.columns:
# cols_to_show.append(c)
# # 让 pandas 打印完整文本(不截断)
# pd.set_option("display.max_colwidth", None)
# all_samples = [] # 如需导出,可把每类样本收集到这里
# for tag in TARGET_LABELS:
# if tag not in dummies.columns:
# print(f"\n=== {tag}: 0 条 ===")
# continue
# subset = df[dummies[tag] == 1]
# k = min(N_PER_LABEL, len(subset))
# print(f"\n=== {tag}: 抽取 {k}/{len(subset)} 条 ===")
# if k > 0:
# sample = subset.sample(n=k, random_state=RANDOM_STATE)
# # 打印
# print(sample[cols_to_show].reset_index(drop=True).to_string(index=True))
# # 如需导出,收集起来
# tmp = sample.copy()
# tmp["__S_tag__"] = tag
# all_samples.append(tmp)
# # 可选:把所有抽样保存成一个文件便于人工复核
# # if all_samples:
# # out = pd.concat(all_samples, ignore_index=True)
# # out.to_csv("/home/data/review_samples_S1_S12.csv", index=False, encoding="utf-8")
# # print("\n已保存到 /home/data/review_samples_S1_S12.csv")