rm_code / diff.py

Upload folder using huggingface_hub

d8a76be verified 7 months ago

4.26 kB

	# from datasets import load_dataset, concatenate_datasets
	# ds1 = load_dataset("parquet", data_files="/home/data/4201-L0_full_label.parquet", split="train")
	# ds2 = load_dataset("parquet", data_files="/home/data/V5.0label_1000-8192/pk-2355-L6_full_label.parquet", split="train")
	# keep_cols = ["chosen_prompt", "chosen", "reject", "chosen_label", "reject_label",
	# "chosen_violations", "reject_violations", "chosen_model", "reject_model"]
	# drop_cols1 = [c for c in ds1.column_names if c not in keep_cols]
	# drop_cols2 = [c for c in ds2.column_names if c not in keep_cols]
	# ds1 = ds1.remove_columns(drop_cols1)
	# ds2 = ds2.remove_columns(drop_cols2)
	# merged = concatenate_datasets([ds1, ds2])
	# print(f"合并后样本数: {len(merged)}")
	# output_path = "/home/data/raw/test/4201_2355_full_label.parquet"
	# merged.to_parquet(output_path)
	# print(f"已保存到 {output_path}")

	import pandas as pd
	import re
	import matplotlib.pyplot as plt
	df = pd.read_parquet("/home/data/result/2_rm3.4.1_9e-6.parquet")
	# df=df[df["right"]==0]
	# df["score_diff"] = df["chosen_score"] - df["reject_score"]
	# df=df[df["score_diff"]==0]
	# print((df["score_diff"]==0).sum())
	# len1=len(df)
	# df1=df[(df["chosen_label"]=="safe")&(df["reject_label"]=="safe")]
	# ds2=df[(df["chosen_label"]=="safe")&(df["reject_label"]=="unsafe")]
	# df3=df[(df["chosen_label"]=="unsafe")&(df["reject_label"]=="safe")]
	# df4=df[(df["chosen_label"]=="unsafe")&(df["reject_label"]=="ununsafe")]
	# df4['score_diff'] = df4['chosen_score'] - df4['reject_score']
	# print(df4['score_diff'].describe())
	# plt.figure(figsize=(10, 6))
	# plt.hist(df4['score_diff'], bins=200, alpha=0.7, color='blue', edgecolor='black')
	# plt.title("Distribution of [chosen_score] - [reject_score]")
	# plt.xlabel("[chosen_score] - [reject_score]")
	# plt.ylabel("Frequency")
	# plt.savefig("score_diff_histogram6.png")
	# print(f"总错误数: {len1}, safe-safe: {len(df1)}, safe-unsafe: {len(ds2)}, unsafe-safe: {len(df3)}, unsafe-unsafe: {len(df4)}")
	# print(df.columns)
	sampled_rows = df.sample(n=1).to_dict(orient="records")
	# 打印每一条完整样本
	# columns_to_keep=[ "chosen",'chosen_label', 'chosen_violations','chosen_model','reject','reject_label',
	# 'reject_violations','reject_model']
	# sampled_rows = [{k: v for k, v in row.items() if k in columns_to_keep} for row in sampled_rows]
	for i, row in enumerate(sampled_rows):
	print(f"Sample {i+1}:\n")
	for k, v in row.items():
	print(f"{k}:\n{v}\n{'-'*40}")
	print("="*80)

	# sample_review_S1_S12.py
	# import pandas as pd

	# PATH = "/home/data/train_10k_label.parquet"
	# N_PER_LABEL = 20
	# TARGET_LABELS = [f"S{i}" for i in range(1, 13)]
	# RANDOM_STATE = 20240819 # 复现实验用

	# # 读数据
	# df = pd.read_parquet(PATH)

	# # 规范化 violations，并把多标签拆成哑变量列
	# violations_norm = (
	# df.get("violations", pd.Series([""] * len(df)))
	# .astype(str).str.upper().str.replace(" ", "", regex=False)
	# )
	# dummies = violations_norm.str.get_dummies(",")

	# # 打印时尽量展示关键文本列（存在才显示）
	# cols_to_show = ["violations"]
	# for c in [ "chosen"]:
	# if c in df.columns:
	# cols_to_show.append(c)

	# # 让 pandas 打印完整文本（不截断）
	# pd.set_option("display.max_colwidth", None)

	# all_samples = [] # 如需导出，可把每类样本收集到这里

	# for tag in TARGET_LABELS:
	# if tag not in dummies.columns:
	# print(f"\n=== {tag}: 0 条 ===")
	# continue

	# subset = df[dummies[tag] == 1]
	# k = min(N_PER_LABEL, len(subset))
	# print(f"\n=== {tag}: 抽取 {k}/{len(subset)} 条 ===")

	# if k > 0:
	# sample = subset.sample(n=k, random_state=RANDOM_STATE)
	# # 打印
	# print(sample[cols_to_show].reset_index(drop=True).to_string(index=True))
	# # 如需导出，收集起来
	# tmp = sample.copy()
	# tmp["__S_tag__"] = tag
	# all_samples.append(tmp)

	# # 可选：把所有抽样保存成一个文件便于人工复核
	# # if all_samples:
	# # out = pd.concat(all_samples, ignore_index=True)
	# # out.to_csv("/home/data/review_samples_S1_S12.csv", index=False, encoding="utf-8")
	# # print("\n已保存到 /home/data/review_samples_S1_S12.csv")