| # from datasets import load_dataset, concatenate_datasets | |
| # ds1 = load_dataset("parquet", data_files="/home/data/4201-L0_full_label.parquet", split="train") | |
| # ds2 = load_dataset("parquet", data_files="/home/data/V5.0label_1000-8192/pk-2355-L6_full_label.parquet", split="train") | |
| # keep_cols = ["chosen_prompt", "chosen", "reject", "chosen_label", "reject_label", | |
| # "chosen_violations", "reject_violations", "chosen_model", "reject_model"] | |
| # drop_cols1 = [c for c in ds1.column_names if c not in keep_cols] | |
| # drop_cols2 = [c for c in ds2.column_names if c not in keep_cols] | |
| # ds1 = ds1.remove_columns(drop_cols1) | |
| # ds2 = ds2.remove_columns(drop_cols2) | |
| # merged = concatenate_datasets([ds1, ds2]) | |
| # print(f"合并后样本数: {len(merged)}") | |
| # output_path = "/home/data/raw/test/4201_2355_full_label.parquet" | |
| # merged.to_parquet(output_path) | |
| # print(f"已保存到 {output_path}") | |
| import pandas as pd | |
| import re | |
| import matplotlib.pyplot as plt | |
| df = pd.read_parquet("/home/data/result/2_rm3.4.1_9e-6.parquet") | |
| # df=df[df["right"]==0] | |
| # df["score_diff"] = df["chosen_score"] - df["reject_score"] | |
| # df=df[df["score_diff"]==0] | |
| # print((df["score_diff"]==0).sum()) | |
| # len1=len(df) | |
| # df1=df[(df["chosen_label"]=="safe")&(df["reject_label"]=="safe")] | |
| # ds2=df[(df["chosen_label"]=="safe")&(df["reject_label"]=="unsafe")] | |
| # df3=df[(df["chosen_label"]=="unsafe")&(df["reject_label"]=="safe")] | |
| # df4=df[(df["chosen_label"]=="unsafe")&(df["reject_label"]=="ununsafe")] | |
| # df4['score_diff'] = df4['chosen_score'] - df4['reject_score'] | |
| # print(df4['score_diff'].describe()) | |
| # plt.figure(figsize=(10, 6)) | |
| # plt.hist(df4['score_diff'], bins=200, alpha=0.7, color='blue', edgecolor='black') | |
| # plt.title("Distribution of [chosen_score] - [reject_score]") | |
| # plt.xlabel("[chosen_score] - [reject_score]") | |
| # plt.ylabel("Frequency") | |
| # plt.savefig("score_diff_histogram6.png") | |
| # print(f"总错误数: {len1}, safe-safe: {len(df1)}, safe-unsafe: {len(ds2)}, unsafe-safe: {len(df3)}, unsafe-unsafe: {len(df4)}") | |
| # print(df.columns) | |
| sampled_rows = df.sample(n=1).to_dict(orient="records") | |
| # 打印每一条完整样本 | |
| # columns_to_keep=[ "chosen",'chosen_label', 'chosen_violations','chosen_model','reject','reject_label', | |
| # 'reject_violations','reject_model'] | |
| # sampled_rows = [{k: v for k, v in row.items() if k in columns_to_keep} for row in sampled_rows] | |
| for i, row in enumerate(sampled_rows): | |
| print(f"Sample {i+1}:\n") | |
| for k, v in row.items(): | |
| print(f"{k}:\n{v}\n{'-'*40}") | |
| print("="*80) | |
| # sample_review_S1_S12.py | |
| # import pandas as pd | |
| # PATH = "/home/data/train_10k_label.parquet" | |
| # N_PER_LABEL = 20 | |
| # TARGET_LABELS = [f"S{i}" for i in range(1, 13)] | |
| # RANDOM_STATE = 20240819 # 复现实验用 | |
| # # 读数据 | |
| # df = pd.read_parquet(PATH) | |
| # # 规范化 violations,并把多标签拆成哑变量列 | |
| # violations_norm = ( | |
| # df.get("violations", pd.Series([""] * len(df))) | |
| # .astype(str).str.upper().str.replace(" ", "", regex=False) | |
| # ) | |
| # dummies = violations_norm.str.get_dummies(",") | |
| # # 打印时尽量展示关键文本列(存在才显示) | |
| # cols_to_show = ["violations"] | |
| # for c in [ "chosen"]: | |
| # if c in df.columns: | |
| # cols_to_show.append(c) | |
| # # 让 pandas 打印完整文本(不截断) | |
| # pd.set_option("display.max_colwidth", None) | |
| # all_samples = [] # 如需导出,可把每类样本收集到这里 | |
| # for tag in TARGET_LABELS: | |
| # if tag not in dummies.columns: | |
| # print(f"\n=== {tag}: 0 条 ===") | |
| # continue | |
| # subset = df[dummies[tag] == 1] | |
| # k = min(N_PER_LABEL, len(subset)) | |
| # print(f"\n=== {tag}: 抽取 {k}/{len(subset)} 条 ===") | |
| # if k > 0: | |
| # sample = subset.sample(n=k, random_state=RANDOM_STATE) | |
| # # 打印 | |
| # print(sample[cols_to_show].reset_index(drop=True).to_string(index=True)) | |
| # # 如需导出,收集起来 | |
| # tmp = sample.copy() | |
| # tmp["__S_tag__"] = tag | |
| # all_samples.append(tmp) | |
| # # 可选:把所有抽样保存成一个文件便于人工复核 | |
| # # if all_samples: | |
| # # out = pd.concat(all_samples, ignore_index=True) | |
| # # out.to_csv("/home/data/review_samples_S1_S12.csv", index=False, encoding="utf-8") | |
| # # print("\n已保存到 /home/data/review_samples_S1_S12.csv") | |