# from datasets import load_dataset, concatenate_datasets # ds1 = load_dataset("parquet", data_files="/home/data/4201-L0_full_label.parquet", split="train") # ds2 = load_dataset("parquet", data_files="/home/data/V5.0label_1000-8192/pk-2355-L6_full_label.parquet", split="train") # keep_cols = ["chosen_prompt", "chosen", "reject", "chosen_label", "reject_label", # "chosen_violations", "reject_violations", "chosen_model", "reject_model"] # drop_cols1 = [c for c in ds1.column_names if c not in keep_cols] # drop_cols2 = [c for c in ds2.column_names if c not in keep_cols] # ds1 = ds1.remove_columns(drop_cols1) # ds2 = ds2.remove_columns(drop_cols2) # merged = concatenate_datasets([ds1, ds2]) # print(f"合并后样本数: {len(merged)}") # output_path = "/home/data/raw/test/4201_2355_full_label.parquet" # merged.to_parquet(output_path) # print(f"已保存到 {output_path}") import pandas as pd import re import matplotlib.pyplot as plt df = pd.read_parquet("/home/data/result/2_rm3.4.1_9e-6.parquet") # df=df[df["right"]==0] # df["score_diff"] = df["chosen_score"] - df["reject_score"] # df=df[df["score_diff"]==0] # print((df["score_diff"]==0).sum()) # len1=len(df) # df1=df[(df["chosen_label"]=="safe")&(df["reject_label"]=="safe")] # ds2=df[(df["chosen_label"]=="safe")&(df["reject_label"]=="unsafe")] # df3=df[(df["chosen_label"]=="unsafe")&(df["reject_label"]=="safe")] # df4=df[(df["chosen_label"]=="unsafe")&(df["reject_label"]=="ununsafe")] # df4['score_diff'] = df4['chosen_score'] - df4['reject_score'] # print(df4['score_diff'].describe()) # plt.figure(figsize=(10, 6)) # plt.hist(df4['score_diff'], bins=200, alpha=0.7, color='blue', edgecolor='black') # plt.title("Distribution of [chosen_score] - [reject_score]") # plt.xlabel("[chosen_score] - [reject_score]") # plt.ylabel("Frequency") # plt.savefig("score_diff_histogram6.png") # print(f"总错误数: {len1}, safe-safe: {len(df1)}, safe-unsafe: {len(ds2)}, unsafe-safe: {len(df3)}, unsafe-unsafe: {len(df4)}") # print(df.columns) sampled_rows = df.sample(n=1).to_dict(orient="records") # 打印每一条完整样本 # columns_to_keep=[ "chosen",'chosen_label', 'chosen_violations','chosen_model','reject','reject_label', # 'reject_violations','reject_model'] # sampled_rows = [{k: v for k, v in row.items() if k in columns_to_keep} for row in sampled_rows] for i, row in enumerate(sampled_rows): print(f"Sample {i+1}:\n") for k, v in row.items(): print(f"{k}:\n{v}\n{'-'*40}") print("="*80) # sample_review_S1_S12.py # import pandas as pd # PATH = "/home/data/train_10k_label.parquet" # N_PER_LABEL = 20 # TARGET_LABELS = [f"S{i}" for i in range(1, 13)] # RANDOM_STATE = 20240819 # 复现实验用 # # 读数据 # df = pd.read_parquet(PATH) # # 规范化 violations,并把多标签拆成哑变量列 # violations_norm = ( # df.get("violations", pd.Series([""] * len(df))) # .astype(str).str.upper().str.replace(" ", "", regex=False) # ) # dummies = violations_norm.str.get_dummies(",") # # 打印时尽量展示关键文本列(存在才显示) # cols_to_show = ["violations"] # for c in [ "chosen"]: # if c in df.columns: # cols_to_show.append(c) # # 让 pandas 打印完整文本(不截断) # pd.set_option("display.max_colwidth", None) # all_samples = [] # 如需导出,可把每类样本收集到这里 # for tag in TARGET_LABELS: # if tag not in dummies.columns: # print(f"\n=== {tag}: 0 条 ===") # continue # subset = df[dummies[tag] == 1] # k = min(N_PER_LABEL, len(subset)) # print(f"\n=== {tag}: 抽取 {k}/{len(subset)} 条 ===") # if k > 0: # sample = subset.sample(n=k, random_state=RANDOM_STATE) # # 打印 # print(sample[cols_to_show].reset_index(drop=True).to_string(index=True)) # # 如需导出,收集起来 # tmp = sample.copy() # tmp["__S_tag__"] = tag # all_samples.append(tmp) # # 可选:把所有抽样保存成一个文件便于人工复核 # # if all_samples: # # out = pd.concat(all_samples, ignore_index=True) # # out.to_csv("/home/data/review_samples_S1_S12.csv", index=False, encoding="utf-8") # # print("\n已保存到 /home/data/review_samples_S1_S12.csv")