from transformers import AutoTokenizer from datasets import load_dataset import numpy as np import matplotlib.pyplot as plt import os # ========= 配置 ========= tokenizer_path = "/home/rm" parquet_paths = [ "/home/data/pk-2089-L6.parquet", "/home/data/pk-1820-L6.parquet", "/home/data/pk-2355-L6.parquet", "/home/data/pk-4088-L6.parquet", "/home/data/pk-3876-L6.parquet", ] tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) # 输出目录 save_dir = "./token_density_plots" os.makedirs(save_dir, exist_ok=True) # 画图参数 BINS = 120 # 分桶数(统一 bins,便于对比) CLIP_PCT = 99.5 # 按总 token 的 99.5 分位裁剪显示上限;None 表示不裁剪 USE_LOGX = False # 横轴是否用对数坐标(跨度大时建议 True) def count_total_tokens(ex): chosen_ids = tokenizer(ex["chosen"], add_special_tokens=False)["input_ids"] rejected_ids = tokenizer(ex["reject"], add_special_tokens=False)["input_ids"] ex["total_tokens"] = len(chosen_ids) + len(rejected_ids) ex["chosen_tokens"] = len(chosen_ids) ex["rejected_tokens"] = len(rejected_ids) return ex # ========== 读取并统计所有数据集 ========== all_sets = [] # [(name, totals, chosens, rejects)] all_totals_for_range = [] for path in parquet_paths: name = os.path.basename(path) print(f"\n▶ 处理 {name}") ds = load_dataset("parquet", data_files=path, split="train") ds = ds.map(count_total_tokens, desc=f"[{name}] 计算 token", num_proc=4) totals = np.asarray(ds["total_tokens"], dtype=np.int64) chosens = np.asarray(ds["chosen_tokens"], dtype=np.int64) rejects = np.asarray(ds["rejected_tokens"],dtype=np.int64) print(f"[{name}] 样本数: {len(ds)}") print(f" total_tokens : max={totals.max()} | min={totals.min()} | mean={totals.mean():.1f}") print(f" chosen_tokens: max={chosens.max()} | min={chosens.min()} | mean={chosens.mean():.1f}") print(f" reject_tokens: max={rejects.max()} | min={rejects.min()} | mean={rejects.mean():.1f}") all_sets.append((name, totals, chosens, rejects)) all_totals_for_range.append(totals) # 统一横轴显示范围(按所有 total 的分位) all_totals_concat = np.concatenate(all_totals_for_range) if all_totals_for_range else np.array([1]) if CLIP_PCT is not None: xmax = float(np.percentile(all_totals_concat, CLIP_PCT)) else: xmax = float(all_totals_concat.max()) xmax = max(1.0, xmax) # 统一 bins 边界 bin_edges = np.linspace(0, xmax, BINS + 1) # ========== 画一张“全量合并的对比图” ========== fig, ax = plt.subplots(figsize=(11, 6)) # 不同类型用不同线型;颜色用默认色盘自动区分不同数据集 linestyles = { "total": "-", "chosen": "--", "reject": "-.", } for name, totals, chosens, rejects in all_sets: # 根据统一 bins 画线形直方图密度(histtype='step') ax.hist(totals, bins=bin_edges, density=True, histtype='step', linewidth=1.6, label=f"{name} • total", linestyle=linestyles["total"]) ax.hist(chosens, bins=bin_edges, density=True, histtype='step', linewidth=1.6, label=f"{name} • chosen", linestyle=linestyles["chosen"]) ax.hist(rejects, bins=bin_edges, density=True, histtype='step', linewidth=1.6, label=f"{name} • reject", linestyle=linestyles["reject"]) if USE_LOGX: ax.set_xscale('log') ax.set_title("Token Density Overlay — All Datasets") ax.set_xlabel("Token Count" + (" (log)" if USE_LOGX else "")) ax.set_ylabel("Density") # 图例放下方,多列展示,避免遮挡 leg = ax.legend(ncol=3, fontsize=8, loc="upper right", frameon=True) plt.tight_layout() out_png = os.path.join(save_dir, "ALL_datasets_density_overlay.png") plt.savefig(out_png, dpi=300) plt.close() print(f"\n✅ 已保存全量合并对比图: {out_png}")