File size: 3,883 Bytes
d8a76be | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | from transformers import AutoTokenizer
from datasets import load_dataset
import numpy as np
import matplotlib.pyplot as plt
import os
# ========= 配置 =========
tokenizer_path = "/home/rm"
parquet_paths = [
"/home/data/pk-2089-L6.parquet",
"/home/data/pk-1820-L6.parquet",
"/home/data/pk-2355-L6.parquet",
"/home/data/pk-4088-L6.parquet",
"/home/data/pk-3876-L6.parquet",
]
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
# 输出目录
save_dir = "./token_density_plots"
os.makedirs(save_dir, exist_ok=True)
# 画图参数
BINS = 120 # 分桶数(统一 bins,便于对比)
CLIP_PCT = 99.5 # 按总 token 的 99.5 分位裁剪显示上限;None 表示不裁剪
USE_LOGX = False # 横轴是否用对数坐标(跨度大时建议 True)
def count_total_tokens(ex):
chosen_ids = tokenizer(ex["chosen"], add_special_tokens=False)["input_ids"]
rejected_ids = tokenizer(ex["reject"], add_special_tokens=False)["input_ids"]
ex["total_tokens"] = len(chosen_ids) + len(rejected_ids)
ex["chosen_tokens"] = len(chosen_ids)
ex["rejected_tokens"] = len(rejected_ids)
return ex
# ========== 读取并统计所有数据集 ==========
all_sets = [] # [(name, totals, chosens, rejects)]
all_totals_for_range = []
for path in parquet_paths:
name = os.path.basename(path)
print(f"\n▶ 处理 {name}")
ds = load_dataset("parquet", data_files=path, split="train")
ds = ds.map(count_total_tokens, desc=f"[{name}] 计算 token", num_proc=4)
totals = np.asarray(ds["total_tokens"], dtype=np.int64)
chosens = np.asarray(ds["chosen_tokens"], dtype=np.int64)
rejects = np.asarray(ds["rejected_tokens"],dtype=np.int64)
print(f"[{name}] 样本数: {len(ds)}")
print(f" total_tokens : max={totals.max()} | min={totals.min()} | mean={totals.mean():.1f}")
print(f" chosen_tokens: max={chosens.max()} | min={chosens.min()} | mean={chosens.mean():.1f}")
print(f" reject_tokens: max={rejects.max()} | min={rejects.min()} | mean={rejects.mean():.1f}")
all_sets.append((name, totals, chosens, rejects))
all_totals_for_range.append(totals)
# 统一横轴显示范围(按所有 total 的分位)
all_totals_concat = np.concatenate(all_totals_for_range) if all_totals_for_range else np.array([1])
if CLIP_PCT is not None:
xmax = float(np.percentile(all_totals_concat, CLIP_PCT))
else:
xmax = float(all_totals_concat.max())
xmax = max(1.0, xmax)
# 统一 bins 边界
bin_edges = np.linspace(0, xmax, BINS + 1)
# ========== 画一张“全量合并的对比图” ==========
fig, ax = plt.subplots(figsize=(11, 6))
# 不同类型用不同线型;颜色用默认色盘自动区分不同数据集
linestyles = {
"total": "-",
"chosen": "--",
"reject": "-.",
}
for name, totals, chosens, rejects in all_sets:
# 根据统一 bins 画线形直方图密度(histtype='step')
ax.hist(totals, bins=bin_edges, density=True, histtype='step', linewidth=1.6,
label=f"{name} • total", linestyle=linestyles["total"])
ax.hist(chosens, bins=bin_edges, density=True, histtype='step', linewidth=1.6,
label=f"{name} • chosen", linestyle=linestyles["chosen"])
ax.hist(rejects, bins=bin_edges, density=True, histtype='step', linewidth=1.6,
label=f"{name} • reject", linestyle=linestyles["reject"])
if USE_LOGX:
ax.set_xscale('log')
ax.set_title("Token Density Overlay — All Datasets")
ax.set_xlabel("Token Count" + (" (log)" if USE_LOGX else ""))
ax.set_ylabel("Density")
# 图例放下方,多列展示,避免遮挡
leg = ax.legend(ncol=3, fontsize=8, loc="upper right", frameon=True)
plt.tight_layout()
out_png = os.path.join(save_dir, "ALL_datasets_density_overlay.png")
plt.savefig(out_png, dpi=300)
plt.close()
print(f"\n✅ 已保存全量合并对比图: {out_png}")
|