| | from transformers import AutoTokenizer |
| | from datasets import load_dataset |
| | import numpy as np |
| | import matplotlib.pyplot as plt |
| | import os |
| |
|
| | |
| | tokenizer_path = "/home/rm" |
| | parquet_paths = [ |
| | "/home/data/pk-2089-L6.parquet", |
| | "/home/data/pk-1820-L6.parquet", |
| | "/home/data/pk-2355-L6.parquet", |
| | "/home/data/pk-4088-L6.parquet", |
| | "/home/data/pk-3876-L6.parquet", |
| | ] |
| | tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) |
| |
|
| | |
| | save_dir = "./token_density_plots" |
| | os.makedirs(save_dir, exist_ok=True) |
| |
|
| | |
| | BINS = 120 |
| | CLIP_PCT = 99.5 |
| | USE_LOGX = False |
| |
|
| | def count_total_tokens(ex): |
| | chosen_ids = tokenizer(ex["chosen"], add_special_tokens=False)["input_ids"] |
| | rejected_ids = tokenizer(ex["reject"], add_special_tokens=False)["input_ids"] |
| | ex["total_tokens"] = len(chosen_ids) + len(rejected_ids) |
| | ex["chosen_tokens"] = len(chosen_ids) |
| | ex["rejected_tokens"] = len(rejected_ids) |
| | return ex |
| |
|
| | |
| | all_sets = [] |
| | all_totals_for_range = [] |
| |
|
| | for path in parquet_paths: |
| | name = os.path.basename(path) |
| | print(f"\n▶ 处理 {name}") |
| | ds = load_dataset("parquet", data_files=path, split="train") |
| | ds = ds.map(count_total_tokens, desc=f"[{name}] 计算 token", num_proc=4) |
| |
|
| | totals = np.asarray(ds["total_tokens"], dtype=np.int64) |
| | chosens = np.asarray(ds["chosen_tokens"], dtype=np.int64) |
| | rejects = np.asarray(ds["rejected_tokens"],dtype=np.int64) |
| |
|
| | print(f"[{name}] 样本数: {len(ds)}") |
| | print(f" total_tokens : max={totals.max()} | min={totals.min()} | mean={totals.mean():.1f}") |
| | print(f" chosen_tokens: max={chosens.max()} | min={chosens.min()} | mean={chosens.mean():.1f}") |
| | print(f" reject_tokens: max={rejects.max()} | min={rejects.min()} | mean={rejects.mean():.1f}") |
| |
|
| | all_sets.append((name, totals, chosens, rejects)) |
| | all_totals_for_range.append(totals) |
| |
|
| | |
| | all_totals_concat = np.concatenate(all_totals_for_range) if all_totals_for_range else np.array([1]) |
| | if CLIP_PCT is not None: |
| | xmax = float(np.percentile(all_totals_concat, CLIP_PCT)) |
| | else: |
| | xmax = float(all_totals_concat.max()) |
| |
|
| | xmax = max(1.0, xmax) |
| |
|
| | |
| | bin_edges = np.linspace(0, xmax, BINS + 1) |
| |
|
| | |
| | fig, ax = plt.subplots(figsize=(11, 6)) |
| |
|
| | |
| | linestyles = { |
| | "total": "-", |
| | "chosen": "--", |
| | "reject": "-.", |
| | } |
| |
|
| | for name, totals, chosens, rejects in all_sets: |
| | |
| | ax.hist(totals, bins=bin_edges, density=True, histtype='step', linewidth=1.6, |
| | label=f"{name} • total", linestyle=linestyles["total"]) |
| | ax.hist(chosens, bins=bin_edges, density=True, histtype='step', linewidth=1.6, |
| | label=f"{name} • chosen", linestyle=linestyles["chosen"]) |
| | ax.hist(rejects, bins=bin_edges, density=True, histtype='step', linewidth=1.6, |
| | label=f"{name} • reject", linestyle=linestyles["reject"]) |
| |
|
| | if USE_LOGX: |
| | ax.set_xscale('log') |
| |
|
| | ax.set_title("Token Density Overlay — All Datasets") |
| | ax.set_xlabel("Token Count" + (" (log)" if USE_LOGX else "")) |
| | ax.set_ylabel("Density") |
| |
|
| | |
| | leg = ax.legend(ncol=3, fontsize=8, loc="upper right", frameon=True) |
| | plt.tight_layout() |
| |
|
| | out_png = os.path.join(save_dir, "ALL_datasets_density_overlay.png") |
| | plt.savefig(out_png, dpi=300) |
| | plt.close() |
| | print(f"\n✅ 已保存全量合并对比图: {out_png}") |
| |
|