File size: 1,970 Bytes

cb2428f

import json
import re
from transformers import AutoTokenizer

# 配置部分👇
dataset_path = "all_dataset_train.jsonl"           # 你的数据文件路径
model_path = "/root/autodl-tmp/output_7B_FULL_cotSFT/v8-20250720-210226/checkpoint-58"  # 用于加载tokenizer的模型
required_fields = ["input", "output"]              # 必须字段
max_token_length = 8192                            # 最大允许token数量（你可以按模型修改）

# 加载 tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_path)

# 控制字符检查函数
def has_control_chars(text):
    return bool(re.search(r"[\x00-\x1F\x7F]", text))

# 开始逐行检查
print("Checking dataset...\n")
with open(dataset_path, "r", encoding="utf-8") as f:
    for idx, line in enumerate(f, 1):
        try:
            data = json.loads(line)
        except json.JSONDecodeError as e:
            print(f"[Line {idx}] ❌ JSON decode error: {e}")
            continue

        # 检查字段完整性
        for field in required_fields:
            if field not in data:
                print(f"[Line {idx}] ❌ Missing required field: '{field}'")
            elif not data[field].strip():
                print(f"[Line {idx}] ❌ Field '{field}' is empty")

        # 控制字符检查
        input_text = data.get("input", "")
        output_text = data.get("output", "")
        if has_control_chars(input_text + output_text):
            print(f"[Line {idx}] ⚠️ Contains control characters")

        # Token 长度检查
        try:
            tokens = tokenizer(input_text + output_text, return_tensors="pt")
            token_len = tokens["input_ids"].shape[1]
            if token_len > max_token_length:
                print(f"[Line {idx}] ⚠️ Too many tokens: {token_len} > {max_token_length}")
        except Exception as e:
            print(f"[Line {idx}] ❌ Tokenization error: {e}")

print("\n✅ Dataset check complete.")