t_n / novel_tokenizer.py
woywan's picture
Upload 12 files
a072099 verified
import os
import torch
import numpy as np
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
def train_tokenizer(text_path, vocab_size=8000, save_path="d:/图像/novel_tokenizer.json"):
"""从小说文本训练一个BPE分词器"""
# 初始化分词器
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()
# 准备训练器
trainer = trainers.BpeTrainer(
vocab_size=vocab_size,
special_tokens=["<s>", "</s>", "<pad>", "<unk>", "<mask>"]
)
# 训练分词器
tokenizer.train([text_path], trainer)
# 添加后处理器
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
# 保存分词器
tokenizer.save(save_path)
print(f"分词器已保存到 {save_path}")
return tokenizer
def prepare_dataset(text_path, tokenizer, context_length=4096, stride=2048, save_dir="d:/图像/novel_data"):
"""将文本转换为训练数据集"""
os.makedirs(save_dir, exist_ok=True)
# 读取文本
with open(text_path, 'r', encoding='utf-8') as f:
text = f.read()
# 对整个文本进行编码
encoded = tokenizer.encode(text).ids
# 创建训练样本
samples = []
for i in range(0, len(encoded) - context_length, stride):
samples.append(encoded[i:i + context_length])
# 转换为张量并保存
samples_tensor = torch.tensor(samples, dtype=torch.long)
torch.save(samples_tensor, os.path.join(save_dir, "novel_dataset.pt"))
print(f"已创建 {len(samples)} 个训练样本,保存到 {save_dir}")
return samples_tensor
if __name__ == "__main__":
novel_path = "d:/图像/0168319.txt" # 您的小说文件路径
# 训练分词器
tokenizer = train_tokenizer(novel_path)
# 准备数据集
prepare_dataset(novel_path, tokenizer)