| import os
|
| import torch
|
| import numpy as np
|
| from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
|
|
|
| def train_tokenizer(text_path, vocab_size=8000, save_path="d:/图像/novel_tokenizer.json"):
|
| """从小说文本训练一个BPE分词器"""
|
|
|
| tokenizer = Tokenizer(models.BPE())
|
| tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
|
| tokenizer.decoder = decoders.ByteLevel()
|
|
|
|
|
| trainer = trainers.BpeTrainer(
|
| vocab_size=vocab_size,
|
| special_tokens=["<s>", "</s>", "<pad>", "<unk>", "<mask>"]
|
| )
|
|
|
|
|
| tokenizer.train([text_path], trainer)
|
|
|
|
|
| tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
|
|
|
|
|
| tokenizer.save(save_path)
|
| print(f"分词器已保存到 {save_path}")
|
|
|
| return tokenizer
|
|
|
| def prepare_dataset(text_path, tokenizer, context_length=4096, stride=2048, save_dir="d:/图像/novel_data"):
|
| """将文本转换为训练数据集"""
|
| os.makedirs(save_dir, exist_ok=True)
|
|
|
|
|
| with open(text_path, 'r', encoding='utf-8') as f:
|
| text = f.read()
|
|
|
|
|
| encoded = tokenizer.encode(text).ids
|
|
|
|
|
| samples = []
|
| for i in range(0, len(encoded) - context_length, stride):
|
| samples.append(encoded[i:i + context_length])
|
|
|
|
|
| samples_tensor = torch.tensor(samples, dtype=torch.long)
|
| torch.save(samples_tensor, os.path.join(save_dir, "novel_dataset.pt"))
|
|
|
| print(f"已创建 {len(samples)} 个训练样本,保存到 {save_dir}")
|
| return samples_tensor
|
|
|
| if __name__ == "__main__":
|
| novel_path = "d:/图像/0168319.txt"
|
|
|
|
|
| tokenizer = train_tokenizer(novel_path)
|
|
|
|
|
| prepare_dataset(novel_path, tokenizer) |