yetrun's picture
ver1: 实现深度学习训练框架,支持 Wiki GPT 与诗歌生成双任务
a5fd608
"""诗歌数据集分词器模块
提供诗歌数据集专用的分词器实现。
"""
import pathlib
from keras import layers
def load_vocabulary(vocab_path: pathlib.Path):
"""从文本文件加载词汇表,每行一个字符。
Args:
vocab_path: 词汇表文件路径
Returns:
词汇表列表
"""
def extract_word(line: str) -> str:
word = line[:-1] # 去掉行末的换行符
return word if word != r"\n" else "\n"
with open(vocab_path, "r", encoding="utf-8") as f:
vocab = [extract_word(line) for line in f]
return vocab
def load_vectorizer(
vocab_path: pathlib.Path, sequence_length: int = 101
) -> layers.TextVectorization:
"""从词汇表文件加载分词器
Args:
vocab_path: 词汇表文件路径
sequence_length: 输出序列长度,默认为 101
(多一位是为了在训练时构建输入和目标偏移一位)
Returns:
TextVectorization 层
"""
vectorizer = layers.TextVectorization(
output_mode="int",
split="character",
output_sequence_length=sequence_length,
standardize=None,
)
vocab = load_vocabulary(vocab_path)
vectorizer.set_vocabulary(vocab)
return vectorizer
def create_vectorizer(sequence_length: int = 101) -> layers.TextVectorization:
"""创建新的分词器(用于训练词汇表)
Args:
sequence_length: 输出序列长度,默认为 101
Returns:
TextVectorization 层
"""
return layers.TextVectorization(
output_mode="int", split="character", standardize=None
)