File size: 1,659 Bytes
a5fd608
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""诗歌数据集分词器模块

提供诗歌数据集专用的分词器实现。
"""

import pathlib

from keras import layers


def load_vocabulary(vocab_path: pathlib.Path):
    """从文本文件加载词汇表,每行一个字符。

    Args:
        vocab_path: 词汇表文件路径

    Returns:
        词汇表列表
    """

    def extract_word(line: str) -> str:
        word = line[:-1]  # 去掉行末的换行符
        return word if word != r"\n" else "\n"

    with open(vocab_path, "r", encoding="utf-8") as f:
        vocab = [extract_word(line) for line in f]
    return vocab


def load_vectorizer(
    vocab_path: pathlib.Path, sequence_length: int = 101
) -> layers.TextVectorization:
    """从词汇表文件加载分词器

    Args:
        vocab_path: 词汇表文件路径
        sequence_length: 输出序列长度,默认为 101
                       (多一位是为了在训练时构建输入和目标偏移一位)

    Returns:
        TextVectorization 层
    """
    vectorizer = layers.TextVectorization(
        output_mode="int",
        split="character",
        output_sequence_length=sequence_length,
        standardize=None,
    )

    vocab = load_vocabulary(vocab_path)
    vectorizer.set_vocabulary(vocab)

    return vectorizer


def create_vectorizer(sequence_length: int = 101) -> layers.TextVectorization:
    """创建新的分词器(用于训练词汇表)

    Args:
        sequence_length: 输出序列长度,默认为 101

    Returns:
        TextVectorization 层
    """
    return layers.TextVectorization(
        output_mode="int", split="character", standardize=None
    )