File size: 3,977 Bytes
52da7b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import re
from collections import Counter

from .linalg import Matrix, np, zeros

TOKEN_PATTERN = re.compile(r"[A-Za-z0-9']+")
FRAMETOKEN_WORD_PREFIX = "▁"


def tokenize(text: str) -> list[str]:
    return TOKEN_PATTERN.findall(text.lower())


def build_vocabulary(
    tokens: list[str],
    min_frequency: int = 1,
    max_vocab: int | None = None,
) -> tuple[dict[str, int], list[str]]:
    counts = Counter(tokens)
    return build_vocabulary_from_counts(
        counts,
        min_frequency=min_frequency,
        max_vocab=max_vocab,
    )


def build_vocabulary_from_counts(
    counts: dict[str, float],
    min_frequency: int = 1,
    max_vocab: int | None = None,
) -> tuple[dict[str, int], list[str]]:
    items = [
        (token, count)
        for token, count in sorted(counts.items(), key=lambda pair: (-pair[1], pair[0]))
        if count >= min_frequency
    ]
    if max_vocab is not None:
        if any(_looks_like_frametoken(token) for token, _ in items):
            items = _prioritize_frametoken_output_items(items)[:max_vocab]
        else:
            items = items[:max_vocab]

    id_to_token = [token for token, _ in items]
    token_to_id = {token: index for index, token in enumerate(id_to_token)}
    return token_to_id, id_to_token


def _looks_like_frametoken(token: str) -> bool:
    return token.startswith(FRAMETOKEN_WORD_PREFIX) or (
        token.startswith("<") and token.endswith(">")
    )


def _is_special_token(token: str) -> bool:
    return token.startswith("<") and token.endswith(">")


def _is_word_start_token(token: str) -> bool:
    return token.startswith(FRAMETOKEN_WORD_PREFIX)


def _is_single_letter_word_start(token: str) -> bool:
    if not token.startswith(FRAMETOKEN_WORD_PREFIX):
        return False
    rendered = token[len(FRAMETOKEN_WORD_PREFIX) :]
    return len(rendered) == 1 and rendered.isalpha() and rendered not in {"A", "I"}


def _is_bare_fallback_token(token: str) -> bool:
    return len(token) == 1 and not token.startswith(FRAMETOKEN_WORD_PREFIX)


def _prioritize_frametoken_output_items(items: list[tuple[str, float]]) -> list[tuple[str, float]]:
    # FrameToken keeps fallback characters for encoding coverage, but the model's
    # output/readout vocabulary should spend its capped slots on answerable tokens.
    def priority(item: tuple[str, float]) -> tuple[int, float, str]:
        token, count = item
        if _is_special_token(token):
            group = 0
        elif _is_single_letter_word_start(token):
            group = 3
        elif _is_word_start_token(token):
            group = 1
        elif _is_bare_fallback_token(token):
            group = 4
        else:
            group = 2
        return (group, -count, token)

    return sorted(items, key=priority)


def build_cooccurrence_matrix(
    tokens: list[str],
    token_to_id: dict[str, int],
    window_size: int,
) -> Matrix:
    size = len(token_to_id)
    token_ids = [token_to_id[token] for token in tokens if token in token_to_id]
    if np is not None and size > 0 and token_ids:
        matrix = np.zeros((size, size), dtype=np.float64)
        token_array = np.asarray(token_ids, dtype=np.int64)
        for offset in range(1, window_size + 1):
            if len(token_array) <= offset:
                break
            left = token_array[:-offset]
            right = token_array[offset:]
            weight = 1.0 / offset
            np.add.at(matrix, (left, right), weight)
            np.add.at(matrix, (right, left), weight)
        return matrix.tolist()

    matrix = zeros(size, size)
    for index, token_id in enumerate(token_ids):
        for offset in range(1, window_size + 1):
            other_index = index + offset
            if other_index >= len(token_ids):
                break
            other_id = token_ids[other_index]
            weight = 1.0 / offset
            matrix[token_id][other_id] += weight
            matrix[other_id][token_id] += weight
    return matrix