smartcore-v1 / code /referans_kod /tokenizer.py
kdirgul's picture
referans_kod Colab için
ad9632f verified
Raw
History Blame Contribute Delete
1.44 kB
"""GPT-2 tokenizer wrapper.
We use the HuggingFace fast tokenizer because its byte-level BPE is fully
deterministic given a fixed `transformers` version and produces the same
output across CPU/GPU and Python releases.
"""
from __future__ import annotations
from typing import Iterable, List
from transformers import AutoTokenizer, PreTrainedTokenizerBase
def load_tokenizer(name: str = "gpt2") -> PreTrainedTokenizerBase:
"""Load the GPT-2 tokenizer with `pad_token = eos_token`.
GPT-2 has no native pad token. Setting it to EOS is the standard idiom
used by HuggingFace causal-LM training code; padded positions are masked
out by the attention mask, so the choice is harmless.
"""
tok = AutoTokenizer.from_pretrained(name, use_fast=True)
if tok.pad_token is None:
tok.pad_token = tok.eos_token
return tok
def encode(
tokenizer: PreTrainedTokenizerBase,
text: str,
add_eos: bool = True,
) -> List[int]:
"""Encode a single document. Returns a list of token IDs."""
ids = tokenizer.encode(text, add_special_tokens=False)
if add_eos:
ids.append(tokenizer.eos_token_id)
return ids
def encode_stream(
tokenizer: PreTrainedTokenizerBase,
docs: Iterable[str],
add_eos: bool = True,
) -> Iterable[List[int]]:
"""Encode an iterable of documents lazily, preserving order."""
for doc in docs:
yield encode(tokenizer, doc, add_eos=add_eos)