Reddit / models /reddit_tokenizer.py
cyrilfrl's picture
Upload 14 files
690dc6b verified
# third-party imports
from transformers import AutoTokenizer
# constants import
# tokenizer definition
def text_tokenizer(text: str, max_length: int = 512):
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
return tokenizer(
text,
max_length=max_length,
padding="max_length",
truncation=True,
return_tensors="pt"
)