File size: 4,080 Bytes
cc9c7ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
"""Contains tokenizers like GloveTokenizers and BERT Tokenizer."""
import torch
# from torchtext.vocab import GloVe
# from torchtext.data import Field, TabularDataset
from src.utils.mapper import configmapper
from transformers import AutoTokenizer
class Tokenizer:
"""Abstract Class for Tokenizers."""
def tokenize(self):
"""Abstract Method for tokenization."""
@configmapper.map("tokenizers", "glove")
class GloveTokenizer(Tokenizer):
"""Implement GloveTokenizer for tokenizing text for Glove Embeddings.
Attributes:
embeddings (torchtext.vocab.Vectors): Loaded pre-trained embeddings.
text_field (torchtext.data.Field): Text_field for vector creation.
Methods:
__init__(self, name='840B', dim='300', cache='../embeddings/') : Constructor method
initialize_vectors(fix_length=4, tokenize='spacy', file_path="../data/imperceptibility
/Concreteness Ratings/train/forty.csv",
file_format='tsv', fields=None): Initialize vocab vectors based on data.
tokenize(x_input, **initializer_params): Tokenize given input and return the output.
"""
def __init__(self, name="840B", dim="300", cache="../embeddings/"):
"""Construct GloveTokenizer.
Args:
name (str): Name of the GloVe embedding file
dim (str): Dimensions of the Glove embedding file
cache (str): Path to the embeddings directory
"""
super(GloveTokenizer, self).__init__()
self.embeddings = GloVe(name=name, dim=dim, cache=cache)
self.text_field = None
def initialize_vectors(
self,
fix_length=4,
tokenize="spacy",
tokenizer_file_paths=None,
file_format="tsv",
fields=None,
):
"""Initialize words/sequences based on GloVe embedding.
Args:
fields (list): The list containing the fields to be taken
and processed from the file (see documentation for
torchtext.data.TabularDataset)
fix_length (int): The length of the tokenized text,
padding or cropping is done accordingly
tokenize (function or string): Method to tokenize the data.
If 'spacy' uses spacy tokenizer,
else the specified method.
tokenizer_file_paths (list of str): The paths of the files containing the data
format (str): The format of the file : 'csv', 'tsv' or 'json'
"""
text_field = Field(batch_first=True, fix_length=fix_length, tokenize=tokenize)
tab_dats = [
TabularDataset(
i, format=file_format, fields={k: (k, text_field) for k in fields}
)
for i in tokenizer_file_paths
]
text_field.build_vocab(*tab_dats)
text_field.vocab.load_vectors(self.embeddings)
self.text_field = text_field
def tokenize(self, x_input, **init_vector__params):
"""Tokenize given input based on initialized vectors.
Initialize the vectors with given parameters if not already initialized.
Args:
x_input (str): Unprocessed input text to be tokenized
**initializer_params (Keyword arguments): Parameters to initialize vectors
Returns:
x_output (str): Processed and tokenized text
"""
if self.text_field is None:
self.initialize_vectors(**init_vector__params)
try:
x_output = torch.squeeze(
self.text_field.process([self.text_field.preprocess(x_input)])
)
except Exception as e:
print(x_input)
print(self.text_field.preprocess(x_input))
print(e)
return x_output
@configmapper.map("tokenizers", "AutoTokenizer")
class AutoTokenizer(AutoTokenizer):
def __init__(self, *args):
super(AutoTokenizer, self).__init__()
|