File size: 4,080 Bytes

cc9c7ee

"""Contains tokenizers like GloveTokenizers and BERT Tokenizer."""

import torch
# from torchtext.vocab import GloVe
# from torchtext.data import Field, TabularDataset
from src.utils.mapper import configmapper
from transformers import AutoTokenizer


class Tokenizer:
    """Abstract Class for Tokenizers."""

    def tokenize(self):
        """Abstract Method for tokenization."""


@configmapper.map("tokenizers", "glove")
class GloveTokenizer(Tokenizer):
    """Implement GloveTokenizer for tokenizing text for Glove Embeddings.

    Attributes:
        embeddings (torchtext.vocab.Vectors): Loaded pre-trained embeddings.
        text_field (torchtext.data.Field): Text_field for vector creation.

    Methods:
        __init__(self, name='840B', dim='300', cache='../embeddings/') : Constructor method
        initialize_vectors(fix_length=4, tokenize='spacy', file_path="../data/imperceptibility
                           /Concreteness Ratings/train/forty.csv",
                           file_format='tsv', fields=None): Initialize vocab vectors based on data.

        tokenize(x_input, **initializer_params): Tokenize given input and return the output.
    """

    def __init__(self, name="840B", dim="300", cache="../embeddings/"):
        """Construct GloveTokenizer.

        Args:
            name (str): Name of the GloVe embedding file
            dim (str): Dimensions of the Glove embedding file
            cache (str): Path to the embeddings directory
        """
        super(GloveTokenizer, self).__init__()
        self.embeddings = GloVe(name=name, dim=dim, cache=cache)
        self.text_field = None

    def initialize_vectors(
        self,
        fix_length=4,
        tokenize="spacy",
        tokenizer_file_paths=None,
        file_format="tsv",
        fields=None,
    ):
        """Initialize words/sequences based on GloVe embedding.

        Args:
            fields (list): The list containing the fields to be taken
                                     and processed from the file (see documentation for
                                      torchtext.data.TabularDataset)
            fix_length (int): The length of the tokenized text,
                              padding or cropping is done accordingly
            tokenize (function or string): Method to tokenize the data.
                                           If 'spacy' uses spacy tokenizer,
                                           else the specified method.
            tokenizer_file_paths (list of str): The paths of the files containing the data
            format (str): The format of the file : 'csv', 'tsv' or 'json'
        """
        text_field = Field(batch_first=True, fix_length=fix_length, tokenize=tokenize)
        tab_dats = [
            TabularDataset(
                i, format=file_format, fields={k: (k, text_field) for k in fields}
            )
            for i in tokenizer_file_paths
        ]
        text_field.build_vocab(*tab_dats)
        text_field.vocab.load_vectors(self.embeddings)
        self.text_field = text_field

    def tokenize(self, x_input, **init_vector__params):
        """Tokenize given input based on initialized vectors.

        Initialize the vectors with given parameters if not already initialized.

        Args:
            x_input (str): Unprocessed input text to be tokenized
            **initializer_params (Keyword arguments): Parameters to initialize vectors

        Returns:
            x_output (str): Processed and tokenized text
        """
        if self.text_field is None:
            self.initialize_vectors(**init_vector__params)
        try:
            x_output = torch.squeeze(
                self.text_field.process([self.text_field.preprocess(x_input)])
            )
        except Exception as e:
            print(x_input)
            print(self.text_field.preprocess(x_input))
            print(e)
        return x_output


@configmapper.map("tokenizers", "AutoTokenizer")
class AutoTokenizer(AutoTokenizer):
    def __init__(self, *args):
        super(AutoTokenizer, self).__init__()