File size: 3,979 Bytes
cc9c7ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from src.modules.tokenizers import *
from src.modules.embeddings import *
from src.utils.mapper import configmapper


class Preprocessor:
    def preprocess(self):
        pass


@configmapper.map("preprocessors", "glove")
class GlovePreprocessor(Preprocessor):
    """GlovePreprocessor."""

    def __init__(self, config):
        """
        Args:
            config (src.utils.module.Config): configuration for preprocessor
        """
        super(GlovePreprocessor, self).__init__()
        self.config = config
        self.tokenizer = configmapper.get_object(
            "tokenizers", self.config.main.preprocessor.tokenizer.name
        )(**self.config.main.preprocessor.tokenizer.init_params.as_dict())
        self.tokenizer_params = (
            self.config.main.preprocessor.tokenizer.init_vector_params.as_dict()
        )

        self.tokenizer.initialize_vectors(**self.tokenizer_params)
        self.embeddings = configmapper.get_object(
            "embeddings", self.config.main.preprocessor.embedding.name
        )(
            self.tokenizer.text_field.vocab.vectors,
            self.tokenizer.text_field.vocab.stoi[self.tokenizer.text_field.pad_token],
        )

    def preprocess(self, model_config, data_config):
        train_dataset = configmapper.get_object("datasets", data_config.main.name)(
            data_config.train, self.tokenizer
        )
        val_dataset = configmapper.get_object("datasets", data_config.main.name)(
            data_config.val, self.tokenizer
        )
        model = configmapper.get_object("models", model_config.name)(
            self.embeddings, **model_config.params.as_dict()
        )

        return model, train_dataset, val_dataset


@configmapper.map("preprocessors", "clozePreprocessor")
class ClozePreprocessor(Preprocessor):
    """GlovePreprocessor."""

    def __init__(self, config):
        """
        Args:
            config (src.utils.module.Config): configuration for preprocessor
        """
        super(ClozePreprocessor, self).__init__()
        self.config = config
        self.tokenizer = configmapper.get_object(
            "tokenizers", self.config.main.preprocessor.tokenizer.name
        ).from_pretrained(
            **self.config.main.preprocessor.tokenizer.init_params.as_dict()
        )

    def preprocess(self, model_config, data_config):
        train_dataset = configmapper.get_object("datasets", data_config.main.name)(
            data_config.train, self.tokenizer
        )
        val_dataset = configmapper.get_object("datasets", data_config.main.name)(
            data_config.val, self.tokenizer
        )
        model = configmapper.get_object("models", model_config.name).from_pretrained(
            **model_config.params.as_dict()
        )

        return model, train_dataset, val_dataset


@configmapper.map("preprocessors", "transformersConcretenessPreprocessor")
class TransformersConcretenessPreprocessor(Preprocessor):
    """BertConcretenessPreprocessor."""

    def __init__(self, config):
        """
        Args:
            config (src.utils.module.Config): configuration for preprocessor
        """
        super(TransformersConcretenessPreprocessor, self).__init__()
        self.config = config
        self.tokenizer = configmapper.get_object(
            "tokenizers", self.config.main.preprocessor.tokenizer.name
        ).from_pretrained(
            **self.config.main.preprocessor.tokenizer.init_params.as_dict()
        )

    def preprocess(self, model_config, data_config):

        train_dataset = configmapper.get_object("datasets", data_config.main.name)(
            data_config.train, self.tokenizer
        )
        val_dataset = configmapper.get_object("datasets", data_config.main.name)(
            data_config.val, self.tokenizer
        )

        model = configmapper.get_object("models", model_config.name)(
            **model_config.params.as_dict()
        )

        return model, train_dataset, val_dataset