File size: 3,979 Bytes
cc9c7ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
from src.modules.tokenizers import *
from src.modules.embeddings import *
from src.utils.mapper import configmapper
class Preprocessor:
def preprocess(self):
pass
@configmapper.map("preprocessors", "glove")
class GlovePreprocessor(Preprocessor):
"""GlovePreprocessor."""
def __init__(self, config):
"""
Args:
config (src.utils.module.Config): configuration for preprocessor
"""
super(GlovePreprocessor, self).__init__()
self.config = config
self.tokenizer = configmapper.get_object(
"tokenizers", self.config.main.preprocessor.tokenizer.name
)(**self.config.main.preprocessor.tokenizer.init_params.as_dict())
self.tokenizer_params = (
self.config.main.preprocessor.tokenizer.init_vector_params.as_dict()
)
self.tokenizer.initialize_vectors(**self.tokenizer_params)
self.embeddings = configmapper.get_object(
"embeddings", self.config.main.preprocessor.embedding.name
)(
self.tokenizer.text_field.vocab.vectors,
self.tokenizer.text_field.vocab.stoi[self.tokenizer.text_field.pad_token],
)
def preprocess(self, model_config, data_config):
train_dataset = configmapper.get_object("datasets", data_config.main.name)(
data_config.train, self.tokenizer
)
val_dataset = configmapper.get_object("datasets", data_config.main.name)(
data_config.val, self.tokenizer
)
model = configmapper.get_object("models", model_config.name)(
self.embeddings, **model_config.params.as_dict()
)
return model, train_dataset, val_dataset
@configmapper.map("preprocessors", "clozePreprocessor")
class ClozePreprocessor(Preprocessor):
"""GlovePreprocessor."""
def __init__(self, config):
"""
Args:
config (src.utils.module.Config): configuration for preprocessor
"""
super(ClozePreprocessor, self).__init__()
self.config = config
self.tokenizer = configmapper.get_object(
"tokenizers", self.config.main.preprocessor.tokenizer.name
).from_pretrained(
**self.config.main.preprocessor.tokenizer.init_params.as_dict()
)
def preprocess(self, model_config, data_config):
train_dataset = configmapper.get_object("datasets", data_config.main.name)(
data_config.train, self.tokenizer
)
val_dataset = configmapper.get_object("datasets", data_config.main.name)(
data_config.val, self.tokenizer
)
model = configmapper.get_object("models", model_config.name).from_pretrained(
**model_config.params.as_dict()
)
return model, train_dataset, val_dataset
@configmapper.map("preprocessors", "transformersConcretenessPreprocessor")
class TransformersConcretenessPreprocessor(Preprocessor):
"""BertConcretenessPreprocessor."""
def __init__(self, config):
"""
Args:
config (src.utils.module.Config): configuration for preprocessor
"""
super(TransformersConcretenessPreprocessor, self).__init__()
self.config = config
self.tokenizer = configmapper.get_object(
"tokenizers", self.config.main.preprocessor.tokenizer.name
).from_pretrained(
**self.config.main.preprocessor.tokenizer.init_params.as_dict()
)
def preprocess(self, model_config, data_config):
train_dataset = configmapper.get_object("datasets", data_config.main.name)(
data_config.train, self.tokenizer
)
val_dataset = configmapper.get_object("datasets", data_config.main.name)(
data_config.val, self.tokenizer
)
model = configmapper.get_object("models", model_config.name)(
**model_config.params.as_dict()
)
return model, train_dataset, val_dataset
|