|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| """Classes for storing hyperparameters, data locations, etc."""
|
|
|
| from __future__ import absolute_import
|
| from __future__ import division
|
| from __future__ import print_function
|
|
|
| import json
|
| from os.path import join
|
| import tensorflow as tf
|
|
|
|
|
| class Config(object):
|
| """Stores everything needed to train a model."""
|
|
|
| def __init__(self, **kwargs):
|
|
|
| self.data_dir = './data'
|
| self.model_name = 'default_model'
|
|
|
|
|
| self.mode = 'train'
|
| self.task_names = ['chunk']
|
|
|
| self.is_semisup = True
|
| self.for_preprocessing = False
|
|
|
|
|
| self.pretrained_embeddings = 'glove.6B.300d.txt'
|
|
|
| self.word_embedding_size = 300
|
|
|
|
|
| self.use_chars = True
|
| self.char_embedding_size = 50
|
| self.char_cnn_filter_widths = [2, 3, 4]
|
| self.char_cnn_n_filters = 100
|
| self.unidirectional_sizes = [1024]
|
| self.bidirectional_sizes = [512]
|
| self.projection_size = 512
|
|
|
|
|
| self.depparse_projection_size = 128
|
|
|
|
|
|
|
| self.label_encoding = 'BIOES'
|
|
|
| self.label_smoothing = 0.1
|
|
|
|
|
| self.lr = 0.5
|
| self.momentum = 0.9
|
| self.grad_clip = 1.0
|
| self.warm_up_steps = 5000.0
|
| self.lr_decay = 0.005
|
|
|
|
|
| self.ema_decay = 0.998
|
| self.ema_test = True
|
| self.ema_teacher = False
|
|
|
|
|
| self.labeled_keep_prob = 0.5
|
| self.unlabeled_keep_prob = 0.8
|
|
|
|
|
| self.max_sentence_length = 100
|
| self.max_word_length = 20
|
| self.train_batch_size = 64
|
| self.test_batch_size = 64
|
| self.buckets = [(0, 15), (15, 40), (40, 1000)]
|
|
|
|
|
|
|
| self.print_every = 25
|
| self.eval_dev_every = 500
|
| self.eval_train_every = 2000
|
| self.save_model_every = 1000
|
|
|
|
|
| self.train_set_percent = 100
|
|
|
| for k, v in kwargs.iteritems():
|
| if k not in self.__dict__:
|
| raise ValueError("Unknown argument", k)
|
| self.__dict__[k] = v
|
|
|
| self.dev_set = self.mode == "train"
|
|
|
|
|
|
|
| self.raw_data_topdir = join(self.data_dir, 'raw_data')
|
| self.unsupervised_data = join(
|
| self.raw_data_topdir,
|
| 'unlabeled_data',
|
| '1-billion-word-language-modeling-benchmark-r13output',
|
| 'training-monolingual.tokenized.shuffled')
|
| self.pretrained_embeddings_file = join(
|
| self.raw_data_topdir, 'pretrained_embeddings',
|
| self.pretrained_embeddings)
|
|
|
| self.preprocessed_data_topdir = join(self.data_dir, 'preprocessed_data')
|
| self.embeddings_dir = join(self.preprocessed_data_topdir,
|
| self.pretrained_embeddings.rsplit('.', 1)[0])
|
| self.word_vocabulary = join(self.embeddings_dir, 'word_vocabulary.pkl')
|
| self.word_embeddings = join(self.embeddings_dir, 'word_embeddings.pkl')
|
|
|
| self.model_dir = join(self.data_dir, "models", self.model_name)
|
| self.checkpoints_dir = join(self.model_dir, 'checkpoints')
|
| self.checkpoint = join(self.checkpoints_dir, 'checkpoint.ckpt')
|
| self.best_model_checkpoints_dir = join(
|
| self.model_dir, 'best_model_checkpoints')
|
| self.best_model_checkpoint = join(
|
| self.best_model_checkpoints_dir, 'checkpoint.ckpt')
|
| self.progress = join(self.checkpoints_dir, 'progress.pkl')
|
| self.summaries_dir = join(self.model_dir, 'summaries')
|
| self.history_file = join(self.model_dir, 'history.pkl')
|
|
|
| def write(self):
|
| tf.gfile.MakeDirs(self.model_dir)
|
| with open(join(self.model_dir, 'config.json'), 'w') as f:
|
| f.write(json.dumps(self.__dict__, sort_keys=True, indent=4,
|
| separators=(',', ': ')))
|
|
|
|
|