| { |
| "dataset_reader": { |
| "type": "multitask", |
| "readers": { |
| "ud": { |
| "type": "universal_dependencies", |
| "token_indexers": { |
| "transformer": { |
| "type": "pretrained_transformer_mismatched", |
| "max_length": 512, |
| "model_name": "MLRS/BERTu" |
| } |
| } |
| } |
| } |
| }, |
| "model": { |
| "type": "multitask", |
| "arg_name_mapping": { |
| "backbone": { |
| "tokens": "text", |
| "words": "text" |
| } |
| }, |
| "backbone": { |
| "type": "embedder_and_mask", |
| "text_field_embedder": { |
| "token_embedders": { |
| "transformer": { |
| "type": "pretrained_transformer_mismatched_with_dropout", |
| "last_layer_only": false, |
| "layer_dropout": 0.1, |
| "max_length": 512, |
| "model_name": "MLRS/BERTu", |
| "tokenizer_kwargs": {}, |
| "train_parameters": true |
| } |
| } |
| } |
| }, |
| "heads": { |
| "ud": { |
| "type": "biaffine_parser", |
| "arc_representation_dim": 100, |
| "dropout": 0.3, |
| "encoder": { |
| "type": "pass_through", |
| "input_dim": 768 |
| }, |
| "initializer": { |
| "regexes": [ |
| [ |
| ".*projection.*weight", |
| { |
| "type": "xavier_uniform" |
| } |
| ], |
| [ |
| ".*projection.*bias", |
| { |
| "type": "zero" |
| } |
| ], |
| [ |
| ".*tag_bilinear.*weight", |
| { |
| "type": "xavier_uniform" |
| } |
| ], |
| [ |
| ".*tag_bilinear.*bias", |
| { |
| "type": "zero" |
| } |
| ], |
| [ |
| ".*weight_ih.*", |
| { |
| "type": "xavier_uniform" |
| } |
| ], |
| [ |
| ".*weight_hh.*", |
| { |
| "type": "orthogonal" |
| } |
| ], |
| [ |
| ".*bias_ih.*", |
| { |
| "type": "zero" |
| } |
| ], |
| [ |
| ".*bias_hh.*", |
| { |
| "type": "lstm_hidden_bias" |
| } |
| ] |
| ] |
| }, |
| "input_dropout": 0.3, |
| "tag_representation_dim": 100, |
| "use_mst_decoding_for_validation": true |
| } |
| } |
| }, |
| "train_data_path": { |
| "ud": "ud-treebanks-v2.8/UD_Maltese-MUDT/mt_mudt-ud-train.conllu" |
| }, |
| "validation_data_path": { |
| "ud": "ud-treebanks-v2.8/UD_Maltese-MUDT/mt_mudt-ud-dev.conllu" |
| }, |
| "trainer": { |
| "callbacks": [ |
| { |
| "tensorboard_writer": { |
| "should_log_learning_rate": true, |
| "should_log_parameter_statistics": true |
| }, |
| "type": "tensorboard" |
| } |
| ], |
| "cuda_device": 0, |
| "grad_norm": 5, |
| "learning_rate_scheduler": { |
| "type": "ulmfit_sqrt", |
| "affected_group_count": 2, |
| "decay_factor": 0.05, |
| "discriminative_fine_tuning": true, |
| "factor": 5, |
| "gradual_unfreezing": true, |
| "model_size": 1, |
| "start_step": 9, |
| "warmup_steps": 9 |
| }, |
| "num_epochs": 200, |
| "optimizer": { |
| "type": "huggingface_adamw", |
| "betas": [ |
| 0.9, |
| 0.999 |
| ], |
| "correct_bias": false, |
| "lr": 0.0005, |
| "parameter_groups": [ |
| [ |
| [ |
| "text_field_embedder.*transformer_model.embeddings.*_embeddings.*", |
| "text_field_embedder.*transformer_model.encoder.*.(key|query|value|dense).weight" |
| ], |
| {} |
| ], |
| [ |
| [ |
| "text_field_embedder.*transformer_model.embeddings.LayerNorm.*", |
| "text_field_embedder.*transformer_model.encoder.*.output.LayerNorm.*", |
| "text_field_embedder.*transformer_model.encoder.*.(key|query|value|dense).bias", |
| "text_field_embedder.*transformer_model.pooler.dense.bias" |
| ], |
| { |
| "weight_decay": 0 |
| } |
| ], |
| [ |
| [ |
| "text_field_embedder.*._scalar_mix.*", |
| "text_field_embedder.*transformer_model.pooler.dense.weight", |
| "_head_sentinel", |
| "head_arc_feedforward._linear_layers.*.weight", |
| "child_arc_feedforward._linear_layers.*.weight", |
| "head_tag_feedforward._linear_layers.*.weight", |
| "child_tag_feedforward._linear_layers.*.weight", |
| "arc_attention._weight_matrix", |
| "tag_bilinear.weight", |
| "tag_projection_layer._module.weight", |
| "crf", |
| "linear.weight", |
| "tagger_linear.weight" |
| ], |
| {} |
| ], |
| [ |
| [ |
| "head_arc_feedforward._linear_layers.*.bias", |
| "child_arc_feedforward._linear_layers.*.bias", |
| "head_tag_feedforward._linear_layers.*.bias", |
| "child_tag_feedforward._linear_layers.*.bias", |
| "arc_attention._bias", |
| "tag_bilinear.bias", |
| "tag_projection_layer._module.bias", |
| "linear.bias", |
| "tagger_linear.bias" |
| ], |
| { |
| "weight_decay": 0 |
| } |
| ] |
| ], |
| "weight_decay": 0.01 |
| }, |
| "patience": 20, |
| "validation_metric": [ |
| "+ud_LAS" |
| ] |
| }, |
| "data_loader": { |
| "type": "multitask", |
| "scheduler": { |
| "type": "unbalanced_homogeneous_roundrobin", |
| "batch_size": 128, |
| "dataset_sizes": { |
| "ud": 1123 |
| } |
| }, |
| "shuffle": true |
| }, |
| "numpy_seed": 2460, |
| "pytorch_seed": 246, |
| "random_seed": 24601, |
| "validation_data_loader": { |
| "type": "multitask", |
| "scheduler": { |
| "type": "homogeneous_roundrobin", |
| "batch_size": 128 |
| }, |
| "shuffle": true |
| } |
| } |