Spaces:

yiningmao
/

metaphor-detection-baseline

Runtime error

App Files Files Community

yiningmao commited on Mar 1, 2023

Commit

c5db72e

1 Parent(s): 9369635

Upload 34 files

Browse files

Files changed (34) hide show

4_20230227-0026/config.json +26 -0
4_20230227-0026/experiments.log +83 -0
4_20230227-0026/main_config.cfg +27 -0
4_20230227-0026/merges.txt +0 -0
4_20230227-0026/pytorch_model.bin +3 -0
4_20230227-0026/training_args.bin +3 -0
4_20230227-0026/vocab.json +0 -0
app.py +236 -0
data_loader.py +233 -0
flagged/log.csv +2 -0
flagged/output/tmpginvysx3.json +1 -0
main.py +545 -0
main_config.cfg +58 -0
modeling.py +403 -0
requirements.txt +2 -0
run_classifier_dataset_utils.py +669 -0
scripts/run.sh +2 -0
scripts/run_bagging.sh +8 -0
utils/Config.py +128 -0
utils/Logger.py +84 -0
utils/ResultTable.py +160 -0
utils/Statistics.py +29 -0
utils/Tool.py +17 -0
utils/__init__.py +4 -0
utils/__pycache__/Config.cpython-36.pyc +0 -0
utils/__pycache__/Config.cpython-38.pyc +0 -0
utils/__pycache__/Logger.cpython-36.pyc +0 -0
utils/__pycache__/Logger.cpython-38.pyc +0 -0
utils/__pycache__/ResultTable.cpython-36.pyc +0 -0
utils/__pycache__/ResultTable.cpython-38.pyc +0 -0
utils/__pycache__/Tool.cpython-36.pyc +0 -0
utils/__pycache__/Tool.cpython-38.pyc +0 -0
utils/__pycache__/__init__.cpython-36.pyc +0 -0
utils/__pycache__/__init__.cpython-38.pyc +0 -0

4_20230227-0026/config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_name_or_path": "roberta-base",
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.2.2",
+  "type_vocab_size": 4,
+  "use_cache": true,
+  "vocab_size": 50265
+}

4_20230227-0026/experiments.log ADDED Viewed

	@@ -0,0 +1,83 @@

+[56]2023-02-27 00:26:50,668: device: cuda n_gpu: 1
+[56]2023-02-27 00:29:19,004: ***** Running training *****
+[56]2023-02-27 00:29:19,004:   Batch size = 16
+[56]2023-02-27 00:29:19,004:   Num steps = 30030
+[56]2023-02-27 01:18:38,101: [epoch 1] ,lr: 1.5e-05 ,tr_loss: 4808.392509443685
+[56]2023-02-27 01:19:00,946: ***** Running evaluation *****
+[56]2023-02-27 01:21:16,307:   acc = 0.8590286538114976
+[56]2023-02-27 01:21:16,308:   f1 = 0.4659498207885305
+[56]2023-02-27 01:21:16,308:   precision = 0.727224294086308
+[56]2023-02-27 01:21:16,308:   recall = 0.3427925665494726
+[56]2023-02-27 02:08:39,574: [epoch 2] ,lr: 3e-05 ,tr_loss: 2660.9157069001812
+[56]2023-02-27 02:09:03,298: ***** Running evaluation *****
+[56]2023-02-27 02:11:18,653:   acc = 0.8952513966480447
+[56]2023-02-27 02:11:18,653:   f1 = 0.6620148277365896
+[56]2023-02-27 02:11:18,653:   precision = 0.7859855022437003
+[56]2023-02-27 02:11:18,653:   recall = 0.5718232044198895
+[56]2023-02-27 02:59:03,528: [epoch 3] ,lr: 0.0 ,tr_loss: 1665.7832884637173
+[56]2023-02-27 02:59:27,660: ***** Running evaluation *****
+[56]2023-02-27 03:01:41,367:   acc = 0.9039466570553253
+[56]2023-02-27 03:01:41,367:   f1 = 0.7135178715399086
+[56]2023-02-27 03:01:41,367:   precision = 0.7673410404624278
+[56]2023-02-27 03:01:41,367:   recall = 0.666750376695128
+[56]2023-02-27 03:01:43,253: -----Best Result-----
+[56]2023-02-27 03:01:43,253:   acc = 0.9039466570553253
+[56]2023-02-27 03:01:43,253:   f1 = 0.7135178715399086
+[56]2023-02-27 03:01:43,253:   precision = 0.7673410404624278
+[56]2023-02-27 03:01:43,253:   recall = 0.666750376695128
+[56]2023-02-27 03:02:04,028: ***** Running evaluation *****
+[56]2023-02-27 03:04:17,297:   acc = 0.9039466570553253
+[56]2023-02-27 03:04:17,297:   f1 = 0.7135178715399086
+[56]2023-02-27 03:04:17,297:   precision = 0.7673410404624278
+[56]2023-02-27 03:04:17,297:   recall = 0.666750376695128
+[56]2023-02-27 03:04:17,298: Saved to saves/roberta-base/4_20230227-0026
+[56]2023-02-28 02:41:26,304: device: cpu n_gpu: 0
+[56]2023-02-28 02:44:12,493: device: cpu n_gpu: 0
+[56]2023-02-28 02:50:35,429: device: cpu n_gpu: 0
+[56]2023-02-28 02:50:56,790: ***** Running evaluation *****
+[56]2023-02-28 02:53:34,368: device: cpu n_gpu: 0
+[56]2023-02-28 03:00:12,499: ***** Running evaluation *****
+[56]2023-02-28 03:22:26,949: device: cpu n_gpu: 0
+[56]2023-02-28 03:47:13,134: device: cpu n_gpu: 0
+[56]2023-02-28 03:48:38,484: device: cpu n_gpu: 0
+[56]2023-02-28 03:49:14,589: device: cpu n_gpu: 0
+[56]2023-02-28 04:02:49,943: device: cpu n_gpu: 0
+[56]2023-02-28 04:32:46,799: device: cpu n_gpu: 0
+[56]2023-02-28 04:34:48,647: device: cpu n_gpu: 0
+[56]2023-02-28 04:37:23,090: device: cpu n_gpu: 0
+[56]2023-02-28 04:40:06,170: device: cpu n_gpu: 0
+[56]2023-02-28 04:43:40,692: device: cpu n_gpu: 0
+[56]2023-02-28 04:57:12,843: device: cpu n_gpu: 0
+[56]2023-02-28 05:03:30,653: device: cpu n_gpu: 0
+[56]2023-02-28 05:14:12,276: device: cpu n_gpu: 0
+[56]2023-02-28 05:15:25,604: device: cpu n_gpu: 0
+[56]2023-02-28 05:21:26,346: device: cpu n_gpu: 0
+[56]2023-02-28 05:29:09,123: device: cpu n_gpu: 0
+[56]2023-02-28 05:30:59,283: device: cpu n_gpu: 0
+[56]2023-02-28 05:34:33,463: device: cpu n_gpu: 0
+[56]2023-02-28 05:37:25,436: device: cpu n_gpu: 0
+[56]2023-02-28 05:40:54,252: device: cpu n_gpu: 0
+[56]2023-02-28 05:51:44,583: device: cpu n_gpu: 0
+[56]2023-02-28 05:54:21,953: device: cpu n_gpu: 0
+[56]2023-02-28 06:04:58,550: device: cpu n_gpu: 0
+[56]2023-02-28 06:12:55,019: device: cpu n_gpu: 0
+[56]2023-02-28 06:17:31,790: device: cpu n_gpu: 0
+[56]2023-02-28 06:21:49,848: device: cpu n_gpu: 0
+[56]2023-02-28 06:23:45,894: device: cpu n_gpu: 0
+[56]2023-02-28 06:30:27,960: device: cpu n_gpu: 0
+[56]2023-02-28 06:34:11,145: device: cpu n_gpu: 0
+[56]2023-02-28 06:36:56,962: device: cpu n_gpu: 0
+[56]2023-02-28 06:38:45,488: device: cpu n_gpu: 0
+[56]2023-02-28 06:39:18,822: device: cpu n_gpu: 0
+[56]2023-02-28 06:39:44,789: device: cpu n_gpu: 0
+[56]2023-02-28 06:44:02,812: device: cpu n_gpu: 0
+[56]2023-02-28 06:45:15,008: device: cpu n_gpu: 0
+[56]2023-02-28 06:48:26,234: device: cpu n_gpu: 0
+[56]2023-02-28 06:54:51,113: device: cpu n_gpu: 0
+[56]2023-02-28 07:08:07,149: device: cpu n_gpu: 0
+[56]2023-02-28 07:10:04,991: device: cpu n_gpu: 0
+[56]2023-02-28 07:11:36,409: device: cpu n_gpu: 0
+[56]2023-02-28 07:12:03,892: device: cpu n_gpu: 0
+[56]2023-02-28 07:13:15,960: device: cpu n_gpu: 0
+[56]2023-02-28 07:21:36,476: device: cpu n_gpu: 0
+[56]2023-02-28 07:23:15,164: device: cpu n_gpu: 0

4_20230227-0026/main_config.cfg ADDED Viewed

	@@ -0,0 +1,27 @@

+[args]
+bert_model=roberta-base
+data_dir=data/VUA20
+task_name=vua
+model_type=MELBERT
+classifier_hidden=768
+lr_schedule=warmup_linear
+warmup_epoch=2
+drop_ratio=0.2
+kfold=10
+num_bagging=0
+bagging_index=0
+use_pos=True
+use_local_context=True
+max_seq_length=150
+do_train=True
+do_test=True
+do_eval=True
+do_lower_case=False
+class_weight=3
+train_batch_size=16
+eval_batch_size=8
+learning_rate=3e-05
+num_train_epoch=3
+no_cuda=False
+seed=42

4_20230227-0026/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

4_20230227-0026/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3027110c50c22427bd23fad97158fbf5c366c8f3dabb92865abcb66df4334adf
+size 508135877

4_20230227-0026/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:044cdea230767bb31596587242f594f00c14006b56bf8277ef59e3086cb00d4a
+size 1339

4_20230227-0026/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import os
+import sys
+import pickle
+import random
+import copy
+import numpy as np
+import gradio as gr
+import re
+import string
+import torch
+import torch.nn as nn
+from tqdm import tqdm, trange
+from collections import OrderedDict
+from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
+from utils import Config, Logger, make_log_dir
+from modeling import (
+    AutoModelForSequenceClassification,
+    AutoModelForTokenClassification,
+    AutoModelForSequenceClassification_SPV,
+    AutoModelForSequenceClassification_MIP,
+    AutoModelForSequenceClassification_SPV_MIP,
+)
+from run_classifier_dataset_utils import processors, output_modes, compute_metrics
+from data_loader import load_train_data, load_train_data_kf, load_test_data, load_sentence_data
+from frame_semantic_transformer import FrameSemanticTransformer
+frame_transformer = FrameSemanticTransformer()
+frame_transformer.setup()
+CONFIG_NAME = "config.json"
+WEIGHTS_NAME = "pytorch_model.bin"
+ARGS_NAME = "training_args.bin"
+def main():
+    # read configs
+    config = Config(main_conf_path="./")
+    # apply system arguments if exist
+    argv = sys.argv[1:]
+    if len(argv) > 0:
+        cmd_arg = OrderedDict()
+        argvs = " ".join(sys.argv[1:]).split(" ")
+        for i in range(0, len(argvs), 2):
+            arg_name, arg_value = argvs[i], argvs[i + 1]
+            arg_name = arg_name.strip("-")
+            cmd_arg[arg_name] = arg_value
+        config.update_params(cmd_arg)
+    args = config
+    print(args.__dict__)
+    # logger
+    if "saves" in args.bert_model:
+        log_dir = args.bert_model
+        logger = Logger(log_dir)
+        config = Config(main_conf_path=log_dir)
+        old_args = copy.deepcopy(args)
+        args.__dict__.update(config.__dict__)
+        args.bert_model = old_args.bert_model
+        args.do_train = old_args.do_train
+        args.data_dir = old_args.data_dir
+        args.task_name = old_args.task_name
+        # apply system arguments if exist
+        argv = sys.argv[1:]
+        if len(argv) > 0:
+            cmd_arg = OrderedDict()
+            argvs = " ".join(sys.argv[1:]).split(" ")
+            for i in range(0, len(argvs), 2):
+                arg_name, arg_value = argvs[i], argvs[i + 1]
+                arg_name = arg_name.strip("-")
+                cmd_arg[arg_name] = arg_value
+            config.update_params(cmd_arg)
+    else:
+        if not os.path.exists("saves"):
+            os.mkdir("saves")
+        log_dir = make_log_dir(os.path.join("saves", args.bert_model))
+        logger = Logger(log_dir)
+        config.save(log_dir)
+    args.log_dir = log_dir
+    # set CUDA devices
+    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+    args.n_gpu = torch.cuda.device_count()
+    args.device = device
+    logger.info("device: {} n_gpu: {}".format(device, args.n_gpu))
+    # set seed
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+    # get dataset and processor
+    args.num_labels = 2
+    # build tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
+    model = load_pretrained_model(args)
+    # Load trained model
+    if "saves" in args.bert_model:
+        model = load_trained_model(args, model, tokenizer)
+    #tokenizer.tokenize('the debate has sharpened.')
+    def run_one_sentence(sentence):
+        print('sentence:', sentence)
+        sentence = re.sub(r'([.,!?()-]+)', r' \1 ', sentence)
+        sentence = ' '.join(sentence.split())
+        print('sentence:', sentence)
+        result = frame_transformer.detect_frames(sentence)
+        print(result)
+        model.eval()
+        s_batch = load_sentence_data(args, sentence, ['0','1'], tokenizer, 'classification')
+        if args.model_type in ["MELBERT_MIP", "MELBERT"]:
+            input_ids, input_mask, segment_ids, label_ids, idx, input_ids_2, input_mask_2, segment_ids_2 = s_batch
+        else:
+            input_ids, input_mask, segment_ids, label_ids, idx = s_batch
+        with torch.no_grad():
+            # compute loss values
+            if args.model_type in ["BERT_BASE", "BERT_SEQ", "MELBERT_SPV"]:
+                logits = model(
+                    input_ids,
+                    target_mask=(segment_ids == 1),
+                    token_type_ids=segment_ids,
+                    attention_mask=input_mask,
+                )
+            elif args.model_type in ["MELBERT_MIP", "MELBERT"]:
+                logits = model(
+                    input_ids,
+                    input_ids_2,
+                    target_mask=(segment_ids == 1),
+                    target_mask_2=segment_ids_2,
+                    attention_mask_2=input_mask_2,
+                    token_type_ids=segment_ids,
+                    attention_mask=input_mask,
+                )
+            pred = logits.detach().cpu().numpy()
+            pred = np.argmax(pred, axis=1)
+            pred_list = [None for _ in range(len(sentence.split()))]
+            for i,n in enumerate(idx):
+                pred_list[n] = 'M' if pred[i] == 1 else None
+            print(len(pred_list), pred_list)
+            label_list = [(w, p) for w,p in zip(sentence.split(), pred_list)]
+            print(label_list)
+            return label_list, result
+    #import pdb; pdb.set_trace()
+    demo = gr.Interface(
+        run_one_sentence,
+        gr.Textbox(placeholder="Enter sentence here..."),
+        ['highlight', 'json'],
+        examples=[
+            ['while new departments are born and others extended .'],
+            ['The sounds are the same as those of daylight , yet somehow the night magnifies and sharpens the creak of a yielding block , the sigh of air over a shroud , the stretching of a sail , the hiss of water sliding sleek against the hull , the curl of a quarter-wave falling away , and the thump as a wave strikes the cutwater to be sheared into two bright slices of whiteness .'],
+            ['and finally, the debate has sharpened.']
+        ]
+    )
+    demo.launch(debug=True)
+def load_pretrained_model(args):
+    # Pretrained Model
+    bert = AutoModel.from_pretrained(args.bert_model)
+    #for name, param in bert.named_parameters():
+    #    print(name, param.requires_grad)
+    config = bert.config
+    config.type_vocab_size = 4
+    if "albert" in args.bert_model:
+        bert.embeddings.token_type_embeddings = nn.Embedding(
+            config.type_vocab_size, config.embedding_size
+        )
+    else:
+        bert.embeddings.token_type_embeddings = nn.Embedding(
+            config.type_vocab_size, config.hidden_size
+        )
+    bert._init_weights(bert.embeddings.token_type_embeddings)
+    # Additional Layers
+    if args.model_type in ["BERT_BASE"]:
+        model = AutoModelForSequenceClassification(
+            args=args, Model=bert, config=config, num_labels=args.num_labels
+        )
+    if args.model_type == "BERT_SEQ":
+        model = AutoModelForTokenClassification(
+            args=args, Model=bert, config=config, num_labels=args.num_labels
+        )
+    if args.model_type == "MELBERT_SPV":
+        model = AutoModelForSequenceClassification_SPV(
+            args=args, Model=bert, config=config, num_labels=args.num_labels
+        )
+    if args.model_type == "MELBERT_MIP":
+        model = AutoModelForSequenceClassification_MIP(
+            args=args, Model=bert, config=config, num_labels=args.num_labels
+        )
+    if args.model_type == "MELBERT":
+        model = AutoModelForSequenceClassification_SPV_MIP(
+            args=args, Model=bert, config=config, num_labels=args.num_labels
+        )
+    model.to(args.device)
+    if args.n_gpu > 1 and not args.no_cuda:
+        model = torch.nn.DataParallel(model)
+    return model
+def load_trained_model(args, model, tokenizer):
+    # If we save using the predefined names, we can load using `from_pretrained`
+    output_model_file = os.path.join(args.log_dir, WEIGHTS_NAME)
+    if hasattr(model, "module"):
+        model.module.load_state_dict(torch.load(output_model_file, map_location=args.device))
+    else:
+        model.load_state_dict(torch.load(output_model_file, map_location=args.device))
+    return model
+if __name__ == "__main__":
+    main()

data_loader.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import string
+from sklearn.model_selection import StratifiedKFold
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
+from run_classifier_dataset_utils import (
+    convert_examples_to_two_features,
+    convert_examples_to_features,
+    convert_two_examples_to_features,
+)
+def load_train_data(args, logger, processor, task_name, label_list, tokenizer, output_mode, k=None):
+    # Prepare data loader
+    if task_name == "vua":
+        train_examples = processor.get_train_examples(args.data_dir)
+    elif task_name == "trofi":
+        train_examples = processor.get_train_examples(args.data_dir, k)
+    else:
+        raise ("task_name should be 'vua' or 'trofi'!")
+    import pdb; pdb.set_trace()
+    print(args.model_type, args.max_data_num)
+    # make features file
+    if args.model_type == "BERT_BASE":
+        train_features = convert_two_examples_to_features(
+            train_examples, label_list, args.max_seq_length, tokenizer, output_mode
+        )
+    if args.model_type in ["BERT_SEQ", "MELBERT_SPV"]:
+        train_features = convert_examples_to_features(
+            train_examples, label_list, args.max_seq_length, tokenizer, output_mode, args
+        )
+    if args.model_type in ["MELBERT_MIP", "MELBERT"]:
+        train_features = convert_examples_to_two_features(
+            train_examples, label_list, args.max_seq_length, tokenizer, output_mode, args
+        )
+    # make features into tensor
+    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
+    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
+    # add additional features for MELBERT_MIP and MELBERT
+    if args.model_type in ["MELBERT_MIP", "MELBERT"]:
+        all_input_ids_2 = torch.tensor([f.input_ids_2 for f in train_features], dtype=torch.long)
+        all_input_mask_2 = torch.tensor([f.input_mask_2 for f in train_features], dtype=torch.long)
+        all_segment_ids_2 = torch.tensor(
+            [f.segment_ids_2 for f in train_features], dtype=torch.long
+        )
+        train_data = TensorDataset(
+            all_input_ids,
+            all_input_mask,
+            all_segment_ids,
+            all_label_ids,
+            all_input_ids_2,
+            all_input_mask_2,
+            all_segment_ids_2,
+        )
+    else:
+        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+    train_sampler = RandomSampler(train_data)
+    train_dataloader = DataLoader(
+        train_data, sampler=train_sampler, batch_size=args.train_batch_size
+    )
+    return train_dataloader
+def load_train_data_kf(
+    args, logger, processor, task_name, label_list, tokenizer, output_mode, k=None
+):
+    # Prepare data loader
+    if task_name == "vua":
+        train_examples = processor.get_train_examples(args.data_dir)
+    elif task_name == "trofi":
+        train_examples = processor.get_train_examples(args.data_dir, k)
+    else:
+        raise ("task_name should be 'vua' or 'trofi'!")
+    # make features file
+    if args.model_type == "BERT_BASE":
+        train_features = convert_two_examples_to_features(
+            train_examples, label_list, args.max_seq_length, tokenizer, output_mode
+        )
+    if args.model_type in ["BERT_SEQ", "MELBERT_SPV"]:
+        train_features = convert_examples_to_features(
+            train_examples, label_list, args.max_seq_length, tokenizer, output_mode, args
+        )
+    if args.model_type in ["MELBERT_MIP", "MELBERT"]:
+        train_features = convert_examples_to_two_features(
+            train_examples, label_list, args.max_seq_length, tokenizer, output_mode, args
+        )
+    # make features into tensor
+    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
+    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
+    # add additional features for MELBERT_MIP and MELBERT
+    if args.model_type in ["MELBERT_MIP", "MELBERT"]:
+        all_input_ids_2 = torch.tensor([f.input_ids_2 for f in train_features], dtype=torch.long)
+        all_input_mask_2 = torch.tensor([f.input_mask_2 for f in train_features], dtype=torch.long)
+        all_segment_ids_2 = torch.tensor(
+            [f.segment_ids_2 for f in train_features], dtype=torch.long
+        )
+        train_data = TensorDataset(
+            all_input_ids,
+            all_input_mask,
+            all_segment_ids,
+            all_label_ids,
+            all_input_ids_2,
+            all_input_mask_2,
+            all_segment_ids_2,
+        )
+    else:
+        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+    gkf = StratifiedKFold(n_splits=args.num_bagging).split(X=all_input_ids, y=all_label_ids.numpy())
+    return train_data, gkf
+def load_test_data(args, logger, processor, task_name, label_list, tokenizer, output_mode, k=None):
+    if task_name == "vua":
+        eval_examples = processor.get_test_examples(args.data_dir)
+    elif task_name == "trofi":
+        eval_examples = processor.get_test_examples(args.data_dir, k)
+    else:
+        raise ("task_name should be 'vua' or 'trofi'!")
+    import pdb; pdb.set_trace()
+    eval_examples = eval_examples[14185:14216]
+    if args.model_type == "BERT_BASE":
+        eval_features = convert_two_examples_to_features(
+            eval_examples, label_list, args.max_seq_length, tokenizer, output_mode
+        )
+    if args.model_type in ["BERT_SEQ", "MELBERT_SPV"]:
+        eval_features = convert_examples_to_features(
+            eval_examples, label_list, args.max_seq_length, tokenizer, output_mode, args
+        )
+    if args.model_type in ["MELBERT_MIP", "MELBERT"]:
+        eval_features = convert_examples_to_two_features(
+            eval_examples, label_list, args.max_seq_length, tokenizer, output_mode, args
+        )
+    import pdb; pdb.set_trace()
+    logger.info("***** Running evaluation *****")
+    if args.model_type in ["MELBERT_MIP", "MELBERT"]:
+        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
+        all_guids = [f.guid for f in eval_features]
+        all_idx = torch.tensor([i for i in range(len(eval_features))], dtype=torch.long)
+        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
+        all_input_ids_2 = torch.tensor([f.input_ids_2 for f in eval_features], dtype=torch.long)
+        all_input_mask_2 = torch.tensor([f.input_mask_2 for f in eval_features], dtype=torch.long)
+        all_segment_ids_2 = torch.tensor([f.segment_ids_2 for f in eval_features], dtype=torch.long)
+        eval_data = TensorDataset(
+            all_input_ids,
+            all_input_mask,
+            all_segment_ids,
+            all_label_ids,
+            all_idx,
+            all_input_ids_2,
+            all_input_mask_2,
+            all_segment_ids_2,
+        )
+    else:
+        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
+        all_guids = [f.guid for f in eval_features]
+        all_idx = torch.tensor([i for i in range(len(eval_features))], dtype=torch.long)
+        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
+        eval_data = TensorDataset(
+            all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_idx
+        )
+    # Run prediction for full data
+    eval_sampler = SequentialSampler(eval_data)
+    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+    return all_guids, eval_dataloader
+from run_classifier_dataset_utils import InputExample
+def load_sentence_data(args, sentence, label_list, tokenizer, output_mode, ):
+    #tokens = tokenizer.tokenize(sentence)
+    #print('tokens:', tokens)
+    examples = []
+    example_idxs = []
+    for index, token in enumerate(sentence.split()):
+        if token not in string.punctuation:
+            examples.append(
+                InputExample(
+                        guid='', text_a=sentence, text_b=str(index), label='0', POS='', FGPOS=''
+                    )
+                )
+            print('[', index, token, ']', end=', ')
+            example_idxs.append(index)
+    eval_features = convert_examples_to_two_features(
+            examples, label_list, args.max_seq_length, tokenizer, output_mode, args
+        )
+    if args.model_type in ["MELBERT_MIP", "MELBERT"]:
+        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
+        all_guids = [f.guid for f in eval_features]
+        all_idx = torch.tensor(example_idxs, dtype=torch.long)
+        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
+        all_input_ids_2 = torch.tensor([f.input_ids_2 for f in eval_features], dtype=torch.long)
+        all_input_mask_2 = torch.tensor([f.input_mask_2 for f in eval_features], dtype=torch.long)
+        all_segment_ids_2 = torch.tensor([f.segment_ids_2 for f in eval_features], dtype=torch.long)
+        eval_data = (
+            all_input_ids,
+            all_input_mask,
+            all_segment_ids,
+            all_label_ids,
+            all_idx,
+            all_input_ids_2,
+            all_input_mask_2,
+            all_segment_ids_2,
+        )
+    else:
+        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
+        all_guids = [f.guid for f in eval_features]
+        all_idx = torch.tensor(example_idxs, dtype=torch.long)
+        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
+        eval_data = (
+            all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_idx
+        )
+    return eval_data

flagged/log.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ sentence,output,flag,username,timestamp
2	+ "The sounds are the same as those of daylight , yet somehow the night magnifies and sharpens the creak of a yielding block , the sigh of air over a shroud , the stretching of a sail , the hiss of water sliding sleek against the hull , the curl of a quarter-wave falling away , and the thump as a wave strikes the cutwater to be sheared into two bright slices of whiteness .",/Users/yiningmao/Desktop/CS224N/MelBERT-main/flagged/output/tmpginvysx3.json,,,2023-02-28 07:12:44.582261

flagged/output/tmpginvysx3.json ADDED Viewed

	@@ -0,0 +1 @@

+ [["The", null], ["sounds", null], ["are", null], ["the", null], ["same", null], ["as", null], ["those", ""], ["of", null], ["daylight", null], [",", null], ["yet", null], ["somehow", null], ["the", null], ["night", null], ["magnifies", ""], ["and", null], ["sharpens", ""], ["the", null], ["creak", null], ["of", null], ["a", null], ["yielding", null], ["block", null], [",", null], ["the", null], ["sigh", ""], ["of", null], ["air", null], ["over", null], ["a", null], ["shroud", null], [",", null], ["the", null], ["stretching", null], ["of", null], ["a", null], ["sail", null], [",", null], ["the", null], ["hiss", null], ["of", null], ["water", null], ["sliding", ""], ["sleek", ""], ["against", null], ["the", null], ["hull", null], [",", null], ["the", null], ["curl", ""], ["of", null], ["a", null], ["quarter", null], ["-", null], ["wave", null], ["falling", null], ["away", null], [",", null], ["and", null], ["the", null], ["thump", null], ["as", null], ["a", null], ["wave", null], ["strikes", ""], ["the", null], ["cutwater", null], ["to", null], ["be", null], ["sheared", ""], ["into", ""], ["two", null], ["bright", ""], ["slices", ""], ["of", null], ["whiteness", null], [".", null]]

main.py ADDED Viewed

	@@ -0,0 +1,545 @@

+import os
+import sys
+import pickle
+import random
+import copy
+import numpy as np
+import torch
+import torch.nn as nn
+from tqdm import tqdm, trange
+from collections import OrderedDict
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
+from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
+from utils import Config, Logger, make_log_dir
+from modeling import (
+    AutoModelForSequenceClassification,
+    AutoModelForTokenClassification,
+    AutoModelForSequenceClassification_SPV,
+    AutoModelForSequenceClassification_MIP,
+    AutoModelForSequenceClassification_SPV_MIP,
+)
+from run_classifier_dataset_utils import processors, output_modes, compute_metrics
+from data_loader import load_train_data, load_train_data_kf, load_test_data
+CONFIG_NAME = "config.json"
+WEIGHTS_NAME = "pytorch_model.bin"
+ARGS_NAME = "training_args.bin"
+def main():
+    # read configs
+    config = Config(main_conf_path="./")
+    # apply system arguments if exist
+    argv = sys.argv[1:]
+    if len(argv) > 0:
+        cmd_arg = OrderedDict()
+        argvs = " ".join(sys.argv[1:]).split(" ")
+        for i in range(0, len(argvs), 2):
+            arg_name, arg_value = argvs[i], argvs[i + 1]
+            arg_name = arg_name.strip("-")
+            cmd_arg[arg_name] = arg_value
+        config.update_params(cmd_arg)
+    args = config
+    print(args.__dict__)
+    # logger
+    if "saves" in args.bert_model:
+        log_dir = args.bert_model
+        logger = Logger(log_dir)
+        config = Config(main_conf_path=log_dir)
+        old_args = copy.deepcopy(args)
+        args.__dict__.update(config.__dict__)
+        args.bert_model = old_args.bert_model
+        args.do_train = old_args.do_train
+        args.data_dir = old_args.data_dir
+        args.task_name = old_args.task_name
+        # apply system arguments if exist
+        argv = sys.argv[1:]
+        if len(argv) > 0:
+            cmd_arg = OrderedDict()
+            argvs = " ".join(sys.argv[1:]).split(" ")
+            for i in range(0, len(argvs), 2):
+                arg_name, arg_value = argvs[i], argvs[i + 1]
+                arg_name = arg_name.strip("-")
+                cmd_arg[arg_name] = arg_value
+            config.update_params(cmd_arg)
+    else:
+        if not os.path.exists("saves"):
+            os.mkdir("saves")
+        log_dir = make_log_dir(os.path.join("saves", args.bert_model))
+        logger = Logger(log_dir)
+        config.save(log_dir)
+    args.log_dir = log_dir
+    # set CUDA devices
+    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+    args.n_gpu = torch.cuda.device_count()
+    args.device = device
+    logger.info("device: {} n_gpu: {}".format(device, args.n_gpu))
+    # set seed
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+    # get dataset and processor
+    task_name = args.task_name.lower()
+    processor = processors[task_name]()
+    output_mode = output_modes[task_name]
+    label_list = processor.get_labels()
+    args.num_labels = len(label_list)
+    # build tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
+    model = load_pretrained_model(args)
+    ########### Training ###########
+    # VUA18 / VUA20 for bagging
+    if args.do_train and args.task_name == "vua" and args.num_bagging:
+        train_data, gkf = load_train_data_kf(args, logger, processor, task_name, label_list, tokenizer, output_mode)
+        for fold, (train_idx, valid_idx) in enumerate(tqdm(gkf, desc="bagging...")):
+            if fold != args.bagging_index:
+                continue
+            print(f"bagging_index = {args.bagging_index}")
+            # Load data
+            temp_train_data = TensorDataset(*train_data[train_idx])
+            train_sampler = RandomSampler(temp_train_data)
+            train_dataloader = DataLoader(temp_train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+            # Reset Model
+            model = load_pretrained_model(args)
+            model, best_result = run_train(args, logger, model, train_dataloader, processor, task_name, label_list, tokenizer, output_mode)
+            # Test
+            all_guids, eval_dataloader = load_test_data(args, logger, processor, task_name, label_list, tokenizer, output_mode)
+            preds = run_eval(args, logger, model, eval_dataloader, all_guids, task_name, return_preds=True)
+            with open(os.path.join(args.data_dir, f"seed{args.seed}_preds_{fold}.p"), "wb") as f:
+                pickle.dump(preds, f)
+            # If train data is VUA20, the model needs to be tested on VUAverb as well.
+            # You can just adjust the names of data_dir in conditions below for your own data directories.
+            if "VUA20" in args.data_dir:
+                # Verb
+                args.data_dir = "data/VUAverb"
+                all_guids, eval_dataloader = load_test_data(args, logger, processor, task_name, label_list, tokenizer, output_mode)
+                preds = run_eval(args, logger, model, eval_dataloader, all_guids, task_name, return_preds=True)
+                with open(os.path.join(args.data_dir, f"seed{args.seed}_preds_{fold}.p"), "wb") as f:
+                    pickle.dump(preds, f)
+            logger.info(f"Saved to {logger.log_dir}")
+        return
+    # VUA18 / VUA20
+    if args.do_train and args.task_name == "vua":
+        train_dataloader = load_train_data(
+            args, logger, processor, task_name, label_list, tokenizer, output_mode
+        )
+        model, best_result = run_train(
+            args,
+            logger,
+            model,
+            train_dataloader,
+            processor,
+            task_name,
+            label_list,
+            tokenizer,
+            output_mode,
+        )
+    # TroFi / MOH-X (K-fold)
+    elif args.do_train and args.task_name == "trofi":
+        k_result = []
+        for k in tqdm(range(args.kfold), desc="K-fold"):
+            model = load_pretrained_model(args)
+            train_dataloader = load_train_data(
+                args, logger, processor, task_name, label_list, tokenizer, output_mode, k
+            )
+            model, best_result = run_train(
+                args,
+                logger,
+                model,
+                train_dataloader,
+                processor,
+                task_name,
+                label_list,
+                tokenizer,
+                output_mode,
+                k,
+            )
+            k_result.append(best_result)
+        # Calculate average result
+        avg_result = copy.deepcopy(k_result[0])
+        for result in k_result[1:]:
+            for k, v in result.items():
+                avg_result[k] += v
+        for k, v in avg_result.items():
+            avg_result[k] /= len(k_result)
+        logger.info(f"-----Averge Result-----")
+        for key in sorted(avg_result.keys()):
+            logger.info(f"  {key} = {str(avg_result[key])}")
+    # Load trained model
+    if "saves" in args.bert_model:
+        model = load_trained_model(args, model, tokenizer)
+    ########### Inference ###########
+    # VUA18 / VUA20
+    if (args.do_eval or args.do_test) and task_name == "vua":
+        # if test data is genre or POS tag data
+        if ("genre" in args.data_dir) or ("pos" in args.data_dir):
+            if "genre" in args.data_dir:
+                targets = ["acad", "conv", "fict", "news"]
+            elif "pos" in args.data_dir:
+                targets = ["adj", "adv", "noun", "verb"]
+            orig_data_dir = args.data_dir
+            for idx, target in tqdm(enumerate(targets)):
+                logger.info(f"====================== Evaluating {target} =====================")
+                args.data_dir = os.path.join(orig_data_dir, target)
+                all_guids, eval_dataloader = load_test_data(
+                    args, logger, processor, task_name, label_list, tokenizer, output_mode
+                )
+                run_eval(args, logger, model, eval_dataloader, all_guids, task_name)
+        else:
+            all_guids, eval_dataloader = load_test_data(
+                args, logger, processor, task_name, label_list, tokenizer, output_mode
+            )
+            run_eval(args, logger, model, eval_dataloader, all_guids, task_name)
+    # TroFi / MOH-X (K-fold)
+    elif (args.do_eval or args.do_test) and args.task_name == "trofi":
+        logger.info(f"***** Evaluating with {args.data_dir}")
+        k_result = []
+        for k in tqdm(range(10), desc="K-fold"):
+            all_guids, eval_dataloader = load_test_data(
+                args, logger, processor, task_name, label_list, tokenizer, output_mode, k
+            )
+            result = run_eval(args, logger, model, eval_dataloader, all_guids, task_name)
+            k_result.append(result)
+        # Calculate average result
+        avg_result = copy.deepcopy(k_result[0])
+        for result in k_result[1:]:
+            for k, v in result.items():
+                avg_result[k] += v
+        for k, v in avg_result.items():
+            avg_result[k] /= len(k_result)
+        logger.info(f"-----Averge Result-----")
+        for key in sorted(avg_result.keys()):
+            logger.info(f"  {key} = {str(avg_result[key])}")
+    logger.info(f"Saved to {logger.log_dir}")
+def run_train(
+    args,
+    logger,
+    model,
+    train_dataloader,
+    processor,
+    task_name,
+    label_list,
+    tokenizer,
+    output_mode,
+    k=None,
+):
+    tr_loss = 0
+    num_train_optimization_steps = len(train_dataloader) * args.num_train_epoch
+    # Prepare optimizer, scheduler
+    param_optimizer = list(model.named_parameters())
+    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
+            "weight_decay": 0.01,
+        },
+        {
+            "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+    if args.lr_schedule != False or args.lr_schedule.lower() != "none":
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer,
+            num_warmup_steps=int(args.warmup_epoch * len(train_dataloader)),
+            num_training_steps=num_train_optimization_steps,
+        )
+    logger.info("***** Running training *****")
+    logger.info(f"  Batch size = {args.train_batch_size}")
+    logger.info(f"  Num steps = { num_train_optimization_steps}")
+    # Run training
+    model.train()
+    max_val_f1 = -1
+    max_result = {}
+    for epoch in trange(int(args.num_train_epoch), desc="Epoch"):
+        tr_loss = 0
+        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
+            # move batch data to gpu
+            batch = tuple(t.to(args.device) for t in batch)
+            if args.model_type in ["MELBERT_MIP", "MELBERT"]:
+                (
+                    input_ids,
+                    input_mask,
+                    segment_ids,
+                    label_ids,
+                    input_ids_2,
+                    input_mask_2,
+                    segment_ids_2,
+                ) = batch
+            else:
+                input_ids, input_mask, segment_ids, label_ids = batch
+            # compute loss values
+            if args.model_type in ["BERT_SEQ", "BERT_BASE", "MELBERT_SPV"]:
+                logits = model(
+                    input_ids,
+                    target_mask=(segment_ids == 1),
+                    token_type_ids=segment_ids,
+                    attention_mask=input_mask,
+                )
+                loss_fct = nn.NLLLoss(weight=torch.Tensor([1, args.class_weight]).to(args.device))
+                loss = loss_fct(logits.view(-1, args.num_labels), label_ids.view(-1))
+            elif args.model_type in ["MELBERT_MIP", "MELBERT"]:
+                logits = model(
+                    input_ids,
+                    input_ids_2,
+                    target_mask=(segment_ids == 1),
+                    target_mask_2=segment_ids_2,
+                    attention_mask_2=input_mask_2,
+                    token_type_ids=segment_ids,
+                    attention_mask=input_mask,
+                )
+                loss_fct = nn.NLLLoss(weight=torch.Tensor([1, args.class_weight]).to(args.device))
+                loss = loss_fct(logits.view(-1, args.num_labels), label_ids.view(-1))
+            # average loss if on multi-gpu.
+            if args.n_gpu > 1:
+                loss = loss.mean()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optimizer.step()
+            if args.lr_schedule != False or args.lr_schedule.lower() != "none":
+                scheduler.step()
+            optimizer.zero_grad()
+            tr_loss += loss.item()
+        cur_lr = optimizer.param_groups[0]["lr"]
+        logger.info(f"[epoch {epoch+1}] ,lr: {cur_lr} ,tr_loss: {tr_loss}")
+        # evaluate
+        if args.do_eval:
+            all_guids, eval_dataloader = load_test_data(
+                args, logger, processor, task_name, label_list, tokenizer, output_mode, k
+            )
+            result = run_eval(args, logger, model, eval_dataloader, all_guids, task_name)
+            # update
+            if result["f1"] > max_val_f1:
+                max_val_f1 = result["f1"]
+                max_result = result
+                if args.task_name == "trofi":
+                    save_model(args, model, tokenizer)
+            if args.task_name == "vua":
+                save_model(args, model, tokenizer)
+    logger.info(f"-----Best Result-----")
+    for key in sorted(max_result.keys()):
+        logger.info(f"  {key} = {str(max_result[key])}")
+    return model, max_result
+def run_eval(args, logger, model, eval_dataloader, all_guids, task_name, return_preds=False):
+    model.eval()
+    eval_loss = 0
+    nb_eval_steps = 0
+    preds = []
+    pred_guids = []
+    out_label_ids = None
+    for eval_batch in tqdm(eval_dataloader, desc="Evaluating"):
+        eval_batch = tuple(t.to(args.device) for t in eval_batch)
+        if args.model_type in ["MELBERT_MIP", "MELBERT"]:
+            (
+                input_ids,
+                input_mask,
+                segment_ids,
+                label_ids,
+                idx,
+                input_ids_2,
+                input_mask_2,
+                segment_ids_2,
+            ) = eval_batch
+        else:
+            input_ids, input_mask, segment_ids, label_ids, idx = eval_batch
+        with torch.no_grad():
+            # compute loss values
+            if args.model_type in ["BERT_BASE", "BERT_SEQ", "MELBERT_SPV"]:
+                logits = model(
+                    input_ids,
+                    target_mask=(segment_ids == 1),
+                    token_type_ids=segment_ids,
+                    attention_mask=input_mask,
+                )
+                loss_fct = nn.NLLLoss()
+                tmp_eval_loss = loss_fct(logits.view(-1, args.num_labels), label_ids.view(-1))
+                eval_loss += tmp_eval_loss.mean().item()
+                nb_eval_steps += 1
+                if len(preds) == 0:
+                    preds.append(logits.detach().cpu().numpy())
+                    pred_guids.append([all_guids[i] for i in idx])
+                    out_label_ids = label_ids.detach().cpu().numpy()
+                else:
+                    preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0)
+                    pred_guids[0].extend([all_guids[i] for i in idx])
+                    out_label_ids = np.append(
+                        out_label_ids, label_ids.detach().cpu().numpy(), axis=0
+                    )
+            elif args.model_type in ["MELBERT_MIP", "MELBERT"]:
+                logits = model(
+                    input_ids,
+                    input_ids_2,
+                    target_mask=(segment_ids == 1),
+                    target_mask_2=segment_ids_2,
+                    attention_mask_2=input_mask_2,
+                    token_type_ids=segment_ids,
+                    attention_mask=input_mask,
+                )
+                loss_fct = nn.NLLLoss()
+                tmp_eval_loss = loss_fct(logits.view(-1, args.num_labels), label_ids.view(-1))
+                eval_loss += tmp_eval_loss.mean().item()
+                nb_eval_steps += 1
+                if len(preds) == 0:
+                    preds.append(logits.detach().cpu().numpy())
+                    pred_guids.append([all_guids[i] for i in idx])
+                    out_label_ids = label_ids.detach().cpu().numpy()
+                else:
+                    preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0)
+                    pred_guids[0].extend([all_guids[i] for i in idx])
+                    out_label_ids = np.append(
+                        out_label_ids, label_ids.detach().cpu().numpy(), axis=0
+                    )
+    eval_loss = eval_loss / nb_eval_steps
+    preds = preds[0]
+    preds = np.argmax(preds, axis=1)
+    # compute metrics
+    result = compute_metrics(preds, out_label_ids)
+    for key in sorted(result.keys()):
+        logger.info(f"  {key} = {str(result[key])}")
+    if return_preds:
+        return preds
+    return result
+def load_pretrained_model(args):
+    # Pretrained Model
+    bert = AutoModel.from_pretrained(args.bert_model)
+    for name, param in bert.named_parameters():
+        print(name, param.requires_grad)
+    config = bert.config
+    config.type_vocab_size = 4
+    if "albert" in args.bert_model:
+        bert.embeddings.token_type_embeddings = nn.Embedding(
+            config.type_vocab_size, config.embedding_size
+        )
+    else:
+        bert.embeddings.token_type_embeddings = nn.Embedding(
+            config.type_vocab_size, config.hidden_size
+        )
+    bert._init_weights(bert.embeddings.token_type_embeddings)
+    # Additional Layers
+    if args.model_type in ["BERT_BASE"]:
+        model = AutoModelForSequenceClassification(
+            args=args, Model=bert, config=config, num_labels=args.num_labels
+        )
+    if args.model_type == "BERT_SEQ":
+        model = AutoModelForTokenClassification(
+            args=args, Model=bert, config=config, num_labels=args.num_labels
+        )
+    if args.model_type == "MELBERT_SPV":
+        model = AutoModelForSequenceClassification_SPV(
+            args=args, Model=bert, config=config, num_labels=args.num_labels
+        )
+    if args.model_type == "MELBERT_MIP":
+        model = AutoModelForSequenceClassification_MIP(
+            args=args, Model=bert, config=config, num_labels=args.num_labels
+        )
+    if args.model_type == "MELBERT":
+        model = AutoModelForSequenceClassification_SPV_MIP(
+            args=args, Model=bert, config=config, num_labels=args.num_labels
+        )
+    model.to(args.device)
+    if args.n_gpu > 1 and not args.no_cuda:
+        model = torch.nn.DataParallel(model)
+    return model
+def save_model(args, model, tokenizer):
+    model_to_save = (
+        model.module if hasattr(model, "module") else model
+    )  # Only save the model it-self
+    # If we save using the predefined names, we can load using `from_pretrained`
+    output_model_file = os.path.join(args.log_dir, WEIGHTS_NAME)
+    output_config_file = os.path.join(args.log_dir, CONFIG_NAME)
+    torch.save(model_to_save.state_dict(), output_model_file)
+    model_to_save.config.to_json_file(output_config_file)
+    tokenizer.save_vocabulary(args.log_dir)
+    # Good practice: save your training arguments together with the trained model
+    output_args_file = os.path.join(args.log_dir, ARGS_NAME)
+    torch.save(args, output_args_file)
+def load_trained_model(args, model, tokenizer):
+    # If we save using the predefined names, we can load using `from_pretrained`
+    output_model_file = os.path.join(args.log_dir, WEIGHTS_NAME)
+    if hasattr(model, "module"):
+        model.module.load_state_dict(torch.load(output_model_file, map_location=args.device))
+    else:
+        model.load_state_dict(torch.load(output_model_file, map_location=args.device))
+    return model
+if __name__ == "__main__":
+    main()

main_config.cfg ADDED Viewed

	@@ -0,0 +1,58 @@

+[args]
+# Bert pre-trained model selected in the list [bert-base-cased, roberta-base, albert-base-v1 / albert-large-v1] (default = roberta-base)
+bert_model = roberta-base
+# The input data dir. Should contain the .tsv files (VUA18 / VUAverb / MOH-X/CLS / TroFi/CLS / VUA20)
+data_dir = data/VUA20
+# The name of the task to train (vua(1-fold) / trofi(10-fold))
+task_name = vua
+# The name of model type (default = MELBERT) (BERT_BASE / BERT_SEQ / MELBERT_SPV / MELBERT_MIP / MELBERT)
+model_type = MELBERT
+# The hidden dimension for classifier (default = 768)
+classifier_hidden = 768
+# Learning rate scheduler (default = warmup_linear) (none / warmup_linear)
+lr_schedule = warmup_linear
+# Training epochs to perform linear learning rate warmup for. (default = 2)
+warmup_epoch = 2
+# Dropout ratio (default = 0.2)
+drop_ratio = 0.2
+# K-fold (default = 10)
+kfold = 10
+# Number of bagging (default = 0) (0 not for using bagging technique)
+num_bagging = 0
+# The index of bagging only for the case using bagging technique (default = 0)
+bagging_index = 0
+# Use additional linguistic features
+# POS tag (default = True)
+use_pos = True
+# Local context (default = True)
+use_local_context= True
+# The maximum total input sequence length after WordPiece tokenization. (default = 200)
+max_seq_length = 150
+# Whether to run training (default = False)
+do_train = False
+# Whether to run eval on the test set (default = False)
+do_test = True
+# Whether to run eval on the dev set. (default = False)
+do_eval = True
+# Set this flag if you are using an uncased model. (default = False)
+do_lower_case = False
+# Weight of metaphor. (default = 3.0)
+class_weight = 3
+# Total batch size for training. (default = 32)
+train_batch_size = 16
+# Total batch size for eval. (default = 8)
+eval_batch_size = 8
+# The initial learning rate for Adam (default = 3e-5)
+learning_rate = 3e-5
+# Total number of training epochs to perform. (default = 3.0)
+num_train_epoch = 3
+# Whether not to use CUDA when available (default = False)
+no_cuda = False
+# random seed for initialization (default = 42)
+seed = 42
+max_data_num = None

modeling.py ADDED Viewed

	@@ -0,0 +1,403 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from utils import Config
+from transformers import AutoTokenizer, AutoModel
+class AutoModelForSequenceClassification(nn.Module):
+    """Base model for sequence classification"""
+    def __init__(self, args, Model, config, num_labels=2):
+        """Initialize the model"""
+        super(AutoModelForSequenceClassification, self).__init__()
+        self.num_labels = num_labels
+        self.encoder = Model
+        self.config = config
+        self.dropout = nn.Dropout(args.drop_ratio)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        self.logsoftmax = nn.LogSoftmax(dim=1)
+        self._init_weights(self.classifier)
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+    def forward(
+        self,
+        input_ids,
+        target_mask=None,
+        token_type_ids=None,
+        attention_mask=None,
+        labels=None,
+        head_mask=None,
+    ):
+        """
+        Inputs:
+            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the vocabulary
+            `target_mask`: a torch.LongTensor of shape [batch_size, sequence_length] with the mask for target wor. 1 for target word and 0 otherwise.
+            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices
+                selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
+            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1].
+                It's a mask to be used if the input sequence length is smaller than the max input sequence length in the current batch.
+                It's the mask that we typically use for attention when a batch has varying length sentences.
+            `labels`: optional labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
+                with indices selected in [0, ..., num_labels].
+            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+        """
+        outputs = self.encoder(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        logits = self.logsoftmax(logits)
+        if labels is not None:
+            loss_fct = nn.NLLLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss
+        return logits
+class AutoModelForTokenClassification(nn.Module):
+    """Base model for token classification"""
+    def __init__(self, args, Model, config, num_labels=2):
+        """Initialize the model"""
+        super(AutoModelForTokenClassification, self).__init__()
+        self.num_labels = num_labels
+        self.bert = Model
+        self.config = config
+        self.dropout = nn.Dropout(args.drop_ratio)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        self.logsoftmax = nn.LogSoftmax(dim=1)
+        self._init_weights(self.classifier)
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+    def forward(
+        self,
+        input_ids,
+        target_mask,
+        token_type_ids=None,
+        attention_mask=None,
+        labels=None,
+        head_mask=None,
+    ):
+        """
+        Inputs:
+            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the vocabulary
+            `target_mask`: a torch.LongTensor of shape [batch_size, sequence_length] with the mask for target wor. 1 for target word and 0 otherwise.
+            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices
+                selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
+            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1].
+                It's a mask to be used if the input sequence length is smaller than the max input sequence length in the current batch.
+                It's the mask that we typically use for attention when a batch has varying length sentences.
+            `labels`: optional labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
+                with indices selected in [0, ..., num_labels].
+            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+        """
+        outputs = self.bert(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+        )
+        sequence_output = outputs[0]  # [batch, max_len, hidden]
+        target_output = sequence_output * target_mask.unsqueeze(2)
+        target_output = self.dropout(target_output)
+        target_output = target_output.sum(1) / target_mask.sum()  # [batch, hideen]
+        logits = self.classifier(target_output)
+        logits = self.logsoftmax(logits)
+        if labels is not None:
+            loss_fct = nn.NLLLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss
+        return logits
+class AutoModelForSequenceClassification_SPV(nn.Module):
+    """MelBERT with only SPV"""
+    def __init__(self, args, Model, config, num_labels=2):
+        """Initialize the model"""
+        super(AutoModelForSequenceClassification_SPV, self).__init__()
+        self.num_labels = num_labels
+        self.encoder = Model
+        self.config = config
+        self.dropout = nn.Dropout(args.drop_ratio)
+        self.classifier = nn.Linear(config.hidden_size * 2, num_labels)
+        self.logsoftmax = nn.LogSoftmax(dim=1)
+        self._init_weights(self.classifier)
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+    def forward(
+        self,
+        input_ids,
+        target_mask,
+        token_type_ids=None,
+        attention_mask=None,
+        labels=None,
+        head_mask=None,
+    ):
+        """
+        Inputs:
+            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the vocabulary
+            `target_mask`: a torch.LongTensor of shape [batch_size, sequence_length] with the mask for target wor. 1 for target word and 0 otherwise.
+            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices
+                selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
+            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1].
+            `labels`: optional labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
+                with indices selected in [0, ..., num_labels].
+            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+        """
+        outputs = self.encoder(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+        )
+        sequence_output = outputs[0]  # [batch, max_len, hidden]
+        pooled_output = outputs[1]  # [batch, hidden]
+        # Get target ouput with target mask
+        target_output = sequence_output * target_mask.unsqueeze(2)  # [batch, hidden]
+        # dropout
+        target_output = self.dropout(target_output)
+        pooled_output = self.dropout(pooled_output)
+        # Get mean value of target output if the target output consistst of more than one token
+        target_output = target_output.mean(1)
+        logits = self.classifier(torch.cat([target_output, pooled_output], dim=1))
+        logits = self.logsoftmax(logits)
+        if labels is not None:
+            loss_fct = nn.NLLLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss
+        return logits
+class AutoModelForSequenceClassification_MIP(nn.Module):
+    """MelBERT with only MIP"""
+    def __init__(self, args, Model, config, num_labels=2):
+        """Initialize the model"""
+        super(AutoModelForSequenceClassification_MIP, self).__init__()
+        self.num_labels = num_labels
+        self.encoder = Model
+        self.config = config
+        self.dropout = nn.Dropout(args.drop_ratio)
+        self.args = args
+        self.classifier = nn.Linear(config.hidden_size * 2, num_labels)
+        self.logsoftmax = nn.LogSoftmax(dim=1)
+        self._init_weights(self.classifier)
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+    def forward(
+        self,
+        input_ids,
+        input_ids_2,
+        target_mask,
+        target_mask_2,
+        attention_mask_2,
+        token_type_ids=None,
+        attention_mask=None,
+        labels=None,
+        head_mask=None,
+    ):
+        """
+        Inputs:
+            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the first input token indices in the vocabulary
+            `input_ids_2`: a torch.LongTensor of shape [batch_size, sequence_length] with the second input token indicies
+            `target_mask`: a torch.LongTensor of shape [batch_size, sequence_length] with the mask for target word in the first input. 1 for target word and 0 otherwise.
+            `target_mask_2`: a torch.LongTensor of shape [batch_size, sequence_length] with the mask for target word in the second input. 1 for target word and 0 otherwise.
+            `attention_mask_2`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1] for the second input.
+            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices
+                selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
+            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1] for the first input.
+            `labels`: optional labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
+                with indices selected in [0, ..., num_labels].
+            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+        """
+        # First encoder for full sentence
+        outputs = self.encoder(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+        )
+        sequence_output = outputs[0]  # [batch, max_len, hidden]
+        # Get target ouput with target mask
+        target_output = sequence_output * target_mask.unsqueeze(2)
+        target_output = self.dropout(target_output)
+        target_output = target_output.sum(1) / target_mask.sum()  # [batch, hidden]
+        # Second encoder for only the target word
+        outputs_2 = self.encoder(input_ids_2, attention_mask=attention_mask_2, head_mask=head_mask)
+        sequence_output_2 = outputs_2[0]  # [batch, max_len, hidden]
+        # Get target ouput with target mask
+        target_output_2 = sequence_output_2 * target_mask_2.unsqueeze(2)
+        target_output_2 = self.dropout(target_output_2)
+        target_output_2 = target_output_2.sum(1) / target_mask_2.sum()
+        logits = self.classifier(torch.cat([target_output_2, target_output], dim=1))
+        logits = self.logsoftmax(logits)
+        if labels is not None:
+            loss_fct = nn.NLLLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss
+        return logits
+class AutoModelForSequenceClassification_SPV_MIP(nn.Module):
+    """MelBERT"""
+    def __init__(self, args, Model, config, num_labels=2):
+        """Initialize the model"""
+        super(AutoModelForSequenceClassification_SPV_MIP, self).__init__()
+        self.num_labels = num_labels
+        self.encoder = Model
+        self.config = config
+        self.dropout = nn.Dropout(args.drop_ratio)
+        self.args = args
+        self.SPV_linear = nn.Linear(config.hidden_size * 2, args.classifier_hidden)
+        self.MIP_linear = nn.Linear(config.hidden_size * 2, args.classifier_hidden)
+        self.classifier = nn.Linear(args.classifier_hidden * 2, num_labels)
+        self._init_weights(self.SPV_linear)
+        self._init_weights(self.MIP_linear)
+        self.logsoftmax = nn.LogSoftmax(dim=1)
+        self._init_weights(self.classifier)
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+    def forward(
+        self,
+        input_ids,
+        input_ids_2,
+        target_mask,
+        target_mask_2,
+        attention_mask_2,
+        token_type_ids=None,
+        attention_mask=None,
+        labels=None,
+        head_mask=None,
+    ):
+        """
+        Inputs:
+            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the first input token indices in the vocabulary
+            `input_ids_2`: a torch.LongTensor of shape [batch_size, sequence_length] with the second input token indicies
+            `target_mask`: a torch.LongTensor of shape [batch_size, sequence_length] with the mask for target word in the first input. 1 for target word and 0 otherwise.
+            `target_mask_2`: a torch.LongTensor of shape [batch_size, sequence_length] with the mask for target word in the second input. 1 for target word and 0 otherwise.
+            `attention_mask_2`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1] for the second input.
+            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices
+                selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
+            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1] for the first input.
+            `labels`: optional labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
+                with indices selected in [0, ..., num_labels].
+            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+        """
+        # First encoder for full sentence
+        outputs = self.encoder(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+        )
+        sequence_output = outputs[0]  # [batch, max_len, hidden]
+        pooled_output = outputs[1]  # [batch, hidden]
+        # Get target ouput with target mask
+        target_output = sequence_output * target_mask.unsqueeze(2)
+        # dropout
+        target_output = self.dropout(target_output)
+        pooled_output = self.dropout(pooled_output)
+        target_output = target_output.mean(1)  # [batch, hidden]
+        # Second encoder for only the target word
+        outputs_2 = self.encoder(input_ids_2, attention_mask=attention_mask_2, head_mask=head_mask)
+        sequence_output_2 = outputs_2[0]  # [batch, max_len, hidden]
+        # Get target ouput with target mask
+        target_output_2 = sequence_output_2 * target_mask_2.unsqueeze(2)
+        target_output_2 = self.dropout(target_output_2)
+        target_output_2 = target_output_2.mean(1)
+        # Get hidden vectors each from SPV and MIP linear layers
+        SPV_hidden = self.SPV_linear(torch.cat([pooled_output, target_output], dim=1))
+        MIP_hidden = self.MIP_linear(torch.cat([target_output_2, target_output], dim=1))
+        logits = self.classifier(self.dropout(torch.cat([SPV_hidden, MIP_hidden], dim=1)))
+        logits = self.logsoftmax(logits)
+        if labels is not None:
+            loss_fct = nn.NLLLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss
+        return logits

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ boto3==1.16.63 nltk==3.5 numpy==1.20.0 requests==2.25.1 scikit-learn==0.24.1 scipy==1.6.0
2	+ torch==1.6.0 torchvision==0.7.0 tqdm==4.56.0 transformers==4.2.2

run_classifier_dataset_utils.py ADDED Viewed

	@@ -0,0 +1,669 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BERT classification fine-tuning: utilities to work with GLUE tasks """
+from __future__ import absolute_import, division, print_function
+import csv
+import logging
+import os
+import sys
+import torch
+from tqdm import tqdm
+from scipy.stats import pearsonr, spearmanr, truncnorm
+from sklearn.metrics import (
+    matthews_corrcoef,
+    f1_score,
+    precision_score,
+    recall_score,
+    mean_squared_error,
+)
+import random
+import nltk
+from nltk.corpus import wordnet
+logger = logging.getLogger(__name__)
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+    def __init__(
+        self,
+        guid,
+        text_a,
+        text_b=None,
+        label=None,
+        POS=None,
+        FGPOS=None,
+        text_a_2=None,
+        text_b_2=None,
+    ):
+        """Constructs a InputExample.
+        Args:
+            guid: Unique id for the example.
+            text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+            text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+        self.POS = POS
+        self.FGPOS = FGPOS
+        self.text_a_2 = text_a_2
+        self.text_b_2 = text_b_2
+class InputFeatures(object):
+    """A single set of features of data."""
+    def __init__(
+        self,
+        input_ids,
+        input_mask,
+        segment_ids,
+        label_id,
+        guid=None,
+        input_ids_2=None,
+        input_mask_2=None,
+        segment_ids_2=None,
+    ):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+        self.guid = guid
+        self.input_ids_2 = input_ids_2
+        self.input_mask_2 = input_mask_2
+        self.segment_ids_2 = segment_ids_2
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding="utf-8") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, "utf-8") for cell in line)
+                lines.append(line)
+            return lines
+class TrofiProcessor(DataProcessor):
+    """Processor for the TroFi and MOH-X data set."""
+    def get_train_examples(self, data_dir, k=None):
+        """See base class."""
+        if k is not None:
+            return self._create_examples(
+                self._read_tsv(os.path.join(data_dir, "train" + str(k) + ".tsv")), "train"
+            )
+        else:
+            return self._create_examples(
+                self._read_tsv(os.path.join(data_dir, "train.tsv")), "train"
+            )
+    def get_test_examples(self, data_dir, k=None):
+        """See base class."""
+        if k is not None:
+            return self._create_examples(
+                self._read_tsv(os.path.join(data_dir, "test" + str(k) + ".tsv")), "test"
+            )
+        else:
+            return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+    def get_dev_examples(self, data_dir, k=None):
+        """See base class."""
+        if k is not None:
+            return self._create_examples(
+                self._read_tsv(os.path.join(data_dir, "dev" + str(k) + ".tsv")), "dev"
+            )
+        else:
+            return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[2]
+            label = line[1]
+            POS = line[3]
+            FGPOS = line[4]
+            index = line[-1]
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=index, label=label, POS=POS, FGPOS=FGPOS
+                )
+            )
+        return examples
+class VUAProcessor(DataProcessor):
+    """Processor for the VUA data set."""
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[2]
+            label = line[1]
+            POS = line[3]
+            FGPOS = line[4]
+            if len(line) == 8:
+                index = line[5]
+                text_a_2 = line[6]
+                index_2 = line[7]
+                examples.append(
+                    InputExample(
+                        guid=guid,
+                        text_a=text_a,
+                        text_b=index,
+                        label=label,
+                        POS=POS,
+                        FGPOS=FGPOS,
+                        text_a_2=text_a_2,
+                        text_b_2=index_2,
+                    )
+                )
+            else:
+                index = line[-1]
+                examples.append(
+                    InputExample(
+                        guid=guid, text_a=text_a, text_b=index, label=label, POS=POS, FGPOS=FGPOS
+                    )
+                )
+        return examples
+def convert_examples_to_features(
+    examples, label_list, max_seq_length, tokenizer, output_mode, args
+):
+    """Loads a data file into a list of `InputBatch`s."""
+    label_map = {label: i for i, label in enumerate(label_list)}
+    features = []
+    for (ex_index, example) in tqdm(enumerate(examples)):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
+        tokens_a = tokenizer.tokenize(example.text_a)  # tokenize the sentence
+        tokens_b = None
+        try:
+            text_b = int(example.text_b)  # index of target word
+            tokens_b = text_b
+            # truncate the sentence to max_seq_len
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[: (max_seq_length - 2)]
+            # Find the target word index
+            for i, w in enumerate(example.text_a.split()):
+                # If w is a target word, tokenize the word and save to text_b
+                if i == text_b:
+                    # consider the index due to models that use a byte-level BPE as a tokenizer (e.g., GPT2, RoBERTa)
+                    text_b = tokenizer.tokenize(w) if i == 0 else tokenizer.tokenize(" " + w)
+                    break
+                w_tok = tokenizer.tokenize(w) if i == 0 else tokenizer.tokenize(" " + w)
+                # Count number of tokens before the target word to get the target word index
+                if w_tok:
+                    tokens_b += len(w_tok) - 1
+        except TypeError:
+            if example.text_b:
+                tokens_b = tokenizer.tokenize(example.text_b)
+                # Modifies `tokens_a` and `tokens_b` in place so that the total
+                # length is less than the specified length.
+                # Account for [CLS], [SEP], [SEP] with "- 3"
+                _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+            else:
+                # Account for [CLS] and [SEP] with "- 2"
+                if len(tokens_a) > max_seq_length - 2:
+                    tokens_a = tokens_a[: (max_seq_length - 2)]
+        tokens = [tokenizer.cls_token] + tokens_a + [tokenizer.sep_token]
+        segment_ids = [0] * len(tokens)
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+        # set the target word as 1 in segment ids
+        try:
+            tokens_b += 1  # add 1 to the target word index considering [CLS]
+            for i in range(len(text_b)):
+                segment_ids[tokens_b + i] = 1
+        except TypeError:
+            pass
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1] * len(input_ids)
+        # Zero-pad up to the sequence length.
+        padding = [tokenizer.convert_tokens_to_ids(tokenizer.pad_token)] * (
+            max_seq_length - len(input_ids)
+        )
+        input_ids += padding
+        input_mask += [0] * len(padding)
+        segment_ids += [0] * len(padding)
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+        if output_mode == "classification":
+            label_id = label_map[example.label]
+        else:
+            raise KeyError(output_mode)
+        if ex_index < 5:
+            logger.info("*** Example ***")
+            logger.info("guid: %s" % (example.guid))
+            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+            logger.info("label: %s (id = %s)" % (example.label, str(label_id)))
+        features.append(
+            InputFeatures(
+                input_ids=input_ids,
+                input_mask=input_mask,
+                segment_ids=segment_ids,
+                label_id=label_id,
+                guid=example.guid + " " + str(example.text_b),
+            )
+        )
+    return features
+def convert_two_examples_to_features(
+    examples, label_list, max_seq_length, tokenizer, output_mode, win_size=-1
+):
+    """Loads a data file into a list of `InputBatch`s."""
+    label_map = {label: i for i, label in enumerate(label_list)}
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
+        tokens_a = tokenizer.tokenize(example.text_a)  # tokenize the sentence
+        tokens_b = None
+        text_b = None
+        try:
+            text_b = int(example.text_b)  # index of target word
+            tokens_b = text_b
+            # truncate the sentence to max_seq_len
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[: (max_seq_length - 2)]
+            # Find the target word index
+            for i, w in enumerate(example.text_a.split()):
+                # If w is a target word, tokenize the word and save to text_b
+                if i == text_b:
+                    # consider the index due to models that use a byte-level BPE as a tokenizer (e.g., GPT2, RoBERTa)
+                    text_b = tokenizer.tokenize(w) if i == 0 else tokenizer.tokenize(" " + w)
+                    break
+                w_tok = tokenizer.tokenize(w) if i == 0 else tokenizer.tokenize(" " + w)
+                # Count number of tokens before the target word to get the target word index
+                if w_tok:
+                    tokens_b += len(w_tok) - 1
+        except TypeError:
+            if example.text_b:
+                tokens_b = tokenizer.tokenize(example.text_b)
+                # Modifies `tokens_a` and `tokens_b` in place so that the total
+                # length is less than the specified length.
+                # Account for [CLS], [SEP], [SEP] with "- 3"
+                _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+            else:
+                # Account for [CLS] and [SEP] with "- 2"
+                if len(tokens_a) > max_seq_length - 2:
+                    tokens_a = tokens_a[: (max_seq_length - 2)]
+        tokens = [tokenizer.cls_token] + tokens_a + [tokenizer.sep_token]
+        segment_ids = [0] * len(tokens)
+        #import pdb; pdb.set_trace()
+        # set the target word as 1 in segment ids
+        try:
+            tokens_b += 1  # add 1 to the target word index considering [CLS]
+            for i in range(len(text_b)):
+                segment_ids[tokens_b + i] = 1
+            # concatentate the second sentence ( ["[CLS]"] + tokens_a + ["[SEP]"] -> ["[CLS]"] + tokens_a + ["[SEP]"] + text_b + ["[SEP]"])
+            tokens = tokens + text_b + [tokenizer.sep_token]
+            segment_ids = segment_ids + [0] * len(text_b)
+        except TypeError:
+            pass
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+        input_mask = [1] * len(input_ids)
+        # Zero-pad up to the sequence length.
+        padding = [tokenizer.convert_tokens_to_ids(tokenizer.pad_token)] * (
+            max_seq_length - len(input_ids)
+        )
+        input_ids += padding
+        input_mask += [0] * len(padding)
+        segment_ids += [0] * len(padding)
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+        if output_mode == "classification":
+            label_id = label_map[example.label]
+        else:
+            raise KeyError(output_mode)
+        if ex_index < 5:
+            logger.info("*** Example ***")
+            logger.info("guid: %s" % (example.guid))
+            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+            logger.info("label: %s (id = %s)" % (example.label, str(label_id)))
+        features.append(
+            InputFeatures(
+                input_ids=input_ids,
+                input_mask=input_mask,
+                segment_ids=segment_ids,
+                label_id=label_id,
+                guid=example.guid + " " + example.text_b,
+            )
+        )
+    return features
+def convert_examples_to_two_features(
+    examples, label_list, max_seq_length, tokenizer, output_mode, args
+):
+    """Loads a data file into a list of `InputBatch`s."""
+    label_map = {label: i for i, label in enumerate(label_list)}
+    #import pdb; pdb.set_trace()
+    # examples = examples[:args.max_data_num] if args.max_data_num is not None else examples
+    features = []
+    for (ex_index, example) in tqdm(enumerate(examples)):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
+        tokens_a = tokenizer.tokenize(example.text_a)  # tokenize the sentence
+        tokens_b = None
+        text_b = None
+        #import pdb; pdb.set_trace()
+        try:
+            #import pdb; pdb.set_trace()
+            text_b = int(example.text_b)  # index of target word
+            tokens_b = text_b
+            # truncate the sentence to max_seq_len
+            if len(tokens_a) > max_seq_length - 6:
+                tokens_a = tokens_a[: (max_seq_length - 6)]
+            # Find the target word index
+            for i, w in enumerate(example.text_a.split()):
+                # If w is a target word, tokenize the word and save to text_b
+                if i == text_b:
+                    # consider the index due to models that use a byte-level BPE as a tokenizer (e.g., GPT2, RoBERTa)
+                    text_b = tokenizer.tokenize(w) if i == 0 else tokenizer.tokenize(" " + w)
+                    break
+                w_tok = tokenizer.tokenize(w) if i == 0 else tokenizer.tokenize(" " + w)
+                # Count number of tokens before the target word to get the target word index
+                if w_tok:
+                    tokens_b += len(w_tok) - 1
+            if tokens_b + len(text_b) > max_seq_length - 6:
+                continue
+        except TypeError:
+            #import pdb; pdb.set_trace()
+            print('Y|', example.text_b, tokens_b)
+            if example.text_b:
+                tokens_b = tokenizer.tokenize(example.text_b)
+                # Account for [CLS], [SEP], [SEP] with "- 3"
+                _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+            else:
+                # Account for [CLS] and [SEP] with "- 2"
+                if len(tokens_a) > max_seq_length - 2:
+                    tokens_a = tokens_a[: (max_seq_length - 2)]
+        tokens = [tokenizer.cls_token] + tokens_a + [tokenizer.sep_token]
+        print('after|', text_b, tokens_b, tokens)
+        #print('N|', tokens_b)
+        # POS tag tokens
+        if args.use_pos:
+            POS_token = tokenizer.tokenize(example.POS)
+            tokens += POS_token + [tokenizer.sep_token]
+        # Local context
+        if args.use_local_context:
+            local_start = 1
+            local_end = local_start + len(tokens_a)
+            comma1 = tokenizer.tokenize(",")[0]
+            comma2 = tokenizer.tokenize(" ,")[0]
+            for i, w in enumerate(tokens):
+                if i < tokens_b + 1 and (w in [comma1, comma2]):
+                    local_start = i
+                if i > tokens_b + 1 and (w in [comma1, comma2]):
+                    local_end = i
+                    break
+            segment_ids = [
+                2 if i >= local_start and i <= local_end else 0 for i in range(len(tokens))
+            ]
+        else:
+            segment_ids = [0] * len(tokens)
+        # POS tag encoding
+        after_token_a = False
+        for i, t in enumerate(tokens):
+            if t == tokenizer.sep_token:
+                after_token_a = True
+            if after_token_a and t != tokenizer.sep_token:
+                segment_ids[i] = 3
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+        try:
+            tokens_b += 1  # add 1 to the target word index considering [CLS]
+            for i in range(len(text_b)):
+                segment_ids[tokens_b + i] = 1
+        except TypeError:
+            pass
+        input_mask = [1] * len(input_ids)
+        padding = [tokenizer.convert_tokens_to_ids(tokenizer.pad_token)] * (
+            max_seq_length - len(input_ids)
+        )
+        input_ids += padding
+        input_mask += [0] * len(padding)
+        segment_ids += [0] * len(padding)
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+        if output_mode == "classification":
+            label_id = label_map[example.label]
+        else:
+            raise KeyError(output_mode)
+        # Second features (Target word)
+        tokens = [tokenizer.cls_token] + text_b + [tokenizer.sep_token]
+        segment_ids_2 = [0] * len(tokens)
+        try:
+            tokens_b = 1  # add 1 to the target word index considering [CLS]
+            for i in range(len(text_b)):
+                segment_ids_2[tokens_b + i] = 1
+        except TypeError:
+            pass
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
+        input_ids_2 = tokenizer.convert_tokens_to_ids(tokens)
+        input_mask_2 = [1] * len(input_ids_2)
+        padding = [tokenizer.convert_tokens_to_ids(tokenizer.pad_token)] * (
+            max_seq_length - len(input_ids_2)
+        )
+        input_ids_2 += padding
+        input_mask_2 += [0] * len(padding)
+        segment_ids_2 += [0] * len(padding)
+        assert len(input_ids_2) == max_seq_length
+        assert len(input_mask_2) == max_seq_length
+        assert len(segment_ids_2) == max_seq_length
+        features.append(
+            InputFeatures(
+                input_ids=input_ids,
+                input_mask=input_mask,
+                segment_ids=segment_ids,
+                label_id=label_id,
+                guid=example.guid + " " + str(example.text_b),
+                input_ids_2=input_ids_2,
+                input_mask_2=input_mask_2,
+                segment_ids_2=segment_ids_2,
+            )
+        )
+    return features
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+def simple_accuracy(preds, labels):
+    return (preds == labels).mean()
+def seq_accuracy(preds, labels):
+    acc = []
+    for idx, pred in enumerate(preds):
+        acc.append((pred == labels[idx]).mean())
+    return acc.mean()
+def acc_and_f1(preds, labels):
+    acc = simple_accuracy(preds, labels)
+    f1 = f1_score(y_true=labels, y_pred=preds)
+    return {
+        "acc": acc,
+        "f1": f1,
+        "acc_and_f1": (acc + f1) / 2,
+    }
+def all_metrics(preds, labels):
+    acc = simple_accuracy(preds, labels)
+    f1 = f1_score(y_true=labels, y_pred=preds)
+    pre = precision_score(y_true=labels, y_pred=preds)
+    rec = recall_score(y_true=labels, y_pred=preds)
+    return {
+        "acc": acc,
+        "precision": pre,
+        "recall": rec,
+        "f1": f1,
+    }
+def compute_metrics(preds, labels):
+    assert len(preds) == len(labels)
+    return all_metrics(preds, labels)
+processors = {
+    "vua": VUAProcessor,
+    "trofi": TrofiProcessor,
+}
+output_modes = {
+    "vua": "classification",
+    "trofi": "classification",
+}

scripts/run.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ #!/bin/bash
2	+ python main.py --data_dir data/VUA20 --task_name vua --model_type MELBERT --train_batch_size 32 --learning_rate 3e-5 --warmup_epoch 2

scripts/run_bagging.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+#!/bin/bash
+INDEXES=$(seq 0 9)
+for i in $INDEXES
+do
+    echo "Running bagging for index $i"
+    python main.py --data_dir data/VUA20 --task_name vua --model_type MELBERT --train_batch_size 32 --learning_rate 3e-5 --warmup_epoch 2 --num_bagging 10 --bagging_index $i
+done

utils/Config.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import os
+from collections import OrderedDict
+from configparser import ConfigParser
+class Config:
+    def __init__(self, main_conf_path):
+        self.main_conf_path = main_conf_path
+        self.main_config = self.read_config(os.path.join(main_conf_path, 'main_config.cfg'))
+    def read_config(self, conf_path):
+        conf_dict = OrderedDict()
+        config = ConfigParser()
+        config.read(conf_path)
+        for section in config.sections():
+            section_config = OrderedDict(config[section].items())
+            conf_dict[section] = self.type_ensurance(section_config)
+            self.__dict__.update((k, v) for k, v in conf_dict[section].items())
+        return conf_dict
+    def ensure_value_type(self, v):
+        BOOLEAN = {'false': False, 'False': False,
+                   'true': True, 'True': True}
+        if isinstance(v, str):
+            try:
+                value = eval(v)
+                if not isinstance(value, (str, int, float, list, tuple)):
+                    value = v
+            except:
+                if v in BOOLEAN:
+                    v = BOOLEAN[v]
+                value = v
+        else:
+            value = v
+        return value
+    def type_ensurance(self, config):
+        BOOLEAN = {'false': False, 'False': False,
+                   'true': True, 'True': True}
+        for k, v in config.items():
+            try:
+                value = eval(v)
+                if not isinstance(value, (str, int, float, list, tuple)):
+                    value = v
+            except:
+                if v in BOOLEAN:
+                    v = BOOLEAN[v]
+                value = v
+            config[k] = value
+        return config
+    def get_param(self, section, param):
+        if section in self.main_config:
+            section = self.main_config[section]
+        else:
+            raise NameError("There are not the parameter named '%s'" % section)
+        if param in section:
+            value = section[param]
+        else:
+            raise NameError("There are not the parameter named '%s'" % param)
+        return value
+    def update_params(self, params):
+        # for now, assume 'params' is dictionary
+        for k, v in params.items():
+            updated=False
+            for section in self.main_config:
+                if k in self.main_config[section]:
+                    self.main_config[section][k] = self.ensure_value_type(v)
+                    self.__dict__[k] = self.main_config[section][k]
+                    updated = True
+                    break
+            if not updated:
+                # raise ValueError
+                print('Parameter not updated. \'%s\' not exists.' % k)
+    def save(self, base_dir):
+        def helper(section_k, section_v):
+            sec_str = '[%s]\n' % section_k
+            for k, v in section_v.items():
+                sec_str += '%s=%s\n' % (str(k), str(v))
+            sec_str += '\n'
+            return sec_str
+        # save main config
+        main_conf_str =''
+        for section in self.main_config:
+            main_conf_str += helper(section, self.main_config[section])
+        with open(os.path.join(base_dir, 'main_config.cfg'), 'wt') as f:
+            f.write(main_conf_str)
+        print('main config saved in %s' % base_dir)
+    def __getitem__(self, item):
+        if not isinstance(item, str):
+            raise TypeError("index must be a str")
+        if item in self.main_config:
+            section = self.main_config[item]
+        else:
+            raise NameError("There are not the parameter named '%s'" % item)
+        return section
+    def __str__(self):
+        config_str = '\n'
+        config_str += '>>>>> Main Config\n'
+        for section in self.main_config:
+            config_str += '[%s]\n' % section
+            config_str += '\n'.join(['{}: {}'.format(k, self.main_config[section][k]) for k in self.main_config[section]])
+            config_str += '\n\n'
+        return config_str
+if __name__ == '__main__':
+    param = Config('../main_config.cfg')
+    print(param)

utils/Logger.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import os
+from time import strftime
+import logging
+def make_log_dir(log_dir):
+    """
+    Generate directory path to log
+    :param log_dir:
+    :return:
+    """
+    if not os.path.exists(log_dir):
+        os.mkdir(log_dir)
+    log_dirs = os.listdir(log_dir)
+    if len(log_dirs) == 0:
+        idx = 0
+    else:
+        idx_list = sorted([int(d.split("_")[0]) for d in log_dirs])
+        idx = idx_list[-1] + 1
+    cur_log_dir = "%d_%s" % (idx, strftime("%Y%m%d-%H%M"))
+    full_log_dir = os.path.join(log_dir, cur_log_dir)
+    if not os.path.exists(full_log_dir):
+        os.mkdir(full_log_dir)
+    return full_log_dir
+class Logger:
+    def __init__(self, log_dir):
+        log_file_format = "[%(lineno)d]%(asctime)s: %(message)s"
+        log_console_format = "%(message)s"
+        # Main logger
+        self.log_dir = log_dir
+        self.logger = logging.getLogger(log_dir)
+        self.logger.setLevel(logging.INFO)
+        self.logger.propagate = False
+        console_handler = logging.StreamHandler()
+        console_handler.setLevel(logging.INFO)
+        console_handler.setFormatter(logging.Formatter(log_console_format))
+        file_handler = logging.FileHandler(os.path.join(log_dir, "experiments.log"))
+        file_handler.setLevel(logging.DEBUG)
+        file_handler.setFormatter(logging.Formatter(log_file_format))
+        self.logger.addHandler(console_handler)
+        self.logger.addHandler(file_handler)
+    def info(self, msg):
+        self.logger.info(msg)
+    def close(self):
+        for handle in self.logger.handlers[:]:
+            self.logger.removeHandler(handle)
+        logging.shutdown()
+def setup_logger(log_dir):
+    log_file_format = "[%(lineno)d]%(asctime)s: %(message)s"
+    log_console_format = "%(message)s"
+    # Main logger
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    logger.propagate = False
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+    console_handler.setFormatter(logging.Formatter(log_console_format))
+    file_handler = logging.FileHandler(os.path.join(log_dir, "experiments.log"))
+    file_handler.setLevel(logging.DEBUG)
+    file_handler.setFormatter(logging.Formatter(log_file_format))
+    logger.addHandler(console_handler)
+    logger.addHandler(file_handler)
+    return logger

utils/ResultTable.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import numpy as np
+from collections import OrderedDict
+class ResultTable:
+    """
+    Class to save and show result neatly.
+    First column is always 'NAME' column.
+    """
+    def __init__(self, table_name='table', header=None, splitter='||', int_formatter='%3d', float_formatter='%.4f'):
+        """
+        Initialize table setting.
+        :param list header: list of string, table headers.
+        :param str splitter:
+        :param str int_formatter:
+        :param str float_formatter:
+        """
+        self.table_name = table_name
+        self.header = header
+        if self.header is not None:
+            self.set_headers(self.header)
+        self.num_rows = 0
+        self.splitter = splitter
+        self.int_formatter = int_formatter
+        self.float_formatter = float_formatter
+    def set_headers(self, header):
+        """
+        Set table headers as given and clear all data.
+        :param list header: list of header strings
+        :return: None
+        """
+        self.header = header
+        if 'NAME' not in header:
+            self.header = ['NAME'] + self.header
+        self.data = OrderedDict([(h, []) for h in self.header])
+        self.max_len = OrderedDict([(h, len(h)) for h in self.header])
+        # {h: len(h) for h in self.header}
+    def add_row(self, row_name, row_dict):
+        """
+        Add new row into the table.
+        :param str row_name: name of the row, which will be the first column
+        :param dict row_dict: dictionary containing column name as a key and column value as value.
+        :return: None
+        """
+        # If header is not defined, fetch from input dict
+        if self.header is None:
+            self.set_headers(list(row_dict.keys()))
+        # If input dict has new column, make one
+        for key in row_dict:
+            if key not in self.data:
+                self.set_headers(self.header + [key])
+        for h in self.header:
+            if h == 'NAME':
+                self.data['NAME'].append(row_name)
+                self.max_len[h] = max(self.max_len['NAME'], len(row_name))
+            else:
+                # If input dict doesn't have values for table header, make empty value.
+                if h not in row_dict:
+                    row_dict[h] = '-'
+                # convert input dict to string
+                d = row_dict[h]
+                if isinstance(d, (int, np.integer)):
+                    d_str = self.int_formatter % d
+                elif isinstance(d, (float, np.float)):
+                    d_str = self.float_formatter % d
+                elif isinstance(d, str):
+                    d_str = d
+                elif isinstance(d, list):
+                    d_str = str(d)
+                else:
+                    raise NotImplementedError('data type currently not supported. %s' % str(type(d)))
+                self.data[h].append(d_str)
+                self.max_len[h] = max(self.max_len[h], len(d_str))
+        self.num_rows += 1
+    def row_to_line(self, row_values):
+        """
+        Convert a row into string form
+        :param list row_values: list of row values as string
+        :return: string form of a row
+        """
+        value_str = []
+        for i, header in enumerate(self.header):
+            max_length = self.max_len[header]
+            length = len(row_values[i])
+            diff = max_length - length
+            # Center align
+            # left_space = diff // 2
+            # right_space = diff - left_space
+            # s = ' ' * left_space + row_values[i] + ' ' * right_space
+            # Left align
+            s = row_values[i] + ' ' * diff
+            value_str.append(s)
+        # for i, max_length in enumerate(self.max_len.values()):
+        #     length = len(row_values[i])
+        #     diff = max_length - length
+        #
+        #     # Center align
+        #     # left_space = diff // 2
+        #     # right_space = diff - left_space
+        #     # s = ' ' * left_space + row_values[i] + ' ' * right_space
+        #
+        #     # Left align
+        #     s = row_values[i] + ' ' * diff
+        #     value_str.append(s)
+        return self.splitter + ' ' + (' %s ' % self.splitter).join(value_str) + ' ' + self.splitter
+    def to_string(self):
+        """
+        Convert a table into string form
+        :return: string form of the table
+        """
+        size_per_col = {h: self.max_len[h] + 2 + len(self.splitter) for h in self.header}
+        line_len = sum([size_per_col[c] for c in size_per_col]) + len(self.splitter)
+        table_str = '\n'
+        # TABLE NAME
+        table_str += self.table_name + '\n'
+        # HEADER
+        line = self.row_to_line(self.header)
+        table_str += '=' * line_len + '\n'
+        table_str += line + '\n'
+        table_str += self.splitter + '-' * (line_len - len(self.splitter) * 2) + self.splitter + '\n'
+        # DATA
+        for row_values in zip(*self.data.values()):
+            line = self.row_to_line(row_values)
+            table_str += line + '\n'
+        table_str += '=' * line_len + '\n'
+        return table_str
+    def show(self):
+        print(self.to_string())
+    @property
+    def shape(self):
+        return (self.num_rows, self.num_cols)
+    @property
+    def num_cols(self):
+        return len(self.header)

utils/Statistics.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import numpy as np
+class Statistics:
+    def __init__(self, name='AVG'):
+        self.name = name
+        self.history = []
+        self.sum = 0
+        self.cnt = 0
+    def update(self, val):
+        self.history.append(val)
+        self.sum += val
+        self.cnt += 1
+    @property
+    def mean_std(self):
+        # mean = self.sum / self.cnt
+        mean = np.mean(self.history)
+        std = np.std(self.history)
+        return mean, std
+    @property
+    def mean(self):
+        # return self.sum / self.cnt
+        return np.mean(self.history)
+    @property
+    def std(self):
+        return np.std(self.history)

utils/Tool.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import math
+import time
+import numpy as np
+import torch
+def set_random_seed(seed):
+    np.random.seed(seed)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def getlocaltime():
+    date = time.strftime('%y-%m-%d', time.localtime())
+    current_time = time.strftime('%H:%M:%S', time.localtime())

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .Config import Config
+from .ResultTable import ResultTable
+from .Logger import Logger, make_log_dir
+from .Tool import set_random_seed

utils/__pycache__/Config.cpython-36.pyc ADDED Viewed

Binary file (3.8 kB). View file

utils/__pycache__/Config.cpython-38.pyc ADDED Viewed

Binary file (3.77 kB). View file

utils/__pycache__/Logger.cpython-36.pyc ADDED Viewed

Binary file (2.35 kB). View file

utils/__pycache__/Logger.cpython-38.pyc ADDED Viewed

Binary file (2.32 kB). View file

utils/__pycache__/ResultTable.cpython-36.pyc ADDED Viewed

Binary file (4.42 kB). View file

utils/__pycache__/ResultTable.cpython-38.pyc ADDED Viewed

Binary file (4.46 kB). View file

utils/__pycache__/Tool.cpython-36.pyc ADDED Viewed

Binary file (708 Bytes). View file

utils/__pycache__/Tool.cpython-38.pyc ADDED Viewed

Binary file (724 Bytes). View file

utils/__pycache__/__init__.cpython-36.pyc ADDED Viewed

Binary file (304 Bytes). View file

utils/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (312 Bytes). View file