diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..61c3f168be1b5a7ad45e2abe56f997d33dee65bc 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +assets/control.png filter=lfs diff=lfs merge=lfs -text +assets/subject.png filter=lfs diff=lfs merge=lfs -text +generation/control/ControlNet/font/DejaVuSans.ttf filter=lfs diff=lfs merge=lfs -text +generation/control/ControlNet/ldm/modules/image_degradation/utils/test.png filter=lfs diff=lfs merge=lfs -text +llama/data/MetaMathQA-40K.json filter=lfs diff=lfs merge=lfs -text +llama/data/MetaMathQA.json filter=lfs diff=lfs merge=lfs -text diff --git a/assets/control.png b/assets/control.png new file mode 100644 index 0000000000000000000000000000000000000000..1afa338bd3be23650e50d92e1f691e6a86c5dd1f --- /dev/null +++ b/assets/control.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1943c7d2d2042fd1f5455f7c85509c7fc2299221d3118caf8369807b99ff451 +size 1046367 diff --git a/assets/subject.png b/assets/subject.png new file mode 100644 index 0000000000000000000000000000000000000000..a26f201248d2be6b1a12608ca5f1b8c6a48eb3f2 --- /dev/null +++ b/assets/subject.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d115037067258634d251581e308b6509fd9b8190b6084d00a211b6886dd379c7 +size 966400 diff --git a/generation/control/ControlNet/font/DejaVuSans.ttf b/generation/control/ControlNet/font/DejaVuSans.ttf new file mode 100644 index 0000000000000000000000000000000000000000..356575d14731ad077bde1fb0aac44f88bb51f5c4 --- /dev/null +++ b/generation/control/ControlNet/font/DejaVuSans.ttf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7da195a74c55bef988d0d48f9508bd5d849425c1770dba5d7bfc6ce9ed848954 +size 757076 diff --git a/generation/control/ControlNet/ldm/modules/image_degradation/utils/test.png b/generation/control/ControlNet/ldm/modules/image_degradation/utils/test.png new file mode 100644 index 0000000000000000000000000000000000000000..e720ed04ac7e1e7938d367e692fb6a742c54a24c --- /dev/null +++ b/generation/control/ControlNet/ldm/modules/image_degradation/utils/test.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92e516278f0d3e85e84cfb55b43338e12d5896a0ee3833aafdf378025457d753 +size 441072 diff --git a/llama/data/MetaMathQA-40K.json b/llama/data/MetaMathQA-40K.json new file mode 100644 index 0000000000000000000000000000000000000000..4f19d6fafbb73a7ea9d677ed38be2be0c58c3d0b --- /dev/null +++ b/llama/data/MetaMathQA-40K.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c884f10e8aa1229a6e73a6bba2c9134ee0c7b7de92a02a7b8c9459085a59e117 +size 31076207 diff --git a/llama/data/MetaMathQA.json b/llama/data/MetaMathQA.json new file mode 100644 index 0000000000000000000000000000000000000000..b7419df6d428ff02158986b096f444c37fdd4eab --- /dev/null +++ b/llama/data/MetaMathQA.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb39a5d8c05c042ece92eae37dfd5ea414a5979df2bf3ad3b86411bef8205725 +size 395626321 diff --git a/llama/output/cp1e4/ft/adapter_model.safetensors b/llama/output/cp1e4/ft/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a09665ee5196fce110ad253798bb2f72e2c9ed8f --- /dev/null +++ b/llama/output/cp1e4/ft/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e1c2fceb4f91331d69364aa56d01dd2103d4e59066f1519f1242a62ecca387a +size 1082171824 diff --git a/llama/output/cp1e4/ft/tokenizer.model b/llama/output/cp1e4/ft/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/llama/output/cp1e4/ft/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/llama/output/cp1e5/ft/adapter_model.safetensors b/llama/output/cp1e5/ft/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..35e96c42e0e006be658eb2dda85727d7159f42d9 --- /dev/null +++ b/llama/output/cp1e5/ft/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6121d3f7682fd21f70fc78ab9097b22ede67191507c54d44a9bd9c30adf44de +size 592928 diff --git a/llama/output/cp1e5N/ft/adapter_model.safetensors b/llama/output/cp1e5N/ft/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..01948e41303160d582479263b0a8ae80571ce40c --- /dev/null +++ b/llama/output/cp1e5N/ft/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d85146aea100acda2fd5bb5a011f8d1e14983756bb0c102bf85efe04ac176479 +size 1082171824 diff --git a/llama/output/cp1e5N/ft/tokenizer.model b/llama/output/cp1e5N/ft/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/llama/output/cp1e5N/ft/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/llama/output/cp3e5/ft/adapter_model.safetensors b/llama/output/cp3e5/ft/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..25c7e034ff8d777506f369efbce304800e56b3ce --- /dev/null +++ b/llama/output/cp3e5/ft/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1945e74d818ded53f08bc892bb458dd0e6addcd548b2f864dbd16a476a8954ef +size 1082171824 diff --git a/llama/output/cp3e5N/ft/adapter_model.safetensors b/llama/output/cp3e5N/ft/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..18d2c70bada6242bbef820c9c93a5355721da5b2 --- /dev/null +++ b/llama/output/cp3e5N/ft/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2396d96c0a301cceddf424fbdf7c7f3518311f90140fa9aad9053706288e9fc +size 1082171824 diff --git a/llama/output/cp3e5N/ft/tokenizer.model b/llama/output/cp3e5N/ft/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/llama/output/cp3e5N/ft/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/llama/output/cpr1/ft/adapter_model.safetensors b/llama/output/cpr1/ft/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a59518f2e704d2a09d1a26811354d7f1511d0419 --- /dev/null +++ b/llama/output/cpr1/ft/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:617c715b246fae47190ca1f8e304e9dbdadf6ac70bbfdd0f3bc3c4b1cd783c0d +size 1049665904 diff --git a/llama/output/cpr1/ft/tokenizer.model b/llama/output/cpr1/ft/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/llama/output/cpr1/ft/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/llama/output/cpr2/ft/adapter_model.safetensors b/llama/output/cpr2/ft/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..07a241d0c9940c7d4288d073c846602a7d25d681 --- /dev/null +++ b/llama/output/cpr2/ft/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:daede58d9fd4806298d90f9af12ba478c119afab844244f355f35ab3829eb029 +size 1049665904 diff --git a/llama/output/cpr2/ft/tokenizer.model b/llama/output/cpr2/ft/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/llama/output/cpr2/ft/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/nlu/DeBERTa.egg-info/PKG-INFO b/nlu/DeBERTa.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..ae8f91cda55784e426fad70a946ec26bb1cf6bd9 --- /dev/null +++ b/nlu/DeBERTa.egg-info/PKG-INFO @@ -0,0 +1,39 @@ +Metadata-Version: 2.1 +Name: DeBERTa +Version: 0.1.13 +Summary: Decoding enhanced BERT with Disentangled Attention +Home-page: https://github.com/microsoft/DeBERTa +Author: penhe +Author-email: penhe@microsoft.com +License: MIT +Keywords: NLP deep learning transformer pytorch Attention BERT RoBERTa DeBERTa +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.6 +Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: License :: OSI Approved :: MIT License +Classifier: Operating System :: OS Independent +Requires-Python: >=3.6 +Description-Content-Type: text/markdown +License-File: LICENSE +Requires-Dist: nltk +Requires-Dist: spacy +Requires-Dist: numpy +Requires-Dist: pytest +Requires-Dist: regex +Requires-Dist: scipy +Requires-Dist: scikit-learn +Requires-Dist: tqdm +Requires-Dist: ujson +Requires-Dist: seqeval +Requires-Dist: psutil +Requires-Dist: sentencepiece +Requires-Dist: torch +Provides-Extra: docs +Requires-Dist: recommonmark; extra == "docs" +Requires-Dist: sphinx; extra == "docs" +Requires-Dist: sphinx-markdown-tables; extra == "docs" +Requires-Dist: sphinx-rtd-theme; extra == "docs" + +deberta long des diff --git a/nlu/DeBERTa.egg-info/SOURCES.txt b/nlu/DeBERTa.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3dd8df8925fc2b9148b942ad6bea302696b0af3 --- /dev/null +++ b/nlu/DeBERTa.egg-info/SOURCES.txt @@ -0,0 +1,73 @@ +LICENSE +setup.cfg +setup.py +DeBERTa/__init__.py +DeBERTa.egg-info/PKG-INFO +DeBERTa.egg-info/SOURCES.txt +DeBERTa.egg-info/dependency_links.txt +DeBERTa.egg-info/requires.txt +DeBERTa.egg-info/top_level.txt +DeBERTa/apps/__init__.py +DeBERTa/apps/_utils.py +DeBERTa/apps/run.py +DeBERTa/apps/models/__init__.py +DeBERTa/apps/models/masked_language_model.py +DeBERTa/apps/models/multi_choice.py +DeBERTa/apps/models/ner.py +DeBERTa/apps/models/record_qa.py +DeBERTa/apps/models/replaced_token_detection_model.py +DeBERTa/apps/models/sequence_classification.py +DeBERTa/apps/tasks/__init__.py +DeBERTa/apps/tasks/glue_tasks.py +DeBERTa/apps/tasks/metrics.py +DeBERTa/apps/tasks/mlm_task.py +DeBERTa/apps/tasks/ner_task.py +DeBERTa/apps/tasks/race_task.py +DeBERTa/apps/tasks/record_eval.py +DeBERTa/apps/tasks/rtd_task.py +DeBERTa/apps/tasks/superglue_tasks.py +DeBERTa/apps/tasks/task.py +DeBERTa/apps/tasks/task_registry.py +DeBERTa/data/__init__.py +DeBERTa/data/async_data.py +DeBERTa/data/data_sampler.py +DeBERTa/data/dataloader.py +DeBERTa/data/dynamic_dataset.py +DeBERTa/data/example.py +DeBERTa/deberta/__init__.py +DeBERTa/deberta/bert.py +DeBERTa/deberta/cache_utils.py +DeBERTa/deberta/config.py +DeBERTa/deberta/da_utils.py +DeBERTa/deberta/deberta.py +DeBERTa/deberta/disentangled_attention.py +DeBERTa/deberta/gpt2_bpe_utils.py +DeBERTa/deberta/gpt2_tokenizer.py +DeBERTa/deberta/mlm.py +DeBERTa/deberta/nnmodule.py +DeBERTa/deberta/ops.py +DeBERTa/deberta/pooling.py +DeBERTa/deberta/pretrained_models.py +DeBERTa/deberta/spm_tokenizer.py +DeBERTa/deberta/tokenizers.py +DeBERTa/optims/__init__.py +DeBERTa/optims/args.py +DeBERTa/optims/fp16_optimizer.py +DeBERTa/optims/lr_schedulers.py +DeBERTa/optims/xadam.py +DeBERTa/sift/__init__.py +DeBERTa/sift/sift.py +DeBERTa/training/__init__.py +DeBERTa/training/_utils.py +DeBERTa/training/args.py +DeBERTa/training/dist_launcher.py +DeBERTa/training/optimizer_utils.py +DeBERTa/training/trainer.py +DeBERTa/utils/__init__.py +DeBERTa/utils/argument_types.py +DeBERTa/utils/jit_tracing.py +DeBERTa/utils/logger_util.py +DeBERTa/utils/xtqdm.py +adapterlib/__init__.py +adapterlib/layers.py +adapterlib/utils.py \ No newline at end of file diff --git a/nlu/DeBERTa.egg-info/dependency_links.txt b/nlu/DeBERTa.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/nlu/DeBERTa.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/nlu/DeBERTa.egg-info/requires.txt b/nlu/DeBERTa.egg-info/requires.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf4bbbbe4b81ce8b58b5398037643cbbd437e923 --- /dev/null +++ b/nlu/DeBERTa.egg-info/requires.txt @@ -0,0 +1,19 @@ +nltk +spacy +numpy +pytest +regex +scipy +scikit-learn +tqdm +ujson +seqeval +psutil +sentencepiece +torch + +[docs] +recommonmark +sphinx +sphinx-markdown-tables +sphinx-rtd-theme diff --git a/nlu/DeBERTa.egg-info/top_level.txt b/nlu/DeBERTa.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..ae1fd650b677ed356341dc53b488086cec2128b2 --- /dev/null +++ b/nlu/DeBERTa.egg-info/top_level.txt @@ -0,0 +1,2 @@ +DeBERTa +adapterlib diff --git a/nlu/DeBERTa/apps/tasks/task_registry.py b/nlu/DeBERTa/apps/tasks/task_registry.py new file mode 100644 index 0000000000000000000000000000000000000000..6b251bd4481ff9900902641ddb5975730acfd8d8 --- /dev/null +++ b/nlu/DeBERTa/apps/tasks/task_registry.py @@ -0,0 +1,70 @@ +# Copyright (c) Microsoft, Inc. 2020 +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Author: penhe@microsoft.com +# Date: 01/25/2019 +# + +from glob import glob +import os +import importlib +import pdb +import sys +from ...utils import get_logger +from .task import Task + +__all__ = ['load_tasks', 'register_task', 'get_task'] +tasks={} + +logger=get_logger() + +def register_task(name=None, desc=None): + def register_task_x(cls): + _name = name + if _name is None: + _name = cls.__name__ + + _desc = desc + if _desc is None: + _desc = _name + + _name = _name.lower() + if _name in tasks: + logger.warning(f'{_name} already registered in the registry: {tasks[_name]}.') + assert issubclass(cls, Task), f'Registered class must be a subclass of Task.' + tasks[_name] = cls + cls._meta = { + 'name': _name, + 'desc': _desc} + return cls + + if type(name)==type: + cls = name + name = None + return register_task_x(cls) + return register_task_x + +def load_tasks(task_dir = None): + script_dir = os.path.dirname(os.path.abspath(__file__)) + sys_tasks = glob(os.path.join(script_dir, "*.py")) + for t in sys_tasks: + m = os.path.splitext(os.path.basename(t))[0] + if not m.startswith('_'): + importlib.import_module(f'DeBERTa.apps.tasks.{m}') + + if task_dir: + assert os.path.exists(task_dir), f"{task_dir} must be a valid directory." + customer_tasks = glob(os.path.join(task_dir, "*.py")) + sys.path.append(task_dir) + for t in customer_tasks: + m = os.path.splitext(os.path.basename(t))[0] + if not m.startswith('_'): + importlib.import_module(f'{m}') + +def get_task(name=None): + if name is None: + return tasks + + return tasks[name.lower()] diff --git a/nlu/DeBERTa/data/__init__.py b/nlu/DeBERTa/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1783518c5fe2f7f2c8376d6cce41b5a6a44e47fa --- /dev/null +++ b/nlu/DeBERTa/data/__init__.py @@ -0,0 +1,5 @@ +from .example import ExampleInstance,ExampleSet,example_to_feature +from .dataloader import SequentialDataLoader +from .dynamic_dataset import * +from .data_sampler import * +from .async_data import * diff --git a/nlu/DeBERTa/data/async_data.py b/nlu/DeBERTa/data/async_data.py new file mode 100644 index 0000000000000000000000000000000000000000..9cbf986a0a4bee4d45209b73a00055e94d6b2f06 --- /dev/null +++ b/nlu/DeBERTa/data/async_data.py @@ -0,0 +1,38 @@ +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Author: Pengcheng He (penhe@microsoft.com) +# Date: 05/15/2019 +# + +from queue import Queue,Empty +from threading import Thread +class AsyncDataLoader(object): + def __init__(self, dataloader, buffer_size=100): + self.buffer_size = buffer_size + self.dataloader = dataloader + + def __iter__(self): + queue = Queue(self.buffer_size) + dl=iter(self.dataloader) + def _worker(): + while True: + try: + queue.put(next(dl)) + except StopIteration: + break + queue.put(None) + t=Thread(target=_worker) + t.start() + while True: + d = queue.get() + if d is None: + break + yield d + del t + del queue + + def __len__(self): + return len(self.dataloader) + diff --git a/nlu/DeBERTa/data/data_sampler.py b/nlu/DeBERTa/data/data_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1aec3c2b4298503556aef1b8d4f0b2abb934f5fa --- /dev/null +++ b/nlu/DeBERTa/data/data_sampler.py @@ -0,0 +1,76 @@ +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Author: Pengcheng He (penhe@microsoft.com) +# Date: 05/15/2019 +# + +import os +import numpy as np +import math +import sys +from torch.utils.data import Sampler + +__all__=['BatchSampler', 'DistributedBatchSampler', 'RandomSampler', 'SequentialSampler'] +class BatchSampler(Sampler): + def __init__(self, sampler, batch_size): + self.sampler = sampler + self.batch_size = batch_size + + def __iter__(self): + batch = [] + for idx in self.sampler: + batch.append(idx) + if len(batch)==self.batch_size: + yield batch + batch = [] + if len(batch)>0: + yield batch + + def __len__(self): + return (len(self.sampler) + self.batch_size - 1)//self.batch_size + +class DistributedBatchSampler(Sampler): + def __init__(self, sampler, rank=0, world_size = 1, drop_last = False): + self.sampler = sampler + self.rank = rank + self.world_size = world_size + self.drop_last = drop_last + + def __iter__(self): + for b in self.sampler: + if len(b)%self.world_size != 0: + if self.drop_last: + break + else: + b.extend([b[0] for _ in range(self.world_size-len(b)%self.world_size)]) + chunk_size = len(b)//self.world_size + yield b[self.rank*chunk_size:(self.rank+1)*chunk_size] + + def __len__(self): + return len(self.sampler) + +class RandomSampler(Sampler): + def __init__(self, total_samples:int, data_seed:int = 0): + self.indices = np.array(np.arange(total_samples)) + self.rng = np.random.RandomState(data_seed) + + def __iter__(self): + self.rng.shuffle(self.indices) + for i in self.indices: + yield i + + def __len__(self): + return len(self.indices) + +class SequentialSampler(Sampler): + def __init__(self, total_samples:int): + self.indices = np.array(np.arange(total_samples)) + + def __iter__(self): + for i in self.indices: + yield i + + def __len__(self): + return len(self.indices) diff --git a/nlu/DeBERTa/data/dataloader.py b/nlu/DeBERTa/data/dataloader.py new file mode 100644 index 0000000000000000000000000000000000000000..fd4e63cb134c70937a9dee131aebc07496da5f0b --- /dev/null +++ b/nlu/DeBERTa/data/dataloader.py @@ -0,0 +1,511 @@ +import random +import torch +import torch.multiprocessing as multiprocessing +from torch._C import _set_worker_signal_handlers, \ + _remove_worker_pids, _error_if_any_worker_fails + +from packaging import version + +if version.Version(torch.__version__) >= version.Version('1.0.0'): + from torch._C import _set_worker_pids +else: + from torch._C import _update_worker_pids as _set_worker_pids + +from torch.utils.data import SequentialSampler, RandomSampler, BatchSampler, Sampler +import signal +import functools +import collections.abc +import re +import sys +import threading +import traceback +import os +import time +# from torch._six import string_classes +string_classes = str + +IS_WINDOWS = sys.platform == "win32" +if IS_WINDOWS: + import ctypes + from ctypes.wintypes import DWORD, BOOL, HANDLE + +if sys.version_info[0] == 2: + import Queue as queue +else: + import queue + +__all__ = ['SequentialDataLoader'] + +class ExceptionWrapper(object): + r"""Wraps an exception plus traceback to communicate across threads""" + + def __init__(self, exc_info): + self.exc_type = exc_info[0] + self.exc_msg = "".join(traceback.format_exception(*exc_info)) + + +_use_shared_memory = False +r"""Whether to use shared memory in default_collate""" + +MANAGER_STATUS_CHECK_INTERVAL = 5.0 + +if IS_WINDOWS: + # On Windows, the parent ID of the worker process remains unchanged when the manager process + # is gone, and the only way to check it through OS is to let the worker have a process handle + # of the manager and ask if the process status has changed. + class ManagerWatchdog(object): + def __init__(self): + self.manager_pid = os.getppid() + + self.kernel32 = ctypes.WinDLL('kernel32', use_last_error=True) + self.kernel32.OpenProcess.argtypes = (DWORD, BOOL, DWORD) + self.kernel32.OpenProcess.restype = HANDLE + self.kernel32.WaitForSingleObject.argtypes = (HANDLE, DWORD) + self.kernel32.WaitForSingleObject.restype = DWORD + + # Value obtained from https://msdn.microsoft.com/en-us/library/ms684880.aspx + SYNCHRONIZE = 0x00100000 + self.manager_handle = self.kernel32.OpenProcess(SYNCHRONIZE, 0, self.manager_pid) + + if not self.manager_handle: + raise ctypes.WinError(ctypes.get_last_error()) + + def is_alive(self): + # Value obtained from https://msdn.microsoft.com/en-us/library/windows/desktop/ms687032.aspx + return self.kernel32.WaitForSingleObject(self.manager_handle, 0) != 0 +else: + class ManagerWatchdog(object): + def __init__(self): + self.manager_pid = os.getppid() + + def is_alive(self): + return os.getppid() == self.manager_pid + + +def _worker_loop(dataset, index_queue, data_queue, collate_fn, init_fn, worker_id): + global _use_shared_memory + _use_shared_memory = True + + # Intialize C side signal handlers for SIGBUS and SIGSEGV. Python signal + # module's handlers are executed after Python returns from C low-level + # handlers, likely when the same fatal signal happened again already. + # https://docs.python.org/3/library/signal.html Sec. 18.8.1.1 + _set_worker_signal_handlers() + + torch.set_num_threads(1) + + if init_fn is not None: + init_fn(worker_id) + + watchdog = ManagerWatchdog() + + while True: + try: + r = index_queue.get(timeout=MANAGER_STATUS_CHECK_INTERVAL) + except queue.Empty: + if watchdog.is_alive(): + continue + else: + break + if r is None: + break + idx, batch_indices = r + try: + samples = collate_fn([dataset[i] for i in batch_indices]) + except Exception: + data_queue.put((idx, ExceptionWrapper(sys.exc_info()))) + else: + data_queue.put((idx, samples)) + del samples + + +def _worker_manager_loop(in_queue, out_queue, done_event, pin_memory, device_id): + if pin_memory: + torch.cuda.set_device(device_id) + + while True: + try: + r = in_queue.get() + except Exception: + if done_event.is_set(): + return + raise + if r is None: + break + if isinstance(r[1], ExceptionWrapper): + out_queue.put(r) + continue + idx, batch = r + try: + if pin_memory: + batch = pin_memory_batch(batch) + except Exception: + out_queue.put((idx, ExceptionWrapper(sys.exc_info()))) + else: + out_queue.put((idx, batch)) + +numpy_type_map = { + 'float64': torch.DoubleTensor, + 'float32': torch.FloatTensor, + 'float16': torch.HalfTensor, + 'int64': torch.LongTensor, + 'int32': torch.IntTensor, + 'int16': torch.ShortTensor, + 'int8': torch.CharTensor, + 'uint8': torch.ByteTensor, +} + + +def default_collate(batch): + r"""Puts each data field into a tensor with outer dimension batch size""" + + error_msg = "batch must contain tensors, numbers, dicts or lists; found {}" + elem_type = type(batch[0]) + if isinstance(batch[0], torch.Tensor): + out = None + if _use_shared_memory: + # If we're in a background process, concatenate directly into a + # shared memory tensor to avoid an extra copy + numel = sum([x.numel() for x in batch]) + storage = batch[0].storage()._new_shared(numel) + out = batch[0].new(storage) + return torch.stack(batch, 0, out=out) + elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \ + and elem_type.__name__ != 'string_': + elem = batch[0] + if elem_type.__name__ == 'ndarray': + # array of string classes and object + if re.search('[SaUO]', elem.dtype.str) is not None: + raise TypeError(error_msg.format(elem.dtype)) + + return torch.stack([torch.from_numpy(b) for b in batch], 0) + if elem.shape == (): # scalars + py_type = float if elem.dtype.name.startswith('float') else int + return numpy_type_map[elem.dtype.name](list(map(py_type, batch))) + elif isinstance(batch[0], int): + return torch.LongTensor(batch) + elif isinstance(batch[0], float): + return torch.DoubleTensor(batch) + elif isinstance(batch[0], string_classes): + return batch + elif isinstance(batch[0], collections.abc.Mapping): + return {key: default_collate([d[key] for d in batch]) for key in batch[0]} + elif isinstance(batch[0], collections.abc.Sequence): + transposed = zip(*batch) + return [default_collate(samples) for samples in transposed] + + raise TypeError((error_msg.format(type(batch[0])))) + + +def pin_memory_batch(batch): + if isinstance(batch, torch.Tensor): + return batch.pin_memory() + elif isinstance(batch, string_classes): + return batch + elif isinstance(batch, collections.abc.Mapping): + return {k: pin_memory_batch(sample) for k, sample in batch.items()} + elif isinstance(batch, collections.abc.Sequence): + return [pin_memory_batch(sample) for sample in batch] + else: + return batch + + +_SIGCHLD_handler_set = False +r"""Whether SIGCHLD handler is set for DataLoader worker failures. Only one +handler needs to be set for all DataLoaders in a process.""" + + +def _set_SIGCHLD_handler(): + # Windows doesn't support SIGCHLD handler + if sys.platform == 'win32': + return + # can't set signal in child threads + if not isinstance(threading.current_thread(), threading._MainThread): + return + global _SIGCHLD_handler_set + if _SIGCHLD_handler_set: + return + previous_handler = signal.getsignal(signal.SIGCHLD) + if not callable(previous_handler): + previous_handler = None + + def handler(signum, frame): + # This following call uses `waitid` with WNOHANG from C side. Therefore, + # Python can still get and update the process status successfully. + _error_if_any_worker_fails() + if previous_handler is not None: + previous_handler(signum, frame) + + signal.signal(signal.SIGCHLD, handler) + _SIGCHLD_handler_set = True + + +class _SequentialDataLoaderIter(object): + r"""Iterates once over the DataLoader's dataset, as specified by the sampler""" + + def __init__(self, loader): + self.dataset = loader.dataset + self.collate_fn = loader.collate_fn + self.batch_sampler = loader.batch_sampler + self.num_workers = loader.num_workers + self.pin_memory = loader.pin_memory and torch.cuda.is_available() + self.timeout = loader.timeout + self.done_event = threading.Event() + + self.sample_iter = iter(self.batch_sampler) + + if self.num_workers > 0: + self.worker_init_fn = loader.worker_init_fn + self.index_queues = [multiprocessing.Queue() for _ in range(self.num_workers)] + self.worker_queue_idx = 0 + self.worker_result_queue = multiprocessing.SimpleQueue() + self.batches_outstanding = 0 + self.worker_pids_set = False + self.shutdown = False + self.send_idx = 0 + self.rcvd_idx = 0 + self.reorder_dict = {} + + self.workers = [ + multiprocessing.Process( + target=_worker_loop, + args=(self.dataset, self.index_queues[i], + self.worker_result_queue, self.collate_fn, self.worker_init_fn, i)) + for i in range(self.num_workers)] + + if self.pin_memory or self.timeout > 0: + self.data_queue = queue.Queue() + if self.pin_memory: + maybe_device_id = torch.cuda.current_device() + else: + # do not initialize cuda context if not necessary + maybe_device_id = None + self.worker_manager_thread = threading.Thread( + target=_worker_manager_loop, + args=(self.worker_result_queue, self.data_queue, self.done_event, self.pin_memory, + maybe_device_id)) + self.worker_manager_thread.daemon = True + self.worker_manager_thread.start() + else: + self.data_queue = self.worker_result_queue + + for w in self.workers: + w.daemon = True # ensure that the worker exits on process exit + w.start() + + _set_worker_pids(id(self), tuple(w.pid for w in self.workers)) + _set_SIGCHLD_handler() + self.worker_pids_set = True + + # prime the prefetch loop + for _ in range(2 * self.num_workers): + self._put_indices() + + def __len__(self): + return len(self.batch_sampler) + + def _get_batch(self): + if self.timeout > 0: + try: + return self.data_queue.get(timeout=self.timeout) + except queue.Empty: + raise RuntimeError('DataLoader timed out after {} seconds'.format(self.timeout)) + else: + return self.data_queue.get() + + def __next__(self): + if self.num_workers == 0: # same-process loading + indices = next(self.sample_iter) # may raise StopIteration + batch = self.collate_fn([self.dataset[i] for i in indices]) + if self.pin_memory: + batch = pin_memory_batch(batch) + return batch + + # check if the next sample has already been generated + if self.rcvd_idx in self.reorder_dict: + batch = self.reorder_dict.pop(self.rcvd_idx) + return self._process_next_batch(batch) + + if self.batches_outstanding == 0: + self._shutdown_workers() + raise StopIteration + + while True: + assert (not self.shutdown and self.batches_outstanding > 0) + idx, batch = self._get_batch() + self.batches_outstanding -= 1 + if idx != self.rcvd_idx: + # store out-of-order samples + self.reorder_dict[idx] = batch + continue + return self._process_next_batch(batch) + + next = __next__ # Python 2 compatibility + + def __iter__(self): + return self + + def _put_indices(self): + assert self.batches_outstanding < 2 * self.num_workers + indices = next(self.sample_iter, None) + if indices is None: + return + self.index_queues[self.worker_queue_idx].put((self.send_idx, indices)) + self.worker_queue_idx = (self.worker_queue_idx + 1) % self.num_workers + self.batches_outstanding += 1 + self.send_idx += 1 + + def _process_next_batch(self, batch): + self.rcvd_idx += 1 + self._put_indices() + if isinstance(batch, ExceptionWrapper): + raise batch.exc_type(batch.exc_msg) + return batch + + def __getstate__(self): + # TODO: add limited pickling support for sharing an iterator + # across multiple threads for HOGWILD. + # Probably the best way to do this is by moving the sample pushing + # to a separate thread and then just sharing the data queue + # but signalling the end is tricky without a non-blocking API + raise NotImplementedError("_SequentialDataLoaderIter cannot be pickled") + + def _shutdown_workers(self): + try: + if not self.shutdown: + self.shutdown = True + self.done_event.set() + for q in self.index_queues: + q.put(None) + # if some workers are waiting to put, make place for them + try: + while not self.worker_result_queue.empty(): + self.worker_result_queue.get() + except (FileNotFoundError, ImportError): + # Many weird errors can happen here due to Python + # shutting down. These are more like obscure Python bugs. + # FileNotFoundError can happen when we rebuild the fd + # fetched from the queue but the socket is already closed + # from the worker side. + # ImportError can happen when the unpickler loads the + # resource from `get`. + pass + # done_event should be sufficient to exit worker_manager_thread, + # but be safe here and put another None + self.worker_result_queue.put(None) + finally: + # removes pids no matter what + if self.worker_pids_set: + _remove_worker_pids(id(self)) + self.worker_pids_set = False + + def __del__(self): + if self.num_workers > 0: + self._shutdown_workers() + + +class SequentialDataLoader(object): + r""" + Sequential Data loader. Combines a dataset and a sampler, and provides + single- or multi-process iterators over the dataset. + This is modified from Pytorch.DataLoader by disable random state touch as for sequential data loading, + we don't want it to touch any random state. + Arguments: + dataset (Dataset): dataset from which to load the data. + batch_size (int, optional): how many samples per batch to load + (default: 1). + shuffle (bool, optional): set to ``True`` to have the data reshuffled + at every epoch (default: False). + sampler (Sampler, optional): defines the strategy to draw samples from + the dataset. If specified, ``shuffle`` must be False. + batch_sampler (Sampler, optional): like sampler, but returns a batch of + indices at a time. Mutually exclusive with batch_size, shuffle, + sampler, and drop_last. + num_workers (int, optional): how many subprocesses to use for data + loading. 0 means that the data will be loaded in the main process. + (default: 0) + collate_fn (callable, optional): merges a list of samples to form a mini-batch. + pin_memory (bool, optional): If ``True``, the data loader will copy tensors + into CUDA pinned memory before returning them. + drop_last (bool, optional): set to ``True`` to drop the last incomplete batch, + if the dataset size is not divisible by the batch size. If ``False`` and + the size of dataset is not divisible by the batch size, then the last batch + will be smaller. (default: False) + timeout (numeric, optional): if positive, the timeout value for collecting a batch + from workers. Should always be non-negative. (default: 0) + worker_init_fn (callable, optional): If not None, this will be called on each + worker subprocess with the worker id (an int in ``[0, num_workers - 1]``) as + input, after seeding and before data loading. (default: None) + + .. note:: By default, each worker will have its PyTorch seed set to + ``base_seed + worker_id``, where ``base_seed`` is a long generated + by main process using its RNG. However, seeds for other libraies + may be duplicated upon initializing workers (w.g., NumPy), causing + each worker to return identical random numbers. (See + :ref:`dataloader-workers-random-seed` section in FAQ.) You may + use ``torch.initial_seed()`` to access the PyTorch seed for each + worker in :attr:`worker_init_fn`, and use it to set other seeds + before data loading. + + .. warning:: If ``spawn`` start method is used, :attr:`worker_init_fn` cannot be an + unpicklable object, e.g., a lambda function. + """ + + __initialized = False + + def __init__(self, dataset, batch_size=1, shuffle=False, sampler=None, batch_sampler=None, + num_workers=0, collate_fn=default_collate, pin_memory=False, drop_last=False, + timeout=0, worker_init_fn=None): + self.dataset = dataset + self.batch_size = batch_size + self.num_workers = num_workers + self.collate_fn = collate_fn + self.pin_memory = pin_memory + self.drop_last = drop_last + self.timeout = timeout + self.worker_init_fn = worker_init_fn + + if timeout < 0: + raise ValueError('timeout option should be non-negative') + + if batch_sampler is not None: + if batch_size > 1 or shuffle or sampler is not None or drop_last: + raise ValueError('batch_sampler option is mutually exclusive ' + 'with batch_size, shuffle, sampler, and ' + 'drop_last') + self.batch_size = None + self.drop_last = None + + if sampler is not None and shuffle: + raise ValueError('sampler option is mutually exclusive with ' + 'shuffle') + + if self.num_workers < 0: + raise ValueError('num_workers option cannot be negative; ' + 'use num_workers=0 to disable multiprocessing.') + + if batch_sampler is None: + if sampler is None: + if shuffle: + sampler = RandomSampler(dataset) + else: + sampler = SequentialSampler(dataset) + batch_sampler = BatchSampler(sampler, batch_size, drop_last) + + self.sampler = sampler + self.batch_sampler = batch_sampler + self.__initialized = True + + def __setattr__(self, attr, val): + if self.__initialized and attr in ('batch_size', 'sampler', 'drop_last'): + raise ValueError('{} attribute should not be set after {} is ' + 'initialized'.format(attr, self.__class__.__name__)) + + super(SequentialDataLoader, self).__setattr__(attr, val) + + def __iter__(self): + return _SequentialDataLoaderIter(self) + + def __len__(self): + return len(self.batch_sampler) + diff --git a/nlu/DeBERTa/data/dynamic_dataset.py b/nlu/DeBERTa/data/dynamic_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..4d269a37b54dc905ce047fa5bdc74ed835217434 --- /dev/null +++ b/nlu/DeBERTa/data/dynamic_dataset.py @@ -0,0 +1,60 @@ +# Copyright (c) Microsoft, Inc. 2020 +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Author: penhe@microsoft.com +# Date: 05/15/2019 +# + +import pdb +from torch.utils.data import Dataset +import random +import mmap +import numpy as np +from bisect import bisect +from ..utils import get_logger +logger=get_logger() + +__all__ = ['DynamicDataset'] + +class DynamicDataset(Dataset): + def __init__(self, corpus, feature_fn, dataset_size=None, shuffle=False, **kwargs): + self.corpus = corpus + self.ds_len = len(self.corpus) + logger.info(f'Total corpus examples: {self.ds_len}') + self.feature_fn = feature_fn + + if not dataset_size: + self.dataset_size = self.ds_len + else: + self.dataset_size = int(dataset_size) + + self.shuffle = shuffle + index_buf = mmap.mmap(-1, self.dataset_size*8) + shuffle_idx = np.ndarray(shape=(self.dataset_size, ), buffer=index_buf, dtype=int) + shuffle_idx[:] = np.arange(self.dataset_size)[:] + if self.shuffle: + #rng = np.random.RandomState(0) + rng = random.Random(0) + rng.shuffle(shuffle_idx) + self.shuffle_idx = shuffle_idx + self.index_offset = 0 + if 'index_offset' in kwargs: + self.index_offset = kwargs['index_offset'] + + def __len__(self): + return self.dataset_size + + def __getitem__(self, idx): + if isinstance(idx, tuple) or isinstance(idx, list): + idx, ext_params = idx + else: + ext_params = None + idx += self.index_offset + seed = idx + rng = random.Random(seed) + # get seq length + example_idx = self.shuffle_idx[idx%self.dataset_size]%self.ds_len + example = self.corpus[example_idx, rng, ext_params] + return self.feature_fn(example, rng, ext_params = ext_params) diff --git a/nlu/DeBERTa/data/example.py b/nlu/DeBERTa/data/example.py new file mode 100644 index 0000000000000000000000000000000000000000..1da9d19b1dd1d4c64f2ab28e8a690ac4d6ca4780 --- /dev/null +++ b/nlu/DeBERTa/data/example.py @@ -0,0 +1,105 @@ +import torch +import os +from collections import OrderedDict +import numpy as np +import tempfile +import numpy as np +import mmap +import pickle +import signal +import sys +import pdb + +from ..utils import xtqdm as tqdm + +__all__=['ExampleInstance', 'example_to_feature', 'ExampleSet'] + +class ExampleInstance: + def __init__(self, segments, label=None, **kwv): + self.segments = segments + self.label = label + self.__dict__.update(kwv) + + def __repr__(self): + return f'segments: {self.segments}\nlabel: {self.label}' + + def __getitem__(self, i): + return self.segments[i] + + def __len__(self): + return len(self.segments) + +class ExampleSet: + def __init__(self, pairs): + self._data = np.array([pickle.dumps(p) for p in pairs]) + self.total = len(self._data) + + def __getitem__(self, idx): + """ + return pair + """ + if isinstance(idx, tuple): + idx,rng, ext_params = idx + else: + rng,ext_params=None, None + content = self._data[idx] + example = pickle.loads(content) + return example + + def __len__(self): + return self.total + + def __iter__(self): + for i in range(self.total): + yield self[i] + +def _truncate_segments(segments, max_num_tokens, rng): + """ + Truncate sequence pair according to original BERT implementation: + https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L391 + """ + while True: + if sum(len(s) for s in segments)<=max_num_tokens: + break + + segments = sorted(segments, key=lambda s:len(s), reverse=True) + trunc_tokens = segments[0] + + assert len(trunc_tokens) >= 1 + + if rng.random() < 0.5: + trunc_tokens.pop(0) + else: + trunc_tokens.pop() + return segments + +def example_to_feature(tokenizer, example, max_seq_len=512, rng=None, mask_generator = None, ext_params=None, label_type='int', **kwargs): + if not rng: + rng = random + max_num_tokens = max_seq_len - len(example.segments) - 1 + segments = _truncate_segments([tokenizer.tokenize(s) for s in example.segments], max_num_tokens, rng) + tokens = ['[CLS]'] + type_ids = [0] + for i,s in enumerate(segments): + tokens.extend(s) + tokens.append('[SEP]') + type_ids.extend([i]*(len(s)+1)) + if mask_generator: + tokens, lm_labels = mask_generator.mask_tokens(tokens, rng) + token_ids = tokenizer.convert_tokens_to_ids(tokens) + pos_ids = list(range(len(token_ids))) + input_mask = [1]*len(token_ids) + features = OrderedDict(input_ids = token_ids, + type_ids = type_ids, + position_ids = pos_ids, + input_mask = input_mask) + if mask_generator: + features['lm_labels'] = lm_labels + padding_size = max(0, max_seq_len - len(token_ids)) + for f in features: + features[f].extend([0]*padding_size) + features[f] = torch.tensor(features[f], dtype=torch.int) + label_type = torch.int if label_type=='int' else torch.float + if example.label is not None: + features['labels'] = torch.tensor(example.label, dtype=label_type) + return features diff --git a/nlu/DeBERTa/deberta/__init__.py b/nlu/DeBERTa/deberta/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5b968269bc3a6c05c57c9a6ab09c70918ba496f9 --- /dev/null +++ b/nlu/DeBERTa/deberta/__init__.py @@ -0,0 +1,22 @@ +# +# Author: penhe@microsoft.com +# Date: 04/25/2019 +# + +""" Components for NN +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from .tokenizers import * +from .pooling import * +from .mlm import MLMPredictionHead +from .nnmodule import NNModule +from .deberta import * +from .disentangled_attention import * +from .ops import * +from .bert import * +from .config import * +from .cache_utils import * diff --git a/nlu/DeBERTa/deberta/bert.py b/nlu/DeBERTa/deberta/bert.py new file mode 100644 index 0000000000000000000000000000000000000000..249fb0141c36b7c7d269c2f74f2ab2d68c7f4e2c --- /dev/null +++ b/nlu/DeBERTa/deberta/bert.py @@ -0,0 +1,308 @@ +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) Microsoft, Inc. 2020 +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# This piece of code is modified based on https://github.com/huggingface/transformers + +import copy +import torch +from torch import nn +from collections.abc import Sequence +from packaging import version +import numpy as np +import math +import os +import pdb + +import json +from .ops import * +from .disentangled_attention import * +from .da_utils import * + +from adapterlib import adapter_dict + +__all__ = ['BertEncoder', 'BertEmbeddings', 'ACT2FN', 'LayerNorm', 'BertLMPredictionHead'] + +class BertSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + # self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if config.inject_adapter != 'linear': + self.dense = adapter_dict[config.inject_adapter](config.hidden_size, config.hidden_size, config=config) + else: + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + + self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = StableDropout(config.hidden_dropout_prob) + self.config = config + + def forward(self, hidden_states, input_states, mask=None): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states += input_states + hidden_states = MaskedLayerNorm(self.LayerNorm, hidden_states) + return hidden_states + +class BertAttention(nn.Module): + def __init__(self, config): + super().__init__() + self.self = DisentangledSelfAttention(config) + self.output = BertSelfOutput(config) + self.config = config + + def forward(self, hidden_states, attention_mask, return_att=False, query_states=None, relative_pos=None, rel_embeddings=None): + output = self.self(hidden_states, attention_mask, return_att, query_states=query_states, relative_pos=relative_pos, rel_embeddings=rel_embeddings) + self_output, att_matrix, att_logits_=output['hidden_states'], output['attention_probs'], output['attention_logits'] + if query_states is None: + query_states = hidden_states + attention_output = self.output(self_output, query_states, attention_mask) + + if return_att: + return (attention_output, att_matrix) + else: + return attention_output + +class BertIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + # self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if config.inject_adapter != 'linear': + self.dense = adapter_dict[config.inject_adapter](config.hidden_size, config.intermediate_size, config=config) + else: + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + + self.intermediate_act_fn = ACT2FN[config.hidden_act] \ + if isinstance(config.hidden_act, str) else config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + +class BertOutput(nn.Module): + def __init__(self, config): + super(BertOutput, self).__init__() + # self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + if config.inject_adapter != 'linear': + self.dense = adapter_dict[config.inject_adapter](config.intermediate_size, config.hidden_size, config=config) + else: + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + + self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = StableDropout(config.hidden_dropout_prob) + self.config = config + + def forward(self, hidden_states, input_states, mask=None): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states += input_states + hidden_states = MaskedLayerNorm(self.LayerNorm, hidden_states) + return hidden_states + +class BertLayer(nn.Module): + def __init__(self, config): + super(BertLayer, self).__init__() + self.attention = BertAttention(config) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward(self, hidden_states, attention_mask, return_att=False, query_states=None, relative_pos=None, rel_embeddings=None): + attention_output = self.attention(hidden_states, attention_mask, return_att=return_att, \ + query_states=query_states, relative_pos=relative_pos, rel_embeddings=rel_embeddings) + if return_att: + attention_output, att_matrix = attention_output + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output, attention_mask) + if return_att: + return (layer_output, att_matrix) + else: + return layer_output + +class ConvLayer(nn.Module): + def __init__(self, config): + super().__init__() + kernel_size = getattr(config, 'conv_kernel_size', 3) + groups = getattr(config, 'conv_groups', 1) + self.conv_act = getattr(config, 'conv_act', 'tanh') + self.conv = torch.nn.Conv1d(config.hidden_size, config.hidden_size, kernel_size, padding = (kernel_size-1)//2, groups = groups) + self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = StableDropout(config.hidden_dropout_prob) + self.config = config + + def forward(self, hidden_states, residual_states, input_mask): + out = self.conv(hidden_states.permute(0,2,1).contiguous()).permute(0,2,1).contiguous() + if version.Version(torch.__version__) >= version.Version('1.2.0a'): + rmask = (1-input_mask).bool() + else: + rmask = (1-input_mask).byte() + out.masked_fill_(rmask.unsqueeze(-1).expand(out.size()), 0) + out = ACT2FN[self.conv_act](self.dropout(out)) + output_states = MaskedLayerNorm(self.LayerNorm, residual_states + out, input_mask) + + return output_states + +class BertEncoder(nn.Module): + """ Modified BertEncoder with relative position bias support + """ + def __init__(self, config): + super().__init__() + #layer = BertLayer(config) + self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) + self.relative_attention = getattr(config, 'relative_attention', False) + if self.relative_attention: + self.max_relative_positions = getattr(config, 'max_relative_positions', -1) + if self.max_relative_positions <1: + self.max_relative_positions = config.max_position_embeddings + self.position_buckets = getattr(config, 'position_buckets', -1) + pos_ebd_size = self.max_relative_positions*2 + if self.position_buckets>0: + pos_ebd_size = self.position_buckets*2 + self.rel_embeddings = nn.Embedding(pos_ebd_size, config.hidden_size) + + self.norm_rel_ebd = [x.strip() for x in getattr(config, 'norm_rel_ebd', 'none').lower().split('|')] + if 'layer_norm' in self.norm_rel_ebd: + self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine = True) + kernel_size = getattr(config, 'conv_kernel_size', 0) + self.with_conv = False + if kernel_size > 0: + self.with_conv = True + self.conv = ConvLayer(config) + + def get_rel_embedding(self): + rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None + if rel_embeddings is not None and ('layer_norm' in self.norm_rel_ebd): + rel_embeddings = self.LayerNorm(rel_embeddings) + return rel_embeddings + + def get_attention_mask(self, attention_mask): + if attention_mask.dim()<=2: + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + attention_mask = extended_attention_mask*extended_attention_mask.squeeze(-2).unsqueeze(-1) + attention_mask = attention_mask.byte() + elif attention_mask.dim()==3: + attention_mask = attention_mask.unsqueeze(1) + + return attention_mask + + def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None): + if self.relative_attention and relative_pos is None: + q = query_states.size(-2) if query_states is not None else hidden_states.size(-2) + relative_pos = build_relative_position(q, hidden_states.size(-2), bucket_size = self.position_buckets, \ + max_position=self.max_relative_positions, device = hidden_states.device) + return relative_pos + + def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, return_att=False, query_states = None, relative_pos=None): + if attention_mask.dim()<=2: + input_mask = attention_mask + else: + input_mask = (attention_mask.sum(-2)>0).byte() + attention_mask = self.get_attention_mask(attention_mask) + relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos) + + all_encoder_layers = [] + att_matrices = [] + if isinstance(hidden_states, Sequence): + next_kv = hidden_states[0] + else: + next_kv = hidden_states + rel_embeddings = self.get_rel_embedding() + for i, layer_module in enumerate(self.layer): + output_states = layer_module(next_kv, attention_mask, return_att, query_states = query_states, relative_pos=relative_pos, rel_embeddings=rel_embeddings) + if return_att: + output_states, att_m = output_states + + if i == 0 and self.with_conv: + prenorm = output_states #output['prenorm_states'] + output_states = self.conv(hidden_states, prenorm, input_mask) + + if query_states is not None: + query_states = output_states + if isinstance(hidden_states, Sequence): + next_kv = hidden_states[i+1] if i+1 < len(self.layer) else None + else: + next_kv = output_states + + if output_all_encoded_layers: + all_encoder_layers.append(output_states) + if return_att: + att_matrices.append(att_m) + if not output_all_encoded_layers: + all_encoder_layers.append(output_states) + if return_att: + att_matrices.append(att_m) + return { + 'hidden_states': all_encoder_layers, + 'attention_matrices': att_matrices + } + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings. + """ + def __init__(self, config): + super(BertEmbeddings, self).__init__() + padding_idx = getattr(config, 'padding_idx', 0) + self.embedding_size = getattr(config, 'embedding_size', config.hidden_size) + self.word_embeddings = nn.Embedding(config.vocab_size, self.embedding_size, padding_idx = padding_idx) + self.position_biased_input = getattr(config, 'position_biased_input', True) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.embedding_size) + + if config.type_vocab_size>0: + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, self.embedding_size) + + if self.embedding_size != config.hidden_size: + self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False) + self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = StableDropout(config.hidden_dropout_prob) + self.output_to_half = False + self.config = config + + def forward(self, input_ids, token_type_ids=None, position_ids=None, mask = None): + seq_length = input_ids.size(1) + if position_ids is None: + position_ids = torch.arange(0, seq_length, dtype=torch.long, device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + words_embeddings = self.word_embeddings(input_ids) + position_embeddings = self.position_embeddings(position_ids.long()) + + embeddings = words_embeddings + if self.config.type_vocab_size>0: + token_type_embeddings = self.token_type_embeddings(token_type_ids) + embeddings += token_type_embeddings + + if self.position_biased_input: + embeddings += position_embeddings + + if self.embedding_size != self.config.hidden_size: + embeddings = self.embed_proj(embeddings) + embeddings = MaskedLayerNorm(self.LayerNorm, embeddings, mask) + embeddings = self.dropout(embeddings) + return { + 'embeddings': embeddings, + 'position_embeddings': position_embeddings} + +class BertLMPredictionHead(nn.Module): + def __init__(self, config, vocab_size): + super().__init__() + self.embedding_size = getattr(config, 'embedding_size', config.hidden_size) + self.dense = nn.Linear(config.hidden_size, self.embedding_size) + self.transform_act_fn = ACT2FN[config.hidden_act] \ + if isinstance(config.hidden_act, str) else config.hidden_act + + self.LayerNorm = LayerNorm(self.embedding_size, config.layer_norm_eps, elementwise_affine=True) + + self.bias = nn.Parameter(torch.zeros(vocab_size)) + + def forward(self, hidden_states, embeding_weight): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + # b x s x d + hidden_states = MaskedLayerNorm(self.LayerNorm, hidden_states) + + # b x s x v + logits = torch.matmul(hidden_states, embeding_weight.t().to(hidden_states)) + self.bias + return logits diff --git a/nlu/DeBERTa/deberta/cache_utils.py b/nlu/DeBERTa/deberta/cache_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..143a7e693561158e99bc0a86f2814f99ce383240 --- /dev/null +++ b/nlu/DeBERTa/deberta/cache_utils.py @@ -0,0 +1,135 @@ +# Copyright (c) Microsoft, Inc. 2020 +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Author: penhe@microsoft.com +# Date: 05/15/2020 +# + +import pdb +import torch +import os +import requests +from .config import ModelConfig +import pathlib +from ..utils import xtqdm as tqdm +from zipfile import ZipFile +from ..utils import get_logger +logger = get_logger() + +__all__ = ['pretrained_models', 'load_model_state', 'load_vocab'] + +class PretrainedModel: + def __init__(self, name, vocab, vocab_type, model='pytorch_model.bin', config='config.json', **kwargs): + self.__dict__.update(kwargs) + host = f'https://huggingface.co/microsoft/{name}/resolve/main/' + self.name = name + self.model_url = host + model + self.config_url = host + config + self.vocab_url = host + vocab + self.vocab_type = vocab_type + +pretrained_models= { + 'base': PretrainedModel('deberta-base', 'bpe_encoder.bin', 'gpt2'), + 'large': PretrainedModel('deberta-large', 'bpe_encoder.bin', 'gpt2'), + 'xlarge': PretrainedModel('deberta-xlarge', 'bpe_encoder.bin', 'gpt2'), + 'base-mnli': PretrainedModel('deberta-base-mnli', 'bpe_encoder.bin', 'gpt2'), + 'large-mnli': PretrainedModel('deberta-large-mnli', 'bpe_encoder.bin', 'gpt2'), + 'xlarge-mnli': PretrainedModel('deberta-xlarge-mnli', 'bpe_encoder.bin', 'gpt2'), + 'xlarge-v2': PretrainedModel('deberta-v2-xlarge', 'spm.model', 'spm'), + 'xxlarge-v2': PretrainedModel('deberta-v2-xxlarge', 'spm.model', 'spm'), + 'xlarge-v2-mnli': PretrainedModel('deberta-v2-xlarge-mnli', 'spm.model', 'spm'), + 'xxlarge-v2-mnli': PretrainedModel('deberta-v2-xxlarge-mnli', 'spm.model', 'spm'), + 'deberta-v3-small': PretrainedModel('deberta-v3-small', 'spm.model', 'spm'), + 'deberta-v3-base': PretrainedModel('deberta-v3-base', 'spm.model', 'spm'), + 'deberta-v3-large': PretrainedModel('deberta-v3-large', 'spm.model', 'spm'), + 'mdeberta-v3-base': PretrainedModel('mdeberta-v3-base', 'spm.model', 'spm'), + 'deberta-v3-xsmall': PretrainedModel('deberta-v3-xsmall', 'spm.model', 'spm'), + } + +def download_asset(url, name, tag=None, no_cache=False, cache_dir=None): + _tag = tag + if _tag is None: + _tag = 'latest' + if not cache_dir: + cache_dir = os.path.join(pathlib.Path.home(), f'.~DeBERTa/assets/{_tag}/') + os.makedirs(cache_dir, exist_ok=True) + output=os.path.join(cache_dir, name) + if os.path.exists(output) and (not no_cache): + return output + + #repo=f'https://huggingface.co/microsoft/deberta-{name}/blob/main/bpe_encoder.bin' + headers = {} + headers['Accept'] = 'application/octet-stream' + resp = requests.get(url, stream=True, headers=headers) + if resp.status_code != 200: + raise Exception(f'Request for {url} return {resp.status_code}, {resp.text}') + + try: + with open(output, 'wb') as fs: + progress = tqdm(total=int(resp.headers['Content-Length']) if 'Content-Length' in resp.headers else -1, ncols=80, desc=f'Downloading {name}') + for c in resp.iter_content(chunk_size=1024*1024): + fs.write(c) + progress.update(len(c)) + progress.close() + except: + os.remove(output) + raise + + return output + +def load_model_state(path_or_pretrained_id, tag=None, no_cache=False, cache_dir=None): + model_path = path_or_pretrained_id + if model_path and (not os.path.exists(model_path)) and (path_or_pretrained_id.lower() in pretrained_models): + _tag = tag + if 'deberta-v3-base' in path_or_pretrained_id: + pretrained = pretrained_models['deberta-v3-base'] + else: + pretrained = pretrained_models[path_or_pretrained_id.lower()] + if _tag is None: + _tag = 'latest' + if not cache_dir: + cache_dir = os.path.join(pathlib.Path.home(), f'.~DeBERTa/assets/{_tag}/{pretrained.name}') + os.makedirs(cache_dir, exist_ok=True) + model_path = os.path.join(cache_dir, 'pytorch_model.bin') + if (not os.path.exists(model_path)) or no_cache: + asset = download_asset(pretrained.model_url, 'pytorch_model.bin', tag=tag, no_cache=no_cache, cache_dir=cache_dir) + asset = download_asset(pretrained.config_url, 'model_config.json', tag=tag, no_cache=no_cache, cache_dir=cache_dir) + elif not model_path: + return None,None + + model_path = os.path.join(model_path, 'pytorch_model.bin') + config_path = os.path.join(os.path.dirname(model_path), 'model_config.json') + model_state = torch.load(model_path, map_location='cpu') + logger.info("Loaded pretrained model file {}".format(model_path)) + if 'config' in model_state: + model_config = ModelConfig.from_dict(model_state['config']) + elif os.path.exists(config_path): + model_config = ModelConfig.from_json_file(config_path) + else: + model_config = None + return model_state, model_config + +def load_vocab(vocab_path=None, vocab_type=None, pretrained_id=None, tag=None, no_cache=False, cache_dir=None): + if pretrained_id and (pretrained_id.lower() in pretrained_models): + _tag = tag + if _tag is None: + _tag = 'latest' + + pretrained = pretrained_models[pretrained_id.lower()] + if not cache_dir: + cache_dir = os.path.join(pathlib.Path.home(), f'.~DeBERTa/assets/{_tag}/{pretrained.name}') + os.makedirs(cache_dir, exist_ok=True) + vocab_type = pretrained.vocab_type + url = pretrained.vocab_url + outname = os.path.basename(url) + vocab_path =os.path.join(cache_dir, outname) + if (not os.path.exists(vocab_path)) or no_cache: + asset = download_asset(url, outname, tag=tag, no_cache=no_cache, cache_dir=cache_dir) + if vocab_type is None: + vocab_type = 'spm' + return vocab_path, vocab_type + +def test_download(): + vocab = load_vocab() diff --git a/nlu/DeBERTa/deberta/config.py b/nlu/DeBERTa/deberta/config.py new file mode 100644 index 0000000000000000000000000000000000000000..11f23aa28041f3bc2f3aadc6fd181abdd551e8b2 --- /dev/null +++ b/nlu/DeBERTa/deberta/config.py @@ -0,0 +1,90 @@ +import json +import copy + +__all__=['AbsModelConfig', 'ModelConfig'] + +class AbsModelConfig(object): + def __init__(self): + pass + + @classmethod + def from_dict(cls, json_object): + """Constructs a `ModelConfig` from a Python dictionary of parameters.""" + config = cls() + for key, value in json_object.items(): + if isinstance(value, dict): + value = AbsModelConfig.from_dict(value) + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `ModelConfig` from a json file of parameters.""" + with open(json_file, "r", encoding='utf-8') as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def __repr__(self): + return str(self.to_json_string()) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + def _json_default(obj): + if isinstance(obj, AbsModelConfig): + return obj.__dict__ + return json.dumps(self.__dict__, indent=2, sort_keys=True, default=_json_default) + "\n" + +class ModelConfig(AbsModelConfig): + """Configuration class to store the configuration of a :class:`~DeBERTa.deberta.DeBERTa` model. + + Attributes: + hidden_size (int): Size of the encoder layers and the pooler layer, default: `768`. + num_hidden_layers (int): Number of hidden layers in the Transformer encoder, default: `12`. + num_attention_heads (int): Number of attention heads for each attention layer in + the Transformer encoder, default: `12`. + intermediate_size (int): The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder, default: `3072`. + hidden_act (str): The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu" and "swish" are supported, default: `gelu`. + hidden_dropout_prob (float): The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler, default: `0.1`. + attention_probs_dropout_prob (float): The dropout ratio for the attention + probabilities, default: `0.1`. + max_position_embeddings (int): The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048), default: `512`. + type_vocab_size (int): The vocabulary size of the `token_type_ids` passed into + `DeBERTa` model, default: `-1`. + initializer_range (int): The sttdev of the _normal_initializer for + initializing all weight matrices, default: `0.02`. + relative_attention (:obj:`bool`): Whether use relative position encoding, default: `False`. + max_relative_positions (int): The range of relative positions [`-max_position_embeddings`, `max_position_embeddings`], default: -1, use the same value as `max_position_embeddings`. + padding_idx (int): The value used to pad input_ids, default: `0`. + position_biased_input (:obj:`bool`): Whether add absolute position embedding to content embedding, default: `True`. + pos_att_type (:obj:`str`): The type of relative position attention, it can be a combination of [`p2c`, `c2p`, `p2p`], e.g. "p2c", "p2c|c2p", "p2c|c2p|p2p"., default: "None". + + + """ + def __init__(self): + """Constructs ModelConfig. + + """ + + self.hidden_size = 768 + self.num_hidden_layers = 12 + self.num_attention_heads = 12 + self.hidden_act = "gelu" + self.intermediate_size = 3072 + self.hidden_dropout_prob = 0.1 + self.attention_probs_dropout_prob = 0.1 + self.max_position_embeddings = 512 + self.type_vocab_size = 0 + self.initializer_range = 0.02 + self.layer_norm_eps = 1e-7 + self.padding_idx = 0 + self.vocab_size = -1 diff --git a/nlu/DeBERTa/deberta/da_utils.py b/nlu/DeBERTa/deberta/da_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..81bc977fa7c1cef0a67ed9ad56ec619743c6b533 --- /dev/null +++ b/nlu/DeBERTa/deberta/da_utils.py @@ -0,0 +1,68 @@ +import torch +import pdb +from functools import lru_cache +import numpy as np +import math + +__all__=['build_relative_position', 'make_log_bucket_position'] + +@lru_cache(maxsize=128) +def make_log_bucket_dict(bucket_size, max_position, device=None): + relative_pos = torch.arange(-max_position, max_position, device=device) + sign = torch.sign(relative_pos) + mid = bucket_size//2 + abs_pos = torch.where((relative_pos -mid), torch.tensor(mid-1).to(relative_pos), torch.abs(relative_pos)) + log_pos = torch.ceil(torch.log(abs_pos/mid)/math.log((max_position-1)/mid) * (mid-1)) + mid + bucket_pos = torch.where(abs_pos<=mid, relative_pos, (log_pos*sign).to(relative_pos)).to(torch.long) + return bucket_pos + +# Faster version +def make_log_bucket_position(relative_pos, bucket_size, max_position): + relative_pos = torch.clamp(relative_pos,-max_position+1, max_position-1) + max_position + bucket_dict = make_log_bucket_dict(bucket_size, max_position, relative_pos.device) + for d in range(relative_pos.dim()-1): + bucket_dict = bucket_dict.unsqueeze(0) + bucket_pos = torch.gather(bucket_dict.expand(list(relative_pos.size())[:-1] + [bucket_dict.size(-1)]), index=relative_pos.long(), dim=-1) + return bucket_pos + +@lru_cache(maxsize=128) +def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-1, device=None): + q_ids = torch.arange(0, query_size) + k_ids = torch.arange(0, key_size) + if device is not None: + q_ids = q_ids.to(device) + k_ids = k_ids.to(device) + rel_pos_ids = q_ids.view(-1,1) - k_ids.view(1,-1) + #q_ids[:, None] - np.tile(k_ids, (q_ids.shape[0],1)) + if bucket_size>0 and max_position > 0: + rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position) + #rel_pos_ids = torch.tensor(rel_pos_ids, dtype=torch.long) + rel_pos_ids = rel_pos_ids[:query_size, :] + rel_pos_ids = rel_pos_ids.unsqueeze(0) + return rel_pos_ids + +def build_relative_position_from_abs(query_pos, key_pos, bucket_size=-1, max_position=-1, device=None): + if isinstance(query_pos, tuple): + q_ids = torch.tensor(query_pos) + else: + q_ids = query_pos + if isinstance(key_pos, tuple): + k_ids = torch.tensor(key_pos) + else: + k_ids = key_pos + + if device is not None: + q_ids = q_ids.to(device) + k_ids = k_ids.to(device) + rel_pos_ids = q_ids.unsqueeze(-1) - k_ids.unsqueeze(-2) + #q_ids[:, None] - np.tile(k_ids, (q_ids.shape[0],1)) + if bucket_size>0 and max_position > 0: + rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position) + #rel_pos_ids = torch.tensor(rel_pos_ids, dtype=torch.long) + return rel_pos_ids + +def test_log_bucket(): + x=np.arange(-511,511) + y=make_log_bucket_position(x, 128, 512) + pdb.set_trace() + diff --git a/nlu/DeBERTa/deberta/deberta.py b/nlu/DeBERTa/deberta/deberta.py new file mode 100644 index 0000000000000000000000000000000000000000..b3f84eca9c50696595814e6e846d291e87ea0832 --- /dev/null +++ b/nlu/DeBERTa/deberta/deberta.py @@ -0,0 +1,145 @@ +# Copyright (c) Microsoft, Inc. 2020 +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Author: penhe@microsoft.com +# Date: 01/15/2020 +# + +import copy +import torch +import os + +import json +from .ops import * +from .bert import * +from .config import ModelConfig +from .cache_utils import load_model_state +import pdb + +__all__ = ['DeBERTa'] + +class DeBERTa(torch.nn.Module): + """ DeBERTa encoder + This module is composed of the input embedding layer with stacked transformer layers with disentangled attention. + + Parameters: + config: + A model config class instance with the configuration to build a new model. The schema is similar to `BertConfig`, \ + for more details, please refer :class:`~DeBERTa.deberta.ModelConfig` + + pre_trained: + The pre-trained DeBERTa model, it can be a physical path of a pre-trained DeBERTa model or a released configurations, \ + i.e. [**base, large, base_mnli, large_mnli**] + + """ + + def __init__(self, config=None, pre_trained=None): + super().__init__() + state = None + if pre_trained is not None: + state, model_config = load_model_state(pre_trained) + if config is not None and model_config is not None: + for k in config.__dict__: + if k not in ['hidden_size', + 'intermediate_size', + 'num_attention_heads', + 'num_hidden_layers', + 'vocab_size', + 'max_position_embeddings']: + model_config.__dict__[k] = config.__dict__[k] + config = copy.copy(model_config) + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) + self.config = config + self.pre_trained = pre_trained + self.apply_state(state) + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, output_all_encoded_layers=True, position_ids = None, return_att = False): + """ + Args: + input_ids: + a torch.LongTensor of shape [batch_size, sequence_length] \ + with the word token indices in the vocabulary + + attention_mask: + an optional parameter for input mask or attention mask. + + - If it's an input mask, then it will be torch.LongTensor of shape [batch_size, sequence_length] with indices \ + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max \ + input sequence length in the current batch. It's the mask that we typically use for attention when \ + a batch has varying length sentences. + + - If it's an attention mask then it will be torch.LongTensor of shape [batch_size, sequence_length, sequence_length]. \ + In this case, it's a mask indicate which tokens in the sequence should be attended by other tokens in the sequence. + + token_type_ids: + an optional torch.LongTensor of shape [batch_size, sequence_length] with the token \ + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to \ + a `sentence B` token (see BERT paper for more details). + + output_all_encoded_layers: + whether to output results of all encoder layers, default, True + + Returns: + + - The output of the stacked transformer layers if `output_all_encoded_layers=True`, else \ + the last layer of stacked transformer layers + + - Attention matrix of self-attention layers if `return_att=True` + + + Example:: + + # Batch of wordPiece token ids. + # Each sample was padded with zero to the maxium length of the batch + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + # Mask of valid input ids + attention_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + + # DeBERTa model initialized with pretrained base model + bert = DeBERTa(pre_trained='base') + + encoder_layers = bert(input_ids, attention_mask=attention_mask) + + """ + + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + ebd_output = self.embeddings(input_ids.to(torch.long), token_type_ids.to(torch.long), position_ids, attention_mask) + embedding_output = ebd_output['embeddings'] + encoder_output = self.encoder(embedding_output, + attention_mask, + output_all_encoded_layers=output_all_encoded_layers, return_att = return_att) + encoder_output.update(ebd_output) + return encoder_output + + def apply_state(self, state = None): + """ Load state from previous loaded model state dictionary. + + Args: + state (:obj:`dict`, optional): State dictionary as the state returned by torch.module.state_dict(), default: `None`. \ + If it's `None`, then will use the pre-trained state loaded via the constructor to re-initialize \ + the `DeBERTa` model + """ + if self.pre_trained is None and state is None: + return + if state is None: + state, config = load_model_state(self.pre_trained) + self.config = config + + prefix = '' + for k in state: + if 'embeddings.' in k: + if not k.startswith('embeddings.'): + prefix = k[:k.index('embeddings.')] + break + + missing_keys = [] + unexpected_keys = [] + error_msgs = [] + self._load_from_state_dict(state, prefix = prefix, local_metadata=None, strict=True, missing_keys=missing_keys, unexpected_keys=unexpected_keys, error_msgs=error_msgs) diff --git a/nlu/DeBERTa/deberta/disentangled_attention.py b/nlu/DeBERTa/deberta/disentangled_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..42215c787c901eb8f73d25d88e3375a635126078 --- /dev/null +++ b/nlu/DeBERTa/deberta/disentangled_attention.py @@ -0,0 +1,221 @@ +# Copyright (c) Microsoft, Inc. 2020 +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Author: penhe@microsoft.com +# Date: 01/15/2020 +# + +""" + Disentangled SelfAttention module +""" + +import numpy as np +import math +import torch +from torch import nn +import functools +import pdb + +from .ops import * +from .da_utils import build_relative_position + +from ..utils import get_logger +logger=get_logger() + +from adapterlib import adapter_dict + +__all__=['DisentangledSelfAttention'] +class DisentangledSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + self.num_attention_heads = config.num_attention_heads + _attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.attention_head_size = getattr(config, 'attention_head_size', _attention_head_size) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + # ----------------------------------------------------------------------------------------------------------------------- + if config.inject_adapter != 'linear': + self.query_proj = adapter_dict[config.inject_adapter](config.hidden_size, self.all_head_size, config=config) + else: + self.query_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True) + + # self.key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True) + if config.inject_adapter != 'linear': + self.key_proj = adapter_dict[config.inject_adapter](config.hidden_size, self.all_head_size, config=config) + else: + self.key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True) + + if config.inject_adapter != 'linear': + self.value_proj = adapter_dict[config.inject_adapter](config.hidden_size, self.all_head_size, config=config) + else: + self.value_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True) + + # ----------------------------------------------------------------------------------------------------------------------- + + # self.query_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True) + # self.key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True) + # self.value_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True) + + self.share_att_key = getattr(config, 'share_att_key', False) + self.pos_att_type = [x.strip() for x in getattr(config, 'pos_att_type', 'c2p').lower().split('|')] # c2p|p2c + self.relative_attention = getattr(config, 'relative_attention', False) + + if self.relative_attention: + self.position_buckets = getattr(config, 'position_buckets', -1) + self.max_relative_positions = getattr(config, 'max_relative_positions', -1) + if self.max_relative_positions <1: + self.max_relative_positions = config.max_position_embeddings + self.pos_ebd_size = self.max_relative_positions + if self.position_buckets>0: + self.pos_ebd_size = self.position_buckets + # For backward compitable + + self.pos_dropout = StableDropout(config.hidden_dropout_prob) + + if (not self.share_att_key): + if 'c2p' in self.pos_att_type or 'p2p' in self.pos_att_type: + self.pos_key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True) + if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type: + self.pos_query_proj = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = StableDropout(config.attention_probs_dropout_prob) + self._register_load_state_dict_pre_hook(self._pre_load_hook) + + def transpose_for_scores(self, x, attention_heads): + new_x_shape = x.size()[:-1] + (attention_heads, -1) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3).contiguous().view(-1, x.size(1), x.size(-1)) + + def forward(self, hidden_states, attention_mask, return_att=False, query_states=None, relative_pos=None, rel_embeddings=None): + if query_states is None: + query_states = hidden_states + query_layer = self.transpose_for_scores(self.query_proj(query_states), self.num_attention_heads).float() + key_layer = self.transpose_for_scores(self.key_proj(hidden_states), self.num_attention_heads).float() + value_layer = self.transpose_for_scores(self.value_proj(hidden_states), self.num_attention_heads) + + rel_att = None + # Take the dot product between "query" and "key" to get the raw attention scores. + scale_factor = 1 + if 'c2p' in self.pos_att_type: + scale_factor += 1 + if 'p2c' in self.pos_att_type: + scale_factor += 1 + if 'p2p' in self.pos_att_type: + scale_factor += 1 + scale = 1/math.sqrt(query_layer.size(-1)*scale_factor) + attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2)*scale) + if self.relative_attention: + rel_embeddings = self.pos_dropout(rel_embeddings) + rel_att = self.disentangled_attention_bias(query_layer, key_layer, relative_pos, rel_embeddings, scale_factor) + + if rel_att is not None: + attention_scores = (attention_scores + rel_att) + attention_scores = (attention_scores - attention_scores.max(dim=-1, keepdim=True).values.detach()).to(hidden_states) + attention_scores = attention_scores.view(-1, self.num_attention_heads, attention_scores.size(-2), attention_scores.size(-1)) + + # bxhxlxd + _attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1) + attention_probs = self.dropout(_attention_probs) + context_layer = torch.bmm(attention_probs.view(-1, attention_probs.size(-2), attention_probs.size(-1)), value_layer) + context_layer = context_layer.view(-1, self.num_attention_heads, context_layer.size(-2), context_layer.size(-1)).permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (-1,) + context_layer = context_layer.view(*new_context_layer_shape) + + return { + 'hidden_states': context_layer, + 'attention_probs': _attention_probs, + 'attention_logits': attention_scores + } + + def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor): + if relative_pos is None: + q = query_layer.size(-2) + relative_pos = build_relative_position(q, key_layer.size(-2), bucket_size = self.position_buckets, \ + max_position = self.max_relative_positions, device=query_layer.device) + if relative_pos.dim()==2: + relative_pos = relative_pos.unsqueeze(0).unsqueeze(0) + elif relative_pos.dim()==3: + relative_pos = relative_pos.unsqueeze(1) + # bxhxqxk + elif relative_pos.dim()!=4: + raise ValueError(f'Relative postion ids must be of dim 2 or 3 or 4. {relative_pos.dim()}') + + att_span = self.pos_ebd_size + relative_pos = relative_pos.long().to(query_layer.device) + + rel_embeddings = rel_embeddings[self.pos_ebd_size - att_span:self.pos_ebd_size + att_span, :].unsqueeze(0) #.repeat(query_layer.size(0)//self.num_attention_heads, 1, 1) + if self.share_att_key: + pos_query_layer = self.transpose_for_scores(self.query_proj(rel_embeddings), self.num_attention_heads)\ + .repeat(query_layer.size(0)//self.num_attention_heads, 1, 1) #.split(self.all_head_size, dim=-1) + pos_key_layer = self.transpose_for_scores(self.key_proj(rel_embeddings), self.num_attention_heads)\ + .repeat(query_layer.size(0)//self.num_attention_heads, 1, 1) #.split(self.all_head_size, dim=-1) + else: + if 'c2p' in self.pos_att_type or 'p2p' in self.pos_att_type: + pos_key_layer = self.transpose_for_scores(self.pos_key_proj(rel_embeddings), self.num_attention_heads)\ + .repeat(query_layer.size(0)//self.num_attention_heads, 1, 1) #.split(self.all_head_size, dim=-1) + if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type: + pos_query_layer = self.transpose_for_scores(self.pos_query_proj(rel_embeddings), self.num_attention_heads)\ + .repeat(query_layer.size(0)//self.num_attention_heads, 1, 1) #.split(self.all_head_size, dim=-1) + + score = 0 + # content->position + if 'c2p' in self.pos_att_type: + scale = 1/math.sqrt(pos_key_layer.size(-1)*scale_factor) + c2p_att = torch.bmm(query_layer, pos_key_layer.transpose(-1, -2).to(query_layer)*scale) + c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span*2-1).squeeze(0).expand([query_layer.size(0), query_layer.size(1), relative_pos.size(-1)]) + c2p_att = torch.gather(c2p_att, dim=-1, index=c2p_pos) + score += c2p_att + + # position->content + if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type: + scale = 1/math.sqrt(pos_query_layer.size(-1)*scale_factor) + + if 'p2c' in self.pos_att_type: + p2c_att = torch.bmm(pos_query_layer.to(key_layer)*scale, key_layer.transpose(-1, -2)) + p2c_att = torch.gather(p2c_att, dim=-2, index=c2p_pos) + score += p2c_att + + # position->position + if 'p2p' in self.pos_att_type: + pos_query = pos_query_layer[:,:,att_span:,:] + p2p_att = torch.matmul(pos_query, pos_key_layer.transpose(-1, -2)) + p2p_att = p2p_att.expand(query_layer.size()[:2] + p2p_att.size()[2:]) + if query_layer.size(-2) != key_layer.size(-2): + p2p_att = torch.gather(p2p_att, dim=-2, index=pos_index.expand(query_layer.size()[:2] + (pos_index.size(-2), p2p_att.size(-1)))) + p2p_att = torch.gather(p2p_att, dim=-1, index=c2p_pos.expand([query_layer.size(0), query_layer.size(1), query_layer.size(2), relative_pos.size(-1)])) + score += p2p_att + + return score + + def _pre_load_hook(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + self_state = self.state_dict() + if ((prefix + 'query_proj.weight') not in state_dict) and ((prefix + 'in_proj.weight') in state_dict): + v1_proj = state_dict[prefix+'in_proj.weight'] + v1_proj = v1_proj.unsqueeze(0).reshape(self.num_attention_heads, -1, v1_proj.size(-1)) + q,k,v=v1_proj.chunk(3, dim=1) + state_dict[prefix + 'query_proj.weight'] = q.reshape(-1, v1_proj.size(-1)) + state_dict[prefix + 'key_proj.weight'] = k.reshape(-1, v1_proj.size(-1)) + state_dict[prefix + 'key_proj.bias'] = self_state['key_proj.bias'] + state_dict[prefix + 'value_proj.weight'] = v.reshape(-1, v1_proj.size(-1)) + v1_query_bias = state_dict[prefix + 'q_bias'] + state_dict[prefix + 'query_proj.bias'] = v1_query_bias + v1_value_bias = state_dict[prefix +'v_bias'] + state_dict[prefix + 'value_proj.bias'] = v1_value_bias + + v1_pos_key_proj = state_dict[prefix + 'pos_proj.weight'] + state_dict[prefix + 'pos_key_proj.weight'] = v1_pos_key_proj + v1_pos_query_proj = state_dict[prefix + 'pos_q_proj.weight'] + state_dict[prefix + 'pos_query_proj.weight'] = v1_pos_query_proj + v1_pos_query_proj_bias = state_dict[prefix + 'pos_q_proj.bias'] + state_dict[prefix + 'pos_query_proj.bias'] = v1_pos_query_proj_bias + state_dict[prefix + 'pos_key_proj.bias'] = self_state['pos_key_proj.bias'] + + del state_dict[prefix + 'in_proj.weight'] + del state_dict[prefix + 'q_bias'] + del state_dict[prefix + 'v_bias'] + del state_dict[prefix + 'pos_proj.weight'] + del state_dict[prefix + 'pos_q_proj.weight'] + del state_dict[prefix + 'pos_q_proj.bias'] diff --git a/nlu/DeBERTa/deberta/gpt2_bpe_utils.py b/nlu/DeBERTa/deberta/gpt2_bpe_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b089f6a4d7a9e7d275a2ddf4f11268d97a6ba890 --- /dev/null +++ b/nlu/DeBERTa/deberta/gpt2_bpe_utils.py @@ -0,0 +1,163 @@ +""" +Byte pair encoding utilities from GPT-2. + +Original source: https://github.com/openai/gpt-2/blob/master/src/encoder.py +Original license: MIT +""" + +from functools import lru_cache +import json +import random +import unicodedata + +try: + import regex as re +except ImportError: + raise ImportError('Please install regex with: pip install regex') + +@lru_cache() +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8+n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + +def get_pairs(word): + """Return set of symbol pairs in a word. + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + +class Encoder: + + def __init__(self, encoder, bpe_merges, errors='replace'): + self.encoder = encoder + self.decoder = {v:k for k,v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} + self.bpe_ranks = dict(zip([tuple(k) for k in bpe_merges], range(len(bpe_merges)))) + self.cache = {} + self.random = random.Random(0) + + # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions + self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word)-1 and word[i+1] == second: + new_word.append(first+second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + self.cache[token] = word + return word + + def split_to_words(self, text): + return list(re.findall(self.pat, text)) + + def encode(self, text): + bpe_tokens = [] + for token in self.split_to_words(text): + token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) + bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) + return bpe_tokens + + def decode(self, tokens): + text = ''.join([self.decoder[token] for token in tokens]) + text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors) + return text + +def get_encoder(encoder, vocab): + return Encoder( + encoder=encoder, + bpe_merges=vocab, + ) + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/nlu/DeBERTa/deberta/gpt2_tokenizer.py b/nlu/DeBERTa/deberta/gpt2_tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..8d7072082891310346f9f51389a950352f078c14 --- /dev/null +++ b/nlu/DeBERTa/deberta/gpt2_tokenizer.py @@ -0,0 +1,216 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Microsoft, Inc. 2020 +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Author: penhe@microsoft.com +# Date: 01/15/2020 +# + +# This piece of code is derived from https://github.com/pytorch/fairseq/blob/master/fairseq/data/encoders/gpt2_bpe.py + +import torch +import unicodedata +import os +from .gpt2_bpe_utils import get_encoder,_is_control,_is_whitespace,_is_punctuation +from .cache_utils import load_vocab + +__all__ = ['GPT2Tokenizer'] + +class GPT2Tokenizer(object): + """ A wrapper of GPT2 tokenizer with similar interface as BERT tokenizer + + Args: + + vocab_file (:obj:`str`, optional): + The local path of vocabulary package or the release name of vocabulary in `DeBERTa GitHub releases `_, \ + e.g. "bpe_encoder", default: `None`. + + If it's `None`, then it will download the vocabulary in the latest release from GitHub. The vocabulary file is a \ + state dictionary with three items, "dict_map", "vocab", "encoder" which correspond to three files used in `RoBERTa`, i.e. `dict.txt`, `vocab.txt` and `encoder.json`. \ + + The difference between our wrapped GPT2 tokenizer and RoBERTa wrapped tokenizer are, + + - Special tokens, unlike `RoBERTa` which use ``, `` as the `start` token and `end` token of a sentence. We use `[CLS]` and `[SEP]` as the `start` and `end`\ + token of input sentence which is the same as `BERT`. + + - We remapped the token ids in our dictionary with regarding to the new special tokens, `[PAD]` => 0, `[CLS]` => 1, `[SEP]` => 2, `[UNK]` => 3, `[MASK]` => 50264 + + do_lower_case (:obj:`bool`, optional): + Whether to convert inputs to lower case. **Not used in GPT2 tokenizer**. + + special_tokens (:obj:`list`, optional): + List of special tokens to be added to the end of the vocabulary. + + + """ + def __init__(self, vocab_file=None, do_lower_case=True, special_tokens=None): + self.pad_token='[PAD]' + self.sep_token='[SEP]' + self.unk_token='[UNK]' + self.cls_token='[CLS]' + + self.symbols = [] + self.count = [] + self.indices = {} + self.pad_token_id = self.add_symbol(self.pad_token) + self.cls_token_id = self.add_symbol(self.cls_token) + self.sep_token_id = self.add_symbol(self.sep_token) + self.unk_token_id = self.add_symbol(self.unk_token) + + self.gpt2_encoder = torch.load(vocab_file) + self.bpe = get_encoder(self.gpt2_encoder['encoder'], self.gpt2_encoder['vocab']) + for w,n in self.gpt2_encoder['dict_map']: + self.add_symbol(w, n) + + self.mask_token='[MASK]' + self.mask_id = self.add_symbol(self.mask_token) + self.special_tokens = ['[MASK]', '[SEP]', '[PAD]', '[UNK]', '[CLS]'] + if special_tokens is not None: + for t in special_tokens: + self.add_special_token(t) + + self.vocab = self.indices + self.ids_to_tokens = self.symbols + + def tokenize(self, text): + """ Convert an input text to tokens. + + Args: + + text (:obj:`str`): input text to be tokenized. + + Returns: + A list of byte tokens where each token represent the byte id in GPT2 byte dictionary + + Example:: + + >>> tokenizer = GPT2Tokenizer() + >>> text = "Hello world!" + >>> tokens = tokenizer.tokenize(text) + >>> print(tokens) + ['15496', '995', '0'] + + """ + bpe = self._encode(text) + + return [t for t in bpe.split(' ') if t] + + def convert_tokens_to_ids(self, tokens): + """ Convert list of tokens to ids. + + Args: + + tokens (:obj:`list`): list of tokens + + Returns: + + List of ids + """ + + return [self.vocab[t] for t in tokens] + + def convert_ids_to_tokens(self, ids): + """ Convert list of ids to tokens. + + Args: + + ids (:obj:`list`): list of ids + + Returns: + + List of tokens + """ + + tokens = [] + for i in ids: + tokens.append(self.ids_to_tokens[i]) + return tokens + + def split_to_words(self, text): + return self.bpe.split_to_words(text) + + def decode(self, tokens): + """ Decode list of tokens to text strings. + + Args: + + tokens (:obj:`list`): list of tokens. + + Returns: + + Text string corresponds to the input tokens. + + Example:: + + >>> tokenizer = GPT2Tokenizer() + >>> text = "Hello world!" + >>> tokens = tokenizer.tokenize(text) + >>> print(tokens) + ['15496', '995', '0'] + + >>> tokenizer.decode(tokens) + 'Hello world!' + + """ + return self.bpe.decode([int(t) for t in tokens if t not in self.special_tokens]) + + def add_special_token(self, token): + """Adds a special token to the dictionary. + + Args: + token (:obj:`str`): Tthe new token/word to be added to the vocabulary. + + Returns: + The id of new token in the vocabulary. + + """ + self.special_tokens.append(token) + return self.add_symbol(token) + + def part_of_whole_word(self, token, is_bos=False): + if is_bos: + return True + s = self._decode(token) + if (len(s)==1 and (_is_whitespace(list(s)[0]) or _is_control(list(s)[0]) or _is_punctuation(list(s)[0]))): + return False + + return not s.startswith(' ') + + def sym(self, id): + return self.ids_to_tokens[id] + + def id(self, sym): + return self.vocab[sym] + + def _encode(self, x: str) -> str: + return ' '.join(map(str, self.bpe.encode(x))) + + def _decode(self, x: str) -> str: + return self.bpe.decode(map(int, x.split())) + + def add_symbol(self, word, n=1): + """Adds a word to the dictionary. + + Args: + word (:obj:`str`): Tthe new token/word to be added to the vocabulary. + n (int, optional): The frequency of the word. + + Returns: + The id of the new word. + + """ + if word in self.indices: + idx = self.indices[word] + self.count[idx] = self.count[idx] + n + return idx + else: + idx = len(self.symbols) + self.indices[word] = idx + self.symbols.append(word) + self.count.append(n) + return idx + + def save_pretrained(self, path: str): + torch.save(self.gpt2_encoder, path) diff --git a/nlu/DeBERTa/deberta/mlm.py b/nlu/DeBERTa/deberta/mlm.py new file mode 100644 index 0000000000000000000000000000000000000000..be00b2d7e9c8d2165ade05c050661baf93ba39d7 --- /dev/null +++ b/nlu/DeBERTa/deberta/mlm.py @@ -0,0 +1,38 @@ +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) Microsoft, Inc. 2020 +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# This piece of code is modified based on https://github.com/huggingface/transformers + +import torch +from torch import nn +import pdb + +from .bert import LayerNorm,ACT2FN + +__all__ = ['MLMPredictionHead'] + +class MLMPredictionHead(nn.Module): + def __init__(self, config, vocab_size): + super().__init__() + self.embedding_size = getattr(config, 'embedding_size', config.hidden_size) + self.dense = nn.Linear(config.hidden_size, self.embedding_size) + self.transform_act_fn = ACT2FN[config.hidden_act] \ + if isinstance(config.hidden_act, str) else config.hidden_act + + self.LayerNorm = LayerNorm(self.embedding_size, config.layer_norm_eps) + self.bias = nn.Parameter(torch.zeros(vocab_size)) + self.pre_norm = PreLayerNorm(config) + + def forward(self, hidden_states, embeding_weight): + hidden_states = self.pre_norm(hidden_states) + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + # b x s x d + hidden_states = MaskedLayerNorm(self.LayerNorm, hidden_states) + + # b x s x v + logits = torch.matmul(hidden_states, embeding_weight.t().to(hidden_states)) + self.bias + return logits diff --git a/nlu/DeBERTa/deberta/nnmodule.py b/nlu/DeBERTa/deberta/nnmodule.py new file mode 100644 index 0000000000000000000000000000000000000000..4497c75a98d720fe866944465e4b1686b1cdfdc4 --- /dev/null +++ b/nlu/DeBERTa/deberta/nnmodule.py @@ -0,0 +1,137 @@ +import pdb +import os +import torch +import copy +from torch import nn +from .config import ModelConfig +from ..utils import xtqdm as tqdm +from .cache_utils import load_model_state + +from ..utils import get_logger +logger = get_logger() + +__all__ = ['NNModule'] + +class NNModule(nn.Module): + """ An abstract class to handle weights initialization and \ + a simple interface for dowloading and loading pretrained models. + + Args: + + config (:obj:`~DeBERTa.deberta.ModelConfig`): The model config to the module + + """ + + def __init__(self, config, *inputs, **kwargs): + super().__init__() + self.config = config + + def init_weights(self, module): + """ Apply Gaussian(mean=0, std=`config.initializer_range`) initialization to the module. + + Args: + + module (:obj:`torch.nn.Module`): The module to apply the initialization. + + Example:: + + class MyModule(NNModule): + def __init__(self, config): + # Add construction instructions + self.bert = DeBERTa(config) + + # Add other modules + ... + + # Apply initialization + self.apply(self.init_weights) + + """ + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + def export_onnx(self, onnx_path, input): + raise NotImplementedError + + @classmethod + def load_model(cls, model_path, model_config=None, tag=None, no_cache=False, cache_dir=None , *inputs, **kwargs): + """ Instantiate a sub-class of NNModule from a pre-trained model file. + + Args: + + model_path (:obj:`str`): Path or name of the pre-trained model which can be either, + + - The path of pre-trained model + + - The pre-trained DeBERTa model name in `DeBERTa GitHub releases `_, i.e. [**base, base_mnli, large, large_mnli**]. + + If `model_path` is `None` or `-`, then the method will create a new sub-class without initialing from pre-trained models. + + model_config (:obj:`str`): The path of model config file. If it's `None`, then the method will try to find the the config in order: + + 1. ['config'] in the model state dictionary. + + 2. `model_config.json` aside the `model_path`. + + If it failed to find a config the method will fail. + + tag (:obj:`str`, optional): The release tag of DeBERTa, default: `None`. + + no_cache (:obj:`bool`, optional): Disable local cache of downloaded models, default: `False`. + + cache_dir (:obj:`str`, optional): The cache directory used to save the downloaded models, default: `None`. If it's `None`, then the models will be saved at `$HOME/.~DeBERTa` + + Return: + + :obj:`NNModule` : The sub-class object. + + """ + # Load config + if model_config: + config = ModelConfig.from_json_file(model_config) + else: + config = None + model_config = None + model_state = None + if (model_path is not None) and (model_path.strip() == '-' or model_path.strip()==''): + model_path = None + try: + model_state, model_config = load_model_state(model_path, tag=tag, no_cache=no_cache, cache_dir=cache_dir) + except Exception as exp: + raise Exception(f'Failed to get model {model_path}. Exception: {exp}') + + if config is not None and model_config is not None: + for k in config.__dict__: + if k not in ['hidden_size', + 'intermediate_size', + 'num_attention_heads', + 'num_hidden_layers', + 'vocab_size', + 'max_position_embeddings'] or (k not in model_config.__dict__) or (model_config.__dict__[k] < 0): + model_config.__dict__[k] = config.__dict__[k] + if model_config is not None: + config = copy.copy(model_config) + vocab_size = config.vocab_size + # Instantiate model. + model = cls(config, *inputs, **kwargs) + if not model_state: + return model + # copy state_dict so _load_from_state_dict can modify it + state_dict = model_state.copy() + + missing_keys = [] + unexpected_keys = [] + error_msgs = [] + metadata = getattr(state_dict, '_metadata', None) + def load(module, prefix=''): + local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) + module._load_from_state_dict( + state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + '.') + load(model) + logger.warning(f'Missing keys: {missing_keys}, unexpected_keys: {unexpected_keys}, error_msgs: {error_msgs}') + return model diff --git a/nlu/DeBERTa/deberta/ops.py b/nlu/DeBERTa/deberta/ops.py new file mode 100644 index 0000000000000000000000000000000000000000..be10d3c9afd4b13c67ad767ff32163f1f30a4e55 --- /dev/null +++ b/nlu/DeBERTa/deberta/ops.py @@ -0,0 +1,228 @@ +# Copyright (c) Microsoft, Inc. 2020 +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Author: penhe@microsoft.com +# Date: 01/15/2020 +# + +import pdb +import math +from packaging import version +import torch +from torch.nn import LayerNorm +from ..utils.jit_tracing import traceable + +if version.Version(torch.__version__) >= version.Version('1.0.0'): + from torch import _softmax_backward_data as _softmax_backward_data +else: + from torch import softmax_backward_data as _softmax_backward_data + +__all__ = ['StableDropout', 'MaskedLayerNorm', 'XSoftmax', 'ACT2FN', 'LayerNorm'] + +@traceable +class XSoftmax(torch.autograd.Function): + """ Masked Softmax which is optimized for saving memory + + Args: + + input (:obj:`torch.tensor`): The input tensor that will apply softmax. + mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax caculation. + dim (int): The dimenssion that will apply softmax. + + Example:: + + import torch + from DeBERTa.deberta import XSoftmax + # Make a tensor + x = torch.randn([4,20,100]) + # Create a mask + mask = (x>0).int() + y = XSoftmax.apply(x, mask, dim=-1) + + """ + + @staticmethod + def forward(self, input, mask, dim): + """ + """ + + self.dim = dim + if version.Version(torch.__version__) >= version.Version('1.2.0a'): + rmask = ~(mask.bool()) + else: + rmask = (1-mask).byte() # This line is not supported by Onnx tracing. + + output = input.masked_fill(rmask, float('-inf')) + output = torch.softmax(output, self.dim) + output.masked_fill_(rmask, 0) + self.save_for_backward(output) + return output + + @staticmethod + def backward(self, grad_output): + """ + """ + + output, = self.saved_tensors + if version.Version(torch.__version__) >= version.Version('1.11.0a'): + inputGrad = _softmax_backward_data(grad_output, output, self.dim, output.dtype) + else: + inputGrad = _softmax_backward_data(grad_output, output, self.dim, output) + return inputGrad, None, None + + @staticmethod + def symbolic(g, self, mask, dim): + import torch.onnx.symbolic_helper as sym_help + from torch.onnx.symbolic_opset9 import masked_fill, softmax + + mask_cast_value = g.op("Cast", mask, to_i=sym_help.cast_pytorch_to_onnx['Long']) + r_mask = g.op("Cast", g.op("Sub", g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64)), mask_cast_value), to_i=sym_help.cast_pytorch_to_onnx['Byte']) + output = masked_fill(g, self, r_mask, g.op("Constant", value_t=torch.tensor(float('-inf')))) + output = softmax(g, output, dim) + return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.uint8))) + +class DropoutContext(object): + def __init__(self): + self.dropout = 0 + self.mask = None + self.scale = 1 + self.reuse_mask = True + +def get_mask(input, local_context): + if not isinstance(local_context, DropoutContext): + dropout = local_context + mask = None + else: + dropout = local_context.dropout + dropout *= local_context.scale + mask = local_context.mask if local_context.reuse_mask else None + + if dropout>0 and mask is None: + if version.Version(torch.__version__) >= version.Version('1.2.0a'): + mask=(1-torch.empty_like(input).bernoulli_(1-dropout)).bool() + else: + mask=(1-torch.empty_like(input).bernoulli_(1-dropout)).byte() + + if isinstance(local_context, DropoutContext): + if local_context.mask is None: + local_context.mask = mask + + return mask, dropout + +@traceable +class XDropout(torch.autograd.Function): + @staticmethod + def forward(ctx, input, local_ctx): + mask, dropout = get_mask(input, local_ctx) + ctx.scale=1.0/(1-dropout) + if dropout>0: + ctx.save_for_backward(mask) + return input.masked_fill(mask, 0)*ctx.scale + else: + return input + + @staticmethod + def backward(ctx, grad_output): + if ctx.scale > 1: + mask, = ctx.saved_tensors + return grad_output.masked_fill(mask, 0)*ctx.scale, None + else: + return grad_output, None + +class StableDropout(torch.nn.Module): + """ Optimized dropout module for stabilizing the training + + Args: + + drop_prob (float): the dropout probabilities + + """ + + def __init__(self, drop_prob): + super().__init__() + self.drop_prob = drop_prob + self.count = 0 + self.context_stack = None + + def forward(self, x): + """ Call the module + + Args: + + x (:obj:`torch.tensor`): The input tensor to apply dropout + + + """ + if self.training and self.drop_prob>0: + return XDropout.apply(x, self.get_context()) + return x + + def clear_context(self): + self.count = 0 + self.context_stack = None + + def init_context(self, reuse_mask=True, scale = 1): + if self.context_stack is None: + self.context_stack = [] + self.count = 0 + for c in self.context_stack: + c.reuse_mask = reuse_mask + c.scale = scale + + def get_context(self): + if self.context_stack is not None: + if self.count >= len(self.context_stack): + self.context_stack.append(DropoutContext()) + ctx = self.context_stack[self.count] + ctx.dropout = self.drop_prob + self.count += 1 + return ctx + else: + return self.drop_prob + +def MaskedLayerNorm(layerNorm, input, mask = None): + """ Masked LayerNorm which will apply mask over the output of LayerNorm to avoid inaccurate updatings to the LayerNorm module. + + Args: + layernorm (:obj:`~DeBERTa.deberta.LayerNorm`): LayerNorm module or function + input (:obj:`torch.tensor`): The input tensor + mask (:obj:`torch.IntTensor`): The mask to applied on the output of LayerNorm where `0` indicate the output of that element will be ignored, i.e. set to `0` + + Example:: + + # Create a tensor b x n x d + x = torch.randn([1,10,100]) + m = torch.tensor([[1,1,1,0,0,0,0,0,0,0]], dtype=torch.int) + LayerNorm = DeBERTa.deberta.LayerNorm(100) + y = MaskedLayerNorm(LayerNorm, x, m) + + """ + output = layerNorm(input).to(input) + if mask is None: + return output + if mask.dim()!=input.dim(): + if mask.dim()==4: + mask=mask.squeeze(1).squeeze(1) + mask = mask.unsqueeze(2) + mask = mask.to(output.dtype) + return output*mask + +def gelu(x): + """Implementation of the gelu activation function. + For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): + 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + """ + return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) + + +def swish(x): + return x * torch.sigmoid(x) + +def linear_act(x): + return x + +ACT2FN = {"gelu": torch.nn.functional.gelu, "relu": torch.nn.functional.relu, "swish": swish, "tanh": torch.tanh, "linear": linear_act, 'sigmoid': torch.sigmoid} + + diff --git a/nlu/DeBERTa/deberta/pooling.py b/nlu/DeBERTa/deberta/pooling.py new file mode 100644 index 0000000000000000000000000000000000000000..c08ed79d92efecd50d97adbabd8cae7776ba421d --- /dev/null +++ b/nlu/DeBERTa/deberta/pooling.py @@ -0,0 +1,88 @@ +# +# Author: penhe@microsoft.com +# Date: 01/25/2019 +# +""" +Pooling functions +""" + +from torch import nn +import copy +import json +import pdb +from .bert import ACT2FN +from .ops import StableDropout +from .config import AbsModelConfig + +__all__ = ['PoolConfig', 'ContextPooler'] + +class PoolConfig(AbsModelConfig): + """Configuration class to store the configuration of `pool layer`. + + Parameters: + + config (:class:`~DeBERTa.deberta.ModelConfig`): The model config. The field of pool config will be initalized with the `pooling` field in model config. + + Attributes: + + hidden_size (int): Size of the encoder layers and the pooler layer, default: `768`. + + dropout (float): The dropout rate applied on the output of `[CLS]` token, + + hidden_act (:obj:`str`): The activation function of the projection layer, it can be one of ['gelu', 'tanh']. + + Example:: + + # Here is the content of an exmple model config file in json format + + { + "hidden_size": 768, + "num_hidden_layers" 12, + "num_attention_heads": 12, + "intermediate_size": 3072, + ... + "pooling": { + "hidden_size": 768, + "hidden_act": "gelu", + "dropout": 0.1 + } + } + + """ + def __init__(self, config=None): + """Constructs PoolConfig. + + Args: + `config`: the config of the model. The field of pool config will be initalized with the 'pooling' field in model config. + """ + + self.hidden_size = 768 + self.dropout = 0 + self.hidden_act = 'gelu' + if config: + pool_config = getattr(config, 'pooling', config) + if isinstance(pool_config, dict): + pool_config = AbsModelConfig.from_dict(pool_config) + self.hidden_size = getattr(pool_config, 'hidden_size', config.hidden_size) + self.dropout = getattr(pool_config, 'dropout', 0) + self.hidden_act = getattr(pool_config, 'hidden_act', 'gelu') + +class ContextPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = StableDropout(config.dropout) + self.config = config + + def forward(self, hidden_states, mask = None): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + + context_token = hidden_states[:, 0] + context_token = self.dropout(context_token) + pooled_output = self.dense(context_token) + pooled_output = ACT2FN[self.config.hidden_act](pooled_output) + return pooled_output + + def output_dim(self): + return self.config.hidden_size diff --git a/nlu/DeBERTa/deberta/pretrained_models.py b/nlu/DeBERTa/deberta/pretrained_models.py new file mode 100644 index 0000000000000000000000000000000000000000..139597f9cb07c5d48bed18984ec4747f4b4f3438 --- /dev/null +++ b/nlu/DeBERTa/deberta/pretrained_models.py @@ -0,0 +1,2 @@ + + diff --git a/nlu/DeBERTa/deberta/spm_tokenizer.py b/nlu/DeBERTa/deberta/spm_tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..db8fbdabe135e170b5f7ade94130881cacc64e13 --- /dev/null +++ b/nlu/DeBERTa/deberta/spm_tokenizer.py @@ -0,0 +1,322 @@ +# Copyright (c) Microsoft, Inc. 2020 +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Author: penhe@microsoft.com +# Date: 11/15/2020 +# + + +import sentencepiece as sp +import six +import unicodedata +import os +import regex as re +from .cache_utils import load_vocab +from ..utils import get_logger +logger=get_logger() + + +import pdb + +__all__ = ['SPMTokenizer'] + +class SPMTokenizer: + def __init__(self, vocab_file, do_lower_case=False, special_tokens=None, bpe_dropout=0, split_by_punct=False): + self.split_by_punct = split_by_punct + spm = sp.SentencePieceProcessor() + assert os.path.exists(vocab_file) + spm.load(vocab_file) + bpe_vocab_size = spm.GetPieceSize() + # Token map + # 0+1 + # 1+1 + # 2+1 + self.vocab = {spm.IdToPiece(i):i for i in range(bpe_vocab_size)} + self.id_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)] + #self.vocab['[PAD]'] = 0 + #self.vocab['[CLS]'] = 1 + #self.vocab['[SEP]'] = 2 + #self.vocab['[UNK]'] = 3 + + _special_tokens = ['[MASK]', '[SEP]', '[PAD]', '[UNK]', '[CLS]'] + self.special_tokens = [] + if special_tokens is not None: + _special_tokens.extend(special_tokens) + for t in _special_tokens: + self.add_special_token(t) + + self.spm = spm + self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + + def tokenize(self, text): + pieces = self._encode_as_pieces(text) + def _norm(x): + if x not in self.vocab or x=='': + return '[UNK]' + else: + return x + pieces = [_norm(p) for p in pieces] + return pieces + + def convert_tokens_to_ids(self, tokens): + return [self.vocab[t] if t in self.vocab else 1 for t in tokens] + + def convert_ids_to_tokens(self, ids): + tokens = [] + for i in ids: + tokens.append(self.ids_to_tokens[i]) + return tokens + + def decode(self, tokens, start=-1, end=-1, raw_text=None): + if raw_text is None: + return self.spm.decode_pieces([t for t in tokens if t not in self.special_tokens]) + else: + words = self.split_to_words(raw_text) + word_tokens = [self.tokenize(w) for w in words] + wt = [w for t in word_tokens for w in t] + #assert tokens == wt, f'{tokens} || {wt}' + if wt!=tokens: + for a,b in zip(wt, tokens): + if a!=b: + pdb.set_trace() + token2words = [0]*len(tokens) + tid = 0 + for i,w in enumerate(word_tokens): + for k,t in enumerate(w): + token2words[tid] = i + tid += 1 + word_start = token2words[start] + word_end = token2words[end] if end prev_end: + words.append(text[prev_end:offset]) + prev_end = offset + w = p.replace(word_start, '') + else: + w = p + try: + s = text.index(w, offset) + pn = "" + k = i+1 + while k < len(pieces): + pn = pieces[k].replace(word_start, '') + if len(pn)>0: + break + k += 1 + + if len(pn)>0 and pn in text[offset:s]: + offset = offset + 1 + else: + offset = s + len(w) + except: + offset = offset + 1 + + if prev_end< offset: + words.append(text[prev_end:offset]) + + return words + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + #words = list(re.findall(self.pat, text)) + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a peice of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + diff --git a/nlu/DeBERTa/deberta/tokenizers.py b/nlu/DeBERTa/deberta/tokenizers.py new file mode 100644 index 0000000000000000000000000000000000000000..fca7a8a71d50c6dafa93101e73db263e2cc0ffa3 --- /dev/null +++ b/nlu/DeBERTa/deberta/tokenizers.py @@ -0,0 +1,16 @@ +# +# Author: penhe@microsoft.com +# Date: 04/25/2019 +# + +""" tokenizers +""" + +from .spm_tokenizer import * +from .gpt2_tokenizer import GPT2Tokenizer + +__all__ = ['tokenizers'] +tokenizers={ + 'gpt2': GPT2Tokenizer, + 'spm': SPMTokenizer + } diff --git a/nlu/DeBERTa/optims/__init__.py b/nlu/DeBERTa/optims/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e63497baeab8603a50e6d98f028c81a3a392e6f2 --- /dev/null +++ b/nlu/DeBERTa/optims/__init__.py @@ -0,0 +1,16 @@ +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Author: Pengcheng He (penhe@microsoft.com) +# Date: 05/15/2019 +# + +""" optimizers +""" + +from .xadam import XAdam +from .fp16_optimizer import * +from .lr_schedulers import SCHEDULES +from .args import get_args + diff --git a/nlu/DeBERTa/optims/args.py b/nlu/DeBERTa/optims/args.py new file mode 100644 index 0000000000000000000000000000000000000000..b89c5c35f16d0b5a3b3ef59244ef391fa9ada04a --- /dev/null +++ b/nlu/DeBERTa/optims/args.py @@ -0,0 +1,100 @@ +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Author: Pengcheng He (penhe@microsoft.com) +# Date: 05/15/2019 +# + +""" Arguments for optimizer +""" +import argparse +from ..utils import boolean_string + +__all__ = ['get_args'] +def get_args(): + parser=argparse.ArgumentParser(add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter) + group = parser.add_argument_group(title='Optimizer', description='Parameters for the distributed optimizer') + group.add_argument('--fp16', + default=False, + type=boolean_string, + help="Whether to use 16-bit float precision instead of 32-bit") + + group.add_argument('--loss_scale', + type=float, default=16384, + help='Loss scaling, positive power of 2 values can improve fp16 convergence.') + + group.add_argument('--scale_steps', + type=int, default=250, + help='The steps to wait to increase the loss scale.') + + group.add_argument('--lookahead_k', + default=-1, + type=int, + help="lookahead k parameter") + + group.add_argument('--lookahead_alpha', + default=0.5, + type=float, + help="lookahead alpha parameter") + + group.add_argument('--with_radam', + default=False, + type=boolean_string, + help="whether to use RAdam") + + group.add_argument('--opt_type', + type=str.lower, + default='adam', + choices=['adam', 'admax'], + help="The optimizer to be used.") + + group.add_argument("--warmup_proportion", + default=0.1, + type=float, + help="Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10%% of training.") + + group.add_argument("--lr_schedule_ends", + default=0, + type=float, + help="The ended learning rate scale for learning rate scheduling") + + group.add_argument("--lr_schedule", + default='warmup_linear', + type=str, + help="The learning rate scheduler used for traning. " + + "E.g. warmup_linear, warmup_linear_shift, warmup_cosine, warmup_constant. Default, warmup_linear") + + group.add_argument("--max_grad_norm", + default=1, + type=float, + help="The clip threshold of global gradient norm") + + group.add_argument("--learning_rate", + default=5e-5, + type=float, + help="The initial learning rate for Adam.") + + group.add_argument("--epsilon", + default=1e-6, + type=float, + help="epsilon setting for Adam.") + + group.add_argument("--adam_beta1", + default=0.9, + type=float, + help="The beta1 parameter for Adam.") + + group.add_argument("--adam_beta2", + default=0.999, + type=float, + help="The beta2 parameter for Adam.") + + group.add_argument('--weight_decay', + type=float, + default=0.01, + help="The weight decay rate") + + return parser + diff --git a/nlu/DeBERTa/optims/fp16_optimizer.py b/nlu/DeBERTa/optims/fp16_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..044a8de0311847684dbc8313d12497def5155fa6 --- /dev/null +++ b/nlu/DeBERTa/optims/fp16_optimizer.py @@ -0,0 +1,301 @@ +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Author: Pengcheng He (penhe@microsoft.com) +# Date: 05/15/2019 +# + +""" FP16 optimizer wrapper +""" + +from collections import defaultdict +import numpy as np +import math +import torch +import pdb +import torch.distributed as dist +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors +import ctypes + +from ..utils import get_logger,boolean_string +logger=get_logger() + +__all__ = ['Fp16Optimizer', 'ExpLossScaler', 'get_world_size'] + +def get_world_size(): + try: + wd = dist.get_world_size() + return wd + except: + return 1 + +def fused_norm(input): + return torch.norm(input, p=2, dtype=torch.float32) + +class OptParameter(torch.Tensor): + def __new__(cls, data, out_data=None, grad=None, name=None): + param = torch.Tensor._make_subclass(cls, data) + param._xgrad = grad + param.out_data = out_data + param._name = name + return param + + @property + def name(self): + return self._name + + @property + def grad(self): + return self._xgrad + + @grad.setter + def grad(self, grad): + self._xgrad = grad + +class Fp16Optimizer(object): + def __init__(self, param_groups, optimizer_fn, loss_scaler=None, grad_clip_norm = 1.0, lookahead_k = -1, lookahead_alpha = 0.5, rank=-1, distributed=False): + # all parameters should on the same device + groups = [] + original_groups = [] + self.rank = rank + self.distributed = distributed + if self.rank<0: + self.distributed = False + for group in param_groups: + if 'offset' not in group: + group['offset'] = None + if ('rank' not in group) or (not self.distributed): + group['rank'] = -1 + assert group['offset'] is None, f"{group['names']}: {group['offset']}" + group_rank = group['rank'] + params = group['params'] # parameter + if len(params) > 1: + flattened_params = _flatten_dense_tensors([p.data for p in params]) + unflattend_params = _unflatten_dense_tensors(flattened_params, [p.data for p in params]) + for uf,p in zip(unflattend_params, params): + p.data = uf + else: + flattened_params = params[0].data.view(-1) + if group['offset'] is not None: + start, length = group['offset'] + flattened_params = flattened_params.narrow(0, start, length) + + if params[0].dtype==torch.half: + if self.rank == group_rank or (not self.distributed): + master_params = flattened_params.clone().to(torch.float).detach_().to(flattened_params.device) + else: + master_params = flattened_params.clone().to(torch.float).detach_().cpu() + group['params'] = [OptParameter(master_params, flattened_params, name='master')] + else: + group['params'] = [OptParameter(flattened_params, None, name='master')] + + o_group = defaultdict(list) + o_group['names'] = group['names'] + o_group['params'] = params + o_group['rank'] = group_rank + o_group['offset'] = group['offset'] + + group['names'] = ['master'] + + original_groups.append(o_group) + groups.append(group) + self.param_groups = groups + self.loss_scaler = loss_scaler + self.optimizer = optimizer_fn(self.param_groups) + self.original_param_groups = original_groups + self.max_grad_norm = grad_clip_norm + self.lookahead_k = lookahead_k + self.lookahead_alpha = lookahead_alpha + + def backward(self, loss): + if self.loss_scaler: + loss_scale, loss, step_loss = self.loss_scaler.scale(loss) + else: + loss_scale = 1 + step_loss = loss.item() + + loss.backward() + return loss_scale, step_loss + + def step(self, lr_scale, loss_scale = 1): + grad_scale = self._grad_scale(loss_scale) + if grad_scale is None or math.isinf(grad_scale): + self.loss_scaler.update(False) + return False + + if self.lookahead_k > 0: + for p in self.param_groups: + if 'la_count' not in p: + # init + #make old copy + p['la_count'] = 0 + p['slow_params'] = [x.data.detach().clone().requires_grad_(False) for x in p['params']] + self.optimizer.step(grad_scale, lr_scale) + + # for group in self.param_groups: + # for p in group['params']: + # # p.data : master fp32 + # # p.out_data : fp16 tensor backing model nn.Parameters + # if hasattr(p, 'out_data') and p.out_data is not None: + # p.out_data.copy_(p.data, non_blocking=True) + + if self.lookahead_k > 0: + for p in self.param_groups: + p['la_count'] += 1 + if p['la_count'] == self.lookahead_k: + p['la_count'] = 0 + for s,f in zip(p['slow_params'], p['params']): + s.mul_(1-self.lookahead_alpha) + s.add_(f.data.detach()*self.lookahead_alpha) + f.data.copy_(s, non_blocking=True) + if hasattr(f, 'out_data') and f.out_data is not None: + f.out_data.copy_(f.data, non_blocking=True) + + if self.loss_scaler: + self.loss_scaler.update(True) + return True + + def zero_grad(self): + for group, o_group in zip(self.param_groups, self.original_param_groups): + for p in group['params']: + p.grad = None + for p in o_group['params']: + p.grad = None + + def _grad_scale(self, loss_scale = 1): + named_params = {} + named_grads = {} + for g in self.original_param_groups: + for n,p in zip(g['names'], g['params']): + named_params[n] = p + named_grads[n] = p.grad if p.grad is not None else torch.zeros_like(p.data) + + wd = get_world_size() + def _reduce(group): + grads = [named_grads[n] for n in group] + if len(grads)>1: + flattened_grads = _flatten_dense_tensors(grads) + else: + flattened_grads = grads[0],view(-1) + + if wd > 1: + flattened_grads /= wd + handle = dist.all_reduce(flattened_grads, async_op=True) + else: + handle = None + return flattened_grads, handle + + def _process_grad(group, flattened_grads, max_grad, norm): + grads = [named_grads[n] for n in group] + norm = norm.to(flattened_grads.device) + norm = norm + fused_norm(flattened_grads)**2 + + if len(grads) > 1: + unflattend_grads = _unflatten_dense_tensors(flattened_grads, grads) + else: + unflattend_grads = [flattened_grads] + + for n,ug in zip(group, unflattend_grads): + named_grads[n] = ug #.to(named_params[n].data) + + return max_grad, norm + + group_size = 0 + group = [] + max_size = 32*1024*1024 + norm = torch.zeros(1, dtype=torch.float) + max_grad = 0 + + all_grads = [] + for name in sorted(named_params.keys(), key=lambda x:x.replace('deberta.', 'bert.')): + group.append(name) + group_size += named_params[name].data.numel() + if group_size>=max_size: + flatten, handle = _reduce(group) + all_grads.append([handle, flatten, group]) + group = [] + group_size = 0 + if group_size>0: + flatten, handle = _reduce(group) + all_grads.append([handle, flatten, group]) + group = [] + group_size = 0 + for h,fg,group in all_grads: + if h is not None: + h.wait() + max_grad, norm = _process_grad(group, fg, max_grad, norm) + + norm = norm**0.5 + if torch.isnan(norm) or torch.isinf(norm) :#in ['-inf', 'inf', 'nan']: + return None + + scaled_norm = norm.detach().item()/loss_scale + grad_scale = loss_scale + + if self.max_grad_norm>0: + scale = norm/(loss_scale*self.max_grad_norm) + if scale>1: + grad_scale *= scale + + for group, o_g in zip(self.param_groups, self.original_param_groups): + grads = [named_grads[n] for n in o_g['names']] + + if len(grads) > 1: + flattened_grads = _flatten_dense_tensors(grads) + else: + flattened_grads = grads[0].view(-1) + if group['offset'] is not None: + start, length = group['offset'] + flattened_grads = flattened_grads.narrow(0, start, length) + if group['rank'] == self.rank or (not self.distributed): + group['params'][0].grad = flattened_grads + + return grad_scale + +class ExpLossScaler: + def __init__(self, init_scale=2**16, scale_interval=1000): + self.cur_scale = init_scale + self.scale_interval = scale_interval + self.invalid_cnt = 0 + self.last_scale = 0 + self.steps = 0 + self.down_scale_smooth = 0 + + def scale(self, loss): + assert self.cur_scale > 0, self.init_scale + step_loss = loss.float().detach().item() + if step_loss != 0 and math.isfinite(step_loss): + loss_scale = self.cur_scale + else: + loss_scale = 1 + loss = loss.float()*loss_scale + return (loss_scale, loss, step_loss) + + def update(self, is_valid = True): + if not is_valid: + self.invalid_cnt += 1 + if self.invalid_cnt>self.down_scale_smooth: + self.cur_scale /= 2 + self.cur_scale = max(self.cur_scale, 1) + self.last_scale = self.steps + else: + self.invalid_cnt = 0 + if self.steps - self.last_scale>self.scale_interval: + self.cur_scale *= 2 + self.last_scale = self.steps + self.steps += 1 + + def state_dict(self): + state = defaultdict(float) + state['steps'] = self.steps + state['invalid_cnt'] = self.invalid_cnt + state['cur_scale'] = self.cur_scale + state['last_scale'] = self.last_scale + return state + + def load_state_dict(self, state): + self.steps = state['steps'] + self.invalid_cnt = state['invalid_cnt'] + self.cur_scale = state['cur_scale'] + self.last_scale = state['last_scale'] diff --git a/nlu/DeBERTa/optims/lr_schedulers.py b/nlu/DeBERTa/optims/lr_schedulers.py new file mode 100644 index 0000000000000000000000000000000000000000..51be4c02f1d4cd8ff59c7f092efa8ef6a0011126 --- /dev/null +++ b/nlu/DeBERTa/optims/lr_schedulers.py @@ -0,0 +1,63 @@ +""" Learning rate schedulers +""" + +import math +import torch +from torch.optim import Optimizer +from torch.nn.utils import clip_grad_norm_ + +def warmup_cosine(step, total, warmup=0.002, ends = 0): + x = step/total + x = x-int(x) + if x < warmup: + return x/warmup + return 0.5 * (1.0 + math.cos(math.pi * x)) + +def warmup_constant(step, total, warmup=0.002, ends = 0): + x = step/total + x = x-int(x) + if x < warmup: + return x/warmup + return 1.0 + +def warmup_linear(step, total, warmup=0.002, ends = 0): + x = step/total + x = x-int(x) + if x < warmup: + return x/warmup + return (1-ends)*(1.0 - x) + ends + +def warmup_linear_cosine(step, total, warmup=0.002, ends = 0): + x = step/total + x = x-int(x) + if x < warmup: + return x/warmup + return (1-ends)*max(0.5*(1+math.cos(math.pi*(x-warmup)/(1-warmup))), 0) + ends + +def warmup_cyclic_linear_cosine(step, total, warmup=0.002, ends = 0): + x = step/total + if x < warmup: + return x/warmup + total = total - int(total*warmup) + step = step - int(total*warmup) + n_epoch = 4 + period = total//n_epoch + k = step//period + s = 1-k/n_epoch + 1/(2*n_epoch)*(math.pow(-1, k)*math.cos(math.pi*step/period)-1) + return (1-ends)*max(s, 0) + ends + +def warmup_linear_shift(step, total, warmup=0.002, ends = 0): + x = step/total + x = x-int(x) + if x < warmup: + return x/warmup + return (1-ends)*(1.0 - (x-warmup)/(1-warmup)) + ends + +SCHEDULES = { + 'warmup_cosine':warmup_cosine, + 'warmup_constant':warmup_constant, + 'warmup_linear':warmup_linear, + 'warmup_linear_cosine':warmup_linear_cosine, + 'warmup_cyclic_linear_cosine':warmup_cyclic_linear_cosine, + 'warmup_linear_shift':warmup_linear_shift, +} diff --git a/nlu/DeBERTa/optims/xadam.py b/nlu/DeBERTa/optims/xadam.py new file mode 100644 index 0000000000000000000000000000000000000000..373075c306c13e6d137e939d3db4eaa0099d2a6f --- /dev/null +++ b/nlu/DeBERTa/optims/xadam.py @@ -0,0 +1,214 @@ +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Author: Pengcheng He (penhe@microsoft.com) +# Date: 05/15/2019 +# + +""" Optimizer +""" + +import math +import torch +from torch.optim import Optimizer +from torch.nn.utils import clip_grad_norm_ +from torch import distributed as dist +import pdb +from .lr_schedulers import SCHEDULES +from ..utils import get_logger + +def adamw(data, + out_data, + next_m, + next_v, + grad, + lr, + beta1, + beta2, + eps, + grad_scale, #combined_scale, g = g/scale + step, + eps_mode = 1, #self.eps_mode, esp inside sqrt:0, outside: 1, only update with momentum: 2 + bias_correction = 0, + weight_decay = 0): + if bias_correction > 0: + lr *= bias_correction + beta1_ = 1 - beta1 + beta2_ = 1 - beta2 + grad = grad.float() + if grad_scale != 1: + grad *= 1/grad_scale + next_m.mul_(beta1).add_(beta1_, grad) + # admax + admax = eps_mode>>4 + eps_mode = eps_mode&0xF + if admax > 0: + torch.max(next_v.mul_(beta2), grad.abs().to(next_v), out=next_v) + update = next_m/(next_v+eps) + else: + next_v.mul_(beta2).addcmul_(beta2_, grad, grad) + if eps_mode == 0: + update = (next_m)*(next_v+eps).rsqrt() + elif eps_mode == 1: + update = (next_m)/(next_v.sqrt()+eps) + else: #=2 + update = next_m.clone() + if weight_decay>0: + update.add_(weight_decay, data) + + data.add_(-lr, update) + if (out_data is not None) and len(out_data)>0: + out_data.copy_(data) + +class XAdam(Optimizer): + """Implements optimized version of Adam algorithm with weight decay fix. + Params: + lr: learning rate + warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1 + t_total: total number of training steps for the learning + rate schedule, -1 means constant learning rate. Default: -1 + schedule: schedule to use for the warmup (see above). Default: 'warmup_linear' + b1: Adams b1. Default: 0.9 + b2: Adams b2. Default: 0.999 + e: Adams epsilon. Default: 1e-6 + weight_decay_rate: Weight decay. Default: 0.01 + max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 + with_radam: Whether to enable radam. Default: False + radam_th: RAdam threshold for tractable variance. Default: 4 + opt_type: The type of optimizer, [adam, admax], default: adam + """ + def __init__(self, params, lr, warmup=-1, t_total=-1, schedule='warmup_linear', + b1=0.9, b2=0.999, e=1e-8, weight_decay_rate=0.01, + lr_ends = 0, + max_grad_norm = 1.0, + with_radam = False, + radam_th = 4, + opt_type=None, + rank = -1): + if not lr >= 0.0: + raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) + if schedule not in SCHEDULES: + raise ValueError("Invalid schedule parameter: {}".format(schedule)) + if not 0.0 <= warmup < 1.0 and not warmup == -1: + raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) + if not 0.0 <= b1 < 1.0: + raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) + if not 0.0 <= b2 < 1.0: + raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2)) + if not e >= 0.0: + raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) + self.defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, + b1=b1, b2=b2, e=e, weight_decay_rate=weight_decay_rate, + lr_ends = lr_ends, + max_grad_norm=max_grad_norm, + with_radam = with_radam, radam_th = radam_th) + self.opt_type = opt_type.lower() if opt_type is not None else "" + self.rank = rank + super().__init__(params, self.defaults) + + def step(self, grad_scale = 1, lr_scale = 1): + """Performs a single optimization step. + + Arguments: + grad_scale: divid grad by grad_scale + lr_scale: scale learning rate by bs_scale + """ + if 'global_step' not in self.state: + self.state['global_step'] = 0 + for group in self.param_groups: + lr_sch = self.get_group_lr_sch(group, self.state['global_step']) + if group['rank'] == self.rank or group['rank']<0 or self.rank<0: + for param in group['params']: + self.update_param(group, param, grad_scale, lr_scale) + + self.state['global_step'] += 1 + self.last_grad_scale = grad_scale + handels = [] + for group in self.param_groups: + if group['rank']>=0 and self.rank>=0: + # sync + for param in group['params']: + out_p = param.out_data if hasattr(param, 'out_data') and (param.out_data is not None) else None + if out_p is not None: + h = torch.distributed.broadcast(out_p, group['rank'], async_op=True) + else: + h = torch.distributed.broadcast(param.data, group['rank'], async_op=True) + handels.append(h) + + for h in handels: + if h is not None: + h.wait() + + return lr_sch + + def get_group_lr_sch(self, group, steps): + if group['t_total'] > 0: + schedule_fct = SCHEDULES[group['schedule']] + lr_scheduled = schedule_fct(steps, group['t_total'], group['warmup'], group['lr_ends']) + else: + lr_scheduled = 1 + return lr_scheduled + + def update_param(self, group, param, grad_scale, lr_scale): + grad = param.grad + if grad.is_sparse: + raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') + state = self.get_state(param) + lr_sch = self.get_group_lr_sch(group, state['step']) + lr = group['lr'] * lr_scale *lr_sch + next_m, next_v = state['next_m'], state['next_v'] + beta1, beta2 = group['b1'], group['b2'] + state['step'] += 1 + + # Support for RAdam + t = (state['step']-1) + 1 + eps_mode = 1 + if group['with_radam']: + rou_ = 2/(1-beta2) - 1 + rou_t = rou_ - 2*t/(beta2**-t - 1) + bias_c = 1/(1-beta1**t) + if rou_t > group['radam_th']: + bias_c *= math.sqrt(1 - beta2**t) + bias_c *= math.sqrt(((rou_t - 4)*(rou_t - 2)*rou_)/((rou_ - 4)*(rou_ - 2)*rou_t)) + else: + eps_mode = 2 + bias_c = 0 + lr *= bias_c + + if self.opt_type == 'admax': + eps_mode |= 0x10 + + with torch.cuda.device(param.device.index): + out_p = param.out_data if hasattr(param, 'out_data') and (param.out_data is not None) else None + if out_p is None or out_p.dtype != grad.dtype: + out_p = torch.tensor([], dtype=torch.float).to(param.data) + + weight_decay = group['weight_decay_rate'] + adamw(param.data, + out_p, + next_m, + next_v, + grad, + lr, + beta1, + beta2, + group['e'], + grad_scale, #combined_scale, g = g/scale + state['step'], + eps_mode, #self.eps_mode, esp inside sqrt:0, outside: 1, only update with momentum: 2 + 0, #bias_correction, + weight_decay) + + out_p = param.out_data if hasattr(param, 'out_data') and (param.out_data is not None) else None + if out_p is not None and out_p.dtype != grad.dtype: + out_p.copy_(param.data) + + def get_state(self, param): + state = self.state[param] + # State initialization + if len(state) == 0: + state['step'] = 0 + state['next_m'] = torch.zeros_like(param.data) + state['next_v'] = torch.zeros_like(param.data) + return state diff --git a/nlu/DeBERTa/sift/README.md b/nlu/DeBERTa/sift/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e82b8083db75c7b77d1673b467f117e44e0b87a8 --- /dev/null +++ b/nlu/DeBERTa/sift/README.md @@ -0,0 +1,53 @@ +# SiFT (Scale Invariant Fine-Tuning) + +## Usage + +For example to try SiFT in DeBERTa, please check `experiments/glue/mnli.sh base-sift` or `experiments/glue/mnli.sh xxlarge-v2-sift` + + +Here is an example to consume SiFT in your existing code, + + ```python + # Create DeBERTa model + adv_modules = hook_sift_layer(model, hidden_size=768) + adv = AdversarialLearner(model, adv_modules) + def logits_fn(model, *wargs, **kwargs): + logits,_ = model(*wargs, **kwargs) + return logits + logits,loss = model(**data) + + loss = loss + adv.loss(logits, logits_fn, **data) + # Other steps is the same as general training. + + ``` + +## Ablation study results + + +| Model | MNLI-m/mm | SST-2 | QNLI | CoLA | RTE | MRPC | QQP |STS-B | +|---------------------------|-------------|-------|------|------|--------|-------|-------|------| +| | Acc | Acc | Acc | MCC | Acc |Acc/F1 |Acc/F1 |P/S | +|**[DeBERTa-V2-XXLarge](https://huggingface.co/microsoft/deberta-v2-xxlarge)1,2**|91.7/91.9|97.2|96.0|72.0| 93.5| **93.1/94.9**|92.7/90.3 |93.2/93.1 | +|**[DeBERTa-V2-XXLarge](https://huggingface.co/microsoft/deberta-v2-xxlarge)1,2**|**92.0/92.1**|97.5|**96.5**|**73.5**| **96.5**| - |**93.0/90.7** | - | + +# Citation +``` +@misc{he2020deberta, + title={DeBERTa: Decoding-enhanced BERT with Disentangled Attention}, + author={Pengcheng He and Xiaodong Liu and Jianfeng Gao and Weizhu Chen}, + year={2020}, + eprint={2006.03654}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} + +@article{Jiang_2020, + title={SMART: Robust and Efficient Fine-Tuning for Pre-trained Natural Language Models through Principled Regularized Optimization}, + url={http://dx.doi.org/10.18653/v1/2020.acl-main.197}, + DOI={10.18653/v1/2020.acl-main.197}, + journal={Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics}, + publisher={Association for Computational Linguistics}, + author={Jiang, Haoming and He, Pengcheng and Chen, Weizhu and Liu, Xiaodong and Gao, Jianfeng and Zhao, Tuo}, + year={2020} +} +``` diff --git a/nlu/DeBERTa/sift/__init__.py b/nlu/DeBERTa/sift/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..53c8e522655a235f120c2659e2f2959ff7709640 --- /dev/null +++ b/nlu/DeBERTa/sift/__init__.py @@ -0,0 +1 @@ +from .sift import * diff --git a/nlu/DeBERTa/sift/sift.py b/nlu/DeBERTa/sift/sift.py new file mode 100644 index 0000000000000000000000000000000000000000..e7b1de8e6c3159922b1df4698aecf43e78d9ddb6 --- /dev/null +++ b/nlu/DeBERTa/sift/sift.py @@ -0,0 +1,210 @@ +# Copyright (c) Microsoft, Inc. 2020 +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Author: penhe@microsoft.com +# Date: 01/25/2021 +# + +import torch +import torch.nn.functional as F + +__all__ = ['PerturbationLayer', 'AdversarialLearner', 'hook_sift_layer'] + +class PerturbationLayer(torch.nn.Module): + def __init__(self, hidden_size, learning_rate=1e-4, init_perturbation=1e-2): + super().__init__() + self.learning_rate = learning_rate + self.init_perturbation = init_perturbation + self.delta = None + self.LayerNorm = torch.nn.LayerNorm(hidden_size, 1e-7, elementwise_affine=False) + self.adversarial_mode = False + + def adversarial_(self, adversarial = True): + self.adversarial_mode = adversarial + if not adversarial: + self.delta = None + + def forward(self, input): + if not self.adversarial_mode: + self.input = self.LayerNorm(input) + return self.input + else: + if self.delta is None: + self.update_delta(requires_grad=True) + return self.perturbated_input + + def update_delta(self, requires_grad = False): + if not self.adversarial_mode: + return True + if self.delta is None: + delta = torch.clamp(self.input.new(self.input.size()).normal_(0, self.init_perturbation).float(), -2*self.init_perturbation, 2*self.init_perturbation) + else: + grad = self.delta.grad + self.delta.grad = None + delta = self.delta + norm = grad.norm() + if torch.isnan(norm) or torch.isinf(norm): + return False + eps = self.learning_rate + with torch.no_grad(): + delta = delta + eps*grad/(1e-6 + grad.abs().max(-1, keepdim=True)[0]) + self.delta = delta.float().detach().requires_grad_(requires_grad) + self.perturbated_input = (self.input.to(delta).detach() + self.delta).to(self.input) + return True + +def hook_sift_layer(model, hidden_size, learning_rate=1e-4, init_perturbation=1e-2, target_module = 'embeddings.LayerNorm'): + """ + Hook the sift perturbation layer to and existing model. With this method, you can apply adversarial training + without changing the existing model implementation. + + Params: + `model`: The model instance to apply adversarial training + `hidden_size`: The dimmension size of the perturbated embedding + `learning_rate`: The learning rate to update the perturbation + `init_perturbation`: The initial range of perturbation + `target_module`: The module to apply perturbation. It can be the name of the sub-module of the model or the sub-module instance. + The perturbation layer will be inserted before the sub-module. + + Outputs: + The perturbation layers. + + """ + + if isinstance(target_module, str): + _modules = [k for n,k in model.named_modules() if target_module in n] + else: + assert isinstance(target_module, torch.nn.Module), f'{type(target_module)} is not an instance of torch.nn.Module' + _modules = [target_module] + adv_modules = [] + for m in _modules: + adv = PerturbationLayer(hidden_size, learning_rate, init_perturbation) + def adv_hook(module, inputs): + return adv(inputs[0]) + for h in list(m._forward_pre_hooks.keys()): + if m._forward_pre_hooks[h].__name__ == 'adv_hook': + del m._forward_pre_hooks[h] + m.register_forward_pre_hook(adv_hook) + adv_modules.append(adv) + return adv_modules + +class AdversarialLearner: + """ Adversarial Learner + This class is the helper class for adversarial training. + + Params: + `model`: The model instance to apply adversarial training + `perturbation_modules`: The sub modules in the model that will generate perturbations. If it's `None`, + the constructor will detect sub-modules of type `PerturbationLayer` in the model. + + Example usage: + ```python + # Create DeBERTa model + adv_modules = hook_sift_layer(model, hidden_size=768) + adv = AdversarialLearner(model, adv_modules) + def logits_fn(model, *wargs, **kwargs): + logits,_ = model(*wargs, **kwargs) + return logits + logits,loss = model(**data) + + loss = loss + adv.loss(logits, logits_fn, **data) + # Other steps is the same as general training. + + ``` + + """ + def __init__(self, model, adv_modules=None): + if adv_modules is None: + self.adv_modules = [m for m in model.modules() if isinstance(m, PerturbationLayer)] + else: + self.adv_modules = adv_modules + self.parameters = [p for p in model.parameters()] + self.model = model + + def loss(self, target, logits_fn, loss_fn = 'symmetric-kl', *wargs, **kwargs): + """ + Calculate the adversarial loss based on the given logits fucntion and loss function. + Inputs: + `target`: the logits from original inputs. + `logits_fn`: the function that produces logits based on perturbated inputs. E.g., + ```python + def logits_fn(model, *wargs, **kwargs): + logits = model(*wargs, **kwargs) + return logits + ``` + `loss_fn`: the function that caclulate the loss from perturbated logits and target logits. + - If it's a string, it can be pre-built loss functions, i.e. kl, symmetric_kl, mse. + - If it's a function, it will be called to calculate the loss, the signature of the function will be, + ```python + def loss_fn(source_logits, target_logits): + # Calculate the loss + return loss + ``` + `*wargs`: the positional arguments that will be passed to the model + `**kwargs`: the key-word arguments that will be passed to the model + Outputs: + The loss based on pertubated inputs. + """ + self.prepare() + if isinstance(loss_fn, str): + loss_fn = perturbation_loss_fns[loss_fn] + pert_logits = logits_fn(self.model, *wargs, **kwargs) + pert_loss = loss_fn(pert_logits, target.detach()).sum() + pert_loss.backward() + for m in self.adv_modules: + ok = m.update_delta(True) + + for r,p in zip(self.prev, self.parameters): + p.requires_grad_(r) + pert_logits = logits_fn(self.model, *wargs, **kwargs) + pert_loss = symmetric_kl(pert_logits, target) + + self.cleanup() + return pert_loss.mean() + + def prepare(self): + self.prev = [p.requires_grad for p in self.parameters] + for p in self.parameters: + p.requires_grad_(False) + for m in self.adv_modules: + m.adversarial_(True) + + def cleanup(self): + for r,p in zip(self.prev, self.parameters): + p.requires_grad_(r) + + for m in self.adv_modules: + m.adversarial_(False) + +def symmetric_kl(logits, target): + logit_stu = logits.view(-1, logits.size(-1)).float() + logit_tea = target.view(-1, target.size(-1)).float() + logprob_stu = F.log_softmax(logit_stu, -1) + logprob_tea = F.log_softmax(logit_tea, -1) + prob_tea = logprob_tea.exp().detach() + prob_stu = logprob_stu.exp().detach() + floss = ((prob_tea*(-logprob_stu)).sum(-1)) # Cross Entropy + bloss = ((prob_stu*(-logprob_tea)).sum(-1)) # Cross Entropy + loss = floss + bloss + return loss + +def kl(logits, target): + logit_stu = logits.view(-1, logits.size(-1)).float() + logit_tea = target.view(-1, target.size(-1)).float() + logprob_stu = F.log_softmax(logit_stu, -1) + logprob_tea = F.log_softmax(logit_tea.detach(), -1) + prob_tea = logprob_tea.exp() + loss = ((prob_tea*(-logprob_stu)).sum(-1)) # Cross Entropy + return loss + +def mse(logits, target): + logit_stu = logits.view(-1, logits.size(-1)).float() + logit_tea = target.view(-1, target.size(-1)).float() + return F.mse_loss(logit_stu.view(-1),logit_tea.view(-1)) + +perturbation_loss_fns = { + 'symmetric-kl': symmetric_kl, + 'kl': kl, + 'mse': mse + } diff --git a/nlu/DeBERTa/training/__init__.py b/nlu/DeBERTa/training/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..457c4574297d2c1927c524e2b8230d57a6585307 --- /dev/null +++ b/nlu/DeBERTa/training/__init__.py @@ -0,0 +1,4 @@ +from .trainer import DistributedTrainer, set_random_seed +from .args import get_args +from .dist_launcher import initialize_distributed,kill_children +from ._utils import batch_to,batch_apply diff --git a/nlu/DeBERTa/training/_utils.py b/nlu/DeBERTa/training/_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5a54e53c72225b81f5495e631903b124929009bf --- /dev/null +++ b/nlu/DeBERTa/training/_utils.py @@ -0,0 +1,16 @@ +import torch +from collections.abc import Sequence, Mapping + +def batch_apply(batch, fn): + if isinstance(batch, torch.Tensor): + return fn(batch) + elif isinstance(batch, Sequence): + return [batch_apply(x, fn) for x in batch] + elif isinstance(batch, Mapping): + return {x:batch_apply(batch[x], fn) for x in batch} + else: + raise NotImplementedError(f'Type of {type(batch)} are not supported in batch_apply') + +def batch_to(batch, device): + return batch_apply(batch, lambda x: x.to(device)) + diff --git a/nlu/DeBERTa/training/args.py b/nlu/DeBERTa/training/args.py new file mode 100644 index 0000000000000000000000000000000000000000..771ee12f585dfb7123d7e5961641684db55230ce --- /dev/null +++ b/nlu/DeBERTa/training/args.py @@ -0,0 +1,72 @@ +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Author: Pengcheng He (penhe@microsoft.com) +# Date: 05/15/2019 +# + +import argparse +from ..utils import boolean_string + +__all__ = ['get_args'] + +def get_args(): + parser=argparse.ArgumentParser(add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter) + group = parser.add_argument_group(title='Trainer', description='Parameters for the distributed trainer') + group.add_argument('--accumulative_update', + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.") + + group.add_argument("--dump_interval", + default=1000, + type=int, + help="Interval steps for generating checkpoint.") + + group.add_argument("--local_rank", + type=int, + default=-1, + help="local_rank for distributed training on gpus") + + group.add_argument('--workers', + type=int, + default=2, + help="The workers to load data.") + + group.add_argument("--num_train_epochs", + default=3.0, + type=float, + help="Total number of training epochs to perform.") + + group.add_argument('--seed', + type=int, + default=1234, + help="random seed for initialization") + + group.add_argument("--train_batch_size", + default=64, + type=int, + help="Total batch size for training.") + + group.add_argument("--world_size", + type=int, + default=-1, + help="[Internal] The world size of distributed training. Internal usage only!! To the world size of the program, you need to use environment. 'WORLD_SIZE'") + + group.add_argument("--rank", + type=int, + default=-1, + help="[Internal] The rank id of current process. Internal usage only!! To the rank of the program, you need to use environment. 'RANK'") + + group.add_argument("--master_ip", + type=str, + default=None, + help="[Internal] The ip address of master node. Internal usage only!! To the master IP of the program, you need to use environment. 'MASTER_ADDR'") + + group.add_argument("--master_port", + type=str, + default=None, + help="[Internal] The port of master node. Internal usage only!! To the master IP of the program, you need to use environment. 'MASTER_PORT'") + + return parser diff --git a/nlu/DeBERTa/training/dist_launcher.py b/nlu/DeBERTa/training/dist_launcher.py new file mode 100644 index 0000000000000000000000000000000000000000..bdeba141349ee311dd362453cddbe035af2b0125 --- /dev/null +++ b/nlu/DeBERTa/training/dist_launcher.py @@ -0,0 +1,163 @@ +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Author: Pengcheng He (penhe@microsoft.com) +# Date: 05/15/2019 +# + +import os +import time +import pdb +import signal +import torch +from multiprocessing import Process,Pool +from collections import defaultdict +import sys +import psutil +from ..utils import set_logger, get_logger +logger = get_logger() + +def kill_children(proc=None, recursive = True): + if proc is None: + proc = psutil.Process() + _children = proc.children(recursive=False) + for c in _children: + try: + if recursive: + kill_children(c, recursive=recursive) + os.kill(c.pid, signal.SIGKILL) + except: + pass + + for c in _children: + try: + c.wait(1) + except: + pass + +def gc(i): + return torch.cuda.device_count() + +def get_ngpu(): + with Pool(1) as p: + return p.map(gc, range(1))[0] + +def _setup_distributed_group(args): + """Initialize torch.distributed.""" + + torch.backends.cudnn.enabled = False + if args.world_size == 1: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + else: + set_logger(args.task_name, os.path.join(args.output_dir, f'training_{args.task_name}_{args.rank}.log'), rank=args.rank, verbose=1 if args.local_rank==0 else 0) + device_id = args.rank % args.n_gpu + if args.local_rank >= 0: + device_id = args.local_rank + device = torch.device("cuda", device_id) + init_method = 'tcp://' + init_method += args.master_ip + ':' + args.master_port + distributed_backend = getattr(args, 'distributed_backend', 'nccl') + torch.distributed.init_process_group( + backend=distributed_backend, + world_size=args.world_size, rank=args.rank, + init_method=init_method) + torch.cuda.set_device(device) + n_gpu = torch.cuda.device_count() + logger.info("device=%s, n_gpu=%d, distributed training=%r, world_size=%d", device, n_gpu, bool(args.world_size != 1), args.world_size) + return device + +def _get_world_size(args): + world_size = int(os.getenv("WORLD_SIZE", '1')) + if not hasattr(args, 'n_gpu') or args.n_gpu is None: + n_gpu = get_ngpu() + return n_gpu * world_size + +def initialize_distributed(args, join=True): + args.world_size = int(os.getenv("WORLD_SIZE", '1')) + args.rank = int(os.getenv('RANK', '0')) + args.master_ip = os.getenv('MASTER_ADDR', 'localhost') + args.master_port = os.getenv('MASTER_PORT', '17006') + + if args.world_size == 1: + args.rank = 0 + args.master_ip = 'localhost' + + if not hasattr(args, 'n_gpu') or args.n_gpu is None: + args.n_gpu = get_ngpu() + + args.node_rank = args.rank + args.world_size = args.n_gpu * args.world_size + seed = args.seed + is_child = False + if args.world_size>1: + children = [] + for r in range(args.n_gpu): + args.rank = r + args.n_gpu*args.node_rank + args.local_rank = r + args.seed = seed + args.rank + child = os.fork() + if child>0: + children.append(child) + else: + signal.signal(signal.SIGINT, signal.SIG_IGN) + is_child = True + break + else: + is_child = True + + if is_child: + return _setup_distributed_group(args) + else: + if join: + try: + for c in children: + cid, ccode = os.waitpid(0,0) + logger.debug(f'Worker {c} done with code {ccode}') + if ccode != 0: + logger.error(f'Worker {c} : {cid} failed with code {ccode}') + kill_children() + raise ValueError(f'Job failed. {cid}:{ccode}') + except (KeyboardInterrupt, SystemExit): + logger.warning('Keybord interrupt by user. Terminate all processes') + kill_children(None) + return children + +def test_dist_launch(): + def test_functions(args): + global logger + set_logger(args.task_name, os.path.join(args.output_dir, f'training_{args.task_name}_{args.node_rank}.log'), rank=args.rank) + logger.info(args) + + class Args: + def __init__(self): + pass + def __repr__(self): + return str(self.__dict__) + + args = Args() + args.task_name = 'test' + args.seed = 0 + args.n_gpu = None + args.no_cuda=False + args.output_dir = '/tmp' + distributed_launch(args, test_functions, (args,)) + +def test_init_dist(): + class Args: + def __init__(self): + pass + def __repr__(self): + return str(self.__dict__) + + args = Args() + args.task_name = 'test' + args.seed = 0 + args.n_gpu = None + args.no_cuda=False + args.output_dir = '/tmp' + device = initialize_distributed(args) + if isinstance(device, torch.device): + return 0 + else: + return 1 diff --git a/nlu/DeBERTa/training/optimizer_utils.py b/nlu/DeBERTa/training/optimizer_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5a7279a4259e6b131f6bd9022c975215105219ee --- /dev/null +++ b/nlu/DeBERTa/training/optimizer_utils.py @@ -0,0 +1,181 @@ +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Author: Pengcheng He (penhe@microsoft.com) +# Date: 05/15/2019 +# + +from collections import defaultdict +import numpy as np +import pdb +from functools import cmp_to_key +import torch +import re +from ..optims import Fp16Optimizer,XAdam,ExpLossScaler,get_world_size +from ..utils import get_logger +logger=get_logger() + + + +def xadam_factory(args, training_steps=None): + def optimizer_fn(param_groups, max_grad_norm=None): + with_radam = getattr(args, 'with_radam', False) + opt_type = getattr(args, 'opt_type', None) + optimizer = XAdam(param_groups, + lr=args.learning_rate, + b1=args.adam_beta1, + b2=args.adam_beta2, + lr_ends=args.lr_schedule_ends, + e=args.epsilon, + warmup=args.warmup_proportion if args.warmup_proportion<1 else args.warmup_proportion/training_steps, + t_total=training_steps, + schedule=args.lr_schedule, + max_grad_norm = args.max_grad_norm if max_grad_norm is None else max_grad_norm, + weight_decay_rate = args.weight_decay, + with_radam = with_radam, + opt_type = opt_type, + rank = args.rank) + return optimizer + + return optimizer_fn + +def create_xoptimizer(model, args, num_train_steps=None, no_decay=['bias', 'LayerNorm.weight']): + if args.fp16: + loss_scaler = ExpLossScaler(scale_interval = args.scale_steps, init_scale=args.loss_scale) + else: + loss_scaler = None + + distributed_optimizer = getattr(args, 'distributed_optimizer', True) + max_distributed_groups = getattr(args, 'max_distributed_groups', 1000000) + world_size = get_world_size() + if world_size<=1: + distributed_optimizer = False + + _no_decay = [x.strip() for x in getattr(args, 'no_decay', '').split('|') if len(x.strip())>0] + if len(_no_decay)>0: + no_decay = _no_decay + + opt_fn = xadam_factory(args, num_train_steps) + + named_params = [(n,p) for n,p in model.named_parameters() if p.requires_grad] + param_size = [p.numel() for n,p in named_params] + type_groups = defaultdict(list) + if distributed_optimizer: + num_groups = min(world_size, max_distributed_groups) + max_group_size = (sum(param_size)+num_groups-1)//num_groups + #max_group_size = max(64*1024*1024, max_group_size) + #max_group_size = max_group_size//2 + max_group_size = (max_group_size//32)*32 + group_sizes = [0 for _ in range(num_groups)] + group_ranks = [g*(world_size//num_groups) for g in range(num_groups)] + else: + # TODO: Fix inconsistent results with different group size + max_group_size = max(64*1024*1024, max(param_size)) + num_groups = (sum(param_size)+max_group_size-1)//max_group_size + group_sizes = [0 for _ in range(num_groups)] + + def get_smallest_group(group_sizes): + return np.argmin([g+i/10000 for i,g in enumerate(group_sizes)]) + + def chunk_into_pieces(param, max_size): + num_chunks = param.numel()//max_size + if num_chunks<2: + return [param], [None] + + flat = param.view(-1) + chunks=[] + offsets = [] + for i in range(num_chunks-1): + chunks.append(flat.narrow(0, i*max_size, max_size)) + offsets.append([i*max_size, max_size]) + i += 1 + chunks.append(flat.narrow(0, i*max_size, flat.size(0)-i*max_size)) + offsets.append([i*max_size, flat.size(0)-i*max_size]) + assert sum([c.numel() for c in chunks])==param.numel(), f'{param.numel()}: {offsets}' + return chunks, offsets + + def param_cmp(x,y): + n1,p1 = x + n2,p2 = y + if p1.numel() == p2.numel(): + if n1n2: + return 1 + else: + return 0 + else: + return p1.numel() - p2.numel() + + def add_group(param_groups, group, group_id): + if distributed_optimizer: + group['rank'] = group_ranks[group_id] + param_groups.append(group.copy()) + group['params'] = [] + group['names'] = [] + group['offset'] = None + return get_smallest_group(group_sizes),group + + hard_reset = getattr(args, 'hard_reset', False) + group_id = 0 + for n,p in named_params: + key = '' + if any(re.search(nd,n) for nd in no_decay): + key += f'{str(p.dtype)}-nd' + else: + key += f'{str(p.dtype)}-d' + type_groups[key].append((n,p)) + param_groups = [] + for key, params in type_groups.items(): + wd_theta = 0 + weight_decay = args.weight_decay + _hard_reset = False + if key.endswith('-nd'): + weight_decay = 0 + else: + _hard_reset = hard_reset + + group = dict(params=[], + weight_decay_rate=weight_decay, + wd_theta = wd_theta, + hard_reset = hard_reset, + names=[], + offset=None) + params = sorted(params, key=cmp_to_key(param_cmp)) + for (n,p) in params: + if p.numel() >= max_group_size: + if len(group['params'])>0: + group_id,group = add_group(param_groups, group, group_id) + chunks, offsets = chunk_into_pieces(p, max_group_size) + for chk, off in zip(chunks, offsets): + group['params'].append(p) + group['names'].append(n) + group['offset'] = off + group_sizes[group_id] += chk.numel() + group_id,group = add_group(param_groups, group, group_id) + else: + group['params'].append(p) + group['names'].append(n) + group['offset'] = None + group_sizes[group_id] += p.numel() + if group_sizes[group_id]>=max_group_size: + group_id,group = add_group(param_groups, group, group_id) + if len(group['params'])>0: + group_id,group = add_group(param_groups, group, group_id) + + lookahead_k = getattr(args, 'lookahead_k', -1) + lookahead_alpha = getattr(args, 'lookahead_alpha', 0.5) + optimizer = Fp16Optimizer(param_groups, opt_fn, loss_scaler, args.max_grad_norm, lookahead_k = lookahead_k,\ + lookahead_alpha = lookahead_alpha, rank=args.rank, distributed=distributed_optimizer) + + # if args.fp16: + # # FP16 + # optimizer = Fp16Optimizer(param_groups, opt_fn, loss_scaler, args.max_grad_norm, lookahead_k = lookahead_k,\ + # lookahead_alpha = lookahead_alpha, rank=args.rank, distributed=distributed_optimizer) + # else: + # # FP32: Dùng trực tiếp Optimizer (XAdam) + # logger.info("FP32 Detected: Bypassing Fp16Optimizer wrapper and using XAdam directly.") + # optimizer = opt_fn(param_groups) + + return optimizer diff --git a/nlu/DeBERTa/training/trainer.py b/nlu/DeBERTa/training/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..3b6616f32c1726dd9565cbf778c3a86cb3961015 --- /dev/null +++ b/nlu/DeBERTa/training/trainer.py @@ -0,0 +1,302 @@ +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Author: Pengcheng He (penhe@microsoft.com) +# Date: 05/15/2019 +# + +import os +import torch +import random +import time +import numpy as np +import pdb +from collections import defaultdict, OrderedDict +from collections.abc import Mapping, Sequence +from torch.utils.data import DataLoader +from ..data import BatchSampler, DistributedBatchSampler,RandomSampler,SequentialSampler, AsyncDataLoader +from ..utils import get_logger +logger = get_logger() + +from .dist_launcher import get_ngpu +from .optimizer_utils import create_xoptimizer +from ._utils import batch_to + +__all__ = ['DistributedTrainer', 'set_random_seed'] + +def set_random_seed(seed, cpu_only=False): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + n_gpu = get_ngpu() + if n_gpu > 0 and not cpu_only: + torch.cuda.manual_seed_all(seed) + +class TrainerState: + def __init__(self, training_steps, name=None): + self.__dict__ = defaultdict(float) + self.loss = 0.0 + self.examples = 0 + self.steps = 0 + self._last_report_step = 0 + self.epochs = 0 + self.next_batch = 0 + self.num_training_steps = training_steps + self._last_report_time = time.time() + self.best_steps = 0 + self.best_metric = -1e9 + self.best_steps_2 = 0 + self.best_metric_2 = -1e9 + self.name = name + self.run_id = None + + def update_step(self, loss, examples, loss_scale): + self.examples += examples + self.loss += loss + self.steps += 1 + self.next_batch += 1 + self.loss_scale = loss_scale + + def report_state(self): + if self.steps <= self._last_report_step: + return + + end = time.time() + start = self._last_report_time + if self.name is not None: + tag = f'[{self.name}]' + else: + tag = None + logger.info('{}[{:0.1f}%][{:0.2f}h] Steps={}, loss={}, examples={}, loss_scale={:0.1f}, {:0.1f}s'.format(tag, 100*self.steps/self.num_training_steps, \ + (self.num_training_steps - self.steps)*(start-end)/((self.steps-self._last_report_step)*3600), self.steps, self.loss/self.steps, self.examples, self.loss_scale, end-start)) + self._last_report_time = end + self._last_report_step = self.steps + +class DistributedTrainer: + def __init__(self, args, output_dir, model, device, data_fn, loss_fn=None, optimizer_fn=None, eval_fn=None, init_fn=None, update_fn=None, dump_interval = 10000, name=None, **kwargs): + """ + data_fn return tuples (training_dataset, training_steps, train_sampler, batch_scheduler), training_dataset is required + loss_fn return the loss of current mini-batch and the size of the batch + optimizer_fn return the created optimizer + eval_fn return metrics for model selection + """ + self.__dict__.update(kwargs) + self.args = args + self.device = device + self.eval_fn = eval_fn + self.accumulative_update = 1 + if hasattr(args, 'accumulative_update'): + self.accumulative_update = args.accumulative_update + + train_data, training_steps, train_sampler = data_fn(self) + self.train_data = train_data + self.train_sampler = train_sampler if train_sampler is not None else RandomSampler(len(train_data)) + self.training_epochs = int(getattr(args, 'num_train_epochs', 1)) + + if training_steps is None: + training_steps = getattr(args, 'training_steps', (len(training_data) + self.args.train_batch_size-1)//self.args.train_batch_size*self.training_epochs) + self.training_steps = training_steps + + self.output_dir = output_dir + self.init_fn = init_fn + self.trainer_state = TrainerState(self.training_steps, name = name) + self.dump_interval = dump_interval + + self.model = self._setup_model(args, model) + self.post_loss_fn = None + + def _opt_fn(trainer, model, training_steps): + return create_xoptimizer(model, args, num_train_steps = training_steps) + optimizer_fn = optimizer_fn if optimizer_fn is not None else _opt_fn + + self.optimizer = optimizer_fn(self, model, training_steps) + + def _loss_fn(trainer, model, batch): + _,loss = model(**batch) + batch_size = batch['input_ids'].size(0) + return loss.mean(), batch_size + self.loss_fn = loss_fn if loss_fn is not None else _loss_fn + + self.initialized = False + self.update_fn = update_fn + + def initialize(self): + set_random_seed(self.args.seed) + + if self.args.world_size>1: + torch.distributed.barrier() + self.initialized = True + + def train(self): + if not self.initialized: + self.initialize() + + rank = self.args.rank + world_size = self.args.world_size + + for n_epoch in range(self.trainer_state.epochs, self.training_epochs): + batch_sampler = BatchSampler(self.train_sampler, self.args.train_batch_size) + batch_sampler = DistributedBatchSampler(batch_sampler, rank = rank, world_size = world_size) + batch_sampler.next = self.trainer_state.next_batch + num_workers = getattr(self.args, 'workers', 2) + train_dataloader = DataLoader(self.train_data, batch_sampler=batch_sampler, num_workers=num_workers, worker_init_fn=self.init_fn, + pin_memory=True,persistent_workers=(num_workers>0)) + torch.cuda.empty_cache() + for step, batch in enumerate(AsyncDataLoader(train_dataloader, 100)): + if self.trainer_state.steps >= self.training_steps: + break + bs_scale = 1 + batch = batch_to(batch, self.device) + self._train_step(batch, bs_scale) + + # Save model + self.trainer_state.epochs += 1 + self.trainer_state.next_batch = 0 + self.trainer_state.report_state() + self._eval_model() + + if n_epoch == self.training_epochs - 1: + self.dump_interval = min(1000, self.dump_interval) + # for n,v in self.model.named_parameters(): + # if n == 'deberta.encoder.layer.0.attention.self.query_proj.hra_u.0': + # print(v[:5]) + # print((v/v.norm())[:5]) + + def save_model(self, args, checkpoint_dir, chk_postfix, model, optimizer): + save_path= os.path.join(checkpoint_dir, f'pytorch.model-{chk_postfix}.bin') + if hasattr(model, 'module'): + model_state = OrderedDict([(n,p) for n,p in model.module.state_dict().items()]) + else: + model_state = OrderedDict([(n,p) for n,p in model.state_dict().items()]) + if args.rank < 1: + torch.save(model_state, save_path) + return save_path + + def _eval_model(self, with_checkpoint=True): + if with_checkpoint: + checkpoint_dir = getattr(self.args, 'checkpoint_dir', None) + checkpoint_dir = checkpoint_dir if checkpoint_dir is not None else self.output_dir + chk_postfix = f'{self.trainer_state.steps:06}' + self.save_model(self.args, checkpoint_dir, chk_postfix, self.model, self.optimizer) + + _metric = self.trainer_state.best_metric + _steps = self.trainer_state.best_steps + if self.args.task_name == 'MNLI': + _metric_2 = self.trainer_state.best_metric_2 + _steps_2 = self.trainer_state.best_steps_2 + if self.eval_fn is not None: + metric = self.eval_fn(self, self.model, self.device, tag=f'{self.trainer_state.steps:06}-{self.training_steps}') + if self.args.task_name == 'MNLI': + if metric[0] > _metric: + _metric = metric[0] + _steps = self.trainer_state.steps + if metric[1] > _metric_2: + _metric_2 = metric[1] + _steps_2 = self.trainer_state.steps + else: + if metric > _metric: + _metric = metric + _steps = self.trainer_state.steps + + if self.args.task_name == 'MNLI': + logger.info(f'Best matched metric: {_metric}@{_steps}') + logger.info(f'Best mismatched metric: {_metric_2}@{_steps_2}') + else: + logger.info(f'Best metric: {_metric}@{_steps}') + + self.trainer_state.best_metric, self.trainer_state.best_steps = _metric, _steps + if self.args.task_name == 'MNLI': + self.trainer_state.best_metric_2, self.trainer_state.best_steps_2 = _metric_2, _steps_2 + + def _train_step(self, data, bs_scale): + self.model.train() + go_next=False + + def split(batch, parts): + sub_batches = [{} for _ in range(parts)] + for k in batch.keys(): + b = batch[k].size(0) + s = (b + parts - 1)//parts + v = batch[k].split(s) + for i,z in enumerate(v): + sub_batches[i][k]=z + chunks = [b for b in sub_batches if len(b)>0] + return chunks + + if self.accumulative_update>1: + data_chunks = split(data, self.accumulative_update) + else: + data_chunks = [data] + + while not go_next: + step_loss = 0 + batch_size = 0 + self.optimizer.zero_grad() + forward_outputs = [] + for i, sub in enumerate(data_chunks): + output = self.loss_fn(self, self.model, sub) + if isinstance(output, dict): + loss, sub_size = output['loss'], output['batch_size'] + else: + loss, sub_size = output + forward_outputs.append(output) + loss = loss/len(data_chunks) + # ------------------------------------------------------------------------------ + # for name, param in self.model.named_parameters(): + # if 'hra_u' in name: + # device = param.device + # hra_u_norm = param / param.norm(dim=0) + # orth_loss = torch.norm(torch.eye(8, device=device) - hra_u_norm.t() @ hra_u_norm) + # loss = loss + 1e-6 * orth_loss + # ------------------------------------------------------------------------------ + if i == 0: + loss_scale, _loss = self.optimizer.backward(loss) + else: + _loss = loss.float().detach().item() + loss = loss.float() * loss_scale + loss.backward() + step_loss += _loss + batch_size += sub_size + + ### + check_param = None + for n, p in self.model.named_parameters(): + if "hra_" in n and p.requires_grad: + check_param = p + break + val_before = check_param.data.clone().cpu().float().numpy()[0,0] # Take first element + if not self.optimizer.step(bs_scale, loss_scale): + self.optimizer.zero_grad() + continue + + #Check value after update + # val_after = check_param.data.clone().cpu().float().numpy()[0,0] + + # if val_before == val_after: + # print(f"[CRITICAL WARNING] HRA Param {n} did NOT change! Optimizer is broken.") + # print(f" Before: {val_before:.6f} | After: {val_after:.6f} | Grad: {check_param.grad.norm().item()}") + # else: + # print(f"[SUCCESS] HRA Param updated. Delta: {val_after - val_before}") + # exit() + + go_next = True + self.trainer_state.update_step(step_loss, batch_size , loss_scale) + if self.update_fn is not None: + self.update_fn(self, self.model, loss_scale) + self.optimizer.zero_grad() + + if self.post_loss_fn is not None: + self.post_loss_fn(forward_outputs) + + if self.trainer_state.steps%100 == 0: + self.trainer_state.report_state() + if self.trainer_state.steps%self.dump_interval == 0: + self._eval_model() + + def _setup_model(self, args, model): + if args.world_size > 1: + for p in model.parameters(): + torch.distributed.broadcast(p.data, 0) + torch.cuda.synchronize() + return model diff --git a/nlu/DeBERTa/utils/__init__.py b/nlu/DeBERTa/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d8a97482bc86684b0a27fa53ed3568a125bf1638 --- /dev/null +++ b/nlu/DeBERTa/utils/__init__.py @@ -0,0 +1,8 @@ +""" +utils +@Author: penhe@microsoft.com +""" + +from .logger_util import * +from .argument_types import * +from .xtqdm import * diff --git a/nlu/DeBERTa/utils/argument_types.py b/nlu/DeBERTa/utils/argument_types.py new file mode 100644 index 0000000000000000000000000000000000000000..ce2df4e84ba4eeb5fa8e11318b7c39b72dfd9aad --- /dev/null +++ b/nlu/DeBERTa/utils/argument_types.py @@ -0,0 +1,5 @@ + +def boolean_string(s): + if s.lower() not in {'false', 'true'}: + raise ValueError('Not a valid boolean string') + return s.lower() == 'true' diff --git a/nlu/DeBERTa/utils/jit_tracing.py b/nlu/DeBERTa/utils/jit_tracing.py new file mode 100644 index 0000000000000000000000000000000000000000..9bc043a9f599a4cc4538f29e973cb03af3504def --- /dev/null +++ b/nlu/DeBERTa/utils/jit_tracing.py @@ -0,0 +1,44 @@ +""" +Logging util +@Author: penhe@microsoft.com +""" + +""" Utils for torch jit tracing customer operators/functions +""" +import os + +def traceable(cls): + """ Decorator over customer functions + There is an issue for tracing customer python torch Function, using this decorator to work around it. + e.g. + @traceable + class MyOp(torch.autograd.Function): + xxx + """ + + class _Function(object): + @staticmethod + def apply(*args): + jit_trace = (os.getenv('JIT_TRACE', 'False').lower() == 'true') + if jit_trace: + return cls.forward(_Function, *args) + else: + return cls.apply(*args) + + @staticmethod + def save_for_backward(*args): + pass + + _Function.__name__ = cls.__name__ + _Function.__doc__ = cls.__doc__ + return _Function + +class TraceMode(): + """ Trace context used when tracing modules contains customer operators/Functions + """ + def __enter__(self): + os.environ['JIT_TRACE'] = 'True' + return self + + def __exit__(self, exp_value, exp_type, trace): + del os.environ['JIT_TRACE'] diff --git a/nlu/DeBERTa/utils/logger_util.py b/nlu/DeBERTa/utils/logger_util.py new file mode 100644 index 0000000000000000000000000000000000000000..83d7cc647d96269227fe37b669a5486511f01109 --- /dev/null +++ b/nlu/DeBERTa/utils/logger_util.py @@ -0,0 +1,54 @@ +""" +Logging util +@Author: penhe@microsoft.com +""" + +__all__ = ['get_logger', 'set_logger'] +import logging +import os +import pdb + +logging.basicConfig(format = '%(asctime)s|%(levelname)s|%(name)s| %(message)s', + datefmt = '%m%d%Y %H:%M:%S', + level = logging.INFO) +logger=None +def set_logger(name, file_log=None, rank=0, verbose=1): + global logger + if not logger: + logger = logging.getLogger(name) + else: + logger.name = name + + dirty_handlers = [h for h in logger.handlers] + + if rank >= 0: + formatter = logging.Formatter(f'%(asctime)s|%(levelname)s|%(name)s|{rank:02}| %(message)s', datefmt='%m/%d/%Y %H:%M:%S') + else: + formatter = logging.Formatter(f'%(asctime)s|%(levelname)s|%(name)s| %(message)s', datefmt='%m/%d/%Y %H:%M:%S') + if file_log: + fh = logging.FileHandler(file_log) + fh.setLevel(logging.DEBUG) + fh.setFormatter(formatter) + logger.addHandler(fh) + + # Stdout + # create console handler with a higher log level + ch = logging.StreamHandler() + if verbose > 0: + ch.setLevel(logging.INFO) + else: + ch.setLevel(logging.WARN) + ch.setFormatter(formatter) + logger.addHandler(ch) + + for h in dirty_handlers: + logger.removeHandler(h) + logger.propagate=False + return logger + +def get_logger(name='logging', file_log=None, rank=0, verbose=1): + global logger + if not logger: + logger = set_logger(name, file_log, rank, verbose) + return logger + diff --git a/nlu/DeBERTa/utils/xtqdm.py b/nlu/DeBERTa/utils/xtqdm.py new file mode 100644 index 0000000000000000000000000000000000000000..2908d0c1ce9683afe2ca1e1050b351fbd3a03f0a --- /dev/null +++ b/nlu/DeBERTa/utils/xtqdm.py @@ -0,0 +1,30 @@ + +from tqdm import tqdm +import os + +__all__=['xtqdm'] + +class dummy_tqdm(): + def __init__(self, iterable=None, *wargs, **kwargs): + self.iterable = iterable + + def __iter__(self): + for d in self.iterable: + yield d + + def update(self, *wargs, **kwargs): + pass + + def close(self): + pass + +def xtqdm(iterable=None, *wargs, **kwargs): + disable = False + if 'disable' in kwargs: + disable = kwargs['disable'] + if 'NO_TQDM' in os.environ: + disable = True if os.getenv('NO_TQDM', '0')!='0' else False + if disable: + return dummy_tqdm(iterable, *wargs, **kwargs) + else: + return tqdm(iterable, *wargs, **kwargs) diff --git a/nlu/adapterlib/__init__.py b/nlu/adapterlib/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7fd9ad25a3ec6359639dfe7fa786dcc2c3c3e080 --- /dev/null +++ b/nlu/adapterlib/__init__.py @@ -0,0 +1,4 @@ +name = "lora" + +from .layers import * +from .utils import * \ No newline at end of file diff --git a/nlu/adapterlib/layers.py b/nlu/adapterlib/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..e387e404adcba109fbe3bd17c5c197ba257507c5 --- /dev/null +++ b/nlu/adapterlib/layers.py @@ -0,0 +1,507 @@ +# ------------------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. +# ------------------------------------------------------------------------------------------ +import torch +import torch.nn as nn +import torch.nn.functional as F + +import math +from typing import Optional, List + +class HRALinear(nn.Linear): + def __init__( + self, + in_features: int, + out_features: int, + config: dict, + **kwargs + ): + nn.Linear.__init__(self, in_features, out_features, **kwargs) + config = config.hra + self.r = config.r + self.apply_GS = config.apply_GS + + half_u = torch.zeros(self.in_features, self.r // 2) + nn.init.kaiming_uniform_(half_u, a=math.sqrt(5)) + self.hra_u = nn.Parameter(torch.repeat_interleave(half_u, 2, dim=1), requires_grad=True) + + self.weight.requires_grad = False + + self.register_buffer( + "eye", + torch.eye(self.in_features) + ) + self.alpha = getattr(config, "alpha", 16.0) + self.scale = self.alpha / self.r + + nn.Linear.reset_parameters(self) + + def train(self, mode: bool = True): + nn.Linear.train(self, mode) + + # def forward(self, x): + # orig_weight = self.weight + # if self.apply_GS: + # weight = [(self.hra_u[:, 0] / self.hra_u[:, 0].norm()).view(-1, 1)] + # for i in range(1, self.r): + # ui = self.hra_u[:, i].view(-1, 1) + # for j in range(i): + # ui = ui - (weight[j].t() @ ui) * weight[j] + # weight.append((ui / ui.norm()).view(-1, 1)) + # weight = torch.cat(weight, dim=1) + # new_weight = torch.mm(orig_weight, torch.eye(self.in_features, device=x.device, dtype=x.dtype) - 2 * weight @ weight.t()) + + # else: + # new_weight = orig_weight + # hra_u_norm = self.hra_u / self.hra_u.norm(dim=0) + # for i in range(self.r): + # ui = hra_u_norm[:, i].view(-1, 1) + # new_weight = torch.mm(new_weight, torch.eye(self.in_features, device=x.device, dtype=x.dtype) - 2 * ui @ ui.t()) + + # out = F.linear(input=x, weight=new_weight, bias=self.bias) + # return out + + def forward(self, x): + # KHÔNG dùng .data + W = self.weight # frozen weight, requires_grad=False + + # ===== build orthogonal Q ===== + if self.apply_GS: + U = [] + for i in range(self.r): + ui = self.hra_u[:, i] + for uj in U: + ui = ui - torch.dot(uj, ui) * uj + ui = ui / (ui.norm() + 1e-6) + U.append(ui) + U = torch.stack(U, dim=1) # [in_features, r] + Q = self.eye - 2.0 * (U @ U.t()) + else: + hra_u_norm = self.hra_u / (self.hra_u.norm(dim=0, keepdim=True) + 1e-6) + Q = self.eye + for i in range(self.r): + ui = hra_u_norm[:, i:i+1] + Q = Q @ (self.eye - 2.0 * ui @ ui.t()) + + # ===== HRA residual (CRITICAL) ===== + deltaW = self.scale * (W @ (Q - self.eye)) + W_eff = W + deltaW + + return F.linear(x, W_eff, self.bias) + + +def project(R, eps): + I = torch.zeros((R.size(0), R.size(0)), dtype=R.dtype, device=R.device) + diff = R - I + norm_diff = torch.norm(diff) + if norm_diff <= eps: + return R + else: + return I + eps * (diff / norm_diff) + +def project_batch(R, eps=1e-5): + # scaling factor for each of the smaller block matrix + eps = eps * 1 / torch.sqrt(torch.tensor(R.shape[0])) + I = torch.zeros((R.size(1), R.size(1)), device=R.device, dtype=R.dtype).unsqueeze(0).expand_as(R) + diff = R - I + norm_diff = torch.norm(R - I, dim=(1, 2), keepdim=True) + mask = (norm_diff <= eps).bool() + out = torch.where(mask, R, I + eps * (diff / norm_diff)) + return out + +class OFTLinear(nn.Linear): + # LoRA implemented in a dense layer + def __init__( + self, + in_features: int, + out_features: int, + config: dict, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + # 不是fan_in_fan_out的问题,因为没有一个module设它为true + **kwargs + ): + nn.Linear.__init__(self, in_features, out_features, **kwargs) + config = config.oft + self.block_size = config.block_size + self.r = in_features // self.block_size + self.is_coft = config.is_coft + self.block_share = config.block_share + self.eps = config.eps + + # Actual trainable parameters + if self.block_share: + # Initialized as an identity matrix + R_shape = [self.block_size, self.block_size] + self.oft_R = nn.Parameter(self.weight.new_zeros(R_shape[0], R_shape[0])) + self.eps = self.eps * R_shape[0] * R_shape[0] + else: + R_shape = [self.r, self.block_size, self.block_size] + self.oft_R = self.weight.new_zeros(R_shape[1], R_shape[1]) + self.oft_R = torch.stack([self.oft_R] * self.r) + self.oft_R = nn.Parameter(self.oft_R) + self.eps = self.eps * R_shape[1] * R_shape[1] + + self.weight.requires_grad = False + # self.reset_parameters() + + def reset_parameters(self): + nn.Linear.reset_parameters(self) + if hasattr(self, 'R'): + nn.init.kaiming_uniform_(self.oft_R, a=math.sqrt(5)) + + def forward(self, x): + if self.block_share: + if self.is_coft: + with torch.no_grad(): + self.oft_R.copy_(project(self.oft_R, eps=self.eps)) + orth_rotate = self.cayley(self.oft_R) + else: + if self.is_coft: + with torch.no_grad(): + self.oft_R.copy_(project_batch(self.oft_R, eps=self.eps)) + orth_rotate = self.cayley_batch(self.oft_R) + + # Block-diagonal parametrization + block_diagonal_matrix = self.block_diagonal(orth_rotate) + out = F.linear(input=x, weight=self.weight @ block_diagonal_matrix.to(x.dtype).t(), bias=self.bias) + + return out + + def cayley(self, data): + r, c = list(data.shape) + # Ensure the input matrix is skew-symmetric + skew = 0.5 * (data - data.t()) + I = torch.eye(r, device=data.device) + + # Perform the Cayley parametrization + Q = torch.mm(I + skew, torch.inverse(I - skew)) + return Q + + def cayley_batch(self, data): + b, r, c = data.shape + # Ensure the input matrix is skew-symmetric + skew = 0.5 * (data - data.transpose(1, 2)) + I = torch.eye(r, device=data.device).unsqueeze(0).expand(b, r, c) + + # Perform the Cayley parametrization + Q = torch.bmm(I - skew, torch.inverse(I + skew)) + + return Q + + def block_diagonal(self, R): + if self.block_share: + # Create a list of R repeated block_count times + blocks = [R] * self.r + else: + # Create a list of R slices along the third dimension + blocks = [R[i, ...] for i in range(self.r)] + + # Use torch.block_diag to create the block diagonal matrix + A = torch.block_diag(*blocks) + + return A + +class LoRALayer(): + def __init__( + self, + r: int, + lora_alpha: int, + lora_dropout: float, + merge_weights: bool, + ): + self.r = r + self.lora_alpha = lora_alpha + # Optional dropout + if lora_dropout > 0.: + self.lora_dropout = nn.Dropout(p=lora_dropout) + else: + self.lora_dropout = lambda x: x + # Mark the weight as unmerged + self.merged = False + self.merge_weights = merge_weights + + +class Embedding(nn.Embedding, LoRALayer): + # LoRA implemented in a dense layer + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + r: int = 0, + lora_alpha: int = 1, + merge_weights: bool = True, + **kwargs + ): + nn.Embedding.__init__(self, num_embeddings, embedding_dim, **kwargs) + LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=0, + merge_weights=merge_weights) + # Actual trainable parameters + if r > 0: + self.lora_A = nn.Parameter(self.weight.new_zeros((r, num_embeddings))) + self.lora_B = nn.Parameter(self.weight.new_zeros((embedding_dim, r))) + self.scaling = self.lora_alpha / self.r + # Freezing the pre-trained weight matrix + self.weight.requires_grad = False + self.reset_parameters() + + def reset_parameters(self): + nn.Embedding.reset_parameters(self) + if hasattr(self, 'lora_A'): + # initialize A the same way as the default for nn.Linear and B to zero + nn.init.zeros_(self.lora_A) + nn.init.normal_(self.lora_B) + + def train(self, mode: bool = True): + nn.Embedding.train(self, mode) + if mode: + if self.merge_weights and self.merged: + # Make sure that the weights are not merged + if self.r > 0: + self.weight.data -= (self.lora_B @ self.lora_A).transpose(0, 1) * self.scaling + self.merged = False + else: + if self.merge_weights and not self.merged: + # Merge the weights and mark it + if self.r > 0: + self.weight.data += (self.lora_B @ self.lora_A).transpose(0, 1) * self.scaling + self.merged = True + + def forward(self, x: torch.Tensor): + if self.r > 0 and not self.merged: + result = nn.Embedding.forward(self, x) + after_A = F.embedding( + x, self.lora_A.transpose(0, 1), self.padding_idx, self.max_norm, + self.norm_type, self.scale_grad_by_freq, self.sparse + ) + result += (after_A @ self.lora_B.transpose(0, 1)) * self.scaling + return result + else: + return nn.Embedding.forward(self, x) + + +class LoRALinear(nn.Linear, LoRALayer): + # LoRA implemented in a dense layer + def __init__( + self, + in_features: int, + out_features: int, + config: dict, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + **kwargs + ): + nn.Linear.__init__(self, in_features, out_features, **kwargs) + config = config.lora + LoRALayer.__init__(self, r=config.lora_r, lora_alpha=config.lora_alpha, lora_dropout=config.lora_dropout, + merge_weights=config.merge_weights) + + self.fan_in_fan_out = fan_in_fan_out + # Actual trainable parameters + if self.r > 0: + self.lora_A = nn.Parameter(self.weight.new_zeros((self.r, in_features))) + self.lora_B = nn.Parameter(self.weight.new_zeros((out_features, self.r))) + self.scaling = self.lora_alpha / self.r + # Freezing the pre-trained weight matrix + self.weight.requires_grad = False + self.reset_parameters() + if fan_in_fan_out: + self.weight.data = self.weight.data.transpose(0, 1) + + def reset_parameters(self): + nn.Linear.reset_parameters(self) + if hasattr(self, 'lora_A'): + # initialize B the same way as the default for nn.Linear and A to zero + # this is different than what is described in the paper but should not affect performance + nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5)) + nn.init.zeros_(self.lora_B) + + def train(self, mode: bool = True): + def T(w): + return w.transpose(0, 1) if self.fan_in_fan_out else w + nn.Linear.train(self, mode) + if mode: + if self.merge_weights and self.merged: + # Make sure that the weights are not merged + if self.r > 0: + self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling + self.merged = False + else: + if self.merge_weights and not self.merged: + # Merge the weights and mark it + if self.r > 0: + self.weight.data += T(self.lora_B @ self.lora_A) * self.scaling + self.merged = True + + def forward(self, x: torch.Tensor): + def T(w): + return w.transpose(0, 1) if self.fan_in_fan_out else w + if self.r > 0 and not self.merged: + result = F.linear(x, T(self.weight), bias=self.bias) + result += (self.lora_dropout(x) @ self.lora_A.transpose(0, 1) @ self.lora_B.transpose(0, 1)) * self.scaling + return result + else: + return F.linear(x, T(self.weight), bias=self.bias) + + +class MergedLinear(nn.Linear, LoRALayer): + # LoRA implemented in a dense layer + def __init__( + self, + in_features: int, + out_features: int, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0., + enable_lora: List[bool] = [False], + fan_in_fan_out: bool = False, + merge_weights: bool = True, + **kwargs + ): + nn.Linear.__init__(self, in_features, out_features, **kwargs) + LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, + merge_weights=merge_weights) + assert out_features % len(enable_lora) == 0, \ + 'The length of enable_lora must divide out_features' + self.enable_lora = enable_lora + self.fan_in_fan_out = fan_in_fan_out + # Actual trainable parameters + if r > 0 and any(enable_lora): + self.lora_A = nn.Parameter( + self.weight.new_zeros((r * sum(enable_lora), in_features))) + self.lora_B = nn.Parameter( + self.weight.new_zeros((out_features // len(enable_lora) * sum(enable_lora), r)) + ) # weights for Conv1D with groups=sum(enable_lora) + self.scaling = self.lora_alpha / self.r + # Freezing the pre-trained weight matrix + self.weight.requires_grad = False + # Compute the indices + self.lora_ind = self.weight.new_zeros( + (out_features, ), dtype=torch.bool + ).view(len(enable_lora), -1) + self.lora_ind[enable_lora, :] = True + self.lora_ind = self.lora_ind.view(-1) + self.reset_parameters() + if fan_in_fan_out: + self.weight.data = self.weight.data.transpose(0, 1) + + def reset_parameters(self): + nn.Linear.reset_parameters(self) + if hasattr(self, 'lora_A'): + # initialize A the same way as the default for nn.Linear and B to zero + nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5)) + nn.init.zeros_(self.lora_B) + + def zero_pad(self, x): + result = x.new_zeros((len(self.lora_ind), *x.shape[1:])) + result[self.lora_ind] = x + return result + + def merge_AB(self): + def T(w): + return w.transpose(0, 1) if self.fan_in_fan_out else w + delta_w = F.conv1d( + self.lora_A.unsqueeze(0), + self.lora_B.unsqueeze(-1), + groups=sum(self.enable_lora) + ).squeeze(0) + return T(self.zero_pad(delta_w)) + + def train(self, mode: bool = True): + def T(w): + return w.transpose(0, 1) if self.fan_in_fan_out else w + nn.Linear.train(self, mode) + if mode: + if self.merge_weights and self.merged: + # Make sure that the weights are not merged + if self.r > 0 and any(self.enable_lora): + self.weight.data -= self.merge_AB() * self.scaling + self.merged = False + else: + if self.merge_weights and not self.merged: + # Merge the weights and mark it + if self.r > 0 and any(self.enable_lora): + self.weight.data += self.merge_AB() * self.scaling + self.merged = True + + def forward(self, x: torch.Tensor): + def T(w): + return w.transpose(0, 1) if self.fan_in_fan_out else w + if self.merged: + return F.linear(x, T(self.weight), bias=self.bias) + else: + result = F.linear(x, T(self.weight), bias=self.bias) + if self.r > 0: + result += self.lora_dropout(x) @ T(self.merge_AB().T) * self.scaling + return result + +class ConvLoRA(nn.Module, LoRALayer): + def __init__(self, conv_module, in_channels, out_channels, kernel_size, r=0, lora_alpha=1, lora_dropout=0., merge_weights=True, **kwargs): + super(ConvLoRA, self).__init__() + self.conv = conv_module(in_channels, out_channels, kernel_size, **kwargs) + LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, merge_weights=merge_weights) + assert isinstance(kernel_size, int) + # Actual trainable parameters + if r > 0: + self.lora_A = nn.Parameter( + self.conv.weight.new_zeros((r * kernel_size, in_channels * kernel_size)) + ) + self.lora_B = nn.Parameter( + self.conv.weight.new_zeros((out_channels//self.conv.groups*kernel_size, r*kernel_size)) + ) + self.scaling = self.lora_alpha / self.r + # Freezing the pre-trained weight matrix + self.conv.weight.requires_grad = False + self.reset_parameters() + self.merged = False + + def reset_parameters(self): + self.conv.reset_parameters() + if hasattr(self, 'lora_A'): + # initialize A the same way as the default for nn.Linear and B to zero + nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5)) + nn.init.zeros_(self.lora_B) + + def train(self, mode=True): + super(ConvLoRA, self).train(mode) + if mode: + if self.merge_weights and self.merged: + if self.r > 0: + # Make sure that the weights are not merged + self.conv.weight.data -= (self.lora_B @ self.lora_A).view(self.conv.weight.shape) * self.scaling + self.merged = False + else: + if self.merge_weights and not self.merged: + if self.r > 0: + # Merge the weights and mark it + self.conv.weight.data += (self.lora_B @ self.lora_A).view(self.conv.weight.shape) * self.scaling + self.merged = True + + def forward(self, x): + if self.r > 0 and not self.merged: + return self.conv._conv_forward( + x, + self.conv.weight + (self.lora_B @ self.lora_A).view(self.conv.weight.shape) * self.scaling, + self.conv.bias + ) + return self.conv(x) + +class Conv2d(ConvLoRA): + def __init__(self, *args, **kwargs): + super(Conv2d, self).__init__(nn.Conv2d, *args, **kwargs) + +class Conv1d(ConvLoRA): + def __init__(self, *args, **kwargs): + super(Conv1d, self).__init__(nn.Conv1d, *args, **kwargs) + +# Can Extend to other ones like this + +class Conv3d(ConvLoRA): + def __init__(self, *args, **kwargs): + super(Conv3d, self).__init__(nn.Conv3d, *args, **kwargs) + + +adapter_dict = { + 'lora': LoRALinear, + 'oft': OFTLinear, + 'hra': HRALinear, +} diff --git a/nlu/adapterlib/utils.py b/nlu/adapterlib/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0d121c9bbd6a424d8f4233fac17335aab5141d9b --- /dev/null +++ b/nlu/adapterlib/utils.py @@ -0,0 +1,49 @@ +# ------------------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. +# ------------------------------------------------------------------------------------------ +import torch +import torch.nn as nn + +from typing import Dict + +from .layers import LoRALayer + + +def mark_only_lora_as_trainable(model: nn.Module, bias: str = 'none') -> None: + for n, p in model.named_parameters(): + if 'lora_' not in n: + p.requires_grad = False + if bias == 'none': + return + elif bias == 'all': + for n, p in model.named_parameters(): + if 'bias' in n: + p.requires_grad = True + elif bias == 'lora_only': + for m in model.modules(): + if isinstance(m, LoRALayer) and \ + hasattr(m, 'bias') and \ + m.bias is not None: + m.bias.requires_grad = True + else: + raise NotImplementedError + + +def lora_state_dict(model: nn.Module, bias: str = 'none') -> Dict[str, torch.Tensor]: + my_state_dict = model.state_dict() + if bias == 'none': + return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k} + elif bias == 'all': + return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k or 'bias' in k} + elif bias == 'lora_only': + to_return = {} + for k in my_state_dict: + if 'lora_' in k: + to_return[k] = my_state_dict[k] + bias_name = k.split('lora_')[0]+'bias' + if bias_name in my_state_dict: + to_return[bias_name] = my_state_dict[bias_name] + return to_return + else: + raise NotImplementedError diff --git a/nlu/base_model/.gitattributes b/nlu/base_model/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..6d34772f5ca361021038b404fb913ec8dc0b1a5a --- /dev/null +++ b/nlu/base_model/.gitattributes @@ -0,0 +1,27 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/nlu/base_model/README.md b/nlu/base_model/README.md new file mode 100644 index 0000000000000000000000000000000000000000..340cb56cdc9bb884bb1857d8fb5013d8f17a6a82 --- /dev/null +++ b/nlu/base_model/README.md @@ -0,0 +1,96 @@ +--- +language: en +tags: + - deberta + - deberta-v3 + - fill-mask +thumbnail: https://huggingface.co/front/thumbnails/microsoft.png +license: mit +--- + +## DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing + +[DeBERTa](https://arxiv.org/abs/2006.03654) improves the BERT and RoBERTa models using disentangled attention and enhanced mask decoder. With those two improvements, DeBERTa out perform RoBERTa on a majority of NLU tasks with 80GB training data. + +In [DeBERTa V3](https://arxiv.org/abs/2111.09543), we further improved the efficiency of DeBERTa using ELECTRA-Style pre-training with Gradient Disentangled Embedding Sharing. Compared to DeBERTa, our V3 version significantly improves the model performance on downstream tasks. You can find more technique details about the new model from our [paper](https://arxiv.org/abs/2111.09543). + +Please check the [official repository](https://github.com/microsoft/DeBERTa) for more implementation details and updates. + +The DeBERTa V3 base model comes with 12 layers and a hidden size of 768. It has only 86M backbone parameters with a vocabulary containing 128K tokens which introduces 98M parameters in the Embedding layer. This model was trained using the 160GB data as DeBERTa V2. + + +#### Fine-tuning on NLU tasks + +We present the dev results on SQuAD 2.0 and MNLI tasks. + +| Model |Vocabulary(K)|Backbone #Params(M)| SQuAD 2.0(F1/EM) | MNLI-m/mm(ACC)| +|-------------------|----------|-------------------|-----------|----------| +| RoBERTa-base |50 |86 | 83.7/80.5 | 87.6/- | +| XLNet-base |32 |92 | -/80.2 | 86.8/- | +| ELECTRA-base |30 |86 | -/80.5 | 88.8/ | +| DeBERTa-base |50 |100 | 86.2/83.1| 88.8/88.5| +| DeBERTa-v3-base |128|86 | **88.4/85.4** | **90.6/90.7**| +| DeBERTa-v3-base + SiFT |128|86 | -/- | 91.0/-| + +We present the dev results on SQuAD 1.1/2.0 and MNLI tasks. + +#### Fine-tuning with HF transformers + +```bash +#!/bin/bash + +cd transformers/examples/pytorch/text-classification/ + +pip install datasets +export TASK_NAME=mnli + +output_dir="ds_results" + +num_gpus=8 + +batch_size=8 + +python -m torch.distributed.launch --nproc_per_node=${num_gpus} \ + run_glue.py \ + --model_name_or_path microsoft/deberta-v3-base \ + --task_name $TASK_NAME \ + --do_train \ + --do_eval \ + --evaluation_strategy steps \ + --max_seq_length 256 \ + --warmup_steps 500 \ + --per_device_train_batch_size ${batch_size} \ + --learning_rate 2e-5 \ + --num_train_epochs 3 \ + --output_dir $output_dir \ + --overwrite_output_dir \ + --logging_steps 1000 \ + --logging_dir $output_dir + +``` + +### Citation + +If you find DeBERTa useful for your work, please cite the following papers: + +``` latex +@misc{he2021debertav3, + title={DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing}, + author={Pengcheng He and Jianfeng Gao and Weizhu Chen}, + year={2021}, + eprint={2111.09543}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +``` latex +@inproceedings{ +he2021deberta, +title={DEBERTA: DECODING-ENHANCED BERT WITH DISENTANGLED ATTENTION}, +author={Pengcheng He and Xiaodong Liu and Jianfeng Gao and Weizhu Chen}, +booktitle={International Conference on Learning Representations}, +year={2021}, +url={https://openreview.net/forum?id=XPZIaotutsD} +} +``` diff --git a/nlu/base_model/config.json b/nlu/base_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..904b7ba900060d5ff61289fb29bee393189716c7 --- /dev/null +++ b/nlu/base_model/config.json @@ -0,0 +1,22 @@ +{ + "model_type": "deberta-v2", + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "max_position_embeddings": 512, + "relative_attention": true, + "position_buckets": 256, + "norm_rel_ebd": "layer_norm", + "share_att_key": true, + "pos_att_type": "p2c|c2p", + "layer_norm_eps": 1e-7, + "max_relative_positions": -1, + "position_biased_input": false, + "num_attention_heads": 12, + "num_hidden_layers": 12, + "type_vocab_size": 0, + "vocab_size": 128100 +} diff --git a/nlu/base_model/pytorch_model.bin b/nlu/base_model/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..cc09775b38deaff27b9d8f8d0a4326f21688ce4d --- /dev/null +++ b/nlu/base_model/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:691d48a2800b926a19e3051def466fc2cca4f59a15e42ce4a0cf7f1b380b5e33 +size 371146213 diff --git a/nlu/base_model/rust_model.ot b/nlu/base_model/rust_model.ot new file mode 100644 index 0000000000000000000000000000000000000000..75939b30452f101e080cf56c6ae057de3c2894a4 --- /dev/null +++ b/nlu/base_model/rust_model.ot @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:276aadc323988709f076fb489790103d28e64b80c72e9a3e19043d28f4c7c31a +size 742218621 diff --git a/nlu/base_model/spm.model b/nlu/base_model/spm.model new file mode 100644 index 0000000000000000000000000000000000000000..b1b95e5b0fef33623979511f423eaeee465c46f0 --- /dev/null +++ b/nlu/base_model/spm.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd +size 2464616 diff --git a/nlu/base_model/tf_model.h5 b/nlu/base_model/tf_model.h5 new file mode 100644 index 0000000000000000000000000000000000000000..d151654ec2e9ac84ab44fe34038dc8164ff04a2f --- /dev/null +++ b/nlu/base_model/tf_model.h5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01ad1b35cac509fb00b9873c670d824363ef884d1aa2758471c47b26cc2948f0 +size 735589384 diff --git a/nlu/base_model/tokenizer_config.json b/nlu/base_model/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..acfd94e399c5659e4bed75f91b4ee24b111fc7a6 --- /dev/null +++ b/nlu/base_model/tokenizer_config.json @@ -0,0 +1,4 @@ +{ + "do_lower_case": false, + "vocab_type": "spm" +} diff --git a/nlu/experiments/glue/README.md b/nlu/experiments/glue/README.md new file mode 100644 index 0000000000000000000000000000000000000000..506adeea2ff4d316de24b625ed951e8a2b7f9818 --- /dev/null +++ b/nlu/experiments/glue/README.md @@ -0,0 +1,30 @@ +# GLUE fine-tuning task +To run the experiment, you need to + +run `./mnli.sh` for fine-tuning mnli base model, + +run `./mnli.sh` for fine-tuning mnli large model. + +run `./cola.sh` for fine-tuning cola large model. + +run `./sst2.sh` for fine-tuning sst2 large model. + +run `./stsb.sh` for fine-tuning stsb large model. + +run `./rte.sh` for fine-tuning rte large model. + +run `./qqp.sh` for fine-tuning qqp large model. + +run `./qnli.sh` for fine-tuning qnli large model. + +run `./mrpc.sh` for fine-tuning mrpc large model. + +## Export model to ONNX format and quantization + +To export model to onnx format during evaluation, use argument `--export_ort_model True`. +To export quantized model, use `--fp16 False --export_ort_model True`. +The exported model will be under output folder, and end with +`__onnx_fp16.bin` if fp16 is True, otherwise the outputs will be `__onnx_fp32.bin` and `__onnx_qt.bin`. + + +Please check [ONNX document](https://onnxruntime.ai/docs/performance/quantization.html) for more details. diff --git a/nlu/experiments/glue/ax.sh b/nlu/experiments/glue/ax.sh new file mode 100644 index 0000000000000000000000000000000000000000..bdf21fd30ca2049f64b00f94ffa5036a72e9650a --- /dev/null +++ b/nlu/experiments/glue/ax.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +cache_dir=/tmp/DeBERTa/ +base_model=deberta-v3-base +task=AX + +export CUDA_MPS_PIPE_DIRECTORY="${HOME}/mps_pipe" +export CUDA_MPS_LOG_DIRECTORY="${HOME}/mps_log" + +# Optional: Print to verify +echo "MPS Pipe at: $CUDA_MPS_PIPE_DIRECTORY" + +python -m DeBERTa.apps.run --model_config config.json \ + --tag $base_model \ + --do_train \ + --do_eval \ + --do_predict \ + --max_seq_len 64 \ + --dump_interval 100 \ + --num_train_epochs 28 \ + --fp16 True \ + --warmup 100 \ + --learning_rate 8e-4 \ + --train_batch_size 32 \ + --cls_drop_out 0.1 \ + --task_name $task \ + --data_dir $cache_dir/glue_tasks/$task \ + --init_model $base_model \ + --output_dir $cache_dir/outputs/$base_model/$task \ + --eval_batch_size 256 \ + --predict_batch_size 256 \ \ No newline at end of file diff --git a/nlu/experiments/glue/cola.sh b/nlu/experiments/glue/cola.sh new file mode 100644 index 0000000000000000000000000000000000000000..429294384126e814fa7c4b5cc9bba46909a3a543 --- /dev/null +++ b/nlu/experiments/glue/cola.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +cache_dir=/tmp/DeBERTa/ +base_model=deberta-v3-base +task=CoLA + +export CUDA_MPS_PIPE_DIRECTORY="${HOME}/mps_pipe" +export CUDA_MPS_LOG_DIRECTORY="${HOME}/mps_log" + +# Optional: Print to verify +echo "MPS Pipe at: $CUDA_MPS_PIPE_DIRECTORY" + +python -m DeBERTa.apps.run --model_config config.json \ + --tag $base_model \ + --do_train \ + --do_eval \ + --do_predict \ + --max_seq_len 64 \ + --dump_interval 100 \ + --num_train_epochs 34 \ + --fp16 True \ + --warmup 100 \ + --learning_rate 9e-3 \ + --train_batch_size 32 \ + --cls_drop_out 0.1 \ + --task_name $task \ + --data_dir $cache_dir/glue_tasks/$task \ + --init_model $base_model \ + --output_dir $cache_dir/outputs/$base_model/$task \ + --eval_batch_size 256 \ + --predict_batch_size 256 \ diff --git a/nlu/experiments/glue/config.json b/nlu/experiments/glue/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ed5298c61e233b732b91527f7b8c29d78fb2a755 --- /dev/null +++ b/nlu/experiments/glue/config.json @@ -0,0 +1,27 @@ +{ + "pooling": { + "dropout": 0, + "hidden_act": "gelu" + }, + "inject_adapter": "hra", + "hra": { + "r": 8, + "apply_GS": false, + "suffix": ["hra_u"] + }, + "oft": { + "block_size": 16, + "is_coft": true, + "block_share": false, + "eps": 1e-5, + "suffix": ["oft_R"] + }, + "lora": { + "lora_r": 8, + "lora_alpha": 32, + "merge_weights": false, + "lora_dropout": 0, + "suffix": ["lora_A", "lora_B"] + }, + "vocab_size": 128100 +} diff --git a/nlu/experiments/glue/download_data.sh b/nlu/experiments/glue/download_data.sh new file mode 100644 index 0000000000000000000000000000000000000000..200a852b7664b7d1412ba9f1981dca867eaed35c --- /dev/null +++ b/nlu/experiments/glue/download_data.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +cache_dir=$1 +task=$2 +if [[ -z $cache_dir ]]; then + cache_dir=/tmp/DeBERTa/glue +fi + + +mkdir -p $cache_dir +curl -s -J -L https://raw.githubusercontent.com/nyu-mll/jiant/v1.3.2/scripts/download_glue_data.py -o $cache_dir/glue.py +patch $cache_dir/glue.py patch.diff +if [[ -z $task ]]; then + python3 $cache_dir/glue.py --data_dir $cache_dir/ +else + python3 $cache_dir/glue.py --data_dir $cache_dir/ --tasks $task +fi \ No newline at end of file diff --git a/nlu/experiments/glue/glue_submission/AX.tsv b/nlu/experiments/glue/glue_submission/AX.tsv new file mode 100644 index 0000000000000000000000000000000000000000..f3c5ef88bc2cd336a8cb621afc2b6b561b57d341 --- /dev/null +++ b/nlu/experiments/glue/glue_submission/AX.tsv @@ -0,0 +1,1105 @@ +index prediction +0 contradiction +1 neutral +2 neutral +3 entailment +4 contradiction +5 entailment +6 contradiction +7 contradiction +8 contradiction +9 neutral +10 neutral +11 entailment +12 contradiction +13 neutral +14 neutral +15 neutral +16 entailment +17 neutral +18 entailment +19 neutral +20 contradiction +21 contradiction +22 entailment +23 contradiction +24 neutral +25 neutral +26 entailment +27 neutral +28 entailment +29 contradiction +30 contradiction +31 neutral +32 contradiction +33 neutral +34 neutral +35 entailment +36 neutral +37 contradiction +38 contradiction +39 contradiction +40 contradiction +41 entailment +42 neutral +43 neutral +44 contradiction +45 contradiction +46 contradiction +47 entailment +48 contradiction +49 entailment +50 contradiction +51 contradiction +52 contradiction +53 neutral +54 entailment +55 contradiction +56 neutral +57 contradiction +58 contradiction +59 entailment +60 contradiction +61 entailment +62 contradiction +63 contradiction +64 contradiction +65 neutral +66 neutral +67 neutral +68 neutral +69 contradiction +70 entailment +71 neutral +72 contradiction +73 contradiction +74 entailment +75 contradiction +76 entailment +77 entailment +78 neutral +79 contradiction +80 contradiction +81 contradiction +82 entailment +83 neutral +84 contradiction +85 neutral +86 entailment +87 entailment +88 neutral +89 neutral +90 contradiction +91 contradiction +92 entailment +93 entailment +94 contradiction +95 contradiction +96 entailment +97 neutral +98 contradiction +99 contradiction +100 contradiction +101 entailment +102 neutral +103 neutral +104 entailment +105 contradiction +106 entailment +107 entailment +108 contradiction +109 entailment +110 entailment +111 entailment +112 entailment +113 neutral +114 entailment +115 entailment +116 neutral +117 entailment +118 neutral +119 contradiction +120 entailment +121 entailment +122 entailment +123 contradiction +124 entailment +125 contradiction +126 contradiction +127 entailment +128 entailment +129 neutral +130 neutral +131 contradiction +132 entailment +133 entailment +134 neutral +135 neutral +136 contradiction +137 neutral +138 entailment +139 contradiction +140 entailment +141 neutral +142 entailment +143 neutral +144 contradiction +145 contradiction +146 neutral +147 entailment +148 contradiction +149 neutral +150 neutral +151 contradiction +152 neutral +153 neutral +154 contradiction +155 entailment +156 neutral +157 neutral +158 entailment +159 neutral +160 entailment +161 entailment +162 entailment +163 entailment +164 entailment +165 neutral +166 entailment +167 entailment +168 entailment +169 neutral +170 neutral +171 contradiction +172 entailment +173 contradiction +174 contradiction +175 entailment +176 contradiction +177 neutral +178 entailment +179 neutral +180 neutral +181 entailment +182 neutral +183 contradiction +184 entailment +185 entailment +186 entailment +187 contradiction +188 contradiction +189 entailment +190 contradiction +191 contradiction +192 entailment +193 neutral +194 contradiction +195 entailment +196 neutral +197 neutral +198 contradiction +199 entailment +200 entailment +201 neutral +202 contradiction +203 neutral +204 contradiction +205 neutral +206 neutral +207 entailment +208 contradiction +209 entailment +210 entailment +211 neutral +212 entailment +213 neutral +214 contradiction +215 neutral +216 neutral +217 neutral +218 entailment +219 entailment +220 entailment +221 neutral +222 neutral +223 contradiction +224 neutral +225 contradiction +226 entailment +227 neutral +228 neutral +229 entailment +230 neutral +231 contradiction +232 neutral +233 contradiction +234 entailment +235 contradiction +236 contradiction +237 neutral +238 contradiction +239 neutral +240 entailment +241 contradiction +242 entailment +243 entailment +244 entailment +245 entailment +246 neutral +247 entailment +248 neutral +249 entailment +250 entailment +251 entailment +252 neutral +253 neutral +254 neutral +255 contradiction +256 neutral +257 contradiction +258 contradiction +259 entailment +260 neutral +261 neutral +262 entailment +263 contradiction +264 contradiction +265 neutral +266 contradiction +267 entailment +268 neutral +269 contradiction +270 neutral +271 contradiction +272 neutral +273 entailment +274 contradiction +275 neutral +276 contradiction +277 neutral +278 neutral +279 contradiction +280 entailment +281 entailment +282 neutral +283 contradiction +284 contradiction +285 neutral +286 neutral +287 entailment +288 neutral +289 entailment +290 entailment +291 entailment +292 neutral +293 neutral +294 entailment +295 contradiction +296 contradiction +297 entailment +298 neutral +299 neutral +300 neutral +301 contradiction +302 neutral +303 neutral +304 entailment +305 neutral +306 neutral +307 entailment +308 contradiction +309 contradiction +310 entailment +311 neutral +312 neutral +313 contradiction +314 entailment +315 entailment +316 neutral +317 neutral +318 entailment +319 entailment +320 neutral +321 entailment +322 contradiction +323 contradiction +324 entailment +325 contradiction +326 contradiction +327 entailment +328 entailment +329 contradiction +330 neutral +331 neutral +332 contradiction +333 entailment +334 neutral +335 contradiction +336 contradiction +337 contradiction +338 entailment +339 entailment +340 neutral +341 entailment +342 contradiction +343 contradiction +344 neutral +345 contradiction +346 contradiction +347 contradiction +348 entailment +349 neutral +350 entailment +351 neutral +352 contradiction +353 contradiction +354 contradiction +355 contradiction +356 contradiction +357 contradiction +358 entailment +359 neutral +360 entailment +361 entailment +362 neutral +363 entailment +364 neutral +365 entailment +366 neutral +367 neutral +368 entailment +369 entailment +370 contradiction +371 neutral +372 neutral +373 neutral +374 neutral +375 entailment +376 contradiction +377 neutral +378 neutral +379 entailment +380 neutral +381 neutral +382 neutral +383 entailment +384 contradiction +385 neutral +386 contradiction +387 contradiction +388 entailment +389 contradiction +390 contradiction +391 entailment +392 entailment +393 contradiction +394 entailment +395 neutral +396 entailment +397 neutral +398 entailment +399 entailment +400 contradiction +401 contradiction +402 neutral +403 neutral +404 entailment +405 entailment +406 neutral +407 entailment +408 contradiction +409 contradiction +410 entailment +411 neutral +412 contradiction +413 contradiction +414 entailment +415 contradiction +416 contradiction +417 entailment +418 entailment +419 contradiction +420 contradiction +421 neutral +422 entailment +423 contradiction +424 entailment +425 neutral +426 neutral +427 neutral +428 entailment +429 entailment +430 contradiction +431 neutral +432 entailment +433 entailment +434 contradiction +435 contradiction +436 neutral +437 contradiction +438 entailment +439 contradiction +440 neutral +441 contradiction +442 neutral +443 contradiction +444 neutral +445 entailment +446 entailment +447 neutral +448 contradiction +449 neutral +450 neutral +451 contradiction +452 neutral +453 entailment +454 entailment +455 contradiction +456 contradiction +457 contradiction +458 contradiction +459 contradiction +460 contradiction +461 neutral +462 neutral +463 contradiction +464 neutral +465 contradiction +466 contradiction +467 entailment +468 entailment +469 contradiction +470 neutral +471 neutral +472 entailment +473 contradiction +474 entailment +475 contradiction +476 entailment +477 neutral +478 neutral +479 entailment +480 entailment +481 contradiction +482 contradiction +483 entailment +484 contradiction +485 neutral +486 neutral +487 neutral +488 contradiction +489 entailment +490 neutral +491 entailment +492 entailment +493 neutral +494 entailment +495 contradiction +496 contradiction +497 neutral +498 entailment +499 neutral +500 contradiction +501 entailment +502 entailment +503 entailment +504 entailment +505 contradiction +506 contradiction +507 neutral +508 neutral +509 entailment +510 contradiction +511 contradiction +512 neutral +513 contradiction +514 entailment +515 entailment +516 neutral +517 entailment +518 contradiction +519 neutral +520 contradiction +521 neutral +522 neutral +523 entailment +524 neutral +525 contradiction +526 entailment +527 contradiction +528 entailment +529 contradiction +530 contradiction +531 contradiction +532 neutral +533 neutral +534 contradiction +535 contradiction +536 entailment +537 neutral +538 contradiction +539 contradiction +540 contradiction +541 neutral +542 neutral +543 neutral +544 entailment +545 contradiction +546 neutral +547 contradiction +548 neutral +549 entailment +550 neutral +551 contradiction +552 entailment +553 neutral +554 entailment +555 contradiction +556 entailment +557 neutral +558 entailment +559 entailment +560 entailment +561 entailment +562 entailment +563 entailment +564 neutral +565 entailment +566 contradiction +567 neutral +568 contradiction +569 contradiction +570 neutral +571 entailment +572 entailment +573 entailment +574 neutral +575 contradiction +576 neutral +577 contradiction +578 contradiction +579 contradiction +580 contradiction +581 contradiction +582 contradiction +583 entailment +584 contradiction +585 neutral +586 neutral +587 entailment +588 entailment +589 neutral +590 entailment +591 contradiction +592 contradiction +593 entailment +594 entailment +595 contradiction +596 contradiction +597 contradiction +598 entailment +599 contradiction +600 entailment +601 neutral +602 neutral +603 entailment +604 contradiction +605 contradiction +606 entailment +607 neutral +608 contradiction +609 contradiction +610 entailment +611 neutral +612 entailment +613 neutral +614 entailment +615 neutral +616 entailment +617 contradiction +618 contradiction +619 neutral +620 contradiction +621 entailment +622 neutral +623 neutral +624 contradiction +625 entailment +626 entailment +627 entailment +628 neutral +629 contradiction +630 contradiction +631 contradiction +632 neutral +633 contradiction +634 neutral +635 entailment +636 entailment +637 contradiction +638 contradiction +639 contradiction +640 entailment +641 entailment +642 neutral +643 entailment +644 neutral +645 neutral +646 neutral +647 neutral +648 neutral +649 contradiction +650 contradiction +651 neutral +652 contradiction +653 neutral +654 neutral +655 contradiction +656 contradiction +657 entailment +658 entailment +659 contradiction +660 entailment +661 contradiction +662 neutral +663 contradiction +664 entailment +665 contradiction +666 contradiction +667 entailment +668 contradiction +669 contradiction +670 neutral +671 neutral +672 contradiction +673 contradiction +674 entailment +675 neutral +676 contradiction +677 entailment +678 neutral +679 entailment +680 contradiction +681 neutral +682 entailment +683 contradiction +684 entailment +685 neutral +686 entailment +687 contradiction +688 neutral +689 contradiction +690 contradiction +691 entailment +692 contradiction +693 contradiction +694 contradiction +695 neutral +696 contradiction +697 neutral +698 contradiction +699 contradiction +700 entailment +701 neutral +702 contradiction +703 contradiction +704 entailment +705 entailment +706 contradiction +707 contradiction +708 entailment +709 neutral +710 entailment +711 entailment +712 contradiction +713 contradiction +714 entailment +715 neutral +716 neutral +717 entailment +718 neutral +719 neutral +720 neutral +721 contradiction +722 entailment +723 entailment +724 neutral +725 neutral +726 contradiction +727 entailment +728 entailment +729 contradiction +730 contradiction +731 neutral +732 contradiction +733 neutral +734 entailment +735 contradiction +736 entailment +737 contradiction +738 contradiction +739 neutral +740 neutral +741 entailment +742 entailment +743 entailment +744 contradiction +745 neutral +746 neutral +747 neutral +748 neutral +749 neutral +750 neutral +751 neutral +752 entailment +753 contradiction +754 entailment +755 neutral +756 contradiction +757 contradiction +758 entailment +759 contradiction +760 neutral +761 contradiction +762 entailment +763 neutral +764 contradiction +765 neutral +766 neutral +767 entailment +768 contradiction +769 entailment +770 neutral +771 contradiction +772 contradiction +773 entailment +774 contradiction +775 entailment +776 contradiction +777 contradiction +778 entailment +779 contradiction +780 entailment +781 neutral +782 entailment +783 entailment +784 contradiction +785 entailment +786 entailment +787 entailment +788 entailment +789 contradiction +790 contradiction +791 entailment +792 entailment +793 contradiction +794 contradiction +795 contradiction +796 neutral +797 entailment +798 contradiction +799 neutral +800 neutral +801 entailment +802 contradiction +803 contradiction +804 entailment +805 contradiction +806 neutral +807 entailment +808 neutral +809 entailment +810 contradiction +811 entailment +812 entailment +813 entailment +814 entailment +815 neutral +816 neutral +817 neutral +818 neutral +819 entailment +820 neutral +821 entailment +822 entailment +823 contradiction +824 neutral +825 contradiction +826 entailment +827 entailment +828 neutral +829 neutral +830 entailment +831 neutral +832 contradiction +833 entailment +834 entailment +835 neutral +836 entailment +837 contradiction +838 entailment +839 contradiction +840 entailment +841 contradiction +842 entailment +843 neutral +844 entailment +845 entailment +846 contradiction +847 entailment +848 entailment +849 neutral +850 contradiction +851 neutral +852 contradiction +853 contradiction +854 contradiction +855 contradiction +856 contradiction +857 entailment +858 contradiction +859 contradiction +860 contradiction +861 contradiction +862 contradiction +863 contradiction +864 entailment +865 entailment +866 neutral +867 neutral +868 entailment +869 contradiction +870 neutral +871 entailment +872 neutral +873 contradiction +874 entailment +875 contradiction +876 contradiction +877 contradiction +878 contradiction +879 neutral +880 neutral +881 contradiction +882 contradiction +883 neutral +884 contradiction +885 entailment +886 entailment +887 contradiction +888 contradiction +889 neutral +890 neutral +891 neutral +892 entailment +893 entailment +894 entailment +895 entailment +896 entailment +897 entailment +898 contradiction +899 entailment +900 entailment +901 entailment +902 contradiction +903 entailment +904 entailment +905 entailment +906 entailment +907 contradiction +908 entailment +909 entailment +910 entailment +911 contradiction +912 entailment +913 neutral +914 entailment +915 neutral +916 entailment +917 neutral +918 contradiction +919 contradiction +920 neutral +921 neutral +922 entailment +923 neutral +924 entailment +925 contradiction +926 contradiction +927 contradiction +928 contradiction +929 entailment +930 entailment +931 entailment +932 contradiction +933 contradiction +934 contradiction +935 neutral +936 neutral +937 contradiction +938 entailment +939 neutral +940 entailment +941 neutral +942 entailment +943 contradiction +944 contradiction +945 neutral +946 neutral +947 neutral +948 contradiction +949 contradiction +950 contradiction +951 contradiction +952 entailment +953 neutral +954 entailment +955 neutral +956 neutral +957 neutral +958 neutral +959 neutral +960 neutral +961 contradiction +962 entailment +963 neutral +964 contradiction +965 neutral +966 contradiction +967 entailment +968 entailment +969 neutral +970 contradiction +971 contradiction +972 entailment +973 neutral +974 contradiction +975 contradiction +976 contradiction +977 entailment +978 entailment +979 contradiction +980 neutral +981 entailment +982 contradiction +983 contradiction +984 contradiction +985 neutral +986 neutral +987 contradiction +988 neutral +989 entailment +990 entailment +991 neutral +992 neutral +993 contradiction +994 neutral +995 contradiction +996 neutral +997 contradiction +998 neutral +999 neutral +1000 neutral +1001 neutral +1002 contradiction +1003 contradiction +1004 neutral +1005 neutral +1006 neutral +1007 entailment +1008 contradiction +1009 entailment +1010 entailment +1011 neutral +1012 neutral +1013 neutral +1014 contradiction +1015 neutral +1016 contradiction +1017 contradiction +1018 entailment +1019 entailment +1020 contradiction +1021 contradiction +1022 contradiction +1023 entailment +1024 contradiction +1025 contradiction +1026 entailment +1027 neutral +1028 contradiction +1029 contradiction +1030 neutral +1031 neutral +1032 neutral +1033 entailment +1034 neutral +1035 entailment +1036 entailment +1037 entailment +1038 contradiction +1039 contradiction +1040 entailment +1041 neutral +1042 entailment +1043 contradiction +1044 entailment +1045 entailment +1046 contradiction +1047 contradiction +1048 neutral +1049 neutral +1050 entailment +1051 contradiction +1052 neutral +1053 entailment +1054 neutral +1055 neutral +1056 neutral +1057 contradiction +1058 contradiction +1059 contradiction +1060 entailment +1061 contradiction +1062 neutral +1063 entailment +1064 neutral +1065 contradiction +1066 contradiction +1067 neutral +1068 contradiction +1069 neutral +1070 entailment +1071 contradiction +1072 neutral +1073 entailment +1074 contradiction +1075 entailment +1076 neutral +1077 neutral +1078 entailment +1079 entailment +1080 neutral +1081 entailment +1082 contradiction +1083 entailment +1084 neutral +1085 contradiction +1086 contradiction +1087 contradiction +1088 neutral +1089 neutral +1090 neutral +1091 entailment +1092 neutral +1093 neutral +1094 entailment +1095 contradiction +1096 contradiction +1097 neutral +1098 neutral +1099 neutral +1100 contradiction +1101 contradiction +1102 contradiction +1103 neutral diff --git a/nlu/experiments/glue/glue_submission/WNLI.tsv b/nlu/experiments/glue/glue_submission/WNLI.tsv new file mode 100644 index 0000000000000000000000000000000000000000..eb860c9ad02cb109de8c6dbb12eb7e4dbae60090 --- /dev/null +++ b/nlu/experiments/glue/glue_submission/WNLI.tsv @@ -0,0 +1,147 @@ +index prediction +0 1 +1 1 +2 0 +3 0 +4 0 +5 1 +6 1 +7 1 +8 1 +9 1 +10 1 +11 1 +12 1 +13 1 +14 0 +15 0 +16 0 +17 0 +18 1 +19 1 +20 1 +21 1 +22 1 +23 0 +24 1 +25 1 +26 1 +27 0 +28 1 +29 0 +30 0 +31 1 +32 0 +33 1 +34 1 +35 0 +36 0 +37 1 +38 0 +39 1 +40 1 +41 0 +42 0 +43 1 +44 1 +45 1 +46 1 +47 0 +48 0 +49 1 +50 0 +51 1 +52 1 +53 0 +54 1 +55 0 +56 1 +57 1 +58 0 +59 1 +60 0 +61 0 +62 1 +63 0 +64 0 +65 1 +66 0 +67 0 +68 1 +69 0 +70 1 +71 0 +72 1 +73 0 +74 0 +75 0 +76 1 +77 1 +78 0 +79 1 +80 0 +81 1 +82 0 +83 1 +84 1 +85 0 +86 1 +87 0 +88 1 +89 1 +90 1 +91 1 +92 1 +93 0 +94 0 +95 1 +96 1 +97 0 +98 0 +99 1 +100 0 +101 1 +102 0 +103 0 +104 1 +105 0 +106 0 +107 0 +108 0 +109 1 +110 1 +111 1 +112 0 +113 1 +114 0 +115 1 +116 1 +117 1 +118 0 +119 1 +120 0 +121 1 +122 0 +123 1 +124 1 +125 1 +126 0 +127 0 +128 0 +129 0 +130 1 +131 1 +132 0 +133 1 +134 1 +135 1 +136 0 +137 0 +138 0 +139 1 +140 1 +141 1 +142 0 +143 1 +144 0 +145 0 diff --git a/nlu/experiments/glue/mnli.sh b/nlu/experiments/glue/mnli.sh new file mode 100644 index 0000000000000000000000000000000000000000..df08fdb62c0666003f6481c4384d58b12e591016 --- /dev/null +++ b/nlu/experiments/glue/mnli.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +cache_dir=/tmp/DeBERTa/ +base_model=deberta-v3-base +task=MNLI + +python -m DeBERTa.apps.run --model_config config.json \ + --tag $base_model \ + --do_train \ + --do_eval \ + --do_predict \ + --max_seq_len 256 \ + --dump_interval 1000 \ + --num_train_epochs 8 \ + --fp16 True \ + --warmup 1000 \ + --learning_rate 2e-3 \ + --train_batch_size 32 \ + --cls_drop_out 0.1 \ + --task_name $task \ + --data_dir $cache_dir/glue_tasks/$task \ + --init_model $base_model \ + --output_dir $cache_dir/outputs/$base_model/$task \ + --eval_batch_size 256 \ + --predict_batch_size 256 \ + diff --git a/nlu/experiments/glue/mrpc.sh b/nlu/experiments/glue/mrpc.sh new file mode 100644 index 0000000000000000000000000000000000000000..9c6ccb5315eb388a06d668b70af9a13ae6c2dee2 --- /dev/null +++ b/nlu/experiments/glue/mrpc.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +cache_dir=/tmp/DeBERTa/ +base_model=deberta-v3-base +task=MRPC + +python -m DeBERTa.apps.run --model_config config.json \ + --tag $base_model \ + --do_train \ + --do_eval \ + --do_predict \ + --max_seq_len 320 \ + --dump_interval 100 \ + --num_train_epochs 60 \ + --fp16 True \ + --warmup 50 \ + --learning_rate 6e-3 \ + --train_batch_size 32 \ + --cls_drop_out 0.1 \ + --task_name $task \ + --data_dir $cache_dir/glue_tasks/$task \ + --init_model $base_model \ + --output_dir $cache_dir/outputs/$base_model/$task \ + --eval_batch_size 256 \ + --predict_batch_size 256 \ diff --git a/nlu/experiments/glue/patch.diff b/nlu/experiments/glue/patch.diff new file mode 100644 index 0000000000000000000000000000000000000000..3dcd78717b1962d7c43634acaa8f8aa30c2c1f11 --- /dev/null +++ b/nlu/experiments/glue/patch.diff @@ -0,0 +1,32 @@ +--- download_glue_data.py 2021-02-01 18:22:04.664290174 -0500 ++++ download_glue_data_fixed.py 2021-02-01 18:21:13.399941815 -0500 +@@ -31,18 +31,18 @@ + + TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"] + TASK2PATH = { +- "CoLA": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4", # noqa +- "SST": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8", # noqa +- "MRPC": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc", # noqa +- "QQP": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP-clean.zip?alt=media&token=11a647cb-ecd3-49c9-9d31-79f8ca8fe277", # noqa +- "STS": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5", # noqa +- "MNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce", # noqa +- "SNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df", # noqa +- "QNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601", # noqa +- "RTE": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb", # noqa +- "WNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf", # noqa ++ "CoLA": "https://dl.fbaipublicfiles.com/glue/data/CoLA.zip", ++ "SST": "https://dl.fbaipublicfiles.com/glue/data/SST-2.zip", ++ "MRPC": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc", ++ "QQP": "https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip", ++ "STS": "https://dl.fbaipublicfiles.com/glue/data/STS-B.zip", ++ "MNLI": "https://dl.fbaipublicfiles.com/glue/data/MNLI.zip", ++ "SNLI": "https://dl.fbaipublicfiles.com/glue/data/SNLI.zip", ++ "QNLI": "https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip", ++ "RTE": "https://dl.fbaipublicfiles.com/glue/data/RTE.zip", ++ "WNLI": "https://dl.fbaipublicfiles.com/glue/data/WNLI.zip", + "diagnostic": [ +- "https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D", # noqa ++ "https://dl.fbaipublicfiles.com/glue/data/AX.tsv", + "https://www.dropbox.com/s/ju7d95ifb072q9f/diagnostic-full.tsv?dl=1", + ], + } diff --git a/nlu/experiments/glue/pseudo.ipynb b/nlu/experiments/glue/pseudo.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..f450c32396b7fe99cd863e1aebbc927a0ab3467a --- /dev/null +++ b/nlu/experiments/glue/pseudo.ipynb @@ -0,0 +1,110 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "4dce6d03", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating glue_submission/AX.tsv with 1104 samples...\n", + "Generating glue_submission/WNLI.tsv with 146 samples...\n", + "Done! Dummy files correspond to GLUE submission standards.\n" + ] + } + ], + "source": [ + "import os\n", + "import csv\n", + "import random\n", + "\n", + "def generate_glue_dummy_files(output_dir=\"glue_submission\"):\n", + " \"\"\"\n", + " Generates dummy submission files for AX and WNLI tasks \n", + " to satisfy GLUE benchmark submission requirements.\n", + " \"\"\"\n", + " \n", + " # Create the directory if it doesn't exist\n", + " if not os.path.exists(output_dir):\n", + " os.makedirs(output_dir)\n", + " # Log: Directory created\n", + " print(f\"Created directory: {output_dir}\")\n", + "\n", + " # ---------------------------------------------------------\n", + " # 1. Generate AX.tsv (Diagnostic Dataset)\n", + " # Specs: 1105 samples.\n", + " # Format: index (int), prediction (string: entailment/neutral/contradiction)\n", + " # ---------------------------------------------------------\n", + " ax_filename = os.path.join(output_dir, \"AX.tsv\")\n", + " ax_count = 1104\n", + " # NLI labels for submission are typically strings\n", + " ax_labels = [\"entailment\", \"neutral\", \"contradiction\"]\n", + "\n", + " print(f\"Generating {ax_filename} with {ax_count} samples...\")\n", + "\n", + " with open(ax_filename, mode='w', newline='', encoding='utf-8') as f:\n", + " writer = csv.writer(f, delimiter='\\t')\n", + " \n", + " # Write header\n", + " writer.writerow([\"index\", \"prediction\"])\n", + " \n", + " for i in range(ax_count):\n", + " # Pick a random label since we are not actually testing this task\n", + " pred = random.choice(ax_labels)\n", + " writer.writerow([i, pred])\n", + "\n", + " # ---------------------------------------------------------\n", + " # 2. Generate WNLI.tsv (Winograd NLI)\n", + " # Specs: 147 samples.\n", + " # Format: index (int), prediction (int: 0 or 1)\n", + " # ---------------------------------------------------------\n", + " wnli_filename = os.path.join(output_dir, \"WNLI.tsv\")\n", + " wnli_count = 146\n", + " # WNLI labels are typically 0 (not entailment) or 1 (entailment)\n", + " wnli_labels = [0, 1]\n", + "\n", + " print(f\"Generating {wnli_filename} with {wnli_count} samples...\")\n", + "\n", + " with open(wnli_filename, mode='w', newline='', encoding='utf-8') as f:\n", + " writer = csv.writer(f, delimiter='\\t')\n", + " \n", + " # Write header\n", + " writer.writerow([\"index\", \"prediction\"])\n", + " \n", + " for i in range(wnli_count):\n", + " # Pick a random label\n", + " pred = random.choice(wnli_labels)\n", + " writer.writerow([i, pred])\n", + "\n", + " print(\"Done! Dummy files correspond to GLUE submission standards.\")\n", + "\n", + "if __name__ == \"__main__\":\n", + " generate_glue_dummy_files()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "allm", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nlu/experiments/glue/qnli.sh b/nlu/experiments/glue/qnli.sh new file mode 100644 index 0000000000000000000000000000000000000000..8b5ef126c7d42c3730ddab1e442f2da6141190f9 --- /dev/null +++ b/nlu/experiments/glue/qnli.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +cache_dir=/tmp/DeBERTa/ +base_model=deberta-v3-base +task=QNLI + +python -m DeBERTa.apps.run --model_config config.json \ + --tag $base_model \ + --do_train \ + --do_eval \ + --do_predict \ + --max_seq_len 512 \ + --dump_interval 100 \ + --num_train_epochs 12 \ + --fp16 True \ + --warmup 500 \ + --learning_rate 1e-3 \ + --train_batch_size 32 \ + --cls_drop_out 0.1 \ + --task_name $task \ + --data_dir $cache_dir/glue_tasks/$task \ + --init_model $base_model \ + --output_dir $cache_dir/outputs/$base_model/$task \ + --eval_batch_size 256 \ + --predict_batch_size 256 \ \ No newline at end of file diff --git a/nlu/experiments/glue/qqp.sh b/nlu/experiments/glue/qqp.sh new file mode 100644 index 0000000000000000000000000000000000000000..28461ea30575ccdae9df7403edb611a24c9f105f --- /dev/null +++ b/nlu/experiments/glue/qqp.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +cache_dir=/tmp/DeBERTa/ +base_model=deberta-v3-base +task=QQP + +python -m DeBERTa.apps.run --model_config config.json \ + --tag $base_model \ + --do_train \ + --do_eval \ + --do_predict \ + --max_seq_len 320 \ + --dump_interval 500 \ + --num_train_epochs 10 \ + --fp16 True \ + --warmup 1000 \ + --learning_rate 9e-4 \ + --train_batch_size 32 \ + --cls_drop_out 0.1 \ + --task_name $task \ + --data_dir $cache_dir/glue_tasks/$task \ + --init_model $base_model \ + --output_dir $cache_dir/outputs/$base_model/$task \ + --eval_batch_size 256 \ + --predict_batch_size 256 \ + diff --git a/nlu/experiments/glue/rte.sh b/nlu/experiments/glue/rte.sh new file mode 100644 index 0000000000000000000000000000000000000000..e22697a3936935d607ae4a77678bca2934545feb --- /dev/null +++ b/nlu/experiments/glue/rte.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +cache_dir=/tmp/DeBERTa/ +base_model=deberta-v3-base +task=RTE + +python -m DeBERTa.apps.run --model_config config.json \ + --tag $base_model \ + --do_train \ + --do_eval \ + --do_predict \ + --num_train_epochs 4 \ + --dump_interval 100 \ + --fp16 False \ + --warmup 50 \ + --learning_rate 5e-3 \ + --train_batch_size 32 \ + --max_seq_len 320 \ + --cls_drop_out 0.0 \ + --task_name $task \ + --data_dir $cache_dir/glue_tasks/$task \ + --init_model $base_model \ + --output_dir $cache_dir/outputs/$base_model/$task \ + --eval_batch_size 256 \ + --predict_batch_size 256 \ diff --git a/nlu/experiments/glue/sst2.sh b/nlu/experiments/glue/sst2.sh new file mode 100644 index 0000000000000000000000000000000000000000..cb95a88ab9413179ee4ea1517292e597d1500a2c --- /dev/null +++ b/nlu/experiments/glue/sst2.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +cache_dir=/tmp/DeBERTa/ +base_model=deberta-v3-base +task=SST-2 + +python -m DeBERTa.apps.run --model_config config.json \ + --tag $base_model \ + --do_train \ + --do_eval \ + --do_predict \ + --max_seq_len 128 \ + --dump_interval 100 \ + --num_train_epochs 16 \ + --fp16 True \ + --warmup 500 \ + --learning_rate 1e-3 \ + --train_batch_size 32 \ + --cls_drop_out 0.1 \ + --task_name $task \ + --data_dir $cache_dir/glue_tasks/$task \ + --init_model $base_model \ + --output_dir $cache_dir/outputs/$base_model/$task \ + --eval_batch_size 256 \ + --predict_batch_size 256 \ \ No newline at end of file diff --git a/nlu/experiments/glue/stsb.sh b/nlu/experiments/glue/stsb.sh new file mode 100644 index 0000000000000000000000000000000000000000..fffe8953538071249c1b84a4bfcca961ab22999b --- /dev/null +++ b/nlu/experiments/glue/stsb.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +cache_dir=/tmp/DeBERTa/ +base_model=deberta-v3-base +task=STS-B + +python -m DeBERTa.apps.run --model_config config.json \ + --tag $base_model \ + --do_train \ + --do_eval \ + --do_predict \ + --max_seq_len 128 \ + --dump_interval 100 \ + --num_train_epochs 39 \ + --fp16 True \ + --warmup 50 \ + --learning_rate 5e-3 \ + --train_batch_size 32 \ + --cls_drop_out 0.1 \ + --task_name $task \ + --data_dir $cache_dir/glue_tasks/$task \ + --init_model $base_model \ + --output_dir $cache_dir/outputs/$base_model/$task \ + --eval_batch_size 256 \ + --predict_batch_size 256 \