nvan13 commited on Dec 31, 2025

Commit

ab0f6ec

verified ·

1 Parent(s): f4dcc30

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +6 -0
assets/control.png +3 -0
assets/subject.png +3 -0
generation/control/ControlNet/font/DejaVuSans.ttf +3 -0
generation/control/ControlNet/ldm/modules/image_degradation/utils/test.png +3 -0
llama/data/MetaMathQA-40K.json +3 -0
llama/data/MetaMathQA.json +3 -0
llama/output/cp1e4/ft/adapter_model.safetensors +3 -0
llama/output/cp1e4/ft/tokenizer.model +3 -0
llama/output/cp1e5/ft/adapter_model.safetensors +3 -0
llama/output/cp1e5N/ft/adapter_model.safetensors +3 -0
llama/output/cp1e5N/ft/tokenizer.model +3 -0
llama/output/cp3e5/ft/adapter_model.safetensors +3 -0
llama/output/cp3e5N/ft/adapter_model.safetensors +3 -0
llama/output/cp3e5N/ft/tokenizer.model +3 -0
llama/output/cpr1/ft/adapter_model.safetensors +3 -0
llama/output/cpr1/ft/tokenizer.model +3 -0
llama/output/cpr2/ft/adapter_model.safetensors +3 -0
llama/output/cpr2/ft/tokenizer.model +3 -0
nlu/DeBERTa.egg-info/PKG-INFO +39 -0
nlu/DeBERTa.egg-info/SOURCES.txt +73 -0
nlu/DeBERTa.egg-info/dependency_links.txt +1 -0
nlu/DeBERTa.egg-info/requires.txt +19 -0
nlu/DeBERTa.egg-info/top_level.txt +2 -0
nlu/DeBERTa/apps/tasks/task_registry.py +70 -0
nlu/DeBERTa/data/__init__.py +5 -0
nlu/DeBERTa/data/async_data.py +38 -0
nlu/DeBERTa/data/data_sampler.py +76 -0
nlu/DeBERTa/data/dataloader.py +511 -0
nlu/DeBERTa/data/dynamic_dataset.py +60 -0
nlu/DeBERTa/data/example.py +105 -0
nlu/DeBERTa/deberta/__init__.py +22 -0
nlu/DeBERTa/deberta/bert.py +308 -0
nlu/DeBERTa/deberta/cache_utils.py +135 -0
nlu/DeBERTa/deberta/config.py +90 -0
nlu/DeBERTa/deberta/da_utils.py +68 -0
nlu/DeBERTa/deberta/deberta.py +145 -0
nlu/DeBERTa/deberta/disentangled_attention.py +221 -0
nlu/DeBERTa/deberta/gpt2_bpe_utils.py +163 -0
nlu/DeBERTa/deberta/gpt2_tokenizer.py +216 -0
nlu/DeBERTa/deberta/mlm.py +38 -0
nlu/DeBERTa/deberta/nnmodule.py +137 -0
nlu/DeBERTa/deberta/ops.py +228 -0
nlu/DeBERTa/deberta/pooling.py +88 -0
nlu/DeBERTa/deberta/pretrained_models.py +2 -0
nlu/DeBERTa/deberta/spm_tokenizer.py +322 -0
nlu/DeBERTa/deberta/tokenizers.py +16 -0
nlu/DeBERTa/optims/__init__.py +16 -0
nlu/DeBERTa/optims/args.py +100 -0
nlu/DeBERTa/optims/fp16_optimizer.py +301 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/control.png filter=lfs diff=lfs merge=lfs -text
+assets/subject.png filter=lfs diff=lfs merge=lfs -text
+generation/control/ControlNet/font/DejaVuSans.ttf filter=lfs diff=lfs merge=lfs -text
+generation/control/ControlNet/ldm/modules/image_degradation/utils/test.png filter=lfs diff=lfs merge=lfs -text
+llama/data/MetaMathQA-40K.json filter=lfs diff=lfs merge=lfs -text
+llama/data/MetaMathQA.json filter=lfs diff=lfs merge=lfs -text

assets/control.png ADDED Viewed

Git LFS Details

SHA256: b1943c7d2d2042fd1f5455f7c85509c7fc2299221d3118caf8369807b99ff451
Pointer size: 132 Bytes
Size of remote file: 1.05 MB

assets/subject.png ADDED Viewed

Git LFS Details

SHA256: d115037067258634d251581e308b6509fd9b8190b6084d00a211b6886dd379c7
Pointer size: 131 Bytes
Size of remote file: 966 kB

generation/control/ControlNet/font/DejaVuSans.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7da195a74c55bef988d0d48f9508bd5d849425c1770dba5d7bfc6ce9ed848954
+size 757076

generation/control/ControlNet/ldm/modules/image_degradation/utils/test.png ADDED Viewed

Git LFS Details

SHA256: 92e516278f0d3e85e84cfb55b43338e12d5896a0ee3833aafdf378025457d753
Pointer size: 131 Bytes
Size of remote file: 441 kB

llama/data/MetaMathQA-40K.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c884f10e8aa1229a6e73a6bba2c9134ee0c7b7de92a02a7b8c9459085a59e117
+size 31076207

llama/data/MetaMathQA.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fb39a5d8c05c042ece92eae37dfd5ea414a5979df2bf3ad3b86411bef8205725
+size 395626321

llama/output/cp1e4/ft/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e1c2fceb4f91331d69364aa56d01dd2103d4e59066f1519f1242a62ecca387a
+size 1082171824

llama/output/cp1e4/ft/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

llama/output/cp1e5/ft/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6121d3f7682fd21f70fc78ab9097b22ede67191507c54d44a9bd9c30adf44de
+size 592928

llama/output/cp1e5N/ft/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d85146aea100acda2fd5bb5a011f8d1e14983756bb0c102bf85efe04ac176479
+size 1082171824

llama/output/cp1e5N/ft/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

llama/output/cp3e5/ft/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1945e74d818ded53f08bc892bb458dd0e6addcd548b2f864dbd16a476a8954ef
+size 1082171824

llama/output/cp3e5N/ft/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2396d96c0a301cceddf424fbdf7c7f3518311f90140fa9aad9053706288e9fc
+size 1082171824

llama/output/cp3e5N/ft/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

llama/output/cpr1/ft/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:617c715b246fae47190ca1f8e304e9dbdadf6ac70bbfdd0f3bc3c4b1cd783c0d
+size 1049665904

llama/output/cpr1/ft/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

llama/output/cpr2/ft/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:daede58d9fd4806298d90f9af12ba478c119afab844244f355f35ab3829eb029
+size 1049665904

llama/output/cpr2/ft/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

nlu/DeBERTa.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,39 @@

+Metadata-Version: 2.1
+Name: DeBERTa
+Version: 0.1.13
+Summary: Decoding enhanced BERT with Disentangled Attention
+Home-page: https://github.com/microsoft/DeBERTa
+Author: penhe
+Author-email: penhe@microsoft.com
+License: MIT
+Keywords: NLP deep learning transformer pytorch Attention BERT RoBERTa DeBERTa
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.6
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: nltk
+Requires-Dist: spacy
+Requires-Dist: numpy
+Requires-Dist: pytest
+Requires-Dist: regex
+Requires-Dist: scipy
+Requires-Dist: scikit-learn
+Requires-Dist: tqdm
+Requires-Dist: ujson
+Requires-Dist: seqeval
+Requires-Dist: psutil
+Requires-Dist: sentencepiece
+Requires-Dist: torch
+Provides-Extra: docs
+Requires-Dist: recommonmark; extra == "docs"
+Requires-Dist: sphinx; extra == "docs"
+Requires-Dist: sphinx-markdown-tables; extra == "docs"
+Requires-Dist: sphinx-rtd-theme; extra == "docs"
+deberta long des

nlu/DeBERTa.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,73 @@

+LICENSE
+setup.cfg
+setup.py
+DeBERTa/__init__.py
+DeBERTa.egg-info/PKG-INFO
+DeBERTa.egg-info/SOURCES.txt
+DeBERTa.egg-info/dependency_links.txt
+DeBERTa.egg-info/requires.txt
+DeBERTa.egg-info/top_level.txt
+DeBERTa/apps/__init__.py
+DeBERTa/apps/_utils.py
+DeBERTa/apps/run.py
+DeBERTa/apps/models/__init__.py
+DeBERTa/apps/models/masked_language_model.py
+DeBERTa/apps/models/multi_choice.py
+DeBERTa/apps/models/ner.py
+DeBERTa/apps/models/record_qa.py
+DeBERTa/apps/models/replaced_token_detection_model.py
+DeBERTa/apps/models/sequence_classification.py
+DeBERTa/apps/tasks/__init__.py
+DeBERTa/apps/tasks/glue_tasks.py
+DeBERTa/apps/tasks/metrics.py
+DeBERTa/apps/tasks/mlm_task.py
+DeBERTa/apps/tasks/ner_task.py
+DeBERTa/apps/tasks/race_task.py
+DeBERTa/apps/tasks/record_eval.py
+DeBERTa/apps/tasks/rtd_task.py
+DeBERTa/apps/tasks/superglue_tasks.py
+DeBERTa/apps/tasks/task.py
+DeBERTa/apps/tasks/task_registry.py
+DeBERTa/data/__init__.py
+DeBERTa/data/async_data.py
+DeBERTa/data/data_sampler.py
+DeBERTa/data/dataloader.py
+DeBERTa/data/dynamic_dataset.py
+DeBERTa/data/example.py
+DeBERTa/deberta/__init__.py
+DeBERTa/deberta/bert.py
+DeBERTa/deberta/cache_utils.py
+DeBERTa/deberta/config.py
+DeBERTa/deberta/da_utils.py
+DeBERTa/deberta/deberta.py
+DeBERTa/deberta/disentangled_attention.py
+DeBERTa/deberta/gpt2_bpe_utils.py
+DeBERTa/deberta/gpt2_tokenizer.py
+DeBERTa/deberta/mlm.py
+DeBERTa/deberta/nnmodule.py
+DeBERTa/deberta/ops.py
+DeBERTa/deberta/pooling.py
+DeBERTa/deberta/pretrained_models.py
+DeBERTa/deberta/spm_tokenizer.py
+DeBERTa/deberta/tokenizers.py
+DeBERTa/optims/__init__.py
+DeBERTa/optims/args.py
+DeBERTa/optims/fp16_optimizer.py
+DeBERTa/optims/lr_schedulers.py
+DeBERTa/optims/xadam.py
+DeBERTa/sift/__init__.py
+DeBERTa/sift/sift.py
+DeBERTa/training/__init__.py
+DeBERTa/training/_utils.py
+DeBERTa/training/args.py
+DeBERTa/training/dist_launcher.py
+DeBERTa/training/optimizer_utils.py
+DeBERTa/training/trainer.py
+DeBERTa/utils/__init__.py
+DeBERTa/utils/argument_types.py
+DeBERTa/utils/jit_tracing.py
+DeBERTa/utils/logger_util.py
+DeBERTa/utils/xtqdm.py
+adapterlib/__init__.py
+adapterlib/layers.py
+adapterlib/utils.py

nlu/DeBERTa.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

nlu/DeBERTa.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+nltk
+spacy
+numpy
+pytest
+regex
+scipy
+scikit-learn
+tqdm
+ujson
+seqeval
+psutil
+sentencepiece
+torch
+[docs]
+recommonmark
+sphinx
+sphinx-markdown-tables
+sphinx-rtd-theme

nlu/DeBERTa.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ DeBERTa
2	+ adapterlib

nlu/DeBERTa/apps/tasks/task_registry.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: penhe@microsoft.com
+# Date: 01/25/2019
+#
+from glob import glob
+import os
+import importlib
+import pdb
+import sys
+from ...utils import get_logger
+from .task import Task
+__all__ = ['load_tasks', 'register_task', 'get_task']
+tasks={}
+logger=get_logger()
+def register_task(name=None, desc=None):
+  def register_task_x(cls):
+    _name = name
+    if _name is None:
+      _name = cls.__name__
+    _desc = desc
+    if _desc is None:
+      _desc = _name
+    _name = _name.lower()
+    if _name in tasks:
+      logger.warning(f'{_name} already registered in the registry: {tasks[_name]}.')
+    assert issubclass(cls, Task), f'Registered class must be a subclass of Task.'
+    tasks[_name] = cls
+    cls._meta = {
+        'name': _name,
+        'desc': _desc}
+    return cls
+  if type(name)==type:
+    cls = name
+    name = None
+    return register_task_x(cls)
+  return register_task_x
+def load_tasks(task_dir = None):
+  script_dir = os.path.dirname(os.path.abspath(__file__))
+  sys_tasks = glob(os.path.join(script_dir, "*.py"))
+  for t in sys_tasks:
+    m = os.path.splitext(os.path.basename(t))[0]
+    if not m.startswith('_'):
+      importlib.import_module(f'DeBERTa.apps.tasks.{m}')
+  if task_dir:
+    assert os.path.exists(task_dir), f"{task_dir} must be a valid directory."
+    customer_tasks = glob(os.path.join(task_dir, "*.py"))
+    sys.path.append(task_dir)
+    for t in customer_tasks:
+      m = os.path.splitext(os.path.basename(t))[0]
+      if not m.startswith('_'):
+        importlib.import_module(f'{m}')
+def get_task(name=None):
+  if name is None:
+    return tasks
+  return tasks[name.lower()]

nlu/DeBERTa/data/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .example import ExampleInstance,ExampleSet,example_to_feature
+from .dataloader import SequentialDataLoader
+from .dynamic_dataset import *
+from .data_sampler import *
+from .async_data import *

nlu/DeBERTa/data/async_data.py ADDED Viewed

	@@ -0,0 +1,38 @@

+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: Pengcheng He (penhe@microsoft.com)
+# Date: 05/15/2019
+#
+from queue import Queue,Empty
+from threading import Thread
+class AsyncDataLoader(object):
+  def __init__(self, dataloader, buffer_size=100):
+    self.buffer_size = buffer_size
+    self.dataloader = dataloader
+  def __iter__(self):
+    queue = Queue(self.buffer_size)
+    dl=iter(self.dataloader)
+    def _worker():
+      while True:
+        try:
+          queue.put(next(dl))
+        except StopIteration:
+          break
+      queue.put(None)
+    t=Thread(target=_worker)
+    t.start()
+    while True:
+      d = queue.get()
+      if d is None:
+        break
+      yield d
+    del t
+    del queue
+  def __len__(self):
+    return len(self.dataloader)

nlu/DeBERTa/data/data_sampler.py ADDED Viewed

	@@ -0,0 +1,76 @@

+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: Pengcheng He (penhe@microsoft.com)
+# Date: 05/15/2019
+#
+import os
+import numpy as np
+import math
+import sys
+from torch.utils.data import Sampler
+__all__=['BatchSampler', 'DistributedBatchSampler', 'RandomSampler', 'SequentialSampler']
+class BatchSampler(Sampler):
+  def __init__(self, sampler, batch_size):
+    self.sampler = sampler
+    self.batch_size = batch_size
+  def __iter__(self):
+    batch = []
+    for idx in self.sampler:
+      batch.append(idx)
+      if len(batch)==self.batch_size:
+        yield batch
+        batch = []
+    if len(batch)>0:
+      yield batch
+  def __len__(self):
+    return (len(self.sampler) + self.batch_size - 1)//self.batch_size
+class DistributedBatchSampler(Sampler):
+  def __init__(self, sampler, rank=0, world_size = 1, drop_last = False):
+    self.sampler = sampler
+    self.rank = rank
+    self.world_size = world_size
+    self.drop_last = drop_last
+  def __iter__(self):
+    for b in self.sampler:
+      if len(b)%self.world_size != 0:
+        if self.drop_last:
+          break
+        else:
+          b.extend([b[0] for _ in range(self.world_size-len(b)%self.world_size)])
+      chunk_size = len(b)//self.world_size
+      yield b[self.rank*chunk_size:(self.rank+1)*chunk_size]
+  def __len__(self):
+    return len(self.sampler)
+class RandomSampler(Sampler):
+  def __init__(self, total_samples:int, data_seed:int = 0):
+    self.indices = np.array(np.arange(total_samples))
+    self.rng = np.random.RandomState(data_seed)
+  def __iter__(self):
+    self.rng.shuffle(self.indices)
+    for i in self.indices:
+      yield i
+  def __len__(self):
+    return len(self.indices)
+class SequentialSampler(Sampler):
+  def __init__(self, total_samples:int):
+    self.indices = np.array(np.arange(total_samples))
+  def __iter__(self):
+    for i in self.indices:
+      yield i
+  def __len__(self):
+    return len(self.indices)

nlu/DeBERTa/data/dataloader.py ADDED Viewed

	@@ -0,0 +1,511 @@

+import random
+import torch
+import torch.multiprocessing as multiprocessing
+from torch._C import _set_worker_signal_handlers, \
+    _remove_worker_pids, _error_if_any_worker_fails
+from packaging import version
+if version.Version(torch.__version__) >= version.Version('1.0.0'):
+    from torch._C import  _set_worker_pids
+else:
+    from torch._C import _update_worker_pids as _set_worker_pids
+from torch.utils.data import SequentialSampler, RandomSampler, BatchSampler, Sampler
+import signal
+import functools
+import collections.abc
+import re
+import sys
+import threading
+import traceback
+import os
+import time
+# from torch._six import string_classes
+string_classes = str
+IS_WINDOWS = sys.platform == "win32"
+if IS_WINDOWS:
+    import ctypes
+    from ctypes.wintypes import DWORD, BOOL, HANDLE
+if sys.version_info[0] == 2:
+    import Queue as queue
+else:
+    import queue
+__all__ = ['SequentialDataLoader']
+class ExceptionWrapper(object):
+    r"""Wraps an exception plus traceback to communicate across threads"""
+    def __init__(self, exc_info):
+        self.exc_type = exc_info[0]
+        self.exc_msg = "".join(traceback.format_exception(*exc_info))
+_use_shared_memory = False
+r"""Whether to use shared memory in default_collate"""
+MANAGER_STATUS_CHECK_INTERVAL = 5.0
+if IS_WINDOWS:
+    # On Windows, the parent ID of the worker process remains unchanged when the manager process
+    # is gone, and the only way to check it through OS is to let the worker have a process handle
+    # of the manager and ask if the process status has changed.
+    class ManagerWatchdog(object):
+        def __init__(self):
+            self.manager_pid = os.getppid()
+            self.kernel32 = ctypes.WinDLL('kernel32', use_last_error=True)
+            self.kernel32.OpenProcess.argtypes = (DWORD, BOOL, DWORD)
+            self.kernel32.OpenProcess.restype = HANDLE
+            self.kernel32.WaitForSingleObject.argtypes = (HANDLE, DWORD)
+            self.kernel32.WaitForSingleObject.restype = DWORD
+            # Value obtained from https://msdn.microsoft.com/en-us/library/ms684880.aspx
+            SYNCHRONIZE = 0x00100000
+            self.manager_handle = self.kernel32.OpenProcess(SYNCHRONIZE, 0, self.manager_pid)
+            if not self.manager_handle:
+                raise ctypes.WinError(ctypes.get_last_error())
+        def is_alive(self):
+            # Value obtained from https://msdn.microsoft.com/en-us/library/windows/desktop/ms687032.aspx
+            return self.kernel32.WaitForSingleObject(self.manager_handle, 0) != 0
+else:
+    class ManagerWatchdog(object):
+        def __init__(self):
+            self.manager_pid = os.getppid()
+        def is_alive(self):
+            return os.getppid() == self.manager_pid
+def _worker_loop(dataset, index_queue, data_queue, collate_fn, init_fn, worker_id):
+    global _use_shared_memory
+    _use_shared_memory = True
+    # Intialize C side signal handlers for SIGBUS and SIGSEGV. Python signal
+    # module's handlers are executed after Python returns from C low-level
+    # handlers, likely when the same fatal signal happened again already.
+    # https://docs.python.org/3/library/signal.html Sec. 18.8.1.1
+    _set_worker_signal_handlers()
+    torch.set_num_threads(1)
+    if init_fn is not None:
+        init_fn(worker_id)
+    watchdog = ManagerWatchdog()
+    while True:
+        try:
+            r = index_queue.get(timeout=MANAGER_STATUS_CHECK_INTERVAL)
+        except queue.Empty:
+            if watchdog.is_alive():
+                continue
+            else:
+                break
+        if r is None:
+            break
+        idx, batch_indices = r
+        try:
+            samples = collate_fn([dataset[i] for i in batch_indices])
+        except Exception:
+            data_queue.put((idx, ExceptionWrapper(sys.exc_info())))
+        else:
+            data_queue.put((idx, samples))
+            del samples
+def _worker_manager_loop(in_queue, out_queue, done_event, pin_memory, device_id):
+    if pin_memory:
+        torch.cuda.set_device(device_id)
+    while True:
+        try:
+            r = in_queue.get()
+        except Exception:
+            if done_event.is_set():
+                return
+            raise
+        if r is None:
+            break
+        if isinstance(r[1], ExceptionWrapper):
+            out_queue.put(r)
+            continue
+        idx, batch = r
+        try:
+            if pin_memory:
+                batch = pin_memory_batch(batch)
+        except Exception:
+            out_queue.put((idx, ExceptionWrapper(sys.exc_info())))
+        else:
+            out_queue.put((idx, batch))
+numpy_type_map = {
+    'float64': torch.DoubleTensor,
+    'float32': torch.FloatTensor,
+    'float16': torch.HalfTensor,
+    'int64': torch.LongTensor,
+    'int32': torch.IntTensor,
+    'int16': torch.ShortTensor,
+    'int8': torch.CharTensor,
+    'uint8': torch.ByteTensor,
+}
+def default_collate(batch):
+    r"""Puts each data field into a tensor with outer dimension batch size"""
+    error_msg = "batch must contain tensors, numbers, dicts or lists; found {}"
+    elem_type = type(batch[0])
+    if isinstance(batch[0], torch.Tensor):
+        out = None
+        if _use_shared_memory:
+            # If we're in a background process, concatenate directly into a
+            # shared memory tensor to avoid an extra copy
+            numel = sum([x.numel() for x in batch])
+            storage = batch[0].storage()._new_shared(numel)
+            out = batch[0].new(storage)
+        return torch.stack(batch, 0, out=out)
+    elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
+            and elem_type.__name__ != 'string_':
+        elem = batch[0]
+        if elem_type.__name__ == 'ndarray':
+            # array of string classes and object
+            if re.search('[SaUO]', elem.dtype.str) is not None:
+                raise TypeError(error_msg.format(elem.dtype))
+            return torch.stack([torch.from_numpy(b) for b in batch], 0)
+        if elem.shape == ():  # scalars
+            py_type = float if elem.dtype.name.startswith('float') else int
+            return numpy_type_map[elem.dtype.name](list(map(py_type, batch)))
+    elif isinstance(batch[0], int):
+        return torch.LongTensor(batch)
+    elif isinstance(batch[0], float):
+        return torch.DoubleTensor(batch)
+    elif isinstance(batch[0], string_classes):
+        return batch
+    elif isinstance(batch[0], collections.abc.Mapping):
+        return {key: default_collate([d[key] for d in batch]) for key in batch[0]}
+    elif isinstance(batch[0], collections.abc.Sequence):
+        transposed = zip(*batch)
+        return [default_collate(samples) for samples in transposed]
+    raise TypeError((error_msg.format(type(batch[0]))))
+def pin_memory_batch(batch):
+    if isinstance(batch, torch.Tensor):
+        return batch.pin_memory()
+    elif isinstance(batch, string_classes):
+        return batch
+    elif isinstance(batch, collections.abc.Mapping):
+        return {k: pin_memory_batch(sample) for k, sample in batch.items()}
+    elif isinstance(batch, collections.abc.Sequence):
+        return [pin_memory_batch(sample) for sample in batch]
+    else:
+        return batch
+_SIGCHLD_handler_set = False
+r"""Whether SIGCHLD handler is set for DataLoader worker failures. Only one
+handler needs to be set for all DataLoaders in a process."""
+def _set_SIGCHLD_handler():
+    # Windows doesn't support SIGCHLD handler
+    if sys.platform == 'win32':
+        return
+    # can't set signal in child threads
+    if not isinstance(threading.current_thread(), threading._MainThread):
+        return
+    global _SIGCHLD_handler_set
+    if _SIGCHLD_handler_set:
+        return
+    previous_handler = signal.getsignal(signal.SIGCHLD)
+    if not callable(previous_handler):
+        previous_handler = None
+    def handler(signum, frame):
+        # This following call uses `waitid` with WNOHANG from C side. Therefore,
+        # Python can still get and update the process status successfully.
+        _error_if_any_worker_fails()
+        if previous_handler is not None:
+            previous_handler(signum, frame)
+    signal.signal(signal.SIGCHLD, handler)
+    _SIGCHLD_handler_set = True
+class _SequentialDataLoaderIter(object):
+    r"""Iterates once over the DataLoader's dataset, as specified by the sampler"""
+    def __init__(self, loader):
+        self.dataset = loader.dataset
+        self.collate_fn = loader.collate_fn
+        self.batch_sampler = loader.batch_sampler
+        self.num_workers = loader.num_workers
+        self.pin_memory = loader.pin_memory and torch.cuda.is_available()
+        self.timeout = loader.timeout
+        self.done_event = threading.Event()
+        self.sample_iter = iter(self.batch_sampler)
+        if self.num_workers > 0:
+            self.worker_init_fn = loader.worker_init_fn
+            self.index_queues = [multiprocessing.Queue() for _ in range(self.num_workers)]
+            self.worker_queue_idx = 0
+            self.worker_result_queue = multiprocessing.SimpleQueue()
+            self.batches_outstanding = 0
+            self.worker_pids_set = False
+            self.shutdown = False
+            self.send_idx = 0
+            self.rcvd_idx = 0
+            self.reorder_dict = {}
+            self.workers = [
+                multiprocessing.Process(
+                    target=_worker_loop,
+                    args=(self.dataset, self.index_queues[i],
+                          self.worker_result_queue, self.collate_fn, self.worker_init_fn, i))
+                for i in range(self.num_workers)]
+            if self.pin_memory or self.timeout > 0:
+                self.data_queue = queue.Queue()
+                if self.pin_memory:
+                    maybe_device_id = torch.cuda.current_device()
+                else:
+                    # do not initialize cuda context if not necessary
+                    maybe_device_id = None
+                self.worker_manager_thread = threading.Thread(
+                    target=_worker_manager_loop,
+                    args=(self.worker_result_queue, self.data_queue, self.done_event, self.pin_memory,
+                          maybe_device_id))
+                self.worker_manager_thread.daemon = True
+                self.worker_manager_thread.start()
+            else:
+                self.data_queue = self.worker_result_queue
+            for w in self.workers:
+                w.daemon = True  # ensure that the worker exits on process exit
+                w.start()
+            _set_worker_pids(id(self), tuple(w.pid for w in self.workers))
+            _set_SIGCHLD_handler()
+            self.worker_pids_set = True
+            # prime the prefetch loop
+            for _ in range(2 * self.num_workers):
+                self._put_indices()
+    def __len__(self):
+        return len(self.batch_sampler)
+    def _get_batch(self):
+        if self.timeout > 0:
+            try:
+                return self.data_queue.get(timeout=self.timeout)
+            except queue.Empty:
+                raise RuntimeError('DataLoader timed out after {} seconds'.format(self.timeout))
+        else:
+            return self.data_queue.get()
+    def __next__(self):
+        if self.num_workers == 0:  # same-process loading
+            indices = next(self.sample_iter)  # may raise StopIteration
+            batch = self.collate_fn([self.dataset[i] for i in indices])
+            if self.pin_memory:
+                batch = pin_memory_batch(batch)
+            return batch
+        # check if the next sample has already been generated
+        if self.rcvd_idx in self.reorder_dict:
+            batch = self.reorder_dict.pop(self.rcvd_idx)
+            return self._process_next_batch(batch)
+        if self.batches_outstanding == 0:
+            self._shutdown_workers()
+            raise StopIteration
+        while True:
+            assert (not self.shutdown and self.batches_outstanding > 0)
+            idx, batch = self._get_batch()
+            self.batches_outstanding -= 1
+            if idx != self.rcvd_idx:
+                # store out-of-order samples
+                self.reorder_dict[idx] = batch
+                continue
+            return self._process_next_batch(batch)
+    next = __next__  # Python 2 compatibility
+    def __iter__(self):
+        return self
+    def _put_indices(self):
+        assert self.batches_outstanding < 2 * self.num_workers
+        indices = next(self.sample_iter, None)
+        if indices is None:
+            return
+        self.index_queues[self.worker_queue_idx].put((self.send_idx, indices))
+        self.worker_queue_idx = (self.worker_queue_idx + 1) % self.num_workers
+        self.batches_outstanding += 1
+        self.send_idx += 1
+    def _process_next_batch(self, batch):
+        self.rcvd_idx += 1
+        self._put_indices()
+        if isinstance(batch, ExceptionWrapper):
+            raise batch.exc_type(batch.exc_msg)
+        return batch
+    def __getstate__(self):
+        # TODO: add limited pickling support for sharing an iterator
+        # across multiple threads for HOGWILD.
+        # Probably the best way to do this is by moving the sample pushing
+        # to a separate thread and then just sharing the data queue
+        # but signalling the end is tricky without a non-blocking API
+        raise NotImplementedError("_SequentialDataLoaderIter cannot be pickled")
+    def _shutdown_workers(self):
+        try:
+            if not self.shutdown:
+                self.shutdown = True
+                self.done_event.set()
+                for q in self.index_queues:
+                    q.put(None)
+                # if some workers are waiting to put, make place for them
+                try:
+                    while not self.worker_result_queue.empty():
+                        self.worker_result_queue.get()
+                except (FileNotFoundError, ImportError):
+                    # Many weird errors can happen here due to Python
+                    # shutting down. These are more like obscure Python bugs.
+                    # FileNotFoundError can happen when we rebuild the fd
+                    # fetched from the queue but the socket is already closed
+                    # from the worker side.
+                    # ImportError can happen when the unpickler loads the
+                    # resource from `get`.
+                    pass
+                # done_event should be sufficient to exit worker_manager_thread,
+                # but be safe here and put another None
+                self.worker_result_queue.put(None)
+        finally:
+            # removes pids no matter what
+            if self.worker_pids_set:
+                _remove_worker_pids(id(self))
+                self.worker_pids_set = False
+    def __del__(self):
+        if self.num_workers > 0:
+            self._shutdown_workers()
+class SequentialDataLoader(object):
+    r"""
+    Sequential Data loader. Combines a dataset and a sampler, and provides
+    single- or multi-process iterators over the dataset.
+    This is modified from Pytorch.DataLoader by disable random state touch as for sequential data loading,
+    we don't want it to touch any random state.
+    Arguments:
+        dataset (Dataset): dataset from which to load the data.
+        batch_size (int, optional): how many samples per batch to load
+            (default: 1).
+        shuffle (bool, optional): set to ``True`` to have the data reshuffled
+            at every epoch (default: False).
+        sampler (Sampler, optional): defines the strategy to draw samples from
+            the dataset. If specified, ``shuffle`` must be False.
+        batch_sampler (Sampler, optional): like sampler, but returns a batch of
+            indices at a time. Mutually exclusive with batch_size, shuffle,
+            sampler, and drop_last.
+        num_workers (int, optional): how many subprocesses to use for data
+            loading. 0 means that the data will be loaded in the main process.
+            (default: 0)
+        collate_fn (callable, optional): merges a list of samples to form a mini-batch.
+        pin_memory (bool, optional): If ``True``, the data loader will copy tensors
+            into CUDA pinned memory before returning them.
+        drop_last (bool, optional): set to ``True`` to drop the last incomplete batch,
+            if the dataset size is not divisible by the batch size. If ``False`` and
+            the size of dataset is not divisible by the batch size, then the last batch
+            will be smaller. (default: False)
+        timeout (numeric, optional): if positive, the timeout value for collecting a batch
+            from workers. Should always be non-negative. (default: 0)
+        worker_init_fn (callable, optional): If not None, this will be called on each
+            worker subprocess with the worker id (an int in ``[0, num_workers - 1]``) as
+            input, after seeding and before data loading. (default: None)
+    .. note:: By default, each worker will have its PyTorch seed set to
+              ``base_seed + worker_id``, where ``base_seed`` is a long generated
+              by main process using its RNG. However, seeds for other libraies
+              may be duplicated upon initializing workers (w.g., NumPy), causing
+              each worker to return identical random numbers. (See
+              :ref:`dataloader-workers-random-seed` section in FAQ.) You may
+              use ``torch.initial_seed()`` to access the PyTorch seed for each
+              worker in :attr:`worker_init_fn`, and use it to set other seeds
+              before data loading.
+    .. warning:: If ``spawn`` start method is used, :attr:`worker_init_fn` cannot be an
+                 unpicklable object, e.g., a lambda function.
+    """
+    __initialized = False
+    def __init__(self, dataset, batch_size=1, shuffle=False, sampler=None, batch_sampler=None,
+                 num_workers=0, collate_fn=default_collate, pin_memory=False, drop_last=False,
+                 timeout=0, worker_init_fn=None):
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.collate_fn = collate_fn
+        self.pin_memory = pin_memory
+        self.drop_last = drop_last
+        self.timeout = timeout
+        self.worker_init_fn = worker_init_fn
+        if timeout < 0:
+            raise ValueError('timeout option should be non-negative')
+        if batch_sampler is not None:
+            if batch_size > 1 or shuffle or sampler is not None or drop_last:
+                raise ValueError('batch_sampler option is mutually exclusive '
+                                 'with batch_size, shuffle, sampler, and '
+                                 'drop_last')
+            self.batch_size = None
+            self.drop_last = None
+        if sampler is not None and shuffle:
+            raise ValueError('sampler option is mutually exclusive with '
+                             'shuffle')
+        if self.num_workers < 0:
+            raise ValueError('num_workers option cannot be negative; '
+                             'use num_workers=0 to disable multiprocessing.')
+        if batch_sampler is None:
+            if sampler is None:
+                if shuffle:
+                    sampler = RandomSampler(dataset)
+                else:
+                    sampler = SequentialSampler(dataset)
+            batch_sampler = BatchSampler(sampler, batch_size, drop_last)
+        self.sampler = sampler
+        self.batch_sampler = batch_sampler
+        self.__initialized = True
+    def __setattr__(self, attr, val):
+        if self.__initialized and attr in ('batch_size', 'sampler', 'drop_last'):
+            raise ValueError('{} attribute should not be set after {} is '
+                             'initialized'.format(attr, self.__class__.__name__))
+        super(SequentialDataLoader, self).__setattr__(attr, val)
+    def __iter__(self):
+        return _SequentialDataLoaderIter(self)
+    def __len__(self):
+        return len(self.batch_sampler)

nlu/DeBERTa/data/dynamic_dataset.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: penhe@microsoft.com
+# Date: 05/15/2019
+#
+import pdb
+from torch.utils.data import Dataset
+import random
+import mmap
+import numpy as np
+from bisect import bisect
+from ..utils import get_logger
+logger=get_logger()
+__all__ = ['DynamicDataset']
+class DynamicDataset(Dataset):
+  def __init__(self, corpus, feature_fn, dataset_size=None, shuffle=False, **kwargs):
+    self.corpus = corpus
+    self.ds_len = len(self.corpus)
+    logger.info(f'Total corpus examples: {self.ds_len}')
+    self.feature_fn = feature_fn
+    if not dataset_size:
+      self.dataset_size = self.ds_len
+    else:
+      self.dataset_size = int(dataset_size)
+    self.shuffle = shuffle
+    index_buf = mmap.mmap(-1, self.dataset_size*8)
+    shuffle_idx = np.ndarray(shape=(self.dataset_size, ), buffer=index_buf, dtype=int)
+    shuffle_idx[:] = np.arange(self.dataset_size)[:]
+    if self.shuffle:
+      #rng = np.random.RandomState(0)
+      rng = random.Random(0)
+      rng.shuffle(shuffle_idx)
+    self.shuffle_idx = shuffle_idx
+    self.index_offset = 0
+    if 'index_offset' in kwargs:
+      self.index_offset = kwargs['index_offset']
+  def __len__(self):
+    return self.dataset_size
+  def __getitem__(self, idx):
+    if isinstance(idx, tuple) or isinstance(idx, list):
+      idx, ext_params = idx
+    else:
+      ext_params = None
+    idx += self.index_offset
+    seed = idx
+    rng = random.Random(seed)
+    # get seq length
+    example_idx = self.shuffle_idx[idx%self.dataset_size]%self.ds_len
+    example = self.corpus[example_idx, rng, ext_params]
+    return self.feature_fn(example, rng, ext_params = ext_params)

nlu/DeBERTa/data/example.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch
+import os
+from collections import OrderedDict
+import numpy as np
+import tempfile
+import numpy as np
+import mmap
+import pickle
+import signal
+import sys
+import pdb
+from ..utils import xtqdm as tqdm
+__all__=['ExampleInstance', 'example_to_feature', 'ExampleSet']
+class ExampleInstance:
+  def __init__(self, segments, label=None,  **kwv):
+    self.segments = segments
+    self.label = label
+    self.__dict__.update(kwv)
+  def __repr__(self):
+    return f'segments: {self.segments}\nlabel: {self.label}'
+  def __getitem__(self, i):
+    return self.segments[i]
+  def __len__(self):
+    return len(self.segments)
+class ExampleSet:
+  def __init__(self, pairs):
+    self._data = np.array([pickle.dumps(p) for p in pairs])
+    self.total = len(self._data)
+  def __getitem__(self, idx):
+    """
+    return pair
+    """
+    if isinstance(idx, tuple):
+      idx,rng, ext_params = idx
+    else:
+      rng,ext_params=None, None
+    content = self._data[idx]
+    example = pickle.loads(content)
+    return example
+  def __len__(self):
+    return self.total
+  def __iter__(self):
+    for i in range(self.total):
+      yield self[i]
+def _truncate_segments(segments, max_num_tokens, rng):
+  """
+  Truncate sequence pair according to original BERT implementation:
+  https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L391
+  """
+  while True:
+    if sum(len(s) for s in segments)<=max_num_tokens:
+      break
+    segments = sorted(segments, key=lambda s:len(s), reverse=True)
+    trunc_tokens = segments[0]
+    assert len(trunc_tokens) >= 1
+    if rng.random() < 0.5:
+      trunc_tokens.pop(0)
+    else:
+      trunc_tokens.pop()
+  return segments
+def example_to_feature(tokenizer, example, max_seq_len=512, rng=None, mask_generator = None, ext_params=None, label_type='int', **kwargs):
+  if not rng:
+    rng = random
+  max_num_tokens = max_seq_len - len(example.segments) - 1
+  segments = _truncate_segments([tokenizer.tokenize(s) for s in example.segments], max_num_tokens, rng)
+  tokens = ['[CLS]']
+  type_ids = [0]
+  for i,s in enumerate(segments):
+    tokens.extend(s)
+    tokens.append('[SEP]')
+    type_ids.extend([i]*(len(s)+1))
+  if mask_generator:
+    tokens, lm_labels = mask_generator.mask_tokens(tokens, rng)
+  token_ids = tokenizer.convert_tokens_to_ids(tokens)
+  pos_ids = list(range(len(token_ids)))
+  input_mask = [1]*len(token_ids)
+  features = OrderedDict(input_ids = token_ids,
+      type_ids = type_ids,
+      position_ids = pos_ids,
+      input_mask = input_mask)
+  if mask_generator:
+    features['lm_labels'] = lm_labels
+  padding_size = max(0, max_seq_len - len(token_ids))
+  for f in features:
+    features[f].extend([0]*padding_size)
+    features[f] = torch.tensor(features[f], dtype=torch.int)
+  label_type = torch.int if label_type=='int' else torch.float
+  if example.label is not None:
+    features['labels'] = torch.tensor(example.label, dtype=label_type)
+  return features

nlu/DeBERTa/deberta/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+#
+# Author: penhe@microsoft.com
+# Date: 04/25/2019
+#
+""" Components for NN
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from .tokenizers import *
+from .pooling import *
+from .mlm import MLMPredictionHead
+from .nnmodule import NNModule
+from .deberta import *
+from .disentangled_attention import *
+from .ops import *
+from .bert import *
+from .config import *
+from .cache_utils import *

nlu/DeBERTa/deberta/bert.py ADDED Viewed

	@@ -0,0 +1,308 @@

+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This piece of code is modified based on https://github.com/huggingface/transformers
+import copy
+import torch
+from torch import nn
+from collections.abc import Sequence
+from packaging import version
+import numpy as np
+import math
+import os
+import pdb
+import json
+from .ops import *
+from .disentangled_attention import *
+from .da_utils import *
+from adapterlib import adapter_dict
+__all__ = ['BertEncoder', 'BertEmbeddings', 'ACT2FN', 'LayerNorm', 'BertLMPredictionHead']
+class BertSelfOutput(nn.Module):
+  def __init__(self, config):
+    super().__init__()
+    # self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+    if config.inject_adapter != 'linear':
+      self.dense = adapter_dict[config.inject_adapter](config.hidden_size, config.hidden_size, config=config)
+    else:
+      self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+    self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+    self.dropout = StableDropout(config.hidden_dropout_prob)
+    self.config = config
+  def forward(self, hidden_states, input_states, mask=None):
+    hidden_states = self.dense(hidden_states)
+    hidden_states = self.dropout(hidden_states)
+    hidden_states += input_states
+    hidden_states = MaskedLayerNorm(self.LayerNorm, hidden_states)
+    return hidden_states
+class BertAttention(nn.Module):
+  def __init__(self, config):
+    super().__init__()
+    self.self = DisentangledSelfAttention(config)
+    self.output = BertSelfOutput(config)
+    self.config = config
+  def forward(self, hidden_states, attention_mask, return_att=False, query_states=None, relative_pos=None, rel_embeddings=None):
+    output = self.self(hidden_states, attention_mask, return_att, query_states=query_states, relative_pos=relative_pos, rel_embeddings=rel_embeddings)
+    self_output, att_matrix, att_logits_=output['hidden_states'], output['attention_probs'], output['attention_logits']
+    if query_states is None:
+      query_states = hidden_states
+    attention_output = self.output(self_output, query_states, attention_mask)
+    if return_att:
+      return (attention_output, att_matrix)
+    else:
+      return attention_output
+class BertIntermediate(nn.Module):
+  def __init__(self, config):
+    super().__init__()
+    # self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+    if config.inject_adapter != 'linear':
+      self.dense = adapter_dict[config.inject_adapter](config.hidden_size, config.intermediate_size, config=config)
+    else:
+      self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+    self.intermediate_act_fn = ACT2FN[config.hidden_act] \
+      if isinstance(config.hidden_act, str) else config.hidden_act
+  def forward(self, hidden_states):
+    hidden_states = self.dense(hidden_states)
+    hidden_states = self.intermediate_act_fn(hidden_states)
+    return hidden_states
+class BertOutput(nn.Module):
+  def __init__(self, config):
+    super(BertOutput, self).__init__()
+    # self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+    if config.inject_adapter != 'linear':
+      self.dense = adapter_dict[config.inject_adapter](config.intermediate_size, config.hidden_size, config=config)
+    else:
+      self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+    self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+    self.dropout = StableDropout(config.hidden_dropout_prob)
+    self.config = config
+  def forward(self, hidden_states, input_states, mask=None):
+    hidden_states = self.dense(hidden_states)
+    hidden_states = self.dropout(hidden_states)
+    hidden_states += input_states
+    hidden_states = MaskedLayerNorm(self.LayerNorm, hidden_states)
+    return hidden_states
+class BertLayer(nn.Module):
+  def __init__(self, config):
+    super(BertLayer, self).__init__()
+    self.attention = BertAttention(config)
+    self.intermediate = BertIntermediate(config)
+    self.output = BertOutput(config)
+  def forward(self, hidden_states, attention_mask, return_att=False, query_states=None, relative_pos=None, rel_embeddings=None):
+    attention_output = self.attention(hidden_states, attention_mask, return_att=return_att, \
+      query_states=query_states, relative_pos=relative_pos, rel_embeddings=rel_embeddings)
+    if return_att:
+      attention_output, att_matrix = attention_output
+    intermediate_output = self.intermediate(attention_output)
+    layer_output = self.output(intermediate_output, attention_output, attention_mask)
+    if return_att:
+      return (layer_output, att_matrix)
+    else:
+      return layer_output
+class ConvLayer(nn.Module):
+    def __init__(self, config):
+      super().__init__()
+      kernel_size = getattr(config, 'conv_kernel_size', 3)
+      groups = getattr(config, 'conv_groups', 1)
+      self.conv_act = getattr(config, 'conv_act', 'tanh')
+      self.conv = torch.nn.Conv1d(config.hidden_size, config.hidden_size, kernel_size, padding = (kernel_size-1)//2, groups = groups)
+      self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+      self.dropout = StableDropout(config.hidden_dropout_prob)
+      self.config = config
+    def forward(self, hidden_states, residual_states, input_mask):
+        out = self.conv(hidden_states.permute(0,2,1).contiguous()).permute(0,2,1).contiguous()
+        if version.Version(torch.__version__) >= version.Version('1.2.0a'):
+            rmask = (1-input_mask).bool()
+        else:
+            rmask = (1-input_mask).byte()
+        out.masked_fill_(rmask.unsqueeze(-1).expand(out.size()), 0)
+        out = ACT2FN[self.conv_act](self.dropout(out))
+        output_states = MaskedLayerNorm(self.LayerNorm, residual_states + out, input_mask)
+        return output_states
+class BertEncoder(nn.Module):
+  """ Modified BertEncoder with relative position bias support
+  """
+  def __init__(self, config):
+    super().__init__()
+    #layer = BertLayer(config)
+    self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+    self.relative_attention = getattr(config, 'relative_attention', False)
+    if self.relative_attention:
+      self.max_relative_positions = getattr(config, 'max_relative_positions', -1)
+      if self.max_relative_positions <1:
+        self.max_relative_positions = config.max_position_embeddings
+      self.position_buckets = getattr(config, 'position_buckets', -1)
+      pos_ebd_size = self.max_relative_positions*2
+      if self.position_buckets>0:
+        pos_ebd_size = self.position_buckets*2
+      self.rel_embeddings = nn.Embedding(pos_ebd_size, config.hidden_size)
+    self.norm_rel_ebd = [x.strip() for x in getattr(config, 'norm_rel_ebd', 'none').lower().split('|')]
+    if 'layer_norm' in self.norm_rel_ebd:
+      self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine = True)
+    kernel_size = getattr(config, 'conv_kernel_size', 0)
+    self.with_conv = False
+    if kernel_size > 0:
+      self.with_conv = True
+      self.conv = ConvLayer(config)
+  def get_rel_embedding(self):
+    rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None
+    if rel_embeddings is not None and ('layer_norm' in self.norm_rel_ebd):
+      rel_embeddings = self.LayerNorm(rel_embeddings)
+    return rel_embeddings
+  def get_attention_mask(self, attention_mask):
+    if attention_mask.dim()<=2:
+      extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+      attention_mask = extended_attention_mask*extended_attention_mask.squeeze(-2).unsqueeze(-1)
+      attention_mask = attention_mask.byte()
+    elif attention_mask.dim()==3:
+      attention_mask = attention_mask.unsqueeze(1)
+    return attention_mask
+  def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
+    if self.relative_attention and relative_pos is None:
+      q = query_states.size(-2) if query_states is not None else hidden_states.size(-2)
+      relative_pos = build_relative_position(q, hidden_states.size(-2), bucket_size = self.position_buckets, \
+          max_position=self.max_relative_positions, device = hidden_states.device)
+    return relative_pos
+  def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, return_att=False, query_states = None, relative_pos=None):
+    if attention_mask.dim()<=2:
+      input_mask = attention_mask
+    else:
+      input_mask = (attention_mask.sum(-2)>0).byte()
+    attention_mask = self.get_attention_mask(attention_mask)
+    relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)
+    all_encoder_layers = []
+    att_matrices = []
+    if isinstance(hidden_states, Sequence):
+      next_kv = hidden_states[0]
+    else:
+      next_kv = hidden_states
+    rel_embeddings = self.get_rel_embedding()
+    for i, layer_module in enumerate(self.layer):
+      output_states = layer_module(next_kv, attention_mask, return_att, query_states = query_states, relative_pos=relative_pos, rel_embeddings=rel_embeddings)
+      if return_att:
+        output_states, att_m = output_states
+      if i == 0 and self.with_conv:
+        prenorm = output_states #output['prenorm_states']
+        output_states = self.conv(hidden_states, prenorm, input_mask)
+      if query_states is not None:
+        query_states = output_states
+        if isinstance(hidden_states, Sequence):
+          next_kv = hidden_states[i+1] if i+1 < len(self.layer) else None
+      else:
+        next_kv = output_states
+      if output_all_encoded_layers:
+        all_encoder_layers.append(output_states)
+        if return_att:
+          att_matrices.append(att_m)
+    if not output_all_encoded_layers:
+      all_encoder_layers.append(output_states)
+      if return_att:
+        att_matrices.append(att_m)
+    return {
+        'hidden_states': all_encoder_layers,
+        'attention_matrices': att_matrices
+        }
+class BertEmbeddings(nn.Module):
+  """Construct the embeddings from word, position and token_type embeddings.
+  """
+  def __init__(self, config):
+    super(BertEmbeddings, self).__init__()
+    padding_idx = getattr(config, 'padding_idx', 0)
+    self.embedding_size = getattr(config, 'embedding_size', config.hidden_size)
+    self.word_embeddings = nn.Embedding(config.vocab_size, self.embedding_size, padding_idx = padding_idx)
+    self.position_biased_input = getattr(config, 'position_biased_input', True)
+    self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.embedding_size)
+    if config.type_vocab_size>0:
+      self.token_type_embeddings = nn.Embedding(config.type_vocab_size, self.embedding_size)
+    if self.embedding_size != config.hidden_size:
+      self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False)
+    self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+    self.dropout = StableDropout(config.hidden_dropout_prob)
+    self.output_to_half = False
+    self.config = config
+  def forward(self, input_ids, token_type_ids=None, position_ids=None, mask = None):
+    seq_length = input_ids.size(1)
+    if position_ids is None:
+      position_ids = torch.arange(0, seq_length, dtype=torch.long, device=input_ids.device)
+      position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+    if token_type_ids is None:
+      token_type_ids = torch.zeros_like(input_ids)
+    words_embeddings = self.word_embeddings(input_ids)
+    position_embeddings = self.position_embeddings(position_ids.long())
+    embeddings = words_embeddings
+    if self.config.type_vocab_size>0:
+      token_type_embeddings = self.token_type_embeddings(token_type_ids)
+      embeddings += token_type_embeddings
+    if self.position_biased_input:
+      embeddings += position_embeddings
+    if self.embedding_size != self.config.hidden_size:
+      embeddings = self.embed_proj(embeddings)
+    embeddings = MaskedLayerNorm(self.LayerNorm, embeddings, mask)
+    embeddings = self.dropout(embeddings)
+    return {
+        'embeddings': embeddings,
+        'position_embeddings': position_embeddings}
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config, vocab_size):
+        super().__init__()
+        self.embedding_size = getattr(config, 'embedding_size', config.hidden_size)
+        self.dense = nn.Linear(config.hidden_size, self.embedding_size)
+        self.transform_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+        self.LayerNorm = LayerNorm(self.embedding_size, config.layer_norm_eps, elementwise_affine=True)
+        self.bias = nn.Parameter(torch.zeros(vocab_size))
+    def forward(self, hidden_states, embeding_weight):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        # b x s x d
+        hidden_states = MaskedLayerNorm(self.LayerNorm, hidden_states)
+        # b x s x v
+        logits = torch.matmul(hidden_states, embeding_weight.t().to(hidden_states)) + self.bias
+        return logits

nlu/DeBERTa/deberta/cache_utils.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: penhe@microsoft.com
+# Date: 05/15/2020
+#
+import pdb
+import torch
+import os
+import requests
+from .config import ModelConfig
+import pathlib
+from ..utils import xtqdm as tqdm
+from zipfile import ZipFile
+from ..utils import get_logger
+logger = get_logger()
+__all__ = ['pretrained_models', 'load_model_state', 'load_vocab']
+class PretrainedModel:
+  def __init__(self, name, vocab, vocab_type, model='pytorch_model.bin', config='config.json', **kwargs):
+    self.__dict__.update(kwargs)
+    host = f'https://huggingface.co/microsoft/{name}/resolve/main/'
+    self.name = name
+    self.model_url = host + model
+    self.config_url = host + config
+    self.vocab_url = host + vocab
+    self.vocab_type = vocab_type
+pretrained_models= {
+    'base': PretrainedModel('deberta-base', 'bpe_encoder.bin', 'gpt2'),
+    'large': PretrainedModel('deberta-large', 'bpe_encoder.bin', 'gpt2'),
+    'xlarge': PretrainedModel('deberta-xlarge', 'bpe_encoder.bin', 'gpt2'),
+    'base-mnli': PretrainedModel('deberta-base-mnli', 'bpe_encoder.bin', 'gpt2'),
+    'large-mnli': PretrainedModel('deberta-large-mnli', 'bpe_encoder.bin', 'gpt2'),
+    'xlarge-mnli': PretrainedModel('deberta-xlarge-mnli', 'bpe_encoder.bin', 'gpt2'),
+    'xlarge-v2': PretrainedModel('deberta-v2-xlarge', 'spm.model', 'spm'),
+    'xxlarge-v2': PretrainedModel('deberta-v2-xxlarge', 'spm.model', 'spm'),
+    'xlarge-v2-mnli': PretrainedModel('deberta-v2-xlarge-mnli', 'spm.model', 'spm'),
+    'xxlarge-v2-mnli': PretrainedModel('deberta-v2-xxlarge-mnli', 'spm.model', 'spm'),
+    'deberta-v3-small': PretrainedModel('deberta-v3-small', 'spm.model', 'spm'),
+    'deberta-v3-base': PretrainedModel('deberta-v3-base', 'spm.model', 'spm'),
+    'deberta-v3-large': PretrainedModel('deberta-v3-large', 'spm.model', 'spm'),
+    'mdeberta-v3-base': PretrainedModel('mdeberta-v3-base', 'spm.model', 'spm'),
+    'deberta-v3-xsmall': PretrainedModel('deberta-v3-xsmall', 'spm.model', 'spm'),
+  }
+def download_asset(url, name, tag=None, no_cache=False, cache_dir=None):
+  _tag = tag
+  if _tag is None:
+    _tag = 'latest'
+  if not cache_dir:
+    cache_dir = os.path.join(pathlib.Path.home(), f'.~DeBERTa/assets/{_tag}/')
+  os.makedirs(cache_dir, exist_ok=True)
+  output=os.path.join(cache_dir, name)
+  if os.path.exists(output) and (not no_cache):
+    return output
+  #repo=f'https://huggingface.co/microsoft/deberta-{name}/blob/main/bpe_encoder.bin'
+  headers = {}
+  headers['Accept'] = 'application/octet-stream'
+  resp = requests.get(url, stream=True, headers=headers)
+  if resp.status_code != 200:
+    raise Exception(f'Request for {url} return {resp.status_code}, {resp.text}')
+  try:
+    with open(output, 'wb') as fs:
+      progress = tqdm(total=int(resp.headers['Content-Length']) if 'Content-Length' in resp.headers else -1, ncols=80, desc=f'Downloading {name}')
+      for c in resp.iter_content(chunk_size=1024*1024):
+        fs.write(c)
+        progress.update(len(c))
+      progress.close()
+  except:
+    os.remove(output)
+    raise
+  return output
+def load_model_state(path_or_pretrained_id, tag=None, no_cache=False, cache_dir=None):
+  model_path = path_or_pretrained_id
+  if model_path and (not os.path.exists(model_path)) and (path_or_pretrained_id.lower() in pretrained_models):
+    _tag = tag
+    if 'deberta-v3-base' in path_or_pretrained_id:
+      pretrained = pretrained_models['deberta-v3-base']
+    else:
+      pretrained = pretrained_models[path_or_pretrained_id.lower()]
+    if _tag is None:
+      _tag = 'latest'
+    if not cache_dir:
+      cache_dir = os.path.join(pathlib.Path.home(), f'.~DeBERTa/assets/{_tag}/{pretrained.name}')
+    os.makedirs(cache_dir, exist_ok=True)
+    model_path = os.path.join(cache_dir, 'pytorch_model.bin')
+    if (not os.path.exists(model_path)) or no_cache:
+      asset = download_asset(pretrained.model_url, 'pytorch_model.bin', tag=tag, no_cache=no_cache, cache_dir=cache_dir)
+      asset = download_asset(pretrained.config_url, 'model_config.json', tag=tag, no_cache=no_cache, cache_dir=cache_dir)
+  elif not model_path:
+    return None,None
+  model_path = os.path.join(model_path, 'pytorch_model.bin')
+  config_path = os.path.join(os.path.dirname(model_path), 'model_config.json')
+  model_state = torch.load(model_path, map_location='cpu')
+  logger.info("Loaded pretrained model file {}".format(model_path))
+  if 'config' in model_state:
+    model_config = ModelConfig.from_dict(model_state['config'])
+  elif os.path.exists(config_path):
+    model_config = ModelConfig.from_json_file(config_path)
+  else:
+    model_config = None
+  return model_state, model_config
+def load_vocab(vocab_path=None, vocab_type=None, pretrained_id=None, tag=None, no_cache=False, cache_dir=None):
+  if pretrained_id and (pretrained_id.lower() in pretrained_models):
+    _tag = tag
+    if _tag is None:
+      _tag = 'latest'
+    pretrained = pretrained_models[pretrained_id.lower()]
+    if not cache_dir:
+      cache_dir = os.path.join(pathlib.Path.home(), f'.~DeBERTa/assets/{_tag}/{pretrained.name}')
+    os.makedirs(cache_dir, exist_ok=True)
+    vocab_type = pretrained.vocab_type
+    url = pretrained.vocab_url
+    outname = os.path.basename(url)
+    vocab_path =os.path.join(cache_dir, outname)
+    if (not os.path.exists(vocab_path)) or no_cache:
+      asset = download_asset(url, outname, tag=tag, no_cache=no_cache, cache_dir=cache_dir)
+  if vocab_type is None:
+    vocab_type = 'spm'
+  return vocab_path, vocab_type
+def test_download():
+  vocab = load_vocab()

nlu/DeBERTa/deberta/config.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import json
+import copy
+__all__=['AbsModelConfig', 'ModelConfig']
+class AbsModelConfig(object):
+    def __init__(self):
+        pass
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `ModelConfig` from a Python dictionary of parameters."""
+        config = cls()
+        for key, value in json_object.items():
+            if isinstance(value, dict):
+                value = AbsModelConfig.from_dict(value)
+            config.__dict__[key] = value
+        return config
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `ModelConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+    def __repr__(self):
+        return str(self.to_json_string())
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        def _json_default(obj):
+            if isinstance(obj, AbsModelConfig):
+                return obj.__dict__
+        return json.dumps(self.__dict__, indent=2, sort_keys=True, default=_json_default) + "\n"
+class ModelConfig(AbsModelConfig):
+    """Configuration class to store the configuration of a :class:`~DeBERTa.deberta.DeBERTa` model.
+        Attributes:
+            hidden_size (int): Size of the encoder layers and the pooler layer, default: `768`.
+            num_hidden_layers (int): Number of hidden layers in the Transformer encoder, default: `12`.
+            num_attention_heads (int): Number of attention heads for each attention layer in
+                the Transformer encoder, default: `12`.
+            intermediate_size (int): The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder, default: `3072`.
+            hidden_act (str): The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported, default: `gelu`.
+            hidden_dropout_prob (float): The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler, default: `0.1`.
+            attention_probs_dropout_prob (float): The dropout ratio for the attention
+                probabilities, default: `0.1`.
+            max_position_embeddings (int): The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048), default: `512`.
+            type_vocab_size (int): The vocabulary size of the `token_type_ids` passed into
+                `DeBERTa` model, default: `-1`.
+            initializer_range (int): The sttdev of the _normal_initializer for
+                initializing all weight matrices, default: `0.02`.
+            relative_attention (:obj:`bool`): Whether use relative position encoding, default: `False`.
+            max_relative_positions (int): The range of relative positions [`-max_position_embeddings`, `max_position_embeddings`], default: -1, use the same value as `max_position_embeddings`.
+            padding_idx (int): The value used to pad input_ids, default: `0`.
+            position_biased_input (:obj:`bool`): Whether add absolute position embedding to content embedding, default: `True`.
+            pos_att_type (:obj:`str`): The type of relative position attention, it can be a combination of [`p2c`, `c2p`, `p2p`], e.g. "p2c", "p2c|c2p", "p2c|c2p|p2p"., default: "None".
+    """
+    def __init__(self):
+        """Constructs ModelConfig.
+        """
+        self.hidden_size = 768
+        self.num_hidden_layers = 12
+        self.num_attention_heads = 12
+        self.hidden_act = "gelu"
+        self.intermediate_size = 3072
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 0
+        self.initializer_range = 0.02
+        self.layer_norm_eps = 1e-7
+        self.padding_idx = 0
+        self.vocab_size = -1

nlu/DeBERTa/deberta/da_utils.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import torch
+import pdb
+from functools import lru_cache
+import numpy as np
+import math
+__all__=['build_relative_position', 'make_log_bucket_position']
+@lru_cache(maxsize=128)
+def make_log_bucket_dict(bucket_size, max_position, device=None):
+  relative_pos = torch.arange(-max_position, max_position, device=device)
+  sign = torch.sign(relative_pos)
+  mid = bucket_size//2
+  abs_pos = torch.where((relative_pos<mid) & (relative_pos > -mid), torch.tensor(mid-1).to(relative_pos), torch.abs(relative_pos))
+  log_pos = torch.ceil(torch.log(abs_pos/mid)/math.log((max_position-1)/mid) * (mid-1)) + mid
+  bucket_pos = torch.where(abs_pos<=mid, relative_pos, (log_pos*sign).to(relative_pos)).to(torch.long)
+  return bucket_pos
+# Faster version
+def make_log_bucket_position(relative_pos, bucket_size, max_position):
+  relative_pos = torch.clamp(relative_pos,-max_position+1, max_position-1) + max_position
+  bucket_dict = make_log_bucket_dict(bucket_size, max_position, relative_pos.device)
+  for d in range(relative_pos.dim()-1):
+    bucket_dict = bucket_dict.unsqueeze(0)
+    bucket_pos = torch.gather(bucket_dict.expand(list(relative_pos.size())[:-1] + [bucket_dict.size(-1)]), index=relative_pos.long(), dim=-1)
+  return bucket_pos
+@lru_cache(maxsize=128)
+def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-1, device=None):
+  q_ids = torch.arange(0, query_size)
+  k_ids = torch.arange(0, key_size)
+  if device is not None:
+    q_ids = q_ids.to(device)
+    k_ids = k_ids.to(device)
+  rel_pos_ids = q_ids.view(-1,1) - k_ids.view(1,-1)
+  #q_ids[:, None] - np.tile(k_ids, (q_ids.shape[0],1))
+  if bucket_size>0 and max_position > 0:
+    rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position)
+  #rel_pos_ids = torch.tensor(rel_pos_ids, dtype=torch.long)
+  rel_pos_ids = rel_pos_ids[:query_size, :]
+  rel_pos_ids = rel_pos_ids.unsqueeze(0)
+  return rel_pos_ids
+def build_relative_position_from_abs(query_pos, key_pos, bucket_size=-1, max_position=-1, device=None):
+  if isinstance(query_pos, tuple):
+    q_ids = torch.tensor(query_pos)
+  else:
+    q_ids = query_pos
+  if isinstance(key_pos, tuple):
+    k_ids = torch.tensor(key_pos)
+  else:
+    k_ids = key_pos
+  if device is not None:
+    q_ids = q_ids.to(device)
+    k_ids = k_ids.to(device)
+  rel_pos_ids = q_ids.unsqueeze(-1) - k_ids.unsqueeze(-2)
+  #q_ids[:, None] - np.tile(k_ids, (q_ids.shape[0],1))
+  if bucket_size>0 and max_position > 0:
+    rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position)
+  #rel_pos_ids = torch.tensor(rel_pos_ids, dtype=torch.long)
+  return rel_pos_ids
+def test_log_bucket():
+  x=np.arange(-511,511)
+  y=make_log_bucket_position(x, 128, 512)
+  pdb.set_trace()

nlu/DeBERTa/deberta/deberta.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: penhe@microsoft.com
+# Date: 01/15/2020
+#
+import copy
+import torch
+import os
+import json
+from .ops import *
+from .bert import *
+from .config import ModelConfig
+from .cache_utils import load_model_state
+import pdb
+__all__ = ['DeBERTa']
+class DeBERTa(torch.nn.Module):
+  """ DeBERTa encoder
+  This module is composed of the input embedding layer with stacked transformer layers with disentangled attention.
+  Parameters:
+    config:
+      A model config class instance with the configuration to build a new model. The schema is similar to `BertConfig`, \
+          for more details, please refer :class:`~DeBERTa.deberta.ModelConfig`
+    pre_trained:
+      The pre-trained DeBERTa model, it can be a physical path of a pre-trained DeBERTa model or a released configurations, \
+          i.e. [**base, large, base_mnli, large_mnli**]
+  """
+  def __init__(self, config=None, pre_trained=None):
+    super().__init__()
+    state = None
+    if pre_trained is not None:
+      state, model_config = load_model_state(pre_trained)
+      if config is not None and model_config is not None:
+        for k in config.__dict__:
+          if k not in ['hidden_size',
+            'intermediate_size',
+            'num_attention_heads',
+            'num_hidden_layers',
+            'vocab_size',
+            'max_position_embeddings']:
+            model_config.__dict__[k] = config.__dict__[k]
+      config = copy.copy(model_config)
+    self.embeddings = BertEmbeddings(config)
+    self.encoder = BertEncoder(config)
+    self.config = config
+    self.pre_trained = pre_trained
+    self.apply_state(state)
+  def forward(self, input_ids, attention_mask=None, token_type_ids=None, output_all_encoded_layers=True, position_ids = None, return_att = False):
+    """
+    Args:
+      input_ids:
+        a torch.LongTensor of shape [batch_size, sequence_length] \
+      with the word token indices in the vocabulary
+      attention_mask:
+        an optional parameter for input mask or attention mask.
+        - If it's an input mask, then it will be torch.LongTensor of shape [batch_size, sequence_length] with indices \
+      selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max \
+      input sequence length in the current batch. It's the mask that we typically use for attention when \
+      a batch has varying length sentences.
+        - If it's an attention mask then it will be torch.LongTensor of shape [batch_size, sequence_length, sequence_length]. \
+      In this case, it's a mask indicate which tokens in the sequence should be attended by other tokens in the sequence.
+      token_type_ids:
+        an optional torch.LongTensor of shape [batch_size, sequence_length] with the token \
+      types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to \
+      a `sentence B` token (see BERT paper for more details).
+      output_all_encoded_layers:
+        whether to output results of all encoder layers, default, True
+    Returns:
+      - The output of the stacked transformer layers if `output_all_encoded_layers=True`, else \
+      the last layer of stacked transformer layers
+      - Attention matrix of self-attention layers if `return_att=True`
+    Example::
+      # Batch of wordPiece token ids.
+      # Each sample was padded with zero to the maxium length of the batch
+      input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+      # Mask of valid input ids
+      attention_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+      # DeBERTa model initialized with pretrained base model
+      bert = DeBERTa(pre_trained='base')
+      encoder_layers = bert(input_ids, attention_mask=attention_mask)
+    """
+    if attention_mask is None:
+      attention_mask = torch.ones_like(input_ids)
+    if token_type_ids is None:
+      token_type_ids = torch.zeros_like(input_ids)
+    ebd_output = self.embeddings(input_ids.to(torch.long), token_type_ids.to(torch.long), position_ids, attention_mask)
+    embedding_output = ebd_output['embeddings']
+    encoder_output = self.encoder(embedding_output,
+                   attention_mask,
+                   output_all_encoded_layers=output_all_encoded_layers, return_att = return_att)
+    encoder_output.update(ebd_output)
+    return encoder_output
+  def apply_state(self, state = None):
+    """ Load state from previous loaded model state dictionary.
+      Args:
+        state (:obj:`dict`, optional): State dictionary as the state returned by torch.module.state_dict(), default: `None`. \
+            If it's `None`, then will use the pre-trained state loaded via the constructor to re-initialize \
+            the `DeBERTa` model
+    """
+    if self.pre_trained is None and state is None:
+      return
+    if state is None:
+      state, config = load_model_state(self.pre_trained)
+      self.config = config
+    prefix = ''
+    for k in state:
+      if 'embeddings.' in k:
+        if not k.startswith('embeddings.'):
+          prefix = k[:k.index('embeddings.')]
+        break
+    missing_keys = []
+    unexpected_keys = []
+    error_msgs = []
+    self._load_from_state_dict(state, prefix = prefix, local_metadata=None, strict=True, missing_keys=missing_keys, unexpected_keys=unexpected_keys, error_msgs=error_msgs)

nlu/DeBERTa/deberta/disentangled_attention.py ADDED Viewed

	@@ -0,0 +1,221 @@

+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: penhe@microsoft.com
+# Date: 01/15/2020
+#
+"""
+  Disentangled SelfAttention module
+"""
+import numpy as np
+import math
+import torch
+from torch import nn
+import functools
+import pdb
+from .ops import *
+from .da_utils import build_relative_position
+from ..utils import get_logger
+logger=get_logger()
+from adapterlib import adapter_dict
+__all__=['DisentangledSelfAttention']
+class DisentangledSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_attention_heads = config.num_attention_heads
+        _attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.attention_head_size = getattr(config, 'attention_head_size', _attention_head_size)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        # -----------------------------------------------------------------------------------------------------------------------
+        if config.inject_adapter != 'linear':
+            self.query_proj = adapter_dict[config.inject_adapter](config.hidden_size, self.all_head_size, config=config)
+        else:
+            self.query_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+        # self.key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+        if config.inject_adapter != 'linear':
+            self.key_proj = adapter_dict[config.inject_adapter](config.hidden_size, self.all_head_size, config=config)
+        else:
+            self.key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+        if config.inject_adapter != 'linear':
+            self.value_proj = adapter_dict[config.inject_adapter](config.hidden_size, self.all_head_size, config=config)
+        else:
+            self.value_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+        # -----------------------------------------------------------------------------------------------------------------------
+        # self.query_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+        # self.key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+        # self.value_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+        self.share_att_key = getattr(config, 'share_att_key', False)
+        self.pos_att_type = [x.strip() for x in getattr(config, 'pos_att_type', 'c2p').lower().split('|')] # c2p|p2c
+        self.relative_attention = getattr(config, 'relative_attention', False)
+        if self.relative_attention:
+            self.position_buckets = getattr(config, 'position_buckets', -1)
+            self.max_relative_positions = getattr(config, 'max_relative_positions', -1)
+            if self.max_relative_positions <1:
+                self.max_relative_positions = config.max_position_embeddings
+            self.pos_ebd_size = self.max_relative_positions
+            if self.position_buckets>0:
+                self.pos_ebd_size = self.position_buckets
+                # For backward compitable
+            self.pos_dropout = StableDropout(config.hidden_dropout_prob)
+            if (not self.share_att_key):
+                if 'c2p' in self.pos_att_type or 'p2p' in self.pos_att_type:
+                    self.pos_key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+                if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type:
+                    self.pos_query_proj = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = StableDropout(config.attention_probs_dropout_prob)
+        self._register_load_state_dict_pre_hook(self._pre_load_hook)
+    def transpose_for_scores(self, x, attention_heads):
+        new_x_shape = x.size()[:-1] + (attention_heads, -1)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3).contiguous().view(-1, x.size(1), x.size(-1))
+    def forward(self, hidden_states, attention_mask, return_att=False, query_states=None, relative_pos=None, rel_embeddings=None):
+        if query_states is None:
+            query_states = hidden_states
+        query_layer = self.transpose_for_scores(self.query_proj(query_states), self.num_attention_heads).float()
+        key_layer = self.transpose_for_scores(self.key_proj(hidden_states), self.num_attention_heads).float()
+        value_layer = self.transpose_for_scores(self.value_proj(hidden_states), self.num_attention_heads)
+        rel_att = None
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        scale_factor = 1
+        if 'c2p' in self.pos_att_type:
+            scale_factor += 1
+        if 'p2c' in self.pos_att_type:
+            scale_factor += 1
+        if 'p2p' in self.pos_att_type:
+            scale_factor += 1
+        scale = 1/math.sqrt(query_layer.size(-1)*scale_factor)
+        attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2)*scale)
+        if self.relative_attention:
+            rel_embeddings = self.pos_dropout(rel_embeddings)
+            rel_att = self.disentangled_attention_bias(query_layer, key_layer, relative_pos, rel_embeddings, scale_factor)
+        if rel_att is not None:
+            attention_scores = (attention_scores + rel_att)
+        attention_scores = (attention_scores - attention_scores.max(dim=-1, keepdim=True).values.detach()).to(hidden_states)
+        attention_scores = attention_scores.view(-1, self.num_attention_heads, attention_scores.size(-2), attention_scores.size(-1))
+        # bxhxlxd
+        _attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
+        attention_probs = self.dropout(_attention_probs)
+        context_layer = torch.bmm(attention_probs.view(-1, attention_probs.size(-2), attention_probs.size(-1)), value_layer)
+        context_layer = context_layer.view(-1, self.num_attention_heads, context_layer.size(-2), context_layer.size(-1)).permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (-1,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return {
+            'hidden_states': context_layer,
+            'attention_probs': _attention_probs,
+            'attention_logits': attention_scores
+            }
+    def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor):
+        if relative_pos is None:
+            q = query_layer.size(-2)
+            relative_pos = build_relative_position(q, key_layer.size(-2), bucket_size = self.position_buckets, \
+                max_position = self.max_relative_positions, device=query_layer.device)
+        if relative_pos.dim()==2:
+            relative_pos = relative_pos.unsqueeze(0).unsqueeze(0)
+        elif relative_pos.dim()==3:
+            relative_pos = relative_pos.unsqueeze(1)
+        # bxhxqxk
+        elif relative_pos.dim()!=4:
+            raise ValueError(f'Relative postion ids must be of dim 2 or 3 or 4. {relative_pos.dim()}')
+        att_span = self.pos_ebd_size
+        relative_pos = relative_pos.long().to(query_layer.device)
+        rel_embeddings = rel_embeddings[self.pos_ebd_size - att_span:self.pos_ebd_size + att_span, :].unsqueeze(0) #.repeat(query_layer.size(0)//self.num_attention_heads, 1, 1)
+        if self.share_att_key:
+            pos_query_layer = self.transpose_for_scores(self.query_proj(rel_embeddings), self.num_attention_heads)\
+                .repeat(query_layer.size(0)//self.num_attention_heads, 1, 1) #.split(self.all_head_size, dim=-1)
+            pos_key_layer = self.transpose_for_scores(self.key_proj(rel_embeddings), self.num_attention_heads)\
+                .repeat(query_layer.size(0)//self.num_attention_heads, 1, 1) #.split(self.all_head_size, dim=-1)
+        else:
+            if 'c2p' in self.pos_att_type or 'p2p' in self.pos_att_type:
+                pos_key_layer = self.transpose_for_scores(self.pos_key_proj(rel_embeddings), self.num_attention_heads)\
+                    .repeat(query_layer.size(0)//self.num_attention_heads, 1, 1) #.split(self.all_head_size, dim=-1)
+            if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type:
+                pos_query_layer = self.transpose_for_scores(self.pos_query_proj(rel_embeddings), self.num_attention_heads)\
+                    .repeat(query_layer.size(0)//self.num_attention_heads, 1, 1) #.split(self.all_head_size, dim=-1)
+        score = 0
+        # content->position
+        if 'c2p' in self.pos_att_type:
+            scale = 1/math.sqrt(pos_key_layer.size(-1)*scale_factor)
+            c2p_att = torch.bmm(query_layer, pos_key_layer.transpose(-1, -2).to(query_layer)*scale)
+            c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span*2-1).squeeze(0).expand([query_layer.size(0), query_layer.size(1), relative_pos.size(-1)])
+            c2p_att = torch.gather(c2p_att, dim=-1, index=c2p_pos)
+            score += c2p_att
+        # position->content
+        if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type:
+            scale = 1/math.sqrt(pos_query_layer.size(-1)*scale_factor)
+        if 'p2c' in self.pos_att_type:
+            p2c_att = torch.bmm(pos_query_layer.to(key_layer)*scale, key_layer.transpose(-1, -2))
+            p2c_att = torch.gather(p2c_att, dim=-2, index=c2p_pos)
+            score += p2c_att
+        # position->position
+        if 'p2p' in self.pos_att_type:
+            pos_query = pos_query_layer[:,:,att_span:,:]
+            p2p_att = torch.matmul(pos_query, pos_key_layer.transpose(-1, -2))
+            p2p_att = p2p_att.expand(query_layer.size()[:2] + p2p_att.size()[2:])
+            if query_layer.size(-2) != key_layer.size(-2):
+                p2p_att = torch.gather(p2p_att, dim=-2, index=pos_index.expand(query_layer.size()[:2] + (pos_index.size(-2), p2p_att.size(-1))))
+            p2p_att = torch.gather(p2p_att, dim=-1, index=c2p_pos.expand([query_layer.size(0), query_layer.size(1), query_layer.size(2), relative_pos.size(-1)]))
+            score += p2p_att
+        return score
+    def _pre_load_hook(self, state_dict, prefix, local_metadata, strict,
+        missing_keys, unexpected_keys, error_msgs):
+        self_state = self.state_dict()
+        if ((prefix + 'query_proj.weight') not in state_dict) and ((prefix + 'in_proj.weight') in state_dict):
+          v1_proj = state_dict[prefix+'in_proj.weight']
+          v1_proj = v1_proj.unsqueeze(0).reshape(self.num_attention_heads, -1, v1_proj.size(-1))
+          q,k,v=v1_proj.chunk(3, dim=1)
+          state_dict[prefix + 'query_proj.weight'] = q.reshape(-1, v1_proj.size(-1))
+          state_dict[prefix + 'key_proj.weight'] = k.reshape(-1, v1_proj.size(-1))
+          state_dict[prefix + 'key_proj.bias'] = self_state['key_proj.bias']
+          state_dict[prefix + 'value_proj.weight'] = v.reshape(-1, v1_proj.size(-1))
+          v1_query_bias = state_dict[prefix + 'q_bias']
+          state_dict[prefix + 'query_proj.bias'] = v1_query_bias
+          v1_value_bias = state_dict[prefix +'v_bias']
+          state_dict[prefix + 'value_proj.bias'] = v1_value_bias
+          v1_pos_key_proj = state_dict[prefix + 'pos_proj.weight']
+          state_dict[prefix + 'pos_key_proj.weight'] = v1_pos_key_proj
+          v1_pos_query_proj = state_dict[prefix + 'pos_q_proj.weight']
+          state_dict[prefix + 'pos_query_proj.weight'] = v1_pos_query_proj
+          v1_pos_query_proj_bias = state_dict[prefix + 'pos_q_proj.bias']
+          state_dict[prefix + 'pos_query_proj.bias'] = v1_pos_query_proj_bias
+          state_dict[prefix + 'pos_key_proj.bias'] = self_state['pos_key_proj.bias']
+          del state_dict[prefix + 'in_proj.weight']
+          del state_dict[prefix + 'q_bias']
+          del state_dict[prefix + 'v_bias']
+          del state_dict[prefix + 'pos_proj.weight']
+          del state_dict[prefix + 'pos_q_proj.weight']
+          del state_dict[prefix + 'pos_q_proj.bias']

nlu/DeBERTa/deberta/gpt2_bpe_utils.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""
+Byte pair encoding utilities from GPT-2.
+Original source: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+Original license: MIT
+"""
+from functools import lru_cache
+import json
+import random
+import unicodedata
+try:
+    import regex as re
+except ImportError:
+    raise ImportError('Please install regex with: pip install regex')
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+class Encoder:
+    def __init__(self, encoder, bpe_merges, errors='replace'):
+        self.encoder = encoder
+        self.decoder = {v:k for k,v in self.encoder.items()}
+        self.errors = errors # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
+        self.bpe_ranks = dict(zip([tuple(k) for k in bpe_merges], range(len(bpe_merges))))
+        self.cache = {}
+        self.random = random.Random(0)
+        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def split_to_words(self, text):
+      return list(re.findall(self.pat, text))
+    def encode(self, text):
+        bpe_tokens = []
+        for token in self.split_to_words(text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        return text
+def get_encoder(encoder, vocab):
+    return Encoder(
+        encoder=encoder,
+        bpe_merges=vocab,
+    )
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False

nlu/DeBERTa/deberta/gpt2_tokenizer.py ADDED Viewed

	@@ -0,0 +1,216 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: penhe@microsoft.com
+# Date: 01/15/2020
+#
+# This piece of code is derived from https://github.com/pytorch/fairseq/blob/master/fairseq/data/encoders/gpt2_bpe.py
+import torch
+import unicodedata
+import os
+from .gpt2_bpe_utils import get_encoder,_is_control,_is_whitespace,_is_punctuation
+from .cache_utils import load_vocab
+__all__ = ['GPT2Tokenizer']
+class GPT2Tokenizer(object):
+  """ A wrapper of GPT2 tokenizer with similar interface as BERT tokenizer
+  Args:
+    vocab_file (:obj:`str`, optional):
+      The local path of vocabulary package or the release name of vocabulary in `DeBERTa GitHub releases <https://github.com/microsoft/DeBERTa/releases>`_, \
+          e.g. "bpe_encoder", default: `None`.
+          If it's `None`, then it will download the vocabulary in the latest release from GitHub. The vocabulary file is a \
+          state dictionary with three items, "dict_map", "vocab", "encoder" which correspond to three files used in `RoBERTa`, i.e. `dict.txt`, `vocab.txt` and `encoder.json`. \
+          The difference between our wrapped GPT2 tokenizer and RoBERTa wrapped tokenizer are,
+          - Special tokens, unlike `RoBERTa` which use `<s>`, `</s>` as the `start` token and `end` token of a sentence. We use `[CLS]` and `[SEP]` as the `start` and `end`\
+              token of input sentence which is the same as `BERT`.
+          - We remapped the token ids in our dictionary with regarding to the new special tokens, `[PAD]` => 0, `[CLS]` => 1, `[SEP]` => 2, `[UNK]` => 3, `[MASK]` => 50264
+    do_lower_case (:obj:`bool`, optional):
+      Whether to convert inputs to lower case. **Not used in GPT2 tokenizer**.
+    special_tokens (:obj:`list`, optional):
+      List of special tokens to be added to the end of the vocabulary.
+  """
+  def __init__(self, vocab_file=None, do_lower_case=True, special_tokens=None):
+    self.pad_token='[PAD]'
+    self.sep_token='[SEP]'
+    self.unk_token='[UNK]'
+    self.cls_token='[CLS]'
+    self.symbols = []
+    self.count = []
+    self.indices = {}
+    self.pad_token_id = self.add_symbol(self.pad_token)
+    self.cls_token_id = self.add_symbol(self.cls_token)
+    self.sep_token_id = self.add_symbol(self.sep_token)
+    self.unk_token_id = self.add_symbol(self.unk_token)
+    self.gpt2_encoder = torch.load(vocab_file)
+    self.bpe = get_encoder(self.gpt2_encoder['encoder'], self.gpt2_encoder['vocab'])
+    for w,n in self.gpt2_encoder['dict_map']:
+      self.add_symbol(w, n)
+    self.mask_token='[MASK]'
+    self.mask_id = self.add_symbol(self.mask_token)
+    self.special_tokens = ['[MASK]', '[SEP]', '[PAD]', '[UNK]', '[CLS]']
+    if special_tokens is not None:
+      for t in special_tokens:
+        self.add_special_token(t)
+    self.vocab = self.indices
+    self.ids_to_tokens = self.symbols
+  def tokenize(self, text):
+    """ Convert an input text to tokens.
+      Args:
+        text (:obj:`str`): input text to be tokenized.
+      Returns:
+        A list of byte tokens where each token represent the byte id in GPT2 byte dictionary
+      Example::
+        >>> tokenizer = GPT2Tokenizer()
+        >>> text = "Hello world!"
+        >>> tokens = tokenizer.tokenize(text)
+        >>> print(tokens)
+        ['15496', '995', '0']
+    """
+    bpe = self._encode(text)
+    return [t for t in bpe.split(' ') if t]
+  def convert_tokens_to_ids(self, tokens):
+    """ Convert list of tokens to ids.
+      Args:
+        tokens (:obj:`list<str>`): list of tokens
+      Returns:
+        List of ids
+    """
+    return [self.vocab[t] for t in tokens]
+  def convert_ids_to_tokens(self, ids):
+    """ Convert list of ids to tokens.
+      Args:
+        ids (:obj:`list<int>`): list of ids
+      Returns:
+        List of tokens
+    """
+    tokens = []
+    for i in ids:
+      tokens.append(self.ids_to_tokens[i])
+    return tokens
+  def split_to_words(self, text):
+    return self.bpe.split_to_words(text)
+  def decode(self, tokens):
+    """ Decode list of tokens to text strings.
+      Args:
+        tokens (:obj:`list<str>`): list of tokens.
+      Returns:
+        Text string corresponds to the input tokens.
+      Example::
+        >>> tokenizer = GPT2Tokenizer()
+        >>> text = "Hello world!"
+        >>> tokens = tokenizer.tokenize(text)
+        >>> print(tokens)
+        ['15496', '995', '0']
+        >>> tokenizer.decode(tokens)
+        'Hello world!'
+    """
+    return self.bpe.decode([int(t) for t in tokens if t not in self.special_tokens])
+  def add_special_token(self, token):
+    """Adds a special token to the dictionary.
+      Args:
+        token (:obj:`str`): Tthe new token/word to be added to the vocabulary.
+      Returns:
+        The id of new token in the vocabulary.
+    """
+    self.special_tokens.append(token)
+    return self.add_symbol(token)
+  def part_of_whole_word(self, token, is_bos=False):
+    if is_bos:
+      return True
+    s = self._decode(token)
+    if (len(s)==1 and (_is_whitespace(list(s)[0]) or _is_control(list(s)[0]) or _is_punctuation(list(s)[0]))):
+      return False
+    return not s.startswith(' ')
+  def sym(self, id):
+    return self.ids_to_tokens[id]
+  def id(self, sym):
+    return self.vocab[sym]
+  def _encode(self, x: str) -> str:
+    return ' '.join(map(str, self.bpe.encode(x)))
+  def _decode(self, x: str) -> str:
+    return self.bpe.decode(map(int, x.split()))
+  def add_symbol(self, word, n=1):
+    """Adds a word to the dictionary.
+      Args:
+        word (:obj:`str`): Tthe new token/word to be added to the vocabulary.
+        n (int, optional): The frequency of the word.
+      Returns:
+        The id of the new word.
+    """
+    if word in self.indices:
+      idx = self.indices[word]
+      self.count[idx] = self.count[idx] + n
+      return idx
+    else:
+      idx = len(self.symbols)
+      self.indices[word] = idx
+      self.symbols.append(word)
+      self.count.append(n)
+      return idx
+  def save_pretrained(self, path: str):
+    torch.save(self.gpt2_encoder, path)

nlu/DeBERTa/deberta/mlm.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This piece of code is modified based on https://github.com/huggingface/transformers
+import torch
+from torch import nn
+import pdb
+from .bert import LayerNorm,ACT2FN
+__all__ = ['MLMPredictionHead']
+class MLMPredictionHead(nn.Module):
+    def __init__(self, config, vocab_size):
+        super().__init__()
+        self.embedding_size = getattr(config, 'embedding_size', config.hidden_size)
+        self.dense = nn.Linear(config.hidden_size, self.embedding_size)
+        self.transform_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+        self.LayerNorm = LayerNorm(self.embedding_size, config.layer_norm_eps)
+        self.bias = nn.Parameter(torch.zeros(vocab_size))
+        self.pre_norm = PreLayerNorm(config)
+    def forward(self, hidden_states, embeding_weight):
+        hidden_states = self.pre_norm(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        # b x s x d
+        hidden_states = MaskedLayerNorm(self.LayerNorm, hidden_states)
+        # b x s x v
+        logits = torch.matmul(hidden_states, embeding_weight.t().to(hidden_states)) + self.bias
+        return logits

nlu/DeBERTa/deberta/nnmodule.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import pdb
+import os
+import torch
+import copy
+from torch import nn
+from .config import ModelConfig
+from ..utils import xtqdm as tqdm
+from .cache_utils import load_model_state
+from ..utils import get_logger
+logger = get_logger()
+__all__ = ['NNModule']
+class NNModule(nn.Module):
+  """ An abstract class to handle weights initialization and \
+    a simple interface for dowloading and loading pretrained models.
+  Args:
+    config (:obj:`~DeBERTa.deberta.ModelConfig`): The model config to the module
+  """
+  def __init__(self, config, *inputs, **kwargs):
+    super().__init__()
+    self.config = config
+  def init_weights(self, module):
+    """ Apply Gaussian(mean=0, std=`config.initializer_range`) initialization to the module.
+    Args:
+      module (:obj:`torch.nn.Module`): The module to apply the initialization.
+    Example::
+      class MyModule(NNModule):
+        def __init__(self, config):
+          # Add construction instructions
+          self.bert = DeBERTa(config)
+          # Add other modules
+          ...
+          # Apply initialization
+          self.apply(self.init_weights)
+    """
+    if isinstance(module, (nn.Linear, nn.Embedding)):
+      module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+    if isinstance(module, nn.Linear) and module.bias is not None:
+      module.bias.data.zero_()
+  def export_onnx(self, onnx_path, input):
+    raise NotImplementedError
+  @classmethod
+  def load_model(cls, model_path, model_config=None, tag=None, no_cache=False, cache_dir=None , *inputs, **kwargs):
+    """ Instantiate a sub-class of NNModule from a pre-trained model file.
+    Args:
+      model_path (:obj:`str`): Path or name of the pre-trained model which can be either,
+        - The path of pre-trained model
+        - The pre-trained DeBERTa model name in `DeBERTa GitHub releases <https://github.com/microsoft/DeBERTa/releases>`_, i.e. [**base, base_mnli, large, large_mnli**].
+        If `model_path` is `None` or `-`, then the method will create a new sub-class without initialing from pre-trained models.
+      model_config (:obj:`str`): The path of model config file. If it's `None`, then the method will try to find the the config in order:
+        1. ['config'] in the model state dictionary.
+        2. `model_config.json` aside the `model_path`.
+        If it failed to find a config the method will fail.
+      tag (:obj:`str`, optional): The release tag of DeBERTa, default: `None`.
+      no_cache (:obj:`bool`, optional): Disable local cache of downloaded models, default: `False`.
+      cache_dir (:obj:`str`, optional): The cache directory used to save the downloaded models, default: `None`. If it's `None`, then the models will be saved at `$HOME/.~DeBERTa`
+    Return:
+      :obj:`NNModule` : The sub-class object.
+    """
+    # Load config
+    if model_config:
+      config = ModelConfig.from_json_file(model_config)
+    else:
+      config = None
+    model_config = None
+    model_state = None
+    if (model_path is not None) and (model_path.strip() == '-' or model_path.strip()==''):
+      model_path = None
+    try:
+      model_state, model_config = load_model_state(model_path, tag=tag, no_cache=no_cache, cache_dir=cache_dir)
+    except Exception as exp:
+      raise Exception(f'Failed to get model {model_path}. Exception: {exp}')
+    if config is not None and model_config is not None:
+      for k in config.__dict__:
+        if k not in ['hidden_size',
+          'intermediate_size',
+          'num_attention_heads',
+          'num_hidden_layers',
+          'vocab_size',
+          'max_position_embeddings'] or (k not in  model_config.__dict__) or (model_config.__dict__[k] < 0):
+          model_config.__dict__[k] = config.__dict__[k]
+    if model_config is not None:
+      config = copy.copy(model_config)
+    vocab_size = config.vocab_size
+    # Instantiate model.
+    model = cls(config, *inputs, **kwargs)
+    if not model_state:
+      return model
+    # copy state_dict so _load_from_state_dict can modify it
+    state_dict = model_state.copy()
+    missing_keys = []
+    unexpected_keys = []
+    error_msgs = []
+    metadata = getattr(state_dict, '_metadata', None)
+    def load(module, prefix=''):
+      local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+      module._load_from_state_dict(
+        state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+      for name, child in module._modules.items():
+        if child is not None:
+          load(child, prefix + name + '.')
+    load(model)
+    logger.warning(f'Missing keys: {missing_keys}, unexpected_keys: {unexpected_keys}, error_msgs: {error_msgs}')
+    return model

nlu/DeBERTa/deberta/ops.py ADDED Viewed

	@@ -0,0 +1,228 @@

+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: penhe@microsoft.com
+# Date: 01/15/2020
+#
+import pdb
+import math
+from packaging import version
+import torch
+from torch.nn import LayerNorm
+from ..utils.jit_tracing import traceable
+if version.Version(torch.__version__) >= version.Version('1.0.0'):
+  from torch import _softmax_backward_data as _softmax_backward_data
+else:
+  from torch import softmax_backward_data as _softmax_backward_data
+__all__ = ['StableDropout', 'MaskedLayerNorm', 'XSoftmax', 'ACT2FN', 'LayerNorm']
+@traceable
+class XSoftmax(torch.autograd.Function):
+  """ Masked Softmax which is optimized for saving memory
+  Args:
+    input (:obj:`torch.tensor`): The input tensor that will apply softmax.
+    mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax caculation.
+    dim (int): The dimenssion that will apply softmax.
+  Example::
+    import torch
+    from DeBERTa.deberta import XSoftmax
+    # Make a tensor
+    x = torch.randn([4,20,100])
+    # Create a mask
+    mask = (x>0).int()
+    y = XSoftmax.apply(x, mask, dim=-1)
+  """
+  @staticmethod
+  def forward(self, input, mask, dim):
+    """
+    """
+    self.dim = dim
+    if version.Version(torch.__version__) >= version.Version('1.2.0a'):
+      rmask = ~(mask.bool())
+    else:
+      rmask = (1-mask).byte() # This line is not supported by Onnx tracing.
+    output = input.masked_fill(rmask, float('-inf'))
+    output = torch.softmax(output, self.dim)
+    output.masked_fill_(rmask, 0)
+    self.save_for_backward(output)
+    return output
+  @staticmethod
+  def backward(self, grad_output):
+    """
+    """
+    output, = self.saved_tensors
+    if version.Version(torch.__version__) >= version.Version('1.11.0a'):
+      inputGrad = _softmax_backward_data(grad_output, output, self.dim, output.dtype)
+    else:
+      inputGrad = _softmax_backward_data(grad_output, output, self.dim, output)
+    return inputGrad, None, None
+  @staticmethod
+  def symbolic(g, self, mask, dim):
+      import torch.onnx.symbolic_helper as sym_help
+      from torch.onnx.symbolic_opset9 import masked_fill, softmax
+      mask_cast_value = g.op("Cast", mask, to_i=sym_help.cast_pytorch_to_onnx['Long'])
+      r_mask = g.op("Cast", g.op("Sub", g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64)), mask_cast_value), to_i=sym_help.cast_pytorch_to_onnx['Byte'])
+      output = masked_fill(g, self, r_mask, g.op("Constant", value_t=torch.tensor(float('-inf'))))
+      output = softmax(g, output, dim)
+      return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.uint8)))
+class DropoutContext(object):
+  def __init__(self):
+    self.dropout = 0
+    self.mask = None
+    self.scale = 1
+    self.reuse_mask = True
+def get_mask(input, local_context):
+  if not isinstance(local_context, DropoutContext):
+    dropout = local_context
+    mask = None
+  else:
+    dropout = local_context.dropout
+    dropout *= local_context.scale
+    mask = local_context.mask if local_context.reuse_mask else None
+  if dropout>0 and mask is None:
+    if version.Version(torch.__version__) >= version.Version('1.2.0a'):
+      mask=(1-torch.empty_like(input).bernoulli_(1-dropout)).bool()
+    else:
+      mask=(1-torch.empty_like(input).bernoulli_(1-dropout)).byte()
+  if isinstance(local_context, DropoutContext):
+    if local_context.mask is None:
+      local_context.mask = mask
+  return mask, dropout
+@traceable
+class XDropout(torch.autograd.Function):
+  @staticmethod
+  def forward(ctx, input, local_ctx):
+    mask, dropout = get_mask(input, local_ctx)
+    ctx.scale=1.0/(1-dropout)
+    if dropout>0:
+      ctx.save_for_backward(mask)
+      return input.masked_fill(mask, 0)*ctx.scale
+    else:
+      return input
+  @staticmethod
+  def backward(ctx, grad_output):
+    if ctx.scale > 1:
+      mask, = ctx.saved_tensors
+      return grad_output.masked_fill(mask, 0)*ctx.scale, None
+    else:
+      return grad_output, None
+class StableDropout(torch.nn.Module):
+  """ Optimized dropout module for stabilizing the training
+  Args:
+    drop_prob (float): the dropout probabilities
+  """
+  def __init__(self, drop_prob):
+    super().__init__()
+    self.drop_prob = drop_prob
+    self.count = 0
+    self.context_stack = None
+  def forward(self, x):
+    """ Call the module
+    Args:
+      x (:obj:`torch.tensor`): The input tensor to apply dropout
+    """
+    if self.training and self.drop_prob>0:
+      return XDropout.apply(x, self.get_context())
+    return x
+  def clear_context(self):
+    self.count = 0
+    self.context_stack = None
+  def init_context(self, reuse_mask=True, scale = 1):
+    if self.context_stack is None:
+      self.context_stack = []
+    self.count = 0
+    for c in self.context_stack:
+      c.reuse_mask = reuse_mask
+      c.scale = scale
+  def get_context(self):
+    if self.context_stack is not None:
+      if self.count >= len(self.context_stack):
+        self.context_stack.append(DropoutContext())
+      ctx = self.context_stack[self.count]
+      ctx.dropout = self.drop_prob
+      self.count += 1
+      return ctx
+    else:
+      return self.drop_prob
+def MaskedLayerNorm(layerNorm, input, mask = None):
+  """ Masked LayerNorm which will apply mask over the output of LayerNorm to avoid inaccurate updatings to the LayerNorm module.
+  Args:
+    layernorm (:obj:`~DeBERTa.deberta.LayerNorm`): LayerNorm module or function
+    input (:obj:`torch.tensor`): The input tensor
+    mask (:obj:`torch.IntTensor`): The mask to applied on the output of LayerNorm where `0` indicate the output of that element will be ignored, i.e. set to `0`
+  Example::
+    # Create a tensor b x n x d
+    x = torch.randn([1,10,100])
+    m = torch.tensor([[1,1,1,0,0,0,0,0,0,0]], dtype=torch.int)
+    LayerNorm = DeBERTa.deberta.LayerNorm(100)
+    y = MaskedLayerNorm(LayerNorm, x, m)
+  """
+  output = layerNorm(input).to(input)
+  if mask is None:
+    return output
+  if mask.dim()!=input.dim():
+    if mask.dim()==4:
+      mask=mask.squeeze(1).squeeze(1)
+    mask = mask.unsqueeze(2)
+  mask = mask.to(output.dtype)
+  return output*mask
+def gelu(x):
+  """Implementation of the gelu activation function.
+    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+    0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+  """
+  return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+def swish(x):
+  return x * torch.sigmoid(x)
+def linear_act(x):
+  return x
+ACT2FN = {"gelu": torch.nn.functional.gelu, "relu": torch.nn.functional.relu, "swish": swish, "tanh": torch.tanh, "linear": linear_act, 'sigmoid': torch.sigmoid}

nlu/DeBERTa/deberta/pooling.py ADDED Viewed

	@@ -0,0 +1,88 @@

+#
+# Author: penhe@microsoft.com
+# Date: 01/25/2019
+#
+"""
+Pooling functions
+"""
+from torch import nn
+import copy
+import json
+import pdb
+from .bert import ACT2FN
+from .ops import StableDropout
+from .config import AbsModelConfig
+__all__ = ['PoolConfig', 'ContextPooler']
+class PoolConfig(AbsModelConfig):
+    """Configuration class to store the configuration of `pool layer`.
+        Parameters:
+            config (:class:`~DeBERTa.deberta.ModelConfig`): The model config. The field of pool config will be initalized with the `pooling` field in model config.
+        Attributes:
+            hidden_size (int): Size of the encoder layers and the pooler layer, default: `768`.
+            dropout (float): The dropout rate applied on the output of `[CLS]` token,
+            hidden_act (:obj:`str`): The activation function of the projection layer, it can be one of ['gelu', 'tanh'].
+        Example::
+            # Here is the content of an exmple model config file in json format
+                {
+                  "hidden_size": 768,
+                  "num_hidden_layers" 12,
+                  "num_attention_heads": 12,
+                  "intermediate_size": 3072,
+                  ...
+                  "pooling": {
+                    "hidden_size":  768,
+                    "hidden_act": "gelu",
+                    "dropout": 0.1
+                  }
+                }
+    """
+    def __init__(self, config=None):
+        """Constructs PoolConfig.
+        Args:
+           `config`: the config of the model. The field of pool config will be initalized with the 'pooling' field in model config.
+        """
+        self.hidden_size = 768
+        self.dropout = 0
+        self.hidden_act = 'gelu'
+        if config:
+            pool_config = getattr(config, 'pooling', config)
+            if isinstance(pool_config, dict):
+                pool_config = AbsModelConfig.from_dict(pool_config)
+            self.hidden_size = getattr(pool_config, 'hidden_size', config.hidden_size)
+            self.dropout = getattr(pool_config, 'dropout', 0)
+            self.hidden_act = getattr(pool_config, 'hidden_act', 'gelu')
+class ContextPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = StableDropout(config.dropout)
+        self.config = config
+    def forward(self, hidden_states, mask = None):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        context_token = hidden_states[:, 0]
+        context_token = self.dropout(context_token)
+        pooled_output = self.dense(context_token)
+        pooled_output = ACT2FN[self.config.hidden_act](pooled_output)
+        return pooled_output
+    def output_dim(self):
+        return self.config.hidden_size

nlu/DeBERTa/deberta/pretrained_models.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+

nlu/DeBERTa/deberta/spm_tokenizer.py ADDED Viewed

	@@ -0,0 +1,322 @@

+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: penhe@microsoft.com
+# Date: 11/15/2020
+#
+import sentencepiece as sp
+import six
+import unicodedata
+import os
+import regex as re
+from .cache_utils import load_vocab
+from ..utils import get_logger
+logger=get_logger()
+import pdb
+__all__ = ['SPMTokenizer']
+class SPMTokenizer:
+  def __init__(self, vocab_file, do_lower_case=False, special_tokens=None, bpe_dropout=0, split_by_punct=False):
+    self.split_by_punct = split_by_punct
+    spm = sp.SentencePieceProcessor()
+    assert os.path.exists(vocab_file)
+    spm.load(vocab_file)
+    bpe_vocab_size = spm.GetPieceSize()
+    # Token map
+    # <unk> 0+1
+    # <s> 1+1
+    # </s> 2+1
+    self.vocab = {spm.IdToPiece(i):i for i in range(bpe_vocab_size)}
+    self.id_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
+    #self.vocab['[PAD]'] = 0
+    #self.vocab['[CLS]'] = 1
+    #self.vocab['[SEP]'] = 2
+    #self.vocab['[UNK]'] = 3
+    _special_tokens = ['[MASK]', '[SEP]', '[PAD]', '[UNK]', '[CLS]']
+    self.special_tokens = []
+    if special_tokens is not None:
+      _special_tokens.extend(special_tokens)
+    for t in _special_tokens:
+      self.add_special_token(t)
+    self.spm = spm
+    self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+  def tokenize(self, text):
+    pieces = self._encode_as_pieces(text)
+    def _norm(x):
+      if x not in self.vocab or x=='<unk>':
+        return '[UNK]'
+      else:
+        return x
+    pieces = [_norm(p) for p in pieces]
+    return pieces
+  def convert_tokens_to_ids(self, tokens):
+    return [self.vocab[t] if t in self.vocab else 1 for t in tokens]
+  def convert_ids_to_tokens(self, ids):
+    tokens = []
+    for i in ids:
+      tokens.append(self.ids_to_tokens[i])
+    return tokens
+  def decode(self, tokens, start=-1, end=-1, raw_text=None):
+    if raw_text is None:
+      return self.spm.decode_pieces([t for t in tokens if t not in self.special_tokens])
+    else:
+      words = self.split_to_words(raw_text)
+      word_tokens = [self.tokenize(w) for w in words]
+      wt = [w for t in word_tokens for w in t]
+      #assert tokens == wt, f'{tokens} || {wt}'
+      if wt!=tokens:
+        for a,b in zip(wt, tokens):
+          if a!=b:
+            pdb.set_trace()
+      token2words = [0]*len(tokens)
+      tid = 0
+      for i,w in enumerate(word_tokens):
+        for k,t in enumerate(w):
+          token2words[tid] = i
+          tid += 1
+      word_start = token2words[start]
+      word_end = token2words[end] if end <len(tokens) else len(words)
+      text = ''.join(words[word_start:word_end])
+      return text
+  def add_special_token(self, token):
+    if token not in self.special_tokens:
+      self.special_tokens.append(token)
+      if token not in self.vocab:
+        self.vocab[token] = len(self.vocab)
+        self.id_to_tokens.append(token)
+    return self.id(token)
+  def part_of_whole_word(self, token, is_bos=False):
+    if is_bos:
+      return True
+    if (len(token)==1 and (_is_whitespace(list(token)[0]) or _is_control(list(token)[0]) or _is_punctuation(list(token)[0]))) or token in self.special_tokens:
+      return False
+    word_start = b'\xe2\x96\x81'.decode('utf-8')
+    return not token.startswith(word_start)
+  def pad(self):
+    return '[PAD]'
+  def bos(self):
+    return '[CLS]'
+  def eos(self):
+    return '[SEP]'
+  def unk(self):
+      return '[UNK]'
+  def mask(self):
+      return '[MASK]'
+  def sym(self, id):
+    return self.ids_to_tokens[id]
+  def id(self, sym):
+    return self.vocab[sym] if sym in self.vocab else 1
+  def _encode_as_pieces(self, text):
+    text = convert_to_unicode(text)
+    if self.split_by_punct:
+      words = self._run_split_on_punc(text)
+      pieces = [self.spm.encode_as_pieces(w) for w in words]
+      return [p for w in pieces for p in w]
+    else:
+      return self.spm.encode_as_pieces(text)
+  def split_to_words(self, text):
+    pieces = self._encode_as_pieces(text)
+    word_start = b'\xe2\x96\x81'.decode('utf-8')
+    words = []
+    offset = 0
+    prev_end = 0
+    for i,p in enumerate(pieces):
+      if p.startswith(word_start):
+        if offset>prev_end:
+          words.append(text[prev_end:offset])
+        prev_end = offset
+        w = p.replace(word_start, '')
+      else:
+        w = p
+      try:
+        s = text.index(w, offset)
+        pn = ""
+        k = i+1
+        while k < len(pieces):
+          pn = pieces[k].replace(word_start, '')
+          if len(pn)>0:
+            break
+          k += 1
+        if len(pn)>0 and pn in text[offset:s]:
+          offset = offset + 1
+        else:
+          offset = s + len(w)
+      except:
+        offset = offset + 1
+    if prev_end< offset:
+      words.append(text[prev_end:offset])
+    return words
+  def _run_strip_accents(self, text):
+    """Strips accents from a piece of text."""
+    text = unicodedata.normalize("NFD", text)
+    output = []
+    for char in text:
+      cat = unicodedata.category(char)
+      if cat == "Mn":
+        continue
+      output.append(char)
+    return "".join(output)
+  def _run_split_on_punc(self, text):
+    """Splits punctuation on a piece of text."""
+    #words = list(re.findall(self.pat, text))
+    chars = list(text)
+    i = 0
+    start_new_word = True
+    output = []
+    while i < len(chars):
+      char = chars[i]
+      if _is_punctuation(char):
+        output.append([char])
+        start_new_word = True
+      else:
+        if start_new_word:
+          output.append([])
+        start_new_word = False
+        output[-1].append(char)
+      i += 1
+    return ["".join(x) for x in output]
+  def _tokenize_chinese_chars(self, text):
+    """Adds whitespace around any CJK character."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if self._is_chinese_char(cp):
+        output.append(" ")
+        output.append(char)
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+  def _is_chinese_char(self, cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+      (cp >= 0x3400 and cp <= 0x4DBF) or  #
+      (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+      (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+      (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+      (cp >= 0x2B820 and cp <= 0x2CEAF) or
+      (cp >= 0xF900 and cp <= 0xFAFF) or  #
+      (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+      return True
+    return False
+  def _clean_text(self, text):
+    """Performs invalid character removal and whitespace cleanup on text."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if cp == 0 or cp == 0xfffd or _is_control(char):
+        continue
+      if _is_whitespace(char):
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a peice of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+def convert_to_unicode(text):
+  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text.decode("utf-8", "ignore")
+    elif isinstance(text, unicode):
+      return text
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")

nlu/DeBERTa/deberta/tokenizers.py ADDED Viewed

	@@ -0,0 +1,16 @@

+#
+# Author: penhe@microsoft.com
+# Date: 04/25/2019
+#
+""" tokenizers
+"""
+from .spm_tokenizer import *
+from .gpt2_tokenizer import GPT2Tokenizer
+__all__ = ['tokenizers']
+tokenizers={
+    'gpt2': GPT2Tokenizer,
+    'spm': SPMTokenizer
+    }

nlu/DeBERTa/optims/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: Pengcheng He (penhe@microsoft.com)
+# Date: 05/15/2019
+#
+""" optimizers
+"""
+from .xadam import XAdam
+from .fp16_optimizer import *
+from .lr_schedulers import SCHEDULES
+from .args import get_args

nlu/DeBERTa/optims/args.py ADDED Viewed

	@@ -0,0 +1,100 @@

+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: Pengcheng He (penhe@microsoft.com)
+# Date: 05/15/2019
+#
+""" Arguments for optimizer
+"""
+import argparse
+from ..utils import boolean_string
+__all__ = ['get_args']
+def get_args():
+  parser=argparse.ArgumentParser(add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+  group = parser.add_argument_group(title='Optimizer', description='Parameters for the distributed optimizer')
+  group.add_argument('--fp16',
+            default=False,
+            type=boolean_string,
+            help="Whether to use 16-bit float precision instead of 32-bit")
+  group.add_argument('--loss_scale',
+            type=float, default=16384,
+            help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
+  group.add_argument('--scale_steps',
+            type=int, default=250,
+            help='The steps to wait to increase the loss scale.')
+  group.add_argument('--lookahead_k',
+            default=-1,
+            type=int,
+            help="lookahead k parameter")
+  group.add_argument('--lookahead_alpha',
+            default=0.5,
+            type=float,
+            help="lookahead alpha parameter")
+  group.add_argument('--with_radam',
+            default=False,
+            type=boolean_string,
+            help="whether to use RAdam")
+  group.add_argument('--opt_type',
+            type=str.lower,
+            default='adam',
+            choices=['adam', 'admax'],
+            help="The optimizer to be used.")
+  group.add_argument("--warmup_proportion",
+            default=0.1,
+            type=float,
+            help="Proportion of training to perform linear learning rate warmup for. "
+              "E.g., 0.1 = 10%% of training.")
+  group.add_argument("--lr_schedule_ends",
+            default=0,
+            type=float,
+            help="The ended learning rate scale for learning rate scheduling")
+  group.add_argument("--lr_schedule",
+            default='warmup_linear',
+            type=str,
+            help="The learning rate scheduler used for traning. " +
+              "E.g. warmup_linear, warmup_linear_shift, warmup_cosine, warmup_constant. Default, warmup_linear")
+  group.add_argument("--max_grad_norm",
+            default=1,
+            type=float,
+            help="The clip threshold of global gradient norm")
+  group.add_argument("--learning_rate",
+            default=5e-5,
+            type=float,
+            help="The initial learning rate for Adam.")
+  group.add_argument("--epsilon",
+            default=1e-6,
+            type=float,
+            help="epsilon setting for Adam.")
+  group.add_argument("--adam_beta1",
+            default=0.9,
+            type=float,
+            help="The beta1 parameter for Adam.")
+  group.add_argument("--adam_beta2",
+            default=0.999,
+            type=float,
+            help="The beta2 parameter for Adam.")
+  group.add_argument('--weight_decay',
+            type=float,
+            default=0.01,
+            help="The weight decay rate")
+  return parser

nlu/DeBERTa/optims/fp16_optimizer.py ADDED Viewed

	@@ -0,0 +1,301 @@

+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: Pengcheng He (penhe@microsoft.com)
+# Date: 05/15/2019
+#
+""" FP16 optimizer wrapper
+"""
+from collections import defaultdict
+import numpy as np
+import math
+import torch
+import pdb
+import torch.distributed as dist
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+import ctypes
+from ..utils import get_logger,boolean_string
+logger=get_logger()
+__all__ = ['Fp16Optimizer', 'ExpLossScaler', 'get_world_size']
+def get_world_size():
+  try:
+    wd = dist.get_world_size()
+    return wd
+  except:
+    return 1
+def fused_norm(input):
+  return torch.norm(input, p=2, dtype=torch.float32)
+class OptParameter(torch.Tensor):
+  def __new__(cls, data, out_data=None, grad=None, name=None):
+    param = torch.Tensor._make_subclass(cls, data)
+    param._xgrad = grad
+    param.out_data = out_data
+    param._name = name
+    return param
+  @property
+  def name(self):
+    return self._name
+  @property
+  def grad(self):
+    return self._xgrad
+  @grad.setter
+  def grad(self, grad):
+    self._xgrad = grad
+class Fp16Optimizer(object):
+  def __init__(self, param_groups, optimizer_fn, loss_scaler=None, grad_clip_norm = 1.0, lookahead_k = -1, lookahead_alpha = 0.5, rank=-1, distributed=False):
+    # all parameters should on the same device
+    groups = []
+    original_groups = []
+    self.rank = rank
+    self.distributed = distributed
+    if self.rank<0:
+      self.distributed = False
+    for group in param_groups:
+      if 'offset' not in group:
+        group['offset'] = None
+      if ('rank' not in group) or (not self.distributed):
+        group['rank'] = -1
+        assert group['offset'] is None, f"{group['names']}: {group['offset']}"
+      group_rank = group['rank']
+      params = group['params'] # parameter
+      if len(params) > 1:
+        flattened_params = _flatten_dense_tensors([p.data for p in params])
+        unflattend_params = _unflatten_dense_tensors(flattened_params, [p.data for p in params])
+        for uf,p in zip(unflattend_params, params):
+          p.data = uf
+      else:
+        flattened_params = params[0].data.view(-1)
+        if group['offset'] is not None:
+          start, length = group['offset']
+          flattened_params = flattened_params.narrow(0, start, length)
+      if params[0].dtype==torch.half:
+        if self.rank == group_rank or (not self.distributed):
+          master_params = flattened_params.clone().to(torch.float).detach_().to(flattened_params.device)
+        else:
+          master_params = flattened_params.clone().to(torch.float).detach_().cpu()
+        group['params'] = [OptParameter(master_params, flattened_params, name='master')]
+      else:
+        group['params'] = [OptParameter(flattened_params, None, name='master')]
+      o_group = defaultdict(list)
+      o_group['names'] = group['names']
+      o_group['params'] = params
+      o_group['rank'] = group_rank
+      o_group['offset'] = group['offset']
+      group['names'] = ['master']
+      original_groups.append(o_group)
+      groups.append(group)
+    self.param_groups = groups
+    self.loss_scaler = loss_scaler
+    self.optimizer = optimizer_fn(self.param_groups)
+    self.original_param_groups = original_groups
+    self.max_grad_norm = grad_clip_norm
+    self.lookahead_k = lookahead_k
+    self.lookahead_alpha = lookahead_alpha
+  def backward(self, loss):
+    if self.loss_scaler:
+      loss_scale, loss, step_loss = self.loss_scaler.scale(loss)
+    else:
+      loss_scale = 1
+      step_loss = loss.item()
+    loss.backward()
+    return loss_scale, step_loss
+  def step(self, lr_scale, loss_scale = 1):
+    grad_scale = self._grad_scale(loss_scale)
+    if grad_scale is None or math.isinf(grad_scale):
+      self.loss_scaler.update(False)
+      return False
+    if self.lookahead_k > 0:
+      for p in self.param_groups:
+        if 'la_count' not in p:
+          # init
+          #make old copy
+          p['la_count'] = 0
+          p['slow_params'] = [x.data.detach().clone().requires_grad_(False) for x in p['params']]
+    self.optimizer.step(grad_scale, lr_scale)
+    # for group in self.param_groups:
+    #     for p in group['params']:
+    #         # p.data        : master fp32
+    #         # p.out_data    : fp16 tensor backing model nn.Parameters
+    #         if hasattr(p, 'out_data') and p.out_data is not None:
+    #             p.out_data.copy_(p.data, non_blocking=True)
+    if self.lookahead_k > 0:
+      for p in self.param_groups:
+        p['la_count'] += 1
+        if p['la_count'] == self.lookahead_k:
+          p['la_count'] = 0
+          for s,f in zip(p['slow_params'], p['params']):
+            s.mul_(1-self.lookahead_alpha)
+            s.add_(f.data.detach()*self.lookahead_alpha)
+            f.data.copy_(s, non_blocking=True)
+            if hasattr(f, 'out_data') and f.out_data is not None:
+              f.out_data.copy_(f.data, non_blocking=True)
+    if self.loss_scaler:
+      self.loss_scaler.update(True)
+    return True
+  def zero_grad(self):
+    for group, o_group in zip(self.param_groups, self.original_param_groups):
+      for p in group['params']:
+        p.grad = None
+      for p in o_group['params']:
+        p.grad = None
+  def _grad_scale(self, loss_scale = 1):
+    named_params = {}
+    named_grads = {}
+    for g in self.original_param_groups:
+      for n,p in zip(g['names'], g['params']):
+        named_params[n] = p
+        named_grads[n] = p.grad if p.grad is not None else torch.zeros_like(p.data)
+    wd = get_world_size()
+    def _reduce(group):
+      grads = [named_grads[n] for n in group]
+      if len(grads)>1:
+        flattened_grads = _flatten_dense_tensors(grads)
+      else:
+        flattened_grads = grads[0],view(-1)
+      if wd > 1:
+        flattened_grads /= wd
+        handle = dist.all_reduce(flattened_grads, async_op=True)
+      else:
+        handle = None
+      return flattened_grads, handle
+    def _process_grad(group, flattened_grads, max_grad, norm):
+      grads = [named_grads[n] for n in group]
+      norm = norm.to(flattened_grads.device)
+      norm = norm + fused_norm(flattened_grads)**2
+      if len(grads) > 1:
+        unflattend_grads = _unflatten_dense_tensors(flattened_grads, grads)
+      else:
+        unflattend_grads = [flattened_grads]
+      for n,ug in zip(group, unflattend_grads):
+        named_grads[n] = ug #.to(named_params[n].data)
+      return max_grad, norm
+    group_size = 0
+    group = []
+    max_size = 32*1024*1024
+    norm = torch.zeros(1, dtype=torch.float)
+    max_grad = 0
+    all_grads = []
+    for name in sorted(named_params.keys(), key=lambda x:x.replace('deberta.', 'bert.')):
+      group.append(name)
+      group_size += named_params[name].data.numel()
+      if group_size>=max_size:
+        flatten, handle = _reduce(group)
+        all_grads.append([handle, flatten, group])
+        group = []
+        group_size = 0
+    if group_size>0:
+      flatten, handle = _reduce(group)
+      all_grads.append([handle, flatten, group])
+      group = []
+      group_size = 0
+    for h,fg,group in all_grads:
+      if h is not None:
+        h.wait()
+      max_grad, norm = _process_grad(group, fg, max_grad, norm)
+    norm = norm**0.5
+    if torch.isnan(norm) or torch.isinf(norm) :#in ['-inf', 'inf', 'nan']:
+      return None
+    scaled_norm = norm.detach().item()/loss_scale
+    grad_scale = loss_scale
+    if self.max_grad_norm>0:
+      scale = norm/(loss_scale*self.max_grad_norm)
+      if scale>1:
+        grad_scale *= scale
+    for group, o_g in zip(self.param_groups, self.original_param_groups):
+      grads = [named_grads[n] for n in o_g['names']]
+      if len(grads) > 1:
+        flattened_grads = _flatten_dense_tensors(grads)
+      else:
+        flattened_grads = grads[0].view(-1)
+        if group['offset'] is not None:
+          start, length = group['offset']
+          flattened_grads = flattened_grads.narrow(0, start, length)
+      if group['rank'] == self.rank or (not self.distributed):
+        group['params'][0].grad = flattened_grads
+    return grad_scale
+class ExpLossScaler:
+  def __init__(self, init_scale=2**16, scale_interval=1000):
+    self.cur_scale = init_scale
+    self.scale_interval = scale_interval
+    self.invalid_cnt = 0
+    self.last_scale = 0
+    self.steps = 0
+    self.down_scale_smooth = 0
+  def scale(self, loss):
+    assert self.cur_scale > 0, self.init_scale
+    step_loss = loss.float().detach().item()
+    if step_loss != 0 and math.isfinite(step_loss):
+      loss_scale = self.cur_scale
+    else:
+      loss_scale = 1
+    loss = loss.float()*loss_scale
+    return (loss_scale, loss, step_loss)
+  def update(self, is_valid = True):
+    if not is_valid:
+      self.invalid_cnt += 1
+      if self.invalid_cnt>self.down_scale_smooth:
+        self.cur_scale /= 2
+        self.cur_scale = max(self.cur_scale, 1)
+        self.last_scale = self.steps
+    else:
+      self.invalid_cnt = 0
+      if self.steps - self.last_scale>self.scale_interval:
+        self.cur_scale *= 2
+        self.last_scale = self.steps
+    self.steps += 1
+  def state_dict(self):
+    state = defaultdict(float)
+    state['steps'] = self.steps
+    state['invalid_cnt'] = self.invalid_cnt
+    state['cur_scale'] = self.cur_scale
+    state['last_scale'] = self.last_scale
+    return state
+  def load_state_dict(self, state):
+    self.steps = state['steps']
+    self.invalid_cnt = state['invalid_cnt']
+    self.cur_scale = state['cur_scale']
+    self.last_scale = state['last_scale']