diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..61c3f168be1b5a7ad45e2abe56f997d33dee65bc 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/control.png filter=lfs diff=lfs merge=lfs -text
+assets/subject.png filter=lfs diff=lfs merge=lfs -text
+generation/control/ControlNet/font/DejaVuSans.ttf filter=lfs diff=lfs merge=lfs -text
+generation/control/ControlNet/ldm/modules/image_degradation/utils/test.png filter=lfs diff=lfs merge=lfs -text
+llama/data/MetaMathQA-40K.json filter=lfs diff=lfs merge=lfs -text
+llama/data/MetaMathQA.json filter=lfs diff=lfs merge=lfs -text
diff --git a/assets/control.png b/assets/control.png
new file mode 100644
index 0000000000000000000000000000000000000000..1afa338bd3be23650e50d92e1f691e6a86c5dd1f
--- /dev/null
+++ b/assets/control.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1943c7d2d2042fd1f5455f7c85509c7fc2299221d3118caf8369807b99ff451
+size 1046367
diff --git a/assets/subject.png b/assets/subject.png
new file mode 100644
index 0000000000000000000000000000000000000000..a26f201248d2be6b1a12608ca5f1b8c6a48eb3f2
--- /dev/null
+++ b/assets/subject.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d115037067258634d251581e308b6509fd9b8190b6084d00a211b6886dd379c7
+size 966400
diff --git a/generation/control/ControlNet/font/DejaVuSans.ttf b/generation/control/ControlNet/font/DejaVuSans.ttf
new file mode 100644
index 0000000000000000000000000000000000000000..356575d14731ad077bde1fb0aac44f88bb51f5c4
--- /dev/null
+++ b/generation/control/ControlNet/font/DejaVuSans.ttf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7da195a74c55bef988d0d48f9508bd5d849425c1770dba5d7bfc6ce9ed848954
+size 757076
diff --git a/generation/control/ControlNet/ldm/modules/image_degradation/utils/test.png b/generation/control/ControlNet/ldm/modules/image_degradation/utils/test.png
new file mode 100644
index 0000000000000000000000000000000000000000..e720ed04ac7e1e7938d367e692fb6a742c54a24c
--- /dev/null
+++ b/generation/control/ControlNet/ldm/modules/image_degradation/utils/test.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92e516278f0d3e85e84cfb55b43338e12d5896a0ee3833aafdf378025457d753
+size 441072
diff --git a/llama/data/MetaMathQA-40K.json b/llama/data/MetaMathQA-40K.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f19d6fafbb73a7ea9d677ed38be2be0c58c3d0b
--- /dev/null
+++ b/llama/data/MetaMathQA-40K.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c884f10e8aa1229a6e73a6bba2c9134ee0c7b7de92a02a7b8c9459085a59e117
+size 31076207
diff --git a/llama/data/MetaMathQA.json b/llama/data/MetaMathQA.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7419df6d428ff02158986b096f444c37fdd4eab
--- /dev/null
+++ b/llama/data/MetaMathQA.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb39a5d8c05c042ece92eae37dfd5ea414a5979df2bf3ad3b86411bef8205725
+size 395626321
diff --git a/llama/output/cp1e4/ft/adapter_model.safetensors b/llama/output/cp1e4/ft/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a09665ee5196fce110ad253798bb2f72e2c9ed8f
--- /dev/null
+++ b/llama/output/cp1e4/ft/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e1c2fceb4f91331d69364aa56d01dd2103d4e59066f1519f1242a62ecca387a
+size 1082171824
diff --git a/llama/output/cp1e4/ft/tokenizer.model b/llama/output/cp1e4/ft/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/llama/output/cp1e4/ft/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/llama/output/cp1e5/ft/adapter_model.safetensors b/llama/output/cp1e5/ft/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..35e96c42e0e006be658eb2dda85727d7159f42d9
--- /dev/null
+++ b/llama/output/cp1e5/ft/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6121d3f7682fd21f70fc78ab9097b22ede67191507c54d44a9bd9c30adf44de
+size 592928
diff --git a/llama/output/cp1e5N/ft/adapter_model.safetensors b/llama/output/cp1e5N/ft/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..01948e41303160d582479263b0a8ae80571ce40c
--- /dev/null
+++ b/llama/output/cp1e5N/ft/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d85146aea100acda2fd5bb5a011f8d1e14983756bb0c102bf85efe04ac176479
+size 1082171824
diff --git a/llama/output/cp1e5N/ft/tokenizer.model b/llama/output/cp1e5N/ft/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/llama/output/cp1e5N/ft/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/llama/output/cp3e5/ft/adapter_model.safetensors b/llama/output/cp3e5/ft/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..25c7e034ff8d777506f369efbce304800e56b3ce
--- /dev/null
+++ b/llama/output/cp3e5/ft/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1945e74d818ded53f08bc892bb458dd0e6addcd548b2f864dbd16a476a8954ef
+size 1082171824
diff --git a/llama/output/cp3e5N/ft/adapter_model.safetensors b/llama/output/cp3e5N/ft/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..18d2c70bada6242bbef820c9c93a5355721da5b2
--- /dev/null
+++ b/llama/output/cp3e5N/ft/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2396d96c0a301cceddf424fbdf7c7f3518311f90140fa9aad9053706288e9fc
+size 1082171824
diff --git a/llama/output/cp3e5N/ft/tokenizer.model b/llama/output/cp3e5N/ft/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/llama/output/cp3e5N/ft/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/llama/output/cpr1/ft/adapter_model.safetensors b/llama/output/cpr1/ft/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a59518f2e704d2a09d1a26811354d7f1511d0419
--- /dev/null
+++ b/llama/output/cpr1/ft/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:617c715b246fae47190ca1f8e304e9dbdadf6ac70bbfdd0f3bc3c4b1cd783c0d
+size 1049665904
diff --git a/llama/output/cpr1/ft/tokenizer.model b/llama/output/cpr1/ft/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/llama/output/cpr1/ft/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/llama/output/cpr2/ft/adapter_model.safetensors b/llama/output/cpr2/ft/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..07a241d0c9940c7d4288d073c846602a7d25d681
--- /dev/null
+++ b/llama/output/cpr2/ft/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:daede58d9fd4806298d90f9af12ba478c119afab844244f355f35ab3829eb029
+size 1049665904
diff --git a/llama/output/cpr2/ft/tokenizer.model b/llama/output/cpr2/ft/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/llama/output/cpr2/ft/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/nlu/DeBERTa.egg-info/PKG-INFO b/nlu/DeBERTa.egg-info/PKG-INFO
new file mode 100644
index 0000000000000000000000000000000000000000..ae8f91cda55784e426fad70a946ec26bb1cf6bd9
--- /dev/null
+++ b/nlu/DeBERTa.egg-info/PKG-INFO
@@ -0,0 +1,39 @@
+Metadata-Version: 2.1
+Name: DeBERTa
+Version: 0.1.13
+Summary: Decoding enhanced BERT with Disentangled Attention
+Home-page: https://github.com/microsoft/DeBERTa
+Author: penhe
+Author-email: penhe@microsoft.com
+License: MIT
+Keywords: NLP deep learning transformer pytorch Attention BERT RoBERTa DeBERTa
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.6
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: nltk
+Requires-Dist: spacy
+Requires-Dist: numpy
+Requires-Dist: pytest
+Requires-Dist: regex
+Requires-Dist: scipy
+Requires-Dist: scikit-learn
+Requires-Dist: tqdm
+Requires-Dist: ujson
+Requires-Dist: seqeval
+Requires-Dist: psutil
+Requires-Dist: sentencepiece
+Requires-Dist: torch
+Provides-Extra: docs
+Requires-Dist: recommonmark; extra == "docs"
+Requires-Dist: sphinx; extra == "docs"
+Requires-Dist: sphinx-markdown-tables; extra == "docs"
+Requires-Dist: sphinx-rtd-theme; extra == "docs"
+
+deberta long des
diff --git a/nlu/DeBERTa.egg-info/SOURCES.txt b/nlu/DeBERTa.egg-info/SOURCES.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d3dd8df8925fc2b9148b942ad6bea302696b0af3
--- /dev/null
+++ b/nlu/DeBERTa.egg-info/SOURCES.txt
@@ -0,0 +1,73 @@
+LICENSE
+setup.cfg
+setup.py
+DeBERTa/__init__.py
+DeBERTa.egg-info/PKG-INFO
+DeBERTa.egg-info/SOURCES.txt
+DeBERTa.egg-info/dependency_links.txt
+DeBERTa.egg-info/requires.txt
+DeBERTa.egg-info/top_level.txt
+DeBERTa/apps/__init__.py
+DeBERTa/apps/_utils.py
+DeBERTa/apps/run.py
+DeBERTa/apps/models/__init__.py
+DeBERTa/apps/models/masked_language_model.py
+DeBERTa/apps/models/multi_choice.py
+DeBERTa/apps/models/ner.py
+DeBERTa/apps/models/record_qa.py
+DeBERTa/apps/models/replaced_token_detection_model.py
+DeBERTa/apps/models/sequence_classification.py
+DeBERTa/apps/tasks/__init__.py
+DeBERTa/apps/tasks/glue_tasks.py
+DeBERTa/apps/tasks/metrics.py
+DeBERTa/apps/tasks/mlm_task.py
+DeBERTa/apps/tasks/ner_task.py
+DeBERTa/apps/tasks/race_task.py
+DeBERTa/apps/tasks/record_eval.py
+DeBERTa/apps/tasks/rtd_task.py
+DeBERTa/apps/tasks/superglue_tasks.py
+DeBERTa/apps/tasks/task.py
+DeBERTa/apps/tasks/task_registry.py
+DeBERTa/data/__init__.py
+DeBERTa/data/async_data.py
+DeBERTa/data/data_sampler.py
+DeBERTa/data/dataloader.py
+DeBERTa/data/dynamic_dataset.py
+DeBERTa/data/example.py
+DeBERTa/deberta/__init__.py
+DeBERTa/deberta/bert.py
+DeBERTa/deberta/cache_utils.py
+DeBERTa/deberta/config.py
+DeBERTa/deberta/da_utils.py
+DeBERTa/deberta/deberta.py
+DeBERTa/deberta/disentangled_attention.py
+DeBERTa/deberta/gpt2_bpe_utils.py
+DeBERTa/deberta/gpt2_tokenizer.py
+DeBERTa/deberta/mlm.py
+DeBERTa/deberta/nnmodule.py
+DeBERTa/deberta/ops.py
+DeBERTa/deberta/pooling.py
+DeBERTa/deberta/pretrained_models.py
+DeBERTa/deberta/spm_tokenizer.py
+DeBERTa/deberta/tokenizers.py
+DeBERTa/optims/__init__.py
+DeBERTa/optims/args.py
+DeBERTa/optims/fp16_optimizer.py
+DeBERTa/optims/lr_schedulers.py
+DeBERTa/optims/xadam.py
+DeBERTa/sift/__init__.py
+DeBERTa/sift/sift.py
+DeBERTa/training/__init__.py
+DeBERTa/training/_utils.py
+DeBERTa/training/args.py
+DeBERTa/training/dist_launcher.py
+DeBERTa/training/optimizer_utils.py
+DeBERTa/training/trainer.py
+DeBERTa/utils/__init__.py
+DeBERTa/utils/argument_types.py
+DeBERTa/utils/jit_tracing.py
+DeBERTa/utils/logger_util.py
+DeBERTa/utils/xtqdm.py
+adapterlib/__init__.py
+adapterlib/layers.py
+adapterlib/utils.py
\ No newline at end of file
diff --git a/nlu/DeBERTa.egg-info/dependency_links.txt b/nlu/DeBERTa.egg-info/dependency_links.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/nlu/DeBERTa.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/nlu/DeBERTa.egg-info/requires.txt b/nlu/DeBERTa.egg-info/requires.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cf4bbbbe4b81ce8b58b5398037643cbbd437e923
--- /dev/null
+++ b/nlu/DeBERTa.egg-info/requires.txt
@@ -0,0 +1,19 @@
+nltk
+spacy
+numpy
+pytest
+regex
+scipy
+scikit-learn
+tqdm
+ujson
+seqeval
+psutil
+sentencepiece
+torch
+
+[docs]
+recommonmark
+sphinx
+sphinx-markdown-tables
+sphinx-rtd-theme
diff --git a/nlu/DeBERTa.egg-info/top_level.txt b/nlu/DeBERTa.egg-info/top_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ae1fd650b677ed356341dc53b488086cec2128b2
--- /dev/null
+++ b/nlu/DeBERTa.egg-info/top_level.txt
@@ -0,0 +1,2 @@
+DeBERTa
+adapterlib
diff --git a/nlu/DeBERTa/apps/tasks/task_registry.py b/nlu/DeBERTa/apps/tasks/task_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b251bd4481ff9900902641ddb5975730acfd8d8
--- /dev/null
+++ b/nlu/DeBERTa/apps/tasks/task_registry.py
@@ -0,0 +1,70 @@
+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: penhe@microsoft.com
+# Date: 01/25/2019
+#
+
+from glob import glob
+import os
+import importlib
+import pdb
+import sys
+from ...utils import get_logger
+from .task import Task
+
+__all__ = ['load_tasks', 'register_task', 'get_task']
+tasks={}
+
+logger=get_logger()
+
+def register_task(name=None, desc=None):
+  def register_task_x(cls):
+    _name = name
+    if _name is None:
+      _name = cls.__name__
+
+    _desc = desc
+    if _desc is None:
+      _desc = _name
+
+    _name = _name.lower()
+    if _name in tasks:
+      logger.warning(f'{_name} already registered in the registry: {tasks[_name]}.')
+    assert issubclass(cls, Task), f'Registered class must be a subclass of Task.'
+    tasks[_name] = cls
+    cls._meta = {
+        'name': _name,
+        'desc': _desc}
+    return cls
+  
+  if type(name)==type:
+    cls = name
+    name = None
+    return register_task_x(cls)
+  return register_task_x
+
+def load_tasks(task_dir = None):
+  script_dir = os.path.dirname(os.path.abspath(__file__))
+  sys_tasks = glob(os.path.join(script_dir, "*.py"))
+  for t in sys_tasks:
+    m = os.path.splitext(os.path.basename(t))[0]
+    if not m.startswith('_'):
+      importlib.import_module(f'DeBERTa.apps.tasks.{m}')
+
+  if task_dir:
+    assert os.path.exists(task_dir), f"{task_dir} must be a valid directory."
+    customer_tasks = glob(os.path.join(task_dir, "*.py"))
+    sys.path.append(task_dir)
+    for t in customer_tasks:
+      m = os.path.splitext(os.path.basename(t))[0]
+      if not m.startswith('_'):
+        importlib.import_module(f'{m}')
+
+def get_task(name=None):
+  if name is None:
+    return tasks
+
+  return tasks[name.lower()]
diff --git a/nlu/DeBERTa/data/__init__.py b/nlu/DeBERTa/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1783518c5fe2f7f2c8376d6cce41b5a6a44e47fa
--- /dev/null
+++ b/nlu/DeBERTa/data/__init__.py
@@ -0,0 +1,5 @@
+from .example import ExampleInstance,ExampleSet,example_to_feature
+from .dataloader import SequentialDataLoader
+from .dynamic_dataset import *
+from .data_sampler import *
+from .async_data import *
diff --git a/nlu/DeBERTa/data/async_data.py b/nlu/DeBERTa/data/async_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cbf986a0a4bee4d45209b73a00055e94d6b2f06
--- /dev/null
+++ b/nlu/DeBERTa/data/async_data.py
@@ -0,0 +1,38 @@
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: Pengcheng He (penhe@microsoft.com)
+# Date: 05/15/2019
+#
+
+from queue import Queue,Empty
+from threading import Thread
+class AsyncDataLoader(object):
+  def __init__(self, dataloader, buffer_size=100):
+    self.buffer_size = buffer_size
+    self.dataloader = dataloader
+
+  def __iter__(self):
+    queue = Queue(self.buffer_size)
+    dl=iter(self.dataloader)
+    def _worker():
+      while True:
+        try:
+          queue.put(next(dl))
+        except StopIteration:
+          break
+      queue.put(None)
+    t=Thread(target=_worker)
+    t.start()
+    while True:
+      d = queue.get()
+      if d is None:
+        break
+      yield d
+    del t
+    del queue
+
+  def __len__(self):
+    return len(self.dataloader)
+
diff --git a/nlu/DeBERTa/data/data_sampler.py b/nlu/DeBERTa/data/data_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1aec3c2b4298503556aef1b8d4f0b2abb934f5fa
--- /dev/null
+++ b/nlu/DeBERTa/data/data_sampler.py
@@ -0,0 +1,76 @@
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: Pengcheng He (penhe@microsoft.com)
+# Date: 05/15/2019
+#
+
+import os
+import numpy as np
+import math
+import sys
+from torch.utils.data import Sampler
+
+__all__=['BatchSampler', 'DistributedBatchSampler', 'RandomSampler', 'SequentialSampler']
+class BatchSampler(Sampler):
+  def __init__(self, sampler, batch_size):
+    self.sampler = sampler
+    self.batch_size = batch_size
+
+  def __iter__(self):
+    batch = []
+    for idx in self.sampler:
+      batch.append(idx)
+      if len(batch)==self.batch_size:
+        yield batch
+        batch = []
+    if len(batch)>0:
+      yield batch
+
+  def __len__(self):
+    return (len(self.sampler) + self.batch_size - 1)//self.batch_size
+
+class DistributedBatchSampler(Sampler):
+  def __init__(self, sampler, rank=0, world_size = 1, drop_last = False):
+    self.sampler = sampler
+    self.rank = rank
+    self.world_size = world_size
+    self.drop_last = drop_last
+
+  def __iter__(self):
+    for b in self.sampler:
+      if len(b)%self.world_size != 0:
+        if self.drop_last:
+          break
+        else:
+          b.extend([b[0] for _ in range(self.world_size-len(b)%self.world_size)])
+      chunk_size = len(b)//self.world_size
+      yield b[self.rank*chunk_size:(self.rank+1)*chunk_size]
+
+  def __len__(self):
+    return len(self.sampler)
+
+class RandomSampler(Sampler):
+  def __init__(self, total_samples:int, data_seed:int = 0):
+    self.indices = np.array(np.arange(total_samples))
+    self.rng = np.random.RandomState(data_seed)
+
+  def __iter__(self):
+    self.rng.shuffle(self.indices)
+    for i in self.indices:
+      yield i
+
+  def __len__(self):
+    return len(self.indices)
+
+class SequentialSampler(Sampler):
+  def __init__(self, total_samples:int):
+    self.indices = np.array(np.arange(total_samples))
+
+  def __iter__(self):
+    for i in self.indices:
+      yield i
+
+  def __len__(self):
+    return len(self.indices)
diff --git a/nlu/DeBERTa/data/dataloader.py b/nlu/DeBERTa/data/dataloader.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd4e63cb134c70937a9dee131aebc07496da5f0b
--- /dev/null
+++ b/nlu/DeBERTa/data/dataloader.py
@@ -0,0 +1,511 @@
+import random
+import torch
+import torch.multiprocessing as multiprocessing
+from torch._C import _set_worker_signal_handlers, \
+    _remove_worker_pids, _error_if_any_worker_fails
+
+from packaging import version
+
+if version.Version(torch.__version__) >= version.Version('1.0.0'):
+    from torch._C import  _set_worker_pids
+else:
+    from torch._C import _update_worker_pids as _set_worker_pids
+
+from torch.utils.data import SequentialSampler, RandomSampler, BatchSampler, Sampler
+import signal
+import functools
+import collections.abc
+import re
+import sys
+import threading
+import traceback
+import os
+import time
+# from torch._six import string_classes
+string_classes = str
+
+IS_WINDOWS = sys.platform == "win32"
+if IS_WINDOWS:
+    import ctypes
+    from ctypes.wintypes import DWORD, BOOL, HANDLE
+
+if sys.version_info[0] == 2:
+    import Queue as queue
+else:
+    import queue
+
+__all__ = ['SequentialDataLoader']
+
+class ExceptionWrapper(object):
+    r"""Wraps an exception plus traceback to communicate across threads"""
+
+    def __init__(self, exc_info):
+        self.exc_type = exc_info[0]
+        self.exc_msg = "".join(traceback.format_exception(*exc_info))
+
+
+_use_shared_memory = False
+r"""Whether to use shared memory in default_collate"""
+
+MANAGER_STATUS_CHECK_INTERVAL = 5.0
+
+if IS_WINDOWS:
+    # On Windows, the parent ID of the worker process remains unchanged when the manager process
+    # is gone, and the only way to check it through OS is to let the worker have a process handle
+    # of the manager and ask if the process status has changed.
+    class ManagerWatchdog(object):
+        def __init__(self):
+            self.manager_pid = os.getppid()
+
+            self.kernel32 = ctypes.WinDLL('kernel32', use_last_error=True)
+            self.kernel32.OpenProcess.argtypes = (DWORD, BOOL, DWORD)
+            self.kernel32.OpenProcess.restype = HANDLE
+            self.kernel32.WaitForSingleObject.argtypes = (HANDLE, DWORD)
+            self.kernel32.WaitForSingleObject.restype = DWORD
+
+            # Value obtained from https://msdn.microsoft.com/en-us/library/ms684880.aspx
+            SYNCHRONIZE = 0x00100000
+            self.manager_handle = self.kernel32.OpenProcess(SYNCHRONIZE, 0, self.manager_pid)
+
+            if not self.manager_handle:
+                raise ctypes.WinError(ctypes.get_last_error())
+
+        def is_alive(self):
+            # Value obtained from https://msdn.microsoft.com/en-us/library/windows/desktop/ms687032.aspx
+            return self.kernel32.WaitForSingleObject(self.manager_handle, 0) != 0
+else:
+    class ManagerWatchdog(object):
+        def __init__(self):
+            self.manager_pid = os.getppid()
+
+        def is_alive(self):
+            return os.getppid() == self.manager_pid
+
+
+def _worker_loop(dataset, index_queue, data_queue, collate_fn, init_fn, worker_id):
+    global _use_shared_memory
+    _use_shared_memory = True
+
+    # Intialize C side signal handlers for SIGBUS and SIGSEGV. Python signal
+    # module's handlers are executed after Python returns from C low-level
+    # handlers, likely when the same fatal signal happened again already.
+    # https://docs.python.org/3/library/signal.html Sec. 18.8.1.1
+    _set_worker_signal_handlers()
+
+    torch.set_num_threads(1)
+
+    if init_fn is not None:
+        init_fn(worker_id)
+
+    watchdog = ManagerWatchdog()
+
+    while True:
+        try:
+            r = index_queue.get(timeout=MANAGER_STATUS_CHECK_INTERVAL)
+        except queue.Empty:
+            if watchdog.is_alive():
+                continue
+            else:
+                break
+        if r is None:
+            break
+        idx, batch_indices = r
+        try:
+            samples = collate_fn([dataset[i] for i in batch_indices])
+        except Exception:
+            data_queue.put((idx, ExceptionWrapper(sys.exc_info())))
+        else:
+            data_queue.put((idx, samples))
+            del samples
+
+
+def _worker_manager_loop(in_queue, out_queue, done_event, pin_memory, device_id):
+    if pin_memory:
+        torch.cuda.set_device(device_id)
+
+    while True:
+        try:
+            r = in_queue.get()
+        except Exception:
+            if done_event.is_set():
+                return
+            raise
+        if r is None:
+            break
+        if isinstance(r[1], ExceptionWrapper):
+            out_queue.put(r)
+            continue
+        idx, batch = r
+        try:
+            if pin_memory:
+                batch = pin_memory_batch(batch)
+        except Exception:
+            out_queue.put((idx, ExceptionWrapper(sys.exc_info())))
+        else:
+            out_queue.put((idx, batch))
+
+numpy_type_map = {
+    'float64': torch.DoubleTensor,
+    'float32': torch.FloatTensor,
+    'float16': torch.HalfTensor,
+    'int64': torch.LongTensor,
+    'int32': torch.IntTensor,
+    'int16': torch.ShortTensor,
+    'int8': torch.CharTensor,
+    'uint8': torch.ByteTensor,
+}
+
+
+def default_collate(batch):
+    r"""Puts each data field into a tensor with outer dimension batch size"""
+
+    error_msg = "batch must contain tensors, numbers, dicts or lists; found {}"
+    elem_type = type(batch[0])
+    if isinstance(batch[0], torch.Tensor):
+        out = None
+        if _use_shared_memory:
+            # If we're in a background process, concatenate directly into a
+            # shared memory tensor to avoid an extra copy
+            numel = sum([x.numel() for x in batch])
+            storage = batch[0].storage()._new_shared(numel)
+            out = batch[0].new(storage)
+        return torch.stack(batch, 0, out=out)
+    elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
+            and elem_type.__name__ != 'string_':
+        elem = batch[0]
+        if elem_type.__name__ == 'ndarray':
+            # array of string classes and object
+            if re.search('[SaUO]', elem.dtype.str) is not None:
+                raise TypeError(error_msg.format(elem.dtype))
+
+            return torch.stack([torch.from_numpy(b) for b in batch], 0)
+        if elem.shape == ():  # scalars
+            py_type = float if elem.dtype.name.startswith('float') else int
+            return numpy_type_map[elem.dtype.name](list(map(py_type, batch)))
+    elif isinstance(batch[0], int):
+        return torch.LongTensor(batch)
+    elif isinstance(batch[0], float):
+        return torch.DoubleTensor(batch)
+    elif isinstance(batch[0], string_classes):
+        return batch
+    elif isinstance(batch[0], collections.abc.Mapping):
+        return {key: default_collate([d[key] for d in batch]) for key in batch[0]}
+    elif isinstance(batch[0], collections.abc.Sequence):
+        transposed = zip(*batch)
+        return [default_collate(samples) for samples in transposed]
+
+    raise TypeError((error_msg.format(type(batch[0]))))
+
+
+def pin_memory_batch(batch):
+    if isinstance(batch, torch.Tensor):
+        return batch.pin_memory()
+    elif isinstance(batch, string_classes):
+        return batch
+    elif isinstance(batch, collections.abc.Mapping):
+        return {k: pin_memory_batch(sample) for k, sample in batch.items()}
+    elif isinstance(batch, collections.abc.Sequence):
+        return [pin_memory_batch(sample) for sample in batch]
+    else:
+        return batch
+
+
+_SIGCHLD_handler_set = False
+r"""Whether SIGCHLD handler is set for DataLoader worker failures. Only one
+handler needs to be set for all DataLoaders in a process."""
+
+
+def _set_SIGCHLD_handler():
+    # Windows doesn't support SIGCHLD handler
+    if sys.platform == 'win32':
+        return
+    # can't set signal in child threads
+    if not isinstance(threading.current_thread(), threading._MainThread):
+        return
+    global _SIGCHLD_handler_set
+    if _SIGCHLD_handler_set:
+        return
+    previous_handler = signal.getsignal(signal.SIGCHLD)
+    if not callable(previous_handler):
+        previous_handler = None
+
+    def handler(signum, frame):
+        # This following call uses `waitid` with WNOHANG from C side. Therefore,
+        # Python can still get and update the process status successfully.
+        _error_if_any_worker_fails()
+        if previous_handler is not None:
+            previous_handler(signum, frame)
+
+    signal.signal(signal.SIGCHLD, handler)
+    _SIGCHLD_handler_set = True
+
+
+class _SequentialDataLoaderIter(object):
+    r"""Iterates once over the DataLoader's dataset, as specified by the sampler"""
+
+    def __init__(self, loader):
+        self.dataset = loader.dataset
+        self.collate_fn = loader.collate_fn
+        self.batch_sampler = loader.batch_sampler
+        self.num_workers = loader.num_workers
+        self.pin_memory = loader.pin_memory and torch.cuda.is_available()
+        self.timeout = loader.timeout
+        self.done_event = threading.Event()
+
+        self.sample_iter = iter(self.batch_sampler)
+
+        if self.num_workers > 0:
+            self.worker_init_fn = loader.worker_init_fn
+            self.index_queues = [multiprocessing.Queue() for _ in range(self.num_workers)]
+            self.worker_queue_idx = 0
+            self.worker_result_queue = multiprocessing.SimpleQueue()
+            self.batches_outstanding = 0
+            self.worker_pids_set = False
+            self.shutdown = False
+            self.send_idx = 0
+            self.rcvd_idx = 0
+            self.reorder_dict = {}
+
+            self.workers = [
+                multiprocessing.Process(
+                    target=_worker_loop,
+                    args=(self.dataset, self.index_queues[i],
+                          self.worker_result_queue, self.collate_fn, self.worker_init_fn, i))
+                for i in range(self.num_workers)]
+
+            if self.pin_memory or self.timeout > 0:
+                self.data_queue = queue.Queue()
+                if self.pin_memory:
+                    maybe_device_id = torch.cuda.current_device()
+                else:
+                    # do not initialize cuda context if not necessary
+                    maybe_device_id = None
+                self.worker_manager_thread = threading.Thread(
+                    target=_worker_manager_loop,
+                    args=(self.worker_result_queue, self.data_queue, self.done_event, self.pin_memory,
+                          maybe_device_id))
+                self.worker_manager_thread.daemon = True
+                self.worker_manager_thread.start()
+            else:
+                self.data_queue = self.worker_result_queue
+
+            for w in self.workers:
+                w.daemon = True  # ensure that the worker exits on process exit
+                w.start()
+
+            _set_worker_pids(id(self), tuple(w.pid for w in self.workers))
+            _set_SIGCHLD_handler()
+            self.worker_pids_set = True
+
+            # prime the prefetch loop
+            for _ in range(2 * self.num_workers):
+                self._put_indices()
+
+    def __len__(self):
+        return len(self.batch_sampler)
+
+    def _get_batch(self):
+        if self.timeout > 0:
+            try:
+                return self.data_queue.get(timeout=self.timeout)
+            except queue.Empty:
+                raise RuntimeError('DataLoader timed out after {} seconds'.format(self.timeout))
+        else:
+            return self.data_queue.get()
+
+    def __next__(self):
+        if self.num_workers == 0:  # same-process loading
+            indices = next(self.sample_iter)  # may raise StopIteration
+            batch = self.collate_fn([self.dataset[i] for i in indices])
+            if self.pin_memory:
+                batch = pin_memory_batch(batch)
+            return batch
+
+        # check if the next sample has already been generated
+        if self.rcvd_idx in self.reorder_dict:
+            batch = self.reorder_dict.pop(self.rcvd_idx)
+            return self._process_next_batch(batch)
+
+        if self.batches_outstanding == 0:
+            self._shutdown_workers()
+            raise StopIteration
+
+        while True:
+            assert (not self.shutdown and self.batches_outstanding > 0)
+            idx, batch = self._get_batch()
+            self.batches_outstanding -= 1
+            if idx != self.rcvd_idx:
+                # store out-of-order samples
+                self.reorder_dict[idx] = batch
+                continue
+            return self._process_next_batch(batch)
+
+    next = __next__  # Python 2 compatibility
+
+    def __iter__(self):
+        return self
+
+    def _put_indices(self):
+        assert self.batches_outstanding < 2 * self.num_workers
+        indices = next(self.sample_iter, None)
+        if indices is None:
+            return
+        self.index_queues[self.worker_queue_idx].put((self.send_idx, indices))
+        self.worker_queue_idx = (self.worker_queue_idx + 1) % self.num_workers
+        self.batches_outstanding += 1
+        self.send_idx += 1
+
+    def _process_next_batch(self, batch):
+        self.rcvd_idx += 1
+        self._put_indices()
+        if isinstance(batch, ExceptionWrapper):
+            raise batch.exc_type(batch.exc_msg)
+        return batch
+
+    def __getstate__(self):
+        # TODO: add limited pickling support for sharing an iterator
+        # across multiple threads for HOGWILD.
+        # Probably the best way to do this is by moving the sample pushing
+        # to a separate thread and then just sharing the data queue
+        # but signalling the end is tricky without a non-blocking API
+        raise NotImplementedError("_SequentialDataLoaderIter cannot be pickled")
+
+    def _shutdown_workers(self):
+        try:
+            if not self.shutdown:
+                self.shutdown = True
+                self.done_event.set()
+                for q in self.index_queues:
+                    q.put(None)
+                # if some workers are waiting to put, make place for them
+                try:
+                    while not self.worker_result_queue.empty():
+                        self.worker_result_queue.get()
+                except (FileNotFoundError, ImportError):
+                    # Many weird errors can happen here due to Python
+                    # shutting down. These are more like obscure Python bugs.
+                    # FileNotFoundError can happen when we rebuild the fd
+                    # fetched from the queue but the socket is already closed
+                    # from the worker side.
+                    # ImportError can happen when the unpickler loads the
+                    # resource from `get`.
+                    pass
+                # done_event should be sufficient to exit worker_manager_thread,
+                # but be safe here and put another None
+                self.worker_result_queue.put(None)
+        finally:
+            # removes pids no matter what
+            if self.worker_pids_set:
+                _remove_worker_pids(id(self))
+                self.worker_pids_set = False
+
+    def __del__(self):
+        if self.num_workers > 0:
+            self._shutdown_workers()
+
+
+class SequentialDataLoader(object):
+    r"""
+    Sequential Data loader. Combines a dataset and a sampler, and provides
+    single- or multi-process iterators over the dataset.
+    This is modified from Pytorch.DataLoader by disable random state touch as for sequential data loading, 
+    we don't want it to touch any random state.
+    Arguments:
+        dataset (Dataset): dataset from which to load the data.
+        batch_size (int, optional): how many samples per batch to load
+            (default: 1).
+        shuffle (bool, optional): set to ``True`` to have the data reshuffled
+            at every epoch (default: False).
+        sampler (Sampler, optional): defines the strategy to draw samples from
+            the dataset. If specified, ``shuffle`` must be False.
+        batch_sampler (Sampler, optional): like sampler, but returns a batch of
+            indices at a time. Mutually exclusive with batch_size, shuffle,
+            sampler, and drop_last.
+        num_workers (int, optional): how many subprocesses to use for data
+            loading. 0 means that the data will be loaded in the main process.
+            (default: 0)
+        collate_fn (callable, optional): merges a list of samples to form a mini-batch.
+        pin_memory (bool, optional): If ``True``, the data loader will copy tensors
+            into CUDA pinned memory before returning them.
+        drop_last (bool, optional): set to ``True`` to drop the last incomplete batch,
+            if the dataset size is not divisible by the batch size. If ``False`` and
+            the size of dataset is not divisible by the batch size, then the last batch
+            will be smaller. (default: False)
+        timeout (numeric, optional): if positive, the timeout value for collecting a batch
+            from workers. Should always be non-negative. (default: 0)
+        worker_init_fn (callable, optional): If not None, this will be called on each
+            worker subprocess with the worker id (an int in ``[0, num_workers - 1]``) as
+            input, after seeding and before data loading. (default: None)
+
+    .. note:: By default, each worker will have its PyTorch seed set to
+              ``base_seed + worker_id``, where ``base_seed`` is a long generated
+              by main process using its RNG. However, seeds for other libraies
+              may be duplicated upon initializing workers (w.g., NumPy), causing
+              each worker to return identical random numbers. (See
+              :ref:`dataloader-workers-random-seed` section in FAQ.) You may
+              use ``torch.initial_seed()`` to access the PyTorch seed for each
+              worker in :attr:`worker_init_fn`, and use it to set other seeds
+              before data loading.
+
+    .. warning:: If ``spawn`` start method is used, :attr:`worker_init_fn` cannot be an
+                 unpicklable object, e.g., a lambda function.
+    """
+
+    __initialized = False
+
+    def __init__(self, dataset, batch_size=1, shuffle=False, sampler=None, batch_sampler=None,
+                 num_workers=0, collate_fn=default_collate, pin_memory=False, drop_last=False,
+                 timeout=0, worker_init_fn=None):
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.collate_fn = collate_fn
+        self.pin_memory = pin_memory
+        self.drop_last = drop_last
+        self.timeout = timeout
+        self.worker_init_fn = worker_init_fn
+
+        if timeout < 0:
+            raise ValueError('timeout option should be non-negative')
+
+        if batch_sampler is not None:
+            if batch_size > 1 or shuffle or sampler is not None or drop_last:
+                raise ValueError('batch_sampler option is mutually exclusive '
+                                 'with batch_size, shuffle, sampler, and '
+                                 'drop_last')
+            self.batch_size = None
+            self.drop_last = None
+
+        if sampler is not None and shuffle:
+            raise ValueError('sampler option is mutually exclusive with '
+                             'shuffle')
+
+        if self.num_workers < 0:
+            raise ValueError('num_workers option cannot be negative; '
+                             'use num_workers=0 to disable multiprocessing.')
+
+        if batch_sampler is None:
+            if sampler is None:
+                if shuffle:
+                    sampler = RandomSampler(dataset)
+                else:
+                    sampler = SequentialSampler(dataset)
+            batch_sampler = BatchSampler(sampler, batch_size, drop_last)
+
+        self.sampler = sampler
+        self.batch_sampler = batch_sampler
+        self.__initialized = True
+
+    def __setattr__(self, attr, val):
+        if self.__initialized and attr in ('batch_size', 'sampler', 'drop_last'):
+            raise ValueError('{} attribute should not be set after {} is '
+                             'initialized'.format(attr, self.__class__.__name__))
+
+        super(SequentialDataLoader, self).__setattr__(attr, val)
+
+    def __iter__(self):
+        return _SequentialDataLoaderIter(self)
+
+    def __len__(self):
+        return len(self.batch_sampler)
+
diff --git a/nlu/DeBERTa/data/dynamic_dataset.py b/nlu/DeBERTa/data/dynamic_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d269a37b54dc905ce047fa5bdc74ed835217434
--- /dev/null
+++ b/nlu/DeBERTa/data/dynamic_dataset.py
@@ -0,0 +1,60 @@
+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: penhe@microsoft.com
+# Date: 05/15/2019
+#
+
+import pdb
+from torch.utils.data import Dataset
+import random
+import mmap
+import numpy as np
+from bisect import bisect
+from ..utils import get_logger
+logger=get_logger()
+
+__all__ = ['DynamicDataset']
+
+class DynamicDataset(Dataset):
+  def __init__(self, corpus, feature_fn, dataset_size=None, shuffle=False, **kwargs):
+    self.corpus = corpus
+    self.ds_len = len(self.corpus)
+    logger.info(f'Total corpus examples: {self.ds_len}')
+    self.feature_fn = feature_fn
+
+    if not dataset_size:
+      self.dataset_size = self.ds_len
+    else:
+      self.dataset_size = int(dataset_size)
+
+    self.shuffle = shuffle
+    index_buf = mmap.mmap(-1, self.dataset_size*8)
+    shuffle_idx = np.ndarray(shape=(self.dataset_size, ), buffer=index_buf, dtype=int)
+    shuffle_idx[:] = np.arange(self.dataset_size)[:]
+    if self.shuffle:
+      #rng = np.random.RandomState(0)
+      rng = random.Random(0)
+      rng.shuffle(shuffle_idx)
+    self.shuffle_idx = shuffle_idx
+    self.index_offset = 0
+    if 'index_offset' in kwargs:
+      self.index_offset = kwargs['index_offset']
+
+  def __len__(self):
+    return self.dataset_size
+
+  def __getitem__(self, idx):
+    if isinstance(idx, tuple) or isinstance(idx, list):
+      idx, ext_params = idx
+    else:
+      ext_params = None
+    idx += self.index_offset
+    seed = idx
+    rng = random.Random(seed)
+    # get seq length
+    example_idx = self.shuffle_idx[idx%self.dataset_size]%self.ds_len
+    example = self.corpus[example_idx, rng, ext_params]
+    return self.feature_fn(example, rng, ext_params = ext_params)
diff --git a/nlu/DeBERTa/data/example.py b/nlu/DeBERTa/data/example.py
new file mode 100644
index 0000000000000000000000000000000000000000..1da9d19b1dd1d4c64f2ab28e8a690ac4d6ca4780
--- /dev/null
+++ b/nlu/DeBERTa/data/example.py
@@ -0,0 +1,105 @@
+import torch
+import os
+from collections import OrderedDict
+import numpy as np
+import tempfile
+import numpy as np
+import mmap
+import pickle
+import signal
+import sys
+import pdb
+
+from ..utils import xtqdm as tqdm
+
+__all__=['ExampleInstance', 'example_to_feature', 'ExampleSet']
+
+class ExampleInstance:
+  def __init__(self, segments, label=None,  **kwv):
+    self.segments = segments
+    self.label = label
+    self.__dict__.update(kwv)
+
+  def __repr__(self):
+    return f'segments: {self.segments}\nlabel: {self.label}'
+
+  def __getitem__(self, i):
+    return self.segments[i]
+
+  def __len__(self):
+    return len(self.segments)
+
+class ExampleSet:
+  def __init__(self, pairs):
+    self._data = np.array([pickle.dumps(p) for p in pairs])
+    self.total = len(self._data)
+
+  def __getitem__(self, idx):
+    """
+    return pair
+    """
+    if isinstance(idx, tuple):
+      idx,rng, ext_params = idx
+    else:
+      rng,ext_params=None, None
+    content = self._data[idx]
+    example = pickle.loads(content)
+    return example
+
+  def __len__(self):
+    return self.total
+
+  def __iter__(self):
+    for i in range(self.total):
+      yield self[i]
+
+def _truncate_segments(segments, max_num_tokens, rng):
+  """
+  Truncate sequence pair according to original BERT implementation:
+  https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L391
+  """
+  while True:
+    if sum(len(s) for s in segments)<=max_num_tokens:
+      break
+
+    segments = sorted(segments, key=lambda s:len(s), reverse=True)
+    trunc_tokens = segments[0]
+
+    assert len(trunc_tokens) >= 1
+
+    if rng.random() < 0.5:
+      trunc_tokens.pop(0)
+    else:
+      trunc_tokens.pop()
+  return segments
+
+def example_to_feature(tokenizer, example, max_seq_len=512, rng=None, mask_generator = None, ext_params=None, label_type='int', **kwargs):
+  if not rng:
+    rng = random
+  max_num_tokens = max_seq_len - len(example.segments) - 1
+  segments = _truncate_segments([tokenizer.tokenize(s) for s in example.segments], max_num_tokens, rng)
+  tokens = ['[CLS]']
+  type_ids = [0]
+  for i,s in enumerate(segments):
+    tokens.extend(s)
+    tokens.append('[SEP]')
+    type_ids.extend([i]*(len(s)+1))
+  if mask_generator:
+    tokens, lm_labels = mask_generator.mask_tokens(tokens, rng)
+  token_ids = tokenizer.convert_tokens_to_ids(tokens)
+  pos_ids = list(range(len(token_ids)))
+  input_mask = [1]*len(token_ids)
+  features = OrderedDict(input_ids = token_ids,
+      type_ids = type_ids,
+      position_ids = pos_ids,
+      input_mask = input_mask)
+  if mask_generator:
+    features['lm_labels'] = lm_labels
+  padding_size = max(0, max_seq_len - len(token_ids))
+  for f in features:
+    features[f].extend([0]*padding_size)
+    features[f] = torch.tensor(features[f], dtype=torch.int)
+  label_type = torch.int if label_type=='int' else torch.float
+  if example.label is not None:
+    features['labels'] = torch.tensor(example.label, dtype=label_type)
+  return features
diff --git a/nlu/DeBERTa/deberta/__init__.py b/nlu/DeBERTa/deberta/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b968269bc3a6c05c57c9a6ab09c70918ba496f9
--- /dev/null
+++ b/nlu/DeBERTa/deberta/__init__.py
@@ -0,0 +1,22 @@
+#
+# Author: penhe@microsoft.com
+# Date: 04/25/2019
+#
+
+""" Components for NN
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from .tokenizers import *
+from .pooling import *
+from .mlm import MLMPredictionHead
+from .nnmodule import NNModule
+from .deberta import *
+from .disentangled_attention import *
+from .ops import *
+from .bert import *
+from .config import *
+from .cache_utils import *
diff --git a/nlu/DeBERTa/deberta/bert.py b/nlu/DeBERTa/deberta/bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..249fb0141c36b7c7d269c2f74f2ab2d68c7f4e2c
--- /dev/null
+++ b/nlu/DeBERTa/deberta/bert.py
@@ -0,0 +1,308 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This piece of code is modified based on https://github.com/huggingface/transformers
+
+import copy
+import torch
+from torch import nn
+from collections.abc import Sequence
+from packaging import version
+import numpy as np
+import math
+import os
+import pdb
+
+import json
+from .ops import *
+from .disentangled_attention import *
+from .da_utils import *
+
+from adapterlib import adapter_dict
+
+__all__ = ['BertEncoder', 'BertEmbeddings', 'ACT2FN', 'LayerNorm', 'BertLMPredictionHead']
+
+class BertSelfOutput(nn.Module):
+  def __init__(self, config):
+    super().__init__()
+    # self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+    if config.inject_adapter != 'linear':
+      self.dense = adapter_dict[config.inject_adapter](config.hidden_size, config.hidden_size, config=config)
+    else:
+      self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+    
+    self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+    self.dropout = StableDropout(config.hidden_dropout_prob)
+    self.config = config
+
+  def forward(self, hidden_states, input_states, mask=None):
+    hidden_states = self.dense(hidden_states)
+    hidden_states = self.dropout(hidden_states)
+    hidden_states += input_states
+    hidden_states = MaskedLayerNorm(self.LayerNorm, hidden_states)
+    return hidden_states
+
+class BertAttention(nn.Module):
+  def __init__(self, config):
+    super().__init__()
+    self.self = DisentangledSelfAttention(config)
+    self.output = BertSelfOutput(config)
+    self.config = config
+
+  def forward(self, hidden_states, attention_mask, return_att=False, query_states=None, relative_pos=None, rel_embeddings=None):
+    output = self.self(hidden_states, attention_mask, return_att, query_states=query_states, relative_pos=relative_pos, rel_embeddings=rel_embeddings)
+    self_output, att_matrix, att_logits_=output['hidden_states'], output['attention_probs'], output['attention_logits']
+    if query_states is None:
+      query_states = hidden_states
+    attention_output = self.output(self_output, query_states, attention_mask)
+
+    if return_att:
+      return (attention_output, att_matrix)
+    else:
+      return attention_output
+
+class BertIntermediate(nn.Module):
+  def __init__(self, config):
+    super().__init__()
+    # self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+    if config.inject_adapter != 'linear':
+      self.dense = adapter_dict[config.inject_adapter](config.hidden_size, config.intermediate_size, config=config)
+    else:
+      self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+      
+    self.intermediate_act_fn = ACT2FN[config.hidden_act] \
+      if isinstance(config.hidden_act, str) else config.hidden_act
+
+  def forward(self, hidden_states):
+    hidden_states = self.dense(hidden_states)
+    hidden_states = self.intermediate_act_fn(hidden_states)
+    return hidden_states
+
+class BertOutput(nn.Module):
+  def __init__(self, config):
+    super(BertOutput, self).__init__()
+    # self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+    if config.inject_adapter != 'linear':
+      self.dense = adapter_dict[config.inject_adapter](config.intermediate_size, config.hidden_size, config=config)
+    else:
+      self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+      
+    self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+    self.dropout = StableDropout(config.hidden_dropout_prob)
+    self.config = config
+
+  def forward(self, hidden_states, input_states, mask=None):
+    hidden_states = self.dense(hidden_states)
+    hidden_states = self.dropout(hidden_states)
+    hidden_states += input_states
+    hidden_states = MaskedLayerNorm(self.LayerNorm, hidden_states)
+    return hidden_states
+
+class BertLayer(nn.Module):
+  def __init__(self, config):
+    super(BertLayer, self).__init__()
+    self.attention = BertAttention(config)
+    self.intermediate = BertIntermediate(config)
+    self.output = BertOutput(config)
+
+  def forward(self, hidden_states, attention_mask, return_att=False, query_states=None, relative_pos=None, rel_embeddings=None):
+    attention_output = self.attention(hidden_states, attention_mask, return_att=return_att, \
+      query_states=query_states, relative_pos=relative_pos, rel_embeddings=rel_embeddings)
+    if return_att:
+      attention_output, att_matrix = attention_output
+    intermediate_output = self.intermediate(attention_output)
+    layer_output = self.output(intermediate_output, attention_output, attention_mask)
+    if return_att:
+      return (layer_output, att_matrix)
+    else:
+      return layer_output
+
+class ConvLayer(nn.Module):
+    def __init__(self, config):
+      super().__init__()
+      kernel_size = getattr(config, 'conv_kernel_size', 3)
+      groups = getattr(config, 'conv_groups', 1)
+      self.conv_act = getattr(config, 'conv_act', 'tanh')
+      self.conv = torch.nn.Conv1d(config.hidden_size, config.hidden_size, kernel_size, padding = (kernel_size-1)//2, groups = groups)
+      self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+      self.dropout = StableDropout(config.hidden_dropout_prob)
+      self.config = config
+
+    def forward(self, hidden_states, residual_states, input_mask):
+        out = self.conv(hidden_states.permute(0,2,1).contiguous()).permute(0,2,1).contiguous()
+        if version.Version(torch.__version__) >= version.Version('1.2.0a'):
+            rmask = (1-input_mask).bool()
+        else:
+            rmask = (1-input_mask).byte()
+        out.masked_fill_(rmask.unsqueeze(-1).expand(out.size()), 0)
+        out = ACT2FN[self.conv_act](self.dropout(out))
+        output_states = MaskedLayerNorm(self.LayerNorm, residual_states + out, input_mask)
+
+        return output_states
+
+class BertEncoder(nn.Module):
+  """ Modified BertEncoder with relative position bias support
+  """
+  def __init__(self, config):
+    super().__init__()
+    #layer = BertLayer(config)
+    self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+    self.relative_attention = getattr(config, 'relative_attention', False)
+    if self.relative_attention:
+      self.max_relative_positions = getattr(config, 'max_relative_positions', -1)
+      if self.max_relative_positions <1:
+        self.max_relative_positions = config.max_position_embeddings
+      self.position_buckets = getattr(config, 'position_buckets', -1)
+      pos_ebd_size = self.max_relative_positions*2
+      if self.position_buckets>0:
+        pos_ebd_size = self.position_buckets*2
+      self.rel_embeddings = nn.Embedding(pos_ebd_size, config.hidden_size)
+
+    self.norm_rel_ebd = [x.strip() for x in getattr(config, 'norm_rel_ebd', 'none').lower().split('|')]
+    if 'layer_norm' in self.norm_rel_ebd:
+      self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine = True)
+    kernel_size = getattr(config, 'conv_kernel_size', 0)
+    self.with_conv = False
+    if kernel_size > 0:
+      self.with_conv = True
+      self.conv = ConvLayer(config)
+
+  def get_rel_embedding(self):
+    rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None
+    if rel_embeddings is not None and ('layer_norm' in self.norm_rel_ebd):
+      rel_embeddings = self.LayerNorm(rel_embeddings)
+    return rel_embeddings
+
+  def get_attention_mask(self, attention_mask):
+    if attention_mask.dim()<=2:
+      extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+      attention_mask = extended_attention_mask*extended_attention_mask.squeeze(-2).unsqueeze(-1)
+      attention_mask = attention_mask.byte()
+    elif attention_mask.dim()==3:
+      attention_mask = attention_mask.unsqueeze(1)
+
+    return attention_mask
+
+  def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
+    if self.relative_attention and relative_pos is None:
+      q = query_states.size(-2) if query_states is not None else hidden_states.size(-2)
+      relative_pos = build_relative_position(q, hidden_states.size(-2), bucket_size = self.position_buckets, \
+          max_position=self.max_relative_positions, device = hidden_states.device)
+    return relative_pos
+
+  def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, return_att=False, query_states = None, relative_pos=None):
+    if attention_mask.dim()<=2:
+      input_mask = attention_mask
+    else:
+      input_mask = (attention_mask.sum(-2)>0).byte()
+    attention_mask = self.get_attention_mask(attention_mask)
+    relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)
+
+    all_encoder_layers = []
+    att_matrices = []
+    if isinstance(hidden_states, Sequence):
+      next_kv = hidden_states[0]
+    else:
+      next_kv = hidden_states
+    rel_embeddings = self.get_rel_embedding()
+    for i, layer_module in enumerate(self.layer):
+      output_states = layer_module(next_kv, attention_mask, return_att, query_states = query_states, relative_pos=relative_pos, rel_embeddings=rel_embeddings)
+      if return_att:
+        output_states, att_m = output_states
+
+      if i == 0 and self.with_conv:
+        prenorm = output_states #output['prenorm_states']
+        output_states = self.conv(hidden_states, prenorm, input_mask)
+
+      if query_states is not None:
+        query_states = output_states
+        if isinstance(hidden_states, Sequence):
+          next_kv = hidden_states[i+1] if i+1 < len(self.layer) else None
+      else:
+        next_kv = output_states
+
+      if output_all_encoded_layers:
+        all_encoder_layers.append(output_states)
+        if return_att:
+          att_matrices.append(att_m)
+    if not output_all_encoded_layers:
+      all_encoder_layers.append(output_states)
+      if return_att:
+        att_matrices.append(att_m)
+    return {
+        'hidden_states': all_encoder_layers,
+        'attention_matrices': att_matrices
+        }
+
+class BertEmbeddings(nn.Module):
+  """Construct the embeddings from word, position and token_type embeddings.
+  """
+  def __init__(self, config):
+    super(BertEmbeddings, self).__init__()
+    padding_idx = getattr(config, 'padding_idx', 0)
+    self.embedding_size = getattr(config, 'embedding_size', config.hidden_size)
+    self.word_embeddings = nn.Embedding(config.vocab_size, self.embedding_size, padding_idx = padding_idx)
+    self.position_biased_input = getattr(config, 'position_biased_input', True)
+    self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.embedding_size)
+
+    if config.type_vocab_size>0:
+      self.token_type_embeddings = nn.Embedding(config.type_vocab_size, self.embedding_size)
+    
+    if self.embedding_size != config.hidden_size:
+      self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False)
+    self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+    self.dropout = StableDropout(config.hidden_dropout_prob)
+    self.output_to_half = False
+    self.config = config
+
+  def forward(self, input_ids, token_type_ids=None, position_ids=None, mask = None):
+    seq_length = input_ids.size(1)
+    if position_ids is None:
+      position_ids = torch.arange(0, seq_length, dtype=torch.long, device=input_ids.device)
+      position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+    if token_type_ids is None:
+      token_type_ids = torch.zeros_like(input_ids)
+
+    words_embeddings = self.word_embeddings(input_ids)
+    position_embeddings = self.position_embeddings(position_ids.long())
+
+    embeddings = words_embeddings
+    if self.config.type_vocab_size>0:
+      token_type_embeddings = self.token_type_embeddings(token_type_ids)
+      embeddings += token_type_embeddings
+
+    if self.position_biased_input:
+      embeddings += position_embeddings
+
+    if self.embedding_size != self.config.hidden_size:
+      embeddings = self.embed_proj(embeddings)
+    embeddings = MaskedLayerNorm(self.LayerNorm, embeddings, mask)
+    embeddings = self.dropout(embeddings)
+    return {
+        'embeddings': embeddings,
+        'position_embeddings': position_embeddings}
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config, vocab_size):
+        super().__init__()
+        self.embedding_size = getattr(config, 'embedding_size', config.hidden_size)
+        self.dense = nn.Linear(config.hidden_size, self.embedding_size)
+        self.transform_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+
+        self.LayerNorm = LayerNorm(self.embedding_size, config.layer_norm_eps, elementwise_affine=True)
+
+        self.bias = nn.Parameter(torch.zeros(vocab_size))
+
+    def forward(self, hidden_states, embeding_weight):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        # b x s x d
+        hidden_states = MaskedLayerNorm(self.LayerNorm, hidden_states)
+
+        # b x s x v
+        logits = torch.matmul(hidden_states, embeding_weight.t().to(hidden_states)) + self.bias
+        return logits
diff --git a/nlu/DeBERTa/deberta/cache_utils.py b/nlu/DeBERTa/deberta/cache_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..143a7e693561158e99bc0a86f2814f99ce383240
--- /dev/null
+++ b/nlu/DeBERTa/deberta/cache_utils.py
@@ -0,0 +1,135 @@
+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: penhe@microsoft.com
+# Date: 05/15/2020
+#
+
+import pdb
+import torch
+import os
+import requests
+from .config import ModelConfig
+import pathlib
+from ..utils import xtqdm as tqdm
+from zipfile import ZipFile
+from ..utils import get_logger
+logger = get_logger()
+
+__all__ = ['pretrained_models', 'load_model_state', 'load_vocab']
+
+class PretrainedModel:
+  def __init__(self, name, vocab, vocab_type, model='pytorch_model.bin', config='config.json', **kwargs):
+    self.__dict__.update(kwargs)
+    host = f'https://huggingface.co/microsoft/{name}/resolve/main/'
+    self.name = name
+    self.model_url = host + model
+    self.config_url = host + config
+    self.vocab_url = host + vocab
+    self.vocab_type = vocab_type
+  
+pretrained_models= {
+    'base': PretrainedModel('deberta-base', 'bpe_encoder.bin', 'gpt2'),
+    'large': PretrainedModel('deberta-large', 'bpe_encoder.bin', 'gpt2'),
+    'xlarge': PretrainedModel('deberta-xlarge', 'bpe_encoder.bin', 'gpt2'),
+    'base-mnli': PretrainedModel('deberta-base-mnli', 'bpe_encoder.bin', 'gpt2'),
+    'large-mnli': PretrainedModel('deberta-large-mnli', 'bpe_encoder.bin', 'gpt2'),
+    'xlarge-mnli': PretrainedModel('deberta-xlarge-mnli', 'bpe_encoder.bin', 'gpt2'),
+    'xlarge-v2': PretrainedModel('deberta-v2-xlarge', 'spm.model', 'spm'),
+    'xxlarge-v2': PretrainedModel('deberta-v2-xxlarge', 'spm.model', 'spm'),
+    'xlarge-v2-mnli': PretrainedModel('deberta-v2-xlarge-mnli', 'spm.model', 'spm'),
+    'xxlarge-v2-mnli': PretrainedModel('deberta-v2-xxlarge-mnli', 'spm.model', 'spm'),
+    'deberta-v3-small': PretrainedModel('deberta-v3-small', 'spm.model', 'spm'),
+    'deberta-v3-base': PretrainedModel('deberta-v3-base', 'spm.model', 'spm'),
+    'deberta-v3-large': PretrainedModel('deberta-v3-large', 'spm.model', 'spm'),
+    'mdeberta-v3-base': PretrainedModel('mdeberta-v3-base', 'spm.model', 'spm'),
+    'deberta-v3-xsmall': PretrainedModel('deberta-v3-xsmall', 'spm.model', 'spm'),
+  }
+
+def download_asset(url, name, tag=None, no_cache=False, cache_dir=None):
+  _tag = tag
+  if _tag is None:
+    _tag = 'latest'
+  if not cache_dir:
+    cache_dir = os.path.join(pathlib.Path.home(), f'.~DeBERTa/assets/{_tag}/')
+  os.makedirs(cache_dir, exist_ok=True)
+  output=os.path.join(cache_dir, name)
+  if os.path.exists(output) and (not no_cache):
+    return output
+
+  #repo=f'https://huggingface.co/microsoft/deberta-{name}/blob/main/bpe_encoder.bin'
+  headers = {}
+  headers['Accept'] = 'application/octet-stream'
+  resp = requests.get(url, stream=True, headers=headers)
+  if resp.status_code != 200:
+    raise Exception(f'Request for {url} return {resp.status_code}, {resp.text}')
+  
+  try:
+    with open(output, 'wb') as fs:
+      progress = tqdm(total=int(resp.headers['Content-Length']) if 'Content-Length' in resp.headers else -1, ncols=80, desc=f'Downloading {name}')
+      for c in resp.iter_content(chunk_size=1024*1024):
+        fs.write(c)
+        progress.update(len(c))
+      progress.close()
+  except:
+    os.remove(output)
+    raise
+
+  return output
+
+def load_model_state(path_or_pretrained_id, tag=None, no_cache=False, cache_dir=None):
+  model_path = path_or_pretrained_id
+  if model_path and (not os.path.exists(model_path)) and (path_or_pretrained_id.lower() in pretrained_models):
+    _tag = tag
+    if 'deberta-v3-base' in path_or_pretrained_id:
+      pretrained = pretrained_models['deberta-v3-base']
+    else:
+      pretrained = pretrained_models[path_or_pretrained_id.lower()]
+    if _tag is None:
+      _tag = 'latest'
+    if not cache_dir:
+      cache_dir = os.path.join(pathlib.Path.home(), f'.~DeBERTa/assets/{_tag}/{pretrained.name}')
+    os.makedirs(cache_dir, exist_ok=True)
+    model_path = os.path.join(cache_dir, 'pytorch_model.bin')
+    if (not os.path.exists(model_path)) or no_cache:
+      asset = download_asset(pretrained.model_url, 'pytorch_model.bin', tag=tag, no_cache=no_cache, cache_dir=cache_dir)
+      asset = download_asset(pretrained.config_url, 'model_config.json', tag=tag, no_cache=no_cache, cache_dir=cache_dir)
+  elif not model_path:
+    return None,None
+
+  model_path = os.path.join(model_path, 'pytorch_model.bin')
+  config_path = os.path.join(os.path.dirname(model_path), 'model_config.json')
+  model_state = torch.load(model_path, map_location='cpu')
+  logger.info("Loaded pretrained model file {}".format(model_path))
+  if 'config' in model_state:
+    model_config = ModelConfig.from_dict(model_state['config'])
+  elif os.path.exists(config_path):
+    model_config = ModelConfig.from_json_file(config_path)
+  else:
+    model_config = None
+  return model_state, model_config
+
+def load_vocab(vocab_path=None, vocab_type=None, pretrained_id=None, tag=None, no_cache=False, cache_dir=None):
+  if pretrained_id and (pretrained_id.lower() in pretrained_models):
+    _tag = tag
+    if _tag is None:
+      _tag = 'latest'
+
+    pretrained = pretrained_models[pretrained_id.lower()]
+    if not cache_dir:
+      cache_dir = os.path.join(pathlib.Path.home(), f'.~DeBERTa/assets/{_tag}/{pretrained.name}')
+    os.makedirs(cache_dir, exist_ok=True)
+    vocab_type = pretrained.vocab_type
+    url = pretrained.vocab_url
+    outname = os.path.basename(url)
+    vocab_path =os.path.join(cache_dir, outname)
+    if (not os.path.exists(vocab_path)) or no_cache:
+      asset = download_asset(url, outname, tag=tag, no_cache=no_cache, cache_dir=cache_dir)
+  if vocab_type is None:
+    vocab_type = 'spm'
+  return vocab_path, vocab_type
+
+def test_download():
+  vocab = load_vocab()
diff --git a/nlu/DeBERTa/deberta/config.py b/nlu/DeBERTa/deberta/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..11f23aa28041f3bc2f3aadc6fd181abdd551e8b2
--- /dev/null
+++ b/nlu/DeBERTa/deberta/config.py
@@ -0,0 +1,90 @@
+import json
+import copy
+
+__all__=['AbsModelConfig', 'ModelConfig']
+
+class AbsModelConfig(object):
+    def __init__(self):
+        pass
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `ModelConfig` from a Python dictionary of parameters."""
+        config = cls()
+        for key, value in json_object.items():
+            if isinstance(value, dict):
+                value = AbsModelConfig.from_dict(value)
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `ModelConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        def _json_default(obj):
+            if isinstance(obj, AbsModelConfig):
+                return obj.__dict__
+        return json.dumps(self.__dict__, indent=2, sort_keys=True, default=_json_default) + "\n"
+
+class ModelConfig(AbsModelConfig):
+    """Configuration class to store the configuration of a :class:`~DeBERTa.deberta.DeBERTa` model.
+
+        Attributes:
+            hidden_size (int): Size of the encoder layers and the pooler layer, default: `768`.
+            num_hidden_layers (int): Number of hidden layers in the Transformer encoder, default: `12`.
+            num_attention_heads (int): Number of attention heads for each attention layer in
+                the Transformer encoder, default: `12`.
+            intermediate_size (int): The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder, default: `3072`.
+            hidden_act (str): The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported, default: `gelu`.
+            hidden_dropout_prob (float): The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler, default: `0.1`.
+            attention_probs_dropout_prob (float): The dropout ratio for the attention
+                probabilities, default: `0.1`.
+            max_position_embeddings (int): The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048), default: `512`.
+            type_vocab_size (int): The vocabulary size of the `token_type_ids` passed into
+                `DeBERTa` model, default: `-1`.
+            initializer_range (int): The sttdev of the _normal_initializer for
+                initializing all weight matrices, default: `0.02`.
+            relative_attention (:obj:`bool`): Whether use relative position encoding, default: `False`.
+            max_relative_positions (int): The range of relative positions [`-max_position_embeddings`, `max_position_embeddings`], default: -1, use the same value as `max_position_embeddings`. 
+            padding_idx (int): The value used to pad input_ids, default: `0`.
+            position_biased_input (:obj:`bool`): Whether add absolute position embedding to content embedding, default: `True`.
+            pos_att_type (:obj:`str`): The type of relative position attention, it can be a combination of [`p2c`, `c2p`, `p2p`], e.g. "p2c", "p2c|c2p", "p2c|c2p|p2p"., default: "None".
+
+
+    """
+    def __init__(self):
+        """Constructs ModelConfig.
+
+        """
+        
+        self.hidden_size = 768
+        self.num_hidden_layers = 12
+        self.num_attention_heads = 12
+        self.hidden_act = "gelu"
+        self.intermediate_size = 3072
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 0
+        self.initializer_range = 0.02
+        self.layer_norm_eps = 1e-7
+        self.padding_idx = 0
+        self.vocab_size = -1
diff --git a/nlu/DeBERTa/deberta/da_utils.py b/nlu/DeBERTa/deberta/da_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..81bc977fa7c1cef0a67ed9ad56ec619743c6b533
--- /dev/null
+++ b/nlu/DeBERTa/deberta/da_utils.py
@@ -0,0 +1,68 @@
+import torch
+import pdb
+from functools import lru_cache
+import numpy as np
+import math
+
+__all__=['build_relative_position', 'make_log_bucket_position']
+
+@lru_cache(maxsize=128)
+def make_log_bucket_dict(bucket_size, max_position, device=None):
+  relative_pos = torch.arange(-max_position, max_position, device=device)
+  sign = torch.sign(relative_pos)
+  mid = bucket_size//2
+  abs_pos = torch.where((relative_pos<mid) & (relative_pos > -mid), torch.tensor(mid-1).to(relative_pos), torch.abs(relative_pos))
+  log_pos = torch.ceil(torch.log(abs_pos/mid)/math.log((max_position-1)/mid) * (mid-1)) + mid
+  bucket_pos = torch.where(abs_pos<=mid, relative_pos, (log_pos*sign).to(relative_pos)).to(torch.long)
+  return bucket_pos
+
+# Faster version
+def make_log_bucket_position(relative_pos, bucket_size, max_position):
+  relative_pos = torch.clamp(relative_pos,-max_position+1, max_position-1) + max_position
+  bucket_dict = make_log_bucket_dict(bucket_size, max_position, relative_pos.device)
+  for d in range(relative_pos.dim()-1):
+    bucket_dict = bucket_dict.unsqueeze(0)
+    bucket_pos = torch.gather(bucket_dict.expand(list(relative_pos.size())[:-1] + [bucket_dict.size(-1)]), index=relative_pos.long(), dim=-1)
+  return bucket_pos
+
+@lru_cache(maxsize=128)
+def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-1, device=None):
+  q_ids = torch.arange(0, query_size)
+  k_ids = torch.arange(0, key_size)
+  if device is not None:
+    q_ids = q_ids.to(device)
+    k_ids = k_ids.to(device)
+  rel_pos_ids = q_ids.view(-1,1) - k_ids.view(1,-1)
+  #q_ids[:, None] - np.tile(k_ids, (q_ids.shape[0],1))
+  if bucket_size>0 and max_position > 0:
+    rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position)
+  #rel_pos_ids = torch.tensor(rel_pos_ids, dtype=torch.long)
+  rel_pos_ids = rel_pos_ids[:query_size, :]
+  rel_pos_ids = rel_pos_ids.unsqueeze(0)
+  return rel_pos_ids
+
+def build_relative_position_from_abs(query_pos, key_pos, bucket_size=-1, max_position=-1, device=None):
+  if isinstance(query_pos, tuple):
+    q_ids = torch.tensor(query_pos)
+  else:
+    q_ids = query_pos
+  if isinstance(key_pos, tuple):
+    k_ids = torch.tensor(key_pos)
+  else:
+    k_ids = key_pos
+
+  if device is not None:
+    q_ids = q_ids.to(device)
+    k_ids = k_ids.to(device)
+  rel_pos_ids = q_ids.unsqueeze(-1) - k_ids.unsqueeze(-2)
+  #q_ids[:, None] - np.tile(k_ids, (q_ids.shape[0],1))
+  if bucket_size>0 and max_position > 0:
+    rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position)
+  #rel_pos_ids = torch.tensor(rel_pos_ids, dtype=torch.long)
+  return rel_pos_ids
+
+def test_log_bucket():
+  x=np.arange(-511,511)
+  y=make_log_bucket_position(x, 128, 512)
+  pdb.set_trace()
+
diff --git a/nlu/DeBERTa/deberta/deberta.py b/nlu/DeBERTa/deberta/deberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3f84eca9c50696595814e6e846d291e87ea0832
--- /dev/null
+++ b/nlu/DeBERTa/deberta/deberta.py
@@ -0,0 +1,145 @@
+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: penhe@microsoft.com
+# Date: 01/15/2020
+#
+
+import copy
+import torch
+import os
+
+import json
+from .ops import *
+from .bert import *
+from .config import ModelConfig
+from .cache_utils import load_model_state
+import pdb
+
+__all__ = ['DeBERTa']
+
+class DeBERTa(torch.nn.Module):
+  """ DeBERTa encoder
+  This module is composed of the input embedding layer with stacked transformer layers with disentangled attention.
+
+  Parameters:
+    config:
+      A model config class instance with the configuration to build a new model. The schema is similar to `BertConfig`, \
+          for more details, please refer :class:`~DeBERTa.deberta.ModelConfig`
+
+    pre_trained:
+      The pre-trained DeBERTa model, it can be a physical path of a pre-trained DeBERTa model or a released configurations, \
+          i.e. [**base, large, base_mnli, large_mnli**]
+
+  """
+
+  def __init__(self, config=None, pre_trained=None):
+    super().__init__()
+    state = None
+    if pre_trained is not None:
+      state, model_config = load_model_state(pre_trained)
+      if config is not None and model_config is not None:
+        for k in config.__dict__:
+          if k not in ['hidden_size',
+            'intermediate_size',
+            'num_attention_heads',
+            'num_hidden_layers',
+            'vocab_size',
+            'max_position_embeddings']:
+            model_config.__dict__[k] = config.__dict__[k]
+      config = copy.copy(model_config)
+    self.embeddings = BertEmbeddings(config)
+    self.encoder = BertEncoder(config)
+    self.config = config
+    self.pre_trained = pre_trained
+    self.apply_state(state)
+
+  def forward(self, input_ids, attention_mask=None, token_type_ids=None, output_all_encoded_layers=True, position_ids = None, return_att = False):
+    """
+    Args:
+      input_ids:
+        a torch.LongTensor of shape [batch_size, sequence_length] \
+      with the word token indices in the vocabulary
+
+      attention_mask:
+        an optional parameter for input mask or attention mask.
+
+        - If it's an input mask, then it will be torch.LongTensor of shape [batch_size, sequence_length] with indices \
+      selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max \
+      input sequence length in the current batch. It's the mask that we typically use for attention when \
+      a batch has varying length sentences.
+
+        - If it's an attention mask then it will be torch.LongTensor of shape [batch_size, sequence_length, sequence_length]. \
+      In this case, it's a mask indicate which tokens in the sequence should be attended by other tokens in the sequence.
+
+      token_type_ids:
+        an optional torch.LongTensor of shape [batch_size, sequence_length] with the token \
+      types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to \
+      a `sentence B` token (see BERT paper for more details).
+
+      output_all_encoded_layers:
+        whether to output results of all encoder layers, default, True
+
+    Returns:
+
+      - The output of the stacked transformer layers if `output_all_encoded_layers=True`, else \
+      the last layer of stacked transformer layers
+
+      - Attention matrix of self-attention layers if `return_att=True`
+
+
+    Example::
+
+      # Batch of wordPiece token ids.
+      # Each sample was padded with zero to the maxium length of the batch
+      input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+      # Mask of valid input ids
+      attention_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+
+      # DeBERTa model initialized with pretrained base model
+      bert = DeBERTa(pre_trained='base')
+
+      encoder_layers = bert(input_ids, attention_mask=attention_mask)
+
+    """
+
+    if attention_mask is None:
+      attention_mask = torch.ones_like(input_ids)
+    if token_type_ids is None:
+      token_type_ids = torch.zeros_like(input_ids)
+
+    ebd_output = self.embeddings(input_ids.to(torch.long), token_type_ids.to(torch.long), position_ids, attention_mask)
+    embedding_output = ebd_output['embeddings']
+    encoder_output = self.encoder(embedding_output,
+                   attention_mask,
+                   output_all_encoded_layers=output_all_encoded_layers, return_att = return_att)
+    encoder_output.update(ebd_output)
+    return encoder_output
+
+  def apply_state(self, state = None):
+    """ Load state from previous loaded model state dictionary.
+
+      Args:
+        state (:obj:`dict`, optional): State dictionary as the state returned by torch.module.state_dict(), default: `None`. \
+            If it's `None`, then will use the pre-trained state loaded via the constructor to re-initialize \
+            the `DeBERTa` model
+    """
+    if self.pre_trained is None and state is None:
+      return
+    if state is None:
+      state, config = load_model_state(self.pre_trained)
+      self.config = config
+    
+    prefix = ''
+    for k in state:
+      if 'embeddings.' in k:
+        if not k.startswith('embeddings.'):
+          prefix = k[:k.index('embeddings.')]
+        break
+
+    missing_keys = []
+    unexpected_keys = []
+    error_msgs = []
+    self._load_from_state_dict(state, prefix = prefix, local_metadata=None, strict=True, missing_keys=missing_keys, unexpected_keys=unexpected_keys, error_msgs=error_msgs)
diff --git a/nlu/DeBERTa/deberta/disentangled_attention.py b/nlu/DeBERTa/deberta/disentangled_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..42215c787c901eb8f73d25d88e3375a635126078
--- /dev/null
+++ b/nlu/DeBERTa/deberta/disentangled_attention.py
@@ -0,0 +1,221 @@
+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: penhe@microsoft.com
+# Date: 01/15/2020
+#
+
+"""
+  Disentangled SelfAttention module
+"""
+
+import numpy as np
+import math
+import torch
+from torch import nn
+import functools
+import pdb
+
+from .ops import *
+from .da_utils import build_relative_position
+
+from ..utils import get_logger
+logger=get_logger()
+
+from adapterlib import adapter_dict
+
+__all__=['DisentangledSelfAttention']
+class DisentangledSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_attention_heads = config.num_attention_heads
+        _attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.attention_head_size = getattr(config, 'attention_head_size', _attention_head_size)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        
+        # -----------------------------------------------------------------------------------------------------------------------
+        if config.inject_adapter != 'linear':
+            self.query_proj = adapter_dict[config.inject_adapter](config.hidden_size, self.all_head_size, config=config)
+        else:
+            self.query_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+
+        # self.key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+        if config.inject_adapter != 'linear':
+            self.key_proj = adapter_dict[config.inject_adapter](config.hidden_size, self.all_head_size, config=config)
+        else:
+            self.key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+
+        if config.inject_adapter != 'linear':
+            self.value_proj = adapter_dict[config.inject_adapter](config.hidden_size, self.all_head_size, config=config)
+        else:
+            self.value_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+            
+        # -----------------------------------------------------------------------------------------------------------------------
+            
+        # self.query_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+        # self.key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+        # self.value_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+
+        self.share_att_key = getattr(config, 'share_att_key', False)
+        self.pos_att_type = [x.strip() for x in getattr(config, 'pos_att_type', 'c2p').lower().split('|')] # c2p|p2c
+        self.relative_attention = getattr(config, 'relative_attention', False)
+
+        if self.relative_attention:
+            self.position_buckets = getattr(config, 'position_buckets', -1)
+            self.max_relative_positions = getattr(config, 'max_relative_positions', -1)
+            if self.max_relative_positions <1:
+                self.max_relative_positions = config.max_position_embeddings
+            self.pos_ebd_size = self.max_relative_positions
+            if self.position_buckets>0:
+                self.pos_ebd_size = self.position_buckets
+                # For backward compitable
+
+            self.pos_dropout = StableDropout(config.hidden_dropout_prob)
+
+            if (not self.share_att_key):
+                if 'c2p' in self.pos_att_type or 'p2p' in self.pos_att_type:
+                    self.pos_key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
+                if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type:
+                    self.pos_query_proj = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = StableDropout(config.attention_probs_dropout_prob)
+        self._register_load_state_dict_pre_hook(self._pre_load_hook)
+
+    def transpose_for_scores(self, x, attention_heads):
+        new_x_shape = x.size()[:-1] + (attention_heads, -1)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3).contiguous().view(-1, x.size(1), x.size(-1))
+
+    def forward(self, hidden_states, attention_mask, return_att=False, query_states=None, relative_pos=None, rel_embeddings=None):
+        if query_states is None:
+            query_states = hidden_states
+        query_layer = self.transpose_for_scores(self.query_proj(query_states), self.num_attention_heads).float()
+        key_layer = self.transpose_for_scores(self.key_proj(hidden_states), self.num_attention_heads).float()
+        value_layer = self.transpose_for_scores(self.value_proj(hidden_states), self.num_attention_heads)
+        
+        rel_att = None
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        scale_factor = 1
+        if 'c2p' in self.pos_att_type:
+            scale_factor += 1
+        if 'p2c' in self.pos_att_type:
+            scale_factor += 1
+        if 'p2p' in self.pos_att_type:
+            scale_factor += 1
+        scale = 1/math.sqrt(query_layer.size(-1)*scale_factor)
+        attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2)*scale)
+        if self.relative_attention:
+            rel_embeddings = self.pos_dropout(rel_embeddings)
+            rel_att = self.disentangled_attention_bias(query_layer, key_layer, relative_pos, rel_embeddings, scale_factor)
+
+        if rel_att is not None:
+            attention_scores = (attention_scores + rel_att)
+        attention_scores = (attention_scores - attention_scores.max(dim=-1, keepdim=True).values.detach()).to(hidden_states)
+        attention_scores = attention_scores.view(-1, self.num_attention_heads, attention_scores.size(-2), attention_scores.size(-1))
+
+        # bxhxlxd
+        _attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
+        attention_probs = self.dropout(_attention_probs)
+        context_layer = torch.bmm(attention_probs.view(-1, attention_probs.size(-2), attention_probs.size(-1)), value_layer)
+        context_layer = context_layer.view(-1, self.num_attention_heads, context_layer.size(-2), context_layer.size(-1)).permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (-1,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        return {
+            'hidden_states': context_layer,
+            'attention_probs': _attention_probs,
+            'attention_logits': attention_scores
+            }
+
+    def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor):
+        if relative_pos is None:
+            q = query_layer.size(-2)
+            relative_pos = build_relative_position(q, key_layer.size(-2), bucket_size = self.position_buckets, \
+                max_position = self.max_relative_positions, device=query_layer.device)
+        if relative_pos.dim()==2:
+            relative_pos = relative_pos.unsqueeze(0).unsqueeze(0)
+        elif relative_pos.dim()==3:
+            relative_pos = relative_pos.unsqueeze(1)
+        # bxhxqxk
+        elif relative_pos.dim()!=4:
+            raise ValueError(f'Relative postion ids must be of dim 2 or 3 or 4. {relative_pos.dim()}')
+
+        att_span = self.pos_ebd_size
+        relative_pos = relative_pos.long().to(query_layer.device)
+
+        rel_embeddings = rel_embeddings[self.pos_ebd_size - att_span:self.pos_ebd_size + att_span, :].unsqueeze(0) #.repeat(query_layer.size(0)//self.num_attention_heads, 1, 1)
+        if self.share_att_key:
+            pos_query_layer = self.transpose_for_scores(self.query_proj(rel_embeddings), self.num_attention_heads)\
+                .repeat(query_layer.size(0)//self.num_attention_heads, 1, 1) #.split(self.all_head_size, dim=-1)
+            pos_key_layer = self.transpose_for_scores(self.key_proj(rel_embeddings), self.num_attention_heads)\
+                .repeat(query_layer.size(0)//self.num_attention_heads, 1, 1) #.split(self.all_head_size, dim=-1)
+        else:
+            if 'c2p' in self.pos_att_type or 'p2p' in self.pos_att_type:
+                pos_key_layer = self.transpose_for_scores(self.pos_key_proj(rel_embeddings), self.num_attention_heads)\
+                    .repeat(query_layer.size(0)//self.num_attention_heads, 1, 1) #.split(self.all_head_size, dim=-1)
+            if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type:
+                pos_query_layer = self.transpose_for_scores(self.pos_query_proj(rel_embeddings), self.num_attention_heads)\
+                    .repeat(query_layer.size(0)//self.num_attention_heads, 1, 1) #.split(self.all_head_size, dim=-1)
+
+        score = 0
+        # content->position
+        if 'c2p' in self.pos_att_type:
+            scale = 1/math.sqrt(pos_key_layer.size(-1)*scale_factor)
+            c2p_att = torch.bmm(query_layer, pos_key_layer.transpose(-1, -2).to(query_layer)*scale)
+            c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span*2-1).squeeze(0).expand([query_layer.size(0), query_layer.size(1), relative_pos.size(-1)])
+            c2p_att = torch.gather(c2p_att, dim=-1, index=c2p_pos)
+            score += c2p_att
+
+        # position->content
+        if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type:
+            scale = 1/math.sqrt(pos_query_layer.size(-1)*scale_factor)
+
+        if 'p2c' in self.pos_att_type:
+            p2c_att = torch.bmm(pos_query_layer.to(key_layer)*scale, key_layer.transpose(-1, -2))
+            p2c_att = torch.gather(p2c_att, dim=-2, index=c2p_pos)
+            score += p2c_att
+
+        # position->position
+        if 'p2p' in self.pos_att_type:
+            pos_query = pos_query_layer[:,:,att_span:,:]
+            p2p_att = torch.matmul(pos_query, pos_key_layer.transpose(-1, -2))
+            p2p_att = p2p_att.expand(query_layer.size()[:2] + p2p_att.size()[2:])
+            if query_layer.size(-2) != key_layer.size(-2):
+                p2p_att = torch.gather(p2p_att, dim=-2, index=pos_index.expand(query_layer.size()[:2] + (pos_index.size(-2), p2p_att.size(-1))))
+            p2p_att = torch.gather(p2p_att, dim=-1, index=c2p_pos.expand([query_layer.size(0), query_layer.size(1), query_layer.size(2), relative_pos.size(-1)]))
+            score += p2p_att
+
+        return score
+
+    def _pre_load_hook(self, state_dict, prefix, local_metadata, strict,
+        missing_keys, unexpected_keys, error_msgs):
+        self_state = self.state_dict()
+        if ((prefix + 'query_proj.weight') not in state_dict) and ((prefix + 'in_proj.weight') in state_dict):
+          v1_proj = state_dict[prefix+'in_proj.weight']
+          v1_proj = v1_proj.unsqueeze(0).reshape(self.num_attention_heads, -1, v1_proj.size(-1))
+          q,k,v=v1_proj.chunk(3, dim=1)
+          state_dict[prefix + 'query_proj.weight'] = q.reshape(-1, v1_proj.size(-1))
+          state_dict[prefix + 'key_proj.weight'] = k.reshape(-1, v1_proj.size(-1))
+          state_dict[prefix + 'key_proj.bias'] = self_state['key_proj.bias']
+          state_dict[prefix + 'value_proj.weight'] = v.reshape(-1, v1_proj.size(-1))
+          v1_query_bias = state_dict[prefix + 'q_bias']
+          state_dict[prefix + 'query_proj.bias'] = v1_query_bias
+          v1_value_bias = state_dict[prefix +'v_bias']
+          state_dict[prefix + 'value_proj.bias'] = v1_value_bias
+
+          v1_pos_key_proj = state_dict[prefix + 'pos_proj.weight']
+          state_dict[prefix + 'pos_key_proj.weight'] = v1_pos_key_proj
+          v1_pos_query_proj = state_dict[prefix + 'pos_q_proj.weight']
+          state_dict[prefix + 'pos_query_proj.weight'] = v1_pos_query_proj
+          v1_pos_query_proj_bias = state_dict[prefix + 'pos_q_proj.bias']
+          state_dict[prefix + 'pos_query_proj.bias'] = v1_pos_query_proj_bias
+          state_dict[prefix + 'pos_key_proj.bias'] = self_state['pos_key_proj.bias']
+
+          del state_dict[prefix + 'in_proj.weight']
+          del state_dict[prefix + 'q_bias']
+          del state_dict[prefix + 'v_bias']
+          del state_dict[prefix + 'pos_proj.weight']
+          del state_dict[prefix + 'pos_q_proj.weight']
+          del state_dict[prefix + 'pos_q_proj.bias']
diff --git a/nlu/DeBERTa/deberta/gpt2_bpe_utils.py b/nlu/DeBERTa/deberta/gpt2_bpe_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b089f6a4d7a9e7d275a2ddf4f11268d97a6ba890
--- /dev/null
+++ b/nlu/DeBERTa/deberta/gpt2_bpe_utils.py
@@ -0,0 +1,163 @@
+"""
+Byte pair encoding utilities from GPT-2.
+
+Original source: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+Original license: MIT
+"""
+
+from functools import lru_cache
+import json
+import random
+import unicodedata
+
+try:
+    import regex as re
+except ImportError:
+    raise ImportError('Please install regex with: pip install regex')
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+class Encoder:
+
+    def __init__(self, encoder, bpe_merges, errors='replace'):
+        self.encoder = encoder
+        self.decoder = {v:k for k,v in self.encoder.items()}
+        self.errors = errors # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
+        self.bpe_ranks = dict(zip([tuple(k) for k in bpe_merges], range(len(bpe_merges))))
+        self.cache = {}
+        self.random = random.Random(0)
+
+        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    
+    def split_to_words(self, text):
+      return list(re.findall(self.pat, text))
+
+    def encode(self, text):
+        bpe_tokens = []
+        for token in self.split_to_words(text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        return text
+
+def get_encoder(encoder, vocab):
+    return Encoder(
+        encoder=encoder,
+        bpe_merges=vocab,
+    )
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
diff --git a/nlu/DeBERTa/deberta/gpt2_tokenizer.py b/nlu/DeBERTa/deberta/gpt2_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d7072082891310346f9f51389a950352f078c14
--- /dev/null
+++ b/nlu/DeBERTa/deberta/gpt2_tokenizer.py
@@ -0,0 +1,216 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: penhe@microsoft.com
+# Date: 01/15/2020
+#
+
+# This piece of code is derived from https://github.com/pytorch/fairseq/blob/master/fairseq/data/encoders/gpt2_bpe.py
+
+import torch
+import unicodedata
+import os
+from .gpt2_bpe_utils import get_encoder,_is_control,_is_whitespace,_is_punctuation
+from .cache_utils import load_vocab
+
+__all__ = ['GPT2Tokenizer']
+
+class GPT2Tokenizer(object):
+  """ A wrapper of GPT2 tokenizer with similar interface as BERT tokenizer
+
+  Args:
+    
+    vocab_file (:obj:`str`, optional):
+      The local path of vocabulary package or the release name of vocabulary in `DeBERTa GitHub releases <https://github.com/microsoft/DeBERTa/releases>`_, \
+          e.g. "bpe_encoder", default: `None`. 
+          
+          If it's `None`, then it will download the vocabulary in the latest release from GitHub. The vocabulary file is a \
+          state dictionary with three items, "dict_map", "vocab", "encoder" which correspond to three files used in `RoBERTa`, i.e. `dict.txt`, `vocab.txt` and `encoder.json`. \
+          
+          The difference between our wrapped GPT2 tokenizer and RoBERTa wrapped tokenizer are,
+
+          - Special tokens, unlike `RoBERTa` which use `<s>`, `</s>` as the `start` token and `end` token of a sentence. We use `[CLS]` and `[SEP]` as the `start` and `end`\
+              token of input sentence which is the same as `BERT`.
+
+          - We remapped the token ids in our dictionary with regarding to the new special tokens, `[PAD]` => 0, `[CLS]` => 1, `[SEP]` => 2, `[UNK]` => 3, `[MASK]` => 50264
+
+    do_lower_case (:obj:`bool`, optional):
+      Whether to convert inputs to lower case. **Not used in GPT2 tokenizer**.
+
+    special_tokens (:obj:`list`, optional):
+      List of special tokens to be added to the end of the vocabulary.
+
+
+  """
+  def __init__(self, vocab_file=None, do_lower_case=True, special_tokens=None):
+    self.pad_token='[PAD]'
+    self.sep_token='[SEP]'
+    self.unk_token='[UNK]'
+    self.cls_token='[CLS]'
+
+    self.symbols = []
+    self.count = []
+    self.indices = {}
+    self.pad_token_id = self.add_symbol(self.pad_token)
+    self.cls_token_id = self.add_symbol(self.cls_token)
+    self.sep_token_id = self.add_symbol(self.sep_token)
+    self.unk_token_id = self.add_symbol(self.unk_token)
+
+    self.gpt2_encoder = torch.load(vocab_file)
+    self.bpe = get_encoder(self.gpt2_encoder['encoder'], self.gpt2_encoder['vocab'])
+    for w,n in self.gpt2_encoder['dict_map']:
+      self.add_symbol(w, n)
+
+    self.mask_token='[MASK]'
+    self.mask_id = self.add_symbol(self.mask_token)
+    self.special_tokens = ['[MASK]', '[SEP]', '[PAD]', '[UNK]', '[CLS]']
+    if special_tokens is not None:
+      for t in special_tokens:
+        self.add_special_token(t)
+
+    self.vocab = self.indices
+    self.ids_to_tokens = self.symbols
+
+  def tokenize(self, text):
+    """ Convert an input text to tokens.
+      
+      Args:
+        
+        text (:obj:`str`): input text to be tokenized.
+
+      Returns:
+        A list of byte tokens where each token represent the byte id in GPT2 byte dictionary
+
+      Example::
+        
+        >>> tokenizer = GPT2Tokenizer()
+        >>> text = "Hello world!"
+        >>> tokens = tokenizer.tokenize(text)
+        >>> print(tokens)
+        ['15496', '995', '0']
+        
+    """
+    bpe = self._encode(text)
+
+    return [t for t in bpe.split(' ') if t]
+
+  def convert_tokens_to_ids(self, tokens):
+    """ Convert list of tokens to ids.
+      
+      Args:
+
+        tokens (:obj:`list<str>`): list of tokens
+
+      Returns:
+        
+        List of ids
+    """
+
+    return [self.vocab[t] for t in tokens]
+
+  def convert_ids_to_tokens(self, ids):
+    """ Convert list of ids to tokens.
+      
+      Args:
+
+        ids (:obj:`list<int>`): list of ids
+
+      Returns:
+        
+        List of tokens
+    """
+
+    tokens = []
+    for i in ids:
+      tokens.append(self.ids_to_tokens[i])
+    return tokens
+
+  def split_to_words(self, text):
+    return self.bpe.split_to_words(text)
+
+  def decode(self, tokens):
+    """ Decode list of tokens to text strings.
+    
+      Args:
+        
+        tokens (:obj:`list<str>`): list of tokens.
+
+      Returns:
+        
+        Text string corresponds to the input tokens.
+
+      Example::
+        
+        >>> tokenizer = GPT2Tokenizer()
+        >>> text = "Hello world!"
+        >>> tokens = tokenizer.tokenize(text)
+        >>> print(tokens)
+        ['15496', '995', '0']
+        
+        >>> tokenizer.decode(tokens)
+        'Hello world!'
+      
+    """
+    return self.bpe.decode([int(t) for t in tokens if t not in self.special_tokens])
+
+  def add_special_token(self, token):
+    """Adds a special token to the dictionary.
+    
+      Args:
+        token (:obj:`str`): Tthe new token/word to be added to the vocabulary.
+
+      Returns:
+        The id of new token in the vocabulary.
+
+    """
+    self.special_tokens.append(token)
+    return self.add_symbol(token)
+
+  def part_of_whole_word(self, token, is_bos=False):
+    if is_bos:
+      return True
+    s = self._decode(token)
+    if (len(s)==1 and (_is_whitespace(list(s)[0]) or _is_control(list(s)[0]) or _is_punctuation(list(s)[0]))):
+      return False
+
+    return not s.startswith(' ')
+
+  def sym(self, id):
+    return self.ids_to_tokens[id]
+
+  def id(self, sym):
+    return self.vocab[sym]
+
+  def _encode(self, x: str) -> str:
+    return ' '.join(map(str, self.bpe.encode(x)))
+
+  def _decode(self, x: str) -> str:
+    return self.bpe.decode(map(int, x.split()))
+
+  def add_symbol(self, word, n=1):
+    """Adds a word to the dictionary.
+    
+      Args:
+        word (:obj:`str`): Tthe new token/word to be added to the vocabulary.
+        n (int, optional): The frequency of the word.
+
+      Returns:
+        The id of the new word.
+
+    """
+    if word in self.indices:
+      idx = self.indices[word]
+      self.count[idx] = self.count[idx] + n
+      return idx
+    else:
+      idx = len(self.symbols)
+      self.indices[word] = idx
+      self.symbols.append(word)
+      self.count.append(n)
+      return idx
+
+  def save_pretrained(self, path: str):
+    torch.save(self.gpt2_encoder, path)
diff --git a/nlu/DeBERTa/deberta/mlm.py b/nlu/DeBERTa/deberta/mlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..be00b2d7e9c8d2165ade05c050661baf93ba39d7
--- /dev/null
+++ b/nlu/DeBERTa/deberta/mlm.py
@@ -0,0 +1,38 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This piece of code is modified based on https://github.com/huggingface/transformers
+
+import torch
+from torch import nn
+import pdb
+
+from .bert import LayerNorm,ACT2FN
+
+__all__ = ['MLMPredictionHead']
+
+class MLMPredictionHead(nn.Module):
+    def __init__(self, config, vocab_size):
+        super().__init__()
+        self.embedding_size = getattr(config, 'embedding_size', config.hidden_size)
+        self.dense = nn.Linear(config.hidden_size, self.embedding_size)
+        self.transform_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+
+        self.LayerNorm = LayerNorm(self.embedding_size, config.layer_norm_eps)
+        self.bias = nn.Parameter(torch.zeros(vocab_size))
+        self.pre_norm = PreLayerNorm(config)
+
+    def forward(self, hidden_states, embeding_weight):
+        hidden_states = self.pre_norm(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        # b x s x d
+        hidden_states = MaskedLayerNorm(self.LayerNorm, hidden_states)
+
+        # b x s x v
+        logits = torch.matmul(hidden_states, embeding_weight.t().to(hidden_states)) + self.bias
+        return logits
diff --git a/nlu/DeBERTa/deberta/nnmodule.py b/nlu/DeBERTa/deberta/nnmodule.py
new file mode 100644
index 0000000000000000000000000000000000000000..4497c75a98d720fe866944465e4b1686b1cdfdc4
--- /dev/null
+++ b/nlu/DeBERTa/deberta/nnmodule.py
@@ -0,0 +1,137 @@
+import pdb
+import os
+import torch
+import copy
+from torch import nn
+from .config import ModelConfig
+from ..utils import xtqdm as tqdm
+from .cache_utils import load_model_state
+
+from ..utils import get_logger
+logger = get_logger()
+
+__all__ = ['NNModule']
+
+class NNModule(nn.Module):
+  """ An abstract class to handle weights initialization and \
+    a simple interface for dowloading and loading pretrained models.
+
+  Args:
+    
+    config (:obj:`~DeBERTa.deberta.ModelConfig`): The model config to the module
+
+  """
+
+  def __init__(self, config, *inputs, **kwargs):
+    super().__init__()
+    self.config = config
+
+  def init_weights(self, module):
+    """ Apply Gaussian(mean=0, std=`config.initializer_range`) initialization to the module.
+
+    Args:
+      
+      module (:obj:`torch.nn.Module`): The module to apply the initialization.
+    
+    Example::
+      
+      class MyModule(NNModule):
+        def __init__(self, config):
+          # Add construction instructions
+          self.bert = DeBERTa(config)
+          
+          # Add other modules
+          ...
+
+          # Apply initialization
+          self.apply(self.init_weights)
+
+    """
+    if isinstance(module, (nn.Linear, nn.Embedding)):
+      module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+    if isinstance(module, nn.Linear) and module.bias is not None:
+      module.bias.data.zero_()
+
+  def export_onnx(self, onnx_path, input):
+    raise NotImplementedError
+
+  @classmethod
+  def load_model(cls, model_path, model_config=None, tag=None, no_cache=False, cache_dir=None , *inputs, **kwargs):
+    """ Instantiate a sub-class of NNModule from a pre-trained model file.
+      
+    Args:
+
+      model_path (:obj:`str`): Path or name of the pre-trained model which can be either,
+        
+        - The path of pre-trained model
+
+        - The pre-trained DeBERTa model name in `DeBERTa GitHub releases <https://github.com/microsoft/DeBERTa/releases>`_, i.e. [**base, base_mnli, large, large_mnli**].
+
+        If `model_path` is `None` or `-`, then the method will create a new sub-class without initialing from pre-trained models.
+
+      model_config (:obj:`str`): The path of model config file. If it's `None`, then the method will try to find the the config in order:
+        
+        1. ['config'] in the model state dictionary.
+
+        2. `model_config.json` aside the `model_path`.
+        
+        If it failed to find a config the method will fail.
+
+      tag (:obj:`str`, optional): The release tag of DeBERTa, default: `None`.
+
+      no_cache (:obj:`bool`, optional): Disable local cache of downloaded models, default: `False`.
+
+      cache_dir (:obj:`str`, optional): The cache directory used to save the downloaded models, default: `None`. If it's `None`, then the models will be saved at `$HOME/.~DeBERTa`
+
+    Return:
+      
+      :obj:`NNModule` : The sub-class object.
+
+    """
+    # Load config
+    if model_config:
+      config = ModelConfig.from_json_file(model_config)
+    else:
+      config = None
+    model_config = None
+    model_state = None
+    if (model_path is not None) and (model_path.strip() == '-' or model_path.strip()==''):
+      model_path = None
+    try:
+      model_state, model_config = load_model_state(model_path, tag=tag, no_cache=no_cache, cache_dir=cache_dir)
+    except Exception as exp:
+      raise Exception(f'Failed to get model {model_path}. Exception: {exp}')
+    
+    if config is not None and model_config is not None:
+      for k in config.__dict__:
+        if k not in ['hidden_size',
+          'intermediate_size',
+          'num_attention_heads',
+          'num_hidden_layers',
+          'vocab_size',
+          'max_position_embeddings'] or (k not in  model_config.__dict__) or (model_config.__dict__[k] < 0):
+          model_config.__dict__[k] = config.__dict__[k]
+    if model_config is not None:
+      config = copy.copy(model_config)
+    vocab_size = config.vocab_size
+    # Instantiate model.
+    model = cls(config, *inputs, **kwargs)
+    if not model_state:
+      return model
+    # copy state_dict so _load_from_state_dict can modify it
+    state_dict = model_state.copy()
+
+    missing_keys = []
+    unexpected_keys = []
+    error_msgs = []
+    metadata = getattr(state_dict, '_metadata', None)
+    def load(module, prefix=''):
+      local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+      module._load_from_state_dict(
+        state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+      for name, child in module._modules.items():
+        if child is not None:
+          load(child, prefix + name + '.')
+    load(model)
+    logger.warning(f'Missing keys: {missing_keys}, unexpected_keys: {unexpected_keys}, error_msgs: {error_msgs}')
+    return model
diff --git a/nlu/DeBERTa/deberta/ops.py b/nlu/DeBERTa/deberta/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..be10d3c9afd4b13c67ad767ff32163f1f30a4e55
--- /dev/null
+++ b/nlu/DeBERTa/deberta/ops.py
@@ -0,0 +1,228 @@
+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: penhe@microsoft.com
+# Date: 01/15/2020
+#
+
+import pdb
+import math
+from packaging import version
+import torch
+from torch.nn import LayerNorm
+from ..utils.jit_tracing import traceable
+
+if version.Version(torch.__version__) >= version.Version('1.0.0'):
+  from torch import _softmax_backward_data as _softmax_backward_data
+else:
+  from torch import softmax_backward_data as _softmax_backward_data
+
+__all__ = ['StableDropout', 'MaskedLayerNorm', 'XSoftmax', 'ACT2FN', 'LayerNorm']
+
+@traceable
+class XSoftmax(torch.autograd.Function):
+  """ Masked Softmax which is optimized for saving memory
+
+  Args:
+      
+    input (:obj:`torch.tensor`): The input tensor that will apply softmax.
+    mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax caculation.
+    dim (int): The dimenssion that will apply softmax.
+    
+  Example::
+
+    import torch
+    from DeBERTa.deberta import XSoftmax
+    # Make a tensor
+    x = torch.randn([4,20,100])
+    # Create a mask
+    mask = (x>0).int()
+    y = XSoftmax.apply(x, mask, dim=-1)
+      
+  """
+
+  @staticmethod
+  def forward(self, input, mask, dim):
+    """
+    """
+
+    self.dim = dim
+    if version.Version(torch.__version__) >= version.Version('1.2.0a'):
+      rmask = ~(mask.bool())
+    else:
+      rmask = (1-mask).byte() # This line is not supported by Onnx tracing.
+
+    output = input.masked_fill(rmask, float('-inf'))
+    output = torch.softmax(output, self.dim)
+    output.masked_fill_(rmask, 0)
+    self.save_for_backward(output)
+    return output
+
+  @staticmethod
+  def backward(self, grad_output):
+    """
+    """
+
+    output, = self.saved_tensors
+    if version.Version(torch.__version__) >= version.Version('1.11.0a'):
+      inputGrad = _softmax_backward_data(grad_output, output, self.dim, output.dtype)
+    else:
+      inputGrad = _softmax_backward_data(grad_output, output, self.dim, output)
+    return inputGrad, None, None
+
+  @staticmethod
+  def symbolic(g, self, mask, dim):
+      import torch.onnx.symbolic_helper as sym_help
+      from torch.onnx.symbolic_opset9 import masked_fill, softmax
+
+      mask_cast_value = g.op("Cast", mask, to_i=sym_help.cast_pytorch_to_onnx['Long'])
+      r_mask = g.op("Cast", g.op("Sub", g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64)), mask_cast_value), to_i=sym_help.cast_pytorch_to_onnx['Byte'])
+      output = masked_fill(g, self, r_mask, g.op("Constant", value_t=torch.tensor(float('-inf'))))
+      output = softmax(g, output, dim)
+      return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.uint8)))
+
+class DropoutContext(object):
+  def __init__(self):
+    self.dropout = 0
+    self.mask = None
+    self.scale = 1
+    self.reuse_mask = True
+
+def get_mask(input, local_context):
+  if not isinstance(local_context, DropoutContext):
+    dropout = local_context
+    mask = None
+  else:
+    dropout = local_context.dropout
+    dropout *= local_context.scale
+    mask = local_context.mask if local_context.reuse_mask else None
+
+  if dropout>0 and mask is None:
+    if version.Version(torch.__version__) >= version.Version('1.2.0a'):
+      mask=(1-torch.empty_like(input).bernoulli_(1-dropout)).bool()
+    else:
+      mask=(1-torch.empty_like(input).bernoulli_(1-dropout)).byte()
+  
+  if isinstance(local_context, DropoutContext):
+    if local_context.mask is None:
+      local_context.mask = mask
+
+  return mask, dropout
+
+@traceable
+class XDropout(torch.autograd.Function):
+  @staticmethod
+  def forward(ctx, input, local_ctx):
+    mask, dropout = get_mask(input, local_ctx)
+    ctx.scale=1.0/(1-dropout)
+    if dropout>0:
+      ctx.save_for_backward(mask)
+      return input.masked_fill(mask, 0)*ctx.scale
+    else:
+      return input
+
+  @staticmethod
+  def backward(ctx, grad_output):
+    if ctx.scale > 1:
+      mask, = ctx.saved_tensors
+      return grad_output.masked_fill(mask, 0)*ctx.scale, None
+    else:
+      return grad_output, None
+
+class StableDropout(torch.nn.Module):
+  """ Optimized dropout module for stabilizing the training
+
+  Args:
+
+    drop_prob (float): the dropout probabilities
+
+  """
+
+  def __init__(self, drop_prob):
+    super().__init__()
+    self.drop_prob = drop_prob
+    self.count = 0
+    self.context_stack = None
+
+  def forward(self, x):
+    """ Call the module
+
+    Args:
+      
+      x (:obj:`torch.tensor`): The input tensor to apply dropout
+
+
+    """
+    if self.training and self.drop_prob>0:
+      return XDropout.apply(x, self.get_context())
+    return x
+
+  def clear_context(self):
+    self.count = 0
+    self.context_stack = None
+
+  def init_context(self, reuse_mask=True, scale = 1):
+    if self.context_stack is None:
+      self.context_stack = []
+    self.count = 0
+    for c in self.context_stack:
+      c.reuse_mask = reuse_mask
+      c.scale = scale
+
+  def get_context(self):
+    if self.context_stack is not None:
+      if self.count >= len(self.context_stack):
+        self.context_stack.append(DropoutContext())
+      ctx = self.context_stack[self.count]
+      ctx.dropout = self.drop_prob
+      self.count += 1
+      return ctx
+    else:
+      return self.drop_prob
+
+def MaskedLayerNorm(layerNorm, input, mask = None):
+  """ Masked LayerNorm which will apply mask over the output of LayerNorm to avoid inaccurate updatings to the LayerNorm module.
+  
+  Args:
+    layernorm (:obj:`~DeBERTa.deberta.LayerNorm`): LayerNorm module or function
+    input (:obj:`torch.tensor`): The input tensor
+    mask (:obj:`torch.IntTensor`): The mask to applied on the output of LayerNorm where `0` indicate the output of that element will be ignored, i.e. set to `0`
+
+  Example::
+
+    # Create a tensor b x n x d
+    x = torch.randn([1,10,100])
+    m = torch.tensor([[1,1,1,0,0,0,0,0,0,0]], dtype=torch.int)
+    LayerNorm = DeBERTa.deberta.LayerNorm(100)
+    y = MaskedLayerNorm(LayerNorm, x, m)
+
+  """
+  output = layerNorm(input).to(input)
+  if mask is None:
+    return output
+  if mask.dim()!=input.dim():
+    if mask.dim()==4:
+      mask=mask.squeeze(1).squeeze(1)
+    mask = mask.unsqueeze(2)
+  mask = mask.to(output.dtype)
+  return output*mask
+
+def gelu(x):
+  """Implementation of the gelu activation function.
+    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+    0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+  """
+  return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def swish(x):
+  return x * torch.sigmoid(x)
+
+def linear_act(x):
+  return x
+
+ACT2FN = {"gelu": torch.nn.functional.gelu, "relu": torch.nn.functional.relu, "swish": swish, "tanh": torch.tanh, "linear": linear_act, 'sigmoid': torch.sigmoid}
+
+
diff --git a/nlu/DeBERTa/deberta/pooling.py b/nlu/DeBERTa/deberta/pooling.py
new file mode 100644
index 0000000000000000000000000000000000000000..c08ed79d92efecd50d97adbabd8cae7776ba421d
--- /dev/null
+++ b/nlu/DeBERTa/deberta/pooling.py
@@ -0,0 +1,88 @@
+#
+# Author: penhe@microsoft.com
+# Date: 01/25/2019
+#
+"""
+Pooling functions
+"""
+
+from torch import nn
+import copy
+import json
+import pdb
+from .bert import ACT2FN
+from .ops import StableDropout
+from .config import AbsModelConfig
+
+__all__ = ['PoolConfig', 'ContextPooler']
+
+class PoolConfig(AbsModelConfig):
+    """Configuration class to store the configuration of `pool layer`.
+
+        Parameters:
+        
+            config (:class:`~DeBERTa.deberta.ModelConfig`): The model config. The field of pool config will be initalized with the `pooling` field in model config.
+
+        Attributes:
+
+            hidden_size (int): Size of the encoder layers and the pooler layer, default: `768`.
+
+            dropout (float): The dropout rate applied on the output of `[CLS]` token,
+
+            hidden_act (:obj:`str`): The activation function of the projection layer, it can be one of ['gelu', 'tanh'].
+
+        Example::
+
+            # Here is the content of an exmple model config file in json format
+
+                {
+                  "hidden_size": 768,
+                  "num_hidden_layers" 12,
+                  "num_attention_heads": 12,
+                  "intermediate_size": 3072,
+                  ...
+                  "pooling": {
+                    "hidden_size":  768,
+                    "hidden_act": "gelu",
+                    "dropout": 0.1
+                  }
+                }
+
+    """
+    def __init__(self, config=None):
+        """Constructs PoolConfig.
+
+        Args:
+           `config`: the config of the model. The field of pool config will be initalized with the 'pooling' field in model config.
+        """
+        
+        self.hidden_size = 768
+        self.dropout = 0
+        self.hidden_act = 'gelu'
+        if config:
+            pool_config = getattr(config, 'pooling', config)
+            if isinstance(pool_config, dict):
+                pool_config = AbsModelConfig.from_dict(pool_config)
+            self.hidden_size = getattr(pool_config, 'hidden_size', config.hidden_size)
+            self.dropout = getattr(pool_config, 'dropout', 0)
+            self.hidden_act = getattr(pool_config, 'hidden_act', 'gelu')
+
+class ContextPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = StableDropout(config.dropout)
+        self.config = config
+
+    def forward(self, hidden_states, mask = None):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        
+        context_token = hidden_states[:, 0]
+        context_token = self.dropout(context_token)
+        pooled_output = self.dense(context_token)
+        pooled_output = ACT2FN[self.config.hidden_act](pooled_output)
+        return pooled_output
+
+    def output_dim(self):
+        return self.config.hidden_size
diff --git a/nlu/DeBERTa/deberta/pretrained_models.py b/nlu/DeBERTa/deberta/pretrained_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..139597f9cb07c5d48bed18984ec4747f4b4f3438
--- /dev/null
+++ b/nlu/DeBERTa/deberta/pretrained_models.py
@@ -0,0 +1,2 @@
+
+
diff --git a/nlu/DeBERTa/deberta/spm_tokenizer.py b/nlu/DeBERTa/deberta/spm_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..db8fbdabe135e170b5f7ade94130881cacc64e13
--- /dev/null
+++ b/nlu/DeBERTa/deberta/spm_tokenizer.py
@@ -0,0 +1,322 @@
+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: penhe@microsoft.com
+# Date: 11/15/2020
+#
+
+
+import sentencepiece as sp
+import six
+import unicodedata
+import os
+import regex as re
+from .cache_utils import load_vocab
+from ..utils import get_logger
+logger=get_logger()
+
+
+import pdb
+
+__all__ = ['SPMTokenizer']
+
+class SPMTokenizer:
+  def __init__(self, vocab_file, do_lower_case=False, special_tokens=None, bpe_dropout=0, split_by_punct=False):
+    self.split_by_punct = split_by_punct
+    spm = sp.SentencePieceProcessor()
+    assert os.path.exists(vocab_file)
+    spm.load(vocab_file)
+    bpe_vocab_size = spm.GetPieceSize()
+    # Token map
+    # <unk> 0+1
+    # <s> 1+1
+    # </s> 2+1
+    self.vocab = {spm.IdToPiece(i):i for i in range(bpe_vocab_size)}
+    self.id_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
+    #self.vocab['[PAD]'] = 0
+    #self.vocab['[CLS]'] = 1
+    #self.vocab['[SEP]'] = 2
+    #self.vocab['[UNK]'] = 3
+
+    _special_tokens = ['[MASK]', '[SEP]', '[PAD]', '[UNK]', '[CLS]']
+    self.special_tokens = []
+    if special_tokens is not None:
+      _special_tokens.extend(special_tokens)
+    for t in _special_tokens:
+      self.add_special_token(t)
+
+    self.spm = spm
+    self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+  def tokenize(self, text):
+    pieces = self._encode_as_pieces(text)
+    def _norm(x):
+      if x not in self.vocab or x=='<unk>':
+        return '[UNK]'
+      else:
+        return x
+    pieces = [_norm(p) for p in pieces]
+    return pieces
+
+  def convert_tokens_to_ids(self, tokens):
+    return [self.vocab[t] if t in self.vocab else 1 for t in tokens]
+
+  def convert_ids_to_tokens(self, ids):
+    tokens = []
+    for i in ids:
+      tokens.append(self.ids_to_tokens[i])
+    return tokens
+
+  def decode(self, tokens, start=-1, end=-1, raw_text=None):
+    if raw_text is None:
+      return self.spm.decode_pieces([t for t in tokens if t not in self.special_tokens])
+    else:
+      words = self.split_to_words(raw_text)
+      word_tokens = [self.tokenize(w) for w in words]
+      wt = [w for t in word_tokens for w in t]
+      #assert tokens == wt, f'{tokens} || {wt}'
+      if wt!=tokens:
+        for a,b in zip(wt, tokens):
+          if a!=b:
+            pdb.set_trace()
+      token2words = [0]*len(tokens)
+      tid = 0
+      for i,w in enumerate(word_tokens):
+        for k,t in enumerate(w):
+          token2words[tid] = i
+          tid += 1
+      word_start = token2words[start]
+      word_end = token2words[end] if end <len(tokens) else len(words)
+      text = ''.join(words[word_start:word_end])
+      return text
+
+  def add_special_token(self, token):
+    if token not in self.special_tokens:
+      self.special_tokens.append(token)
+      if token not in self.vocab:
+        self.vocab[token] = len(self.vocab)
+        self.id_to_tokens.append(token)
+    return self.id(token)
+
+  def part_of_whole_word(self, token, is_bos=False):
+    if is_bos:
+      return True
+    if (len(token)==1 and (_is_whitespace(list(token)[0]) or _is_control(list(token)[0]) or _is_punctuation(list(token)[0]))) or token in self.special_tokens:
+      return False
+
+    word_start = b'\xe2\x96\x81'.decode('utf-8')
+    return not token.startswith(word_start)
+
+  def pad(self):
+    return '[PAD]'
+
+  def bos(self):
+    return '[CLS]'
+
+  def eos(self):
+    return '[SEP]'
+
+  def unk(self):
+      return '[UNK]'
+
+  def mask(self):
+      return '[MASK]'
+
+  def sym(self, id):
+    return self.ids_to_tokens[id]
+
+  def id(self, sym):
+    return self.vocab[sym] if sym in self.vocab else 1
+
+  def _encode_as_pieces(self, text):
+    text = convert_to_unicode(text)
+    if self.split_by_punct:
+      words = self._run_split_on_punc(text)
+      pieces = [self.spm.encode_as_pieces(w) for w in words]
+      return [p for w in pieces for p in w]
+    else:
+      return self.spm.encode_as_pieces(text)
+
+  def split_to_words(self, text):
+    pieces = self._encode_as_pieces(text)
+    word_start = b'\xe2\x96\x81'.decode('utf-8')
+    words = []
+    offset = 0
+    prev_end = 0
+    for i,p in enumerate(pieces):
+      if p.startswith(word_start):
+        if offset>prev_end:
+          words.append(text[prev_end:offset])
+        prev_end = offset
+        w = p.replace(word_start, '')
+      else:
+        w = p
+      try:
+        s = text.index(w, offset)
+        pn = ""
+        k = i+1
+        while k < len(pieces):
+          pn = pieces[k].replace(word_start, '')
+          if len(pn)>0:
+            break
+          k += 1
+
+        if len(pn)>0 and pn in text[offset:s]:
+          offset = offset + 1
+        else:
+          offset = s + len(w)
+      except:
+        offset = offset + 1
+
+    if prev_end< offset:
+      words.append(text[prev_end:offset])
+
+    return words
+
+  def _run_strip_accents(self, text):
+    """Strips accents from a piece of text."""
+    text = unicodedata.normalize("NFD", text)
+    output = []
+    for char in text:
+      cat = unicodedata.category(char)
+      if cat == "Mn":
+        continue
+      output.append(char)
+    return "".join(output)
+
+  def _run_split_on_punc(self, text):
+    """Splits punctuation on a piece of text."""
+    #words = list(re.findall(self.pat, text))
+    chars = list(text)
+    i = 0
+    start_new_word = True
+    output = []
+    while i < len(chars):
+      char = chars[i]
+      if _is_punctuation(char):
+        output.append([char])
+        start_new_word = True
+      else:
+        if start_new_word:
+          output.append([])
+        start_new_word = False
+        output[-1].append(char)
+      i += 1
+
+    return ["".join(x) for x in output]
+  
+  def _tokenize_chinese_chars(self, text):
+    """Adds whitespace around any CJK character."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if self._is_chinese_char(cp):
+        output.append(" ")
+        output.append(char)
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+  def _is_chinese_char(self, cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+      (cp >= 0x3400 and cp <= 0x4DBF) or  #
+      (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+      (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+      (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+      (cp >= 0x2B820 and cp <= 0x2CEAF) or
+      (cp >= 0xF900 and cp <= 0xFAFF) or  #
+      (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+      return True
+  
+    return False
+  
+  def _clean_text(self, text):
+    """Performs invalid character removal and whitespace cleanup on text."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if cp == 0 or cp == 0xfffd or _is_control(char):
+        continue
+      if _is_whitespace(char):
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a peice of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+def convert_to_unicode(text):
+  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text.decode("utf-8", "ignore")
+    elif isinstance(text, unicode):
+      return text
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+
diff --git a/nlu/DeBERTa/deberta/tokenizers.py b/nlu/DeBERTa/deberta/tokenizers.py
new file mode 100644
index 0000000000000000000000000000000000000000..fca7a8a71d50c6dafa93101e73db263e2cc0ffa3
--- /dev/null
+++ b/nlu/DeBERTa/deberta/tokenizers.py
@@ -0,0 +1,16 @@
+#
+# Author: penhe@microsoft.com
+# Date: 04/25/2019
+#
+
+""" tokenizers
+"""
+
+from .spm_tokenizer import *
+from .gpt2_tokenizer import GPT2Tokenizer
+
+__all__ = ['tokenizers']
+tokenizers={
+    'gpt2': GPT2Tokenizer,
+    'spm': SPMTokenizer
+    }
diff --git a/nlu/DeBERTa/optims/__init__.py b/nlu/DeBERTa/optims/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e63497baeab8603a50e6d98f028c81a3a392e6f2
--- /dev/null
+++ b/nlu/DeBERTa/optims/__init__.py
@@ -0,0 +1,16 @@
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: Pengcheng He (penhe@microsoft.com)
+# Date: 05/15/2019
+#
+
+""" optimizers
+"""
+
+from .xadam import XAdam
+from .fp16_optimizer import *
+from .lr_schedulers import SCHEDULES
+from .args import get_args
+
diff --git a/nlu/DeBERTa/optims/args.py b/nlu/DeBERTa/optims/args.py
new file mode 100644
index 0000000000000000000000000000000000000000..b89c5c35f16d0b5a3b3ef59244ef391fa9ada04a
--- /dev/null
+++ b/nlu/DeBERTa/optims/args.py
@@ -0,0 +1,100 @@
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: Pengcheng He (penhe@microsoft.com)
+# Date: 05/15/2019
+#
+
+""" Arguments for optimizer
+"""
+import argparse
+from ..utils import boolean_string
+
+__all__ = ['get_args']
+def get_args():
+  parser=argparse.ArgumentParser(add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+  group = parser.add_argument_group(title='Optimizer', description='Parameters for the distributed optimizer')
+  group.add_argument('--fp16',
+            default=False,
+            type=boolean_string,
+            help="Whether to use 16-bit float precision instead of 32-bit")
+
+  group.add_argument('--loss_scale',
+            type=float, default=16384,
+            help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
+
+  group.add_argument('--scale_steps',
+            type=int, default=250,
+            help='The steps to wait to increase the loss scale.')
+
+  group.add_argument('--lookahead_k',
+            default=-1,
+            type=int,
+            help="lookahead k parameter")
+
+  group.add_argument('--lookahead_alpha',
+            default=0.5,
+            type=float,
+            help="lookahead alpha parameter")
+
+  group.add_argument('--with_radam',
+            default=False,
+            type=boolean_string,
+            help="whether to use RAdam")
+
+  group.add_argument('--opt_type',
+            type=str.lower,
+            default='adam',
+            choices=['adam', 'admax'],
+            help="The optimizer to be used.")
+
+  group.add_argument("--warmup_proportion",
+            default=0.1,
+            type=float,
+            help="Proportion of training to perform linear learning rate warmup for. "
+              "E.g., 0.1 = 10%% of training.")
+
+  group.add_argument("--lr_schedule_ends",
+            default=0,
+            type=float,
+            help="The ended learning rate scale for learning rate scheduling")
+
+  group.add_argument("--lr_schedule",
+            default='warmup_linear',
+            type=str,
+            help="The learning rate scheduler used for traning. " +
+              "E.g. warmup_linear, warmup_linear_shift, warmup_cosine, warmup_constant. Default, warmup_linear")
+
+  group.add_argument("--max_grad_norm",
+            default=1,
+            type=float,
+            help="The clip threshold of global gradient norm")
+
+  group.add_argument("--learning_rate",
+            default=5e-5,
+            type=float,
+            help="The initial learning rate for Adam.")
+
+  group.add_argument("--epsilon",
+            default=1e-6,
+            type=float,
+            help="epsilon setting for Adam.")
+
+  group.add_argument("--adam_beta1",
+            default=0.9,
+            type=float,
+            help="The beta1 parameter for Adam.")
+
+  group.add_argument("--adam_beta2",
+            default=0.999,
+            type=float,
+            help="The beta2 parameter for Adam.")
+
+  group.add_argument('--weight_decay',
+            type=float,
+            default=0.01,
+            help="The weight decay rate")
+
+  return parser
+
diff --git a/nlu/DeBERTa/optims/fp16_optimizer.py b/nlu/DeBERTa/optims/fp16_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..044a8de0311847684dbc8313d12497def5155fa6
--- /dev/null
+++ b/nlu/DeBERTa/optims/fp16_optimizer.py
@@ -0,0 +1,301 @@
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: Pengcheng He (penhe@microsoft.com)
+# Date: 05/15/2019
+#
+
+""" FP16 optimizer wrapper
+"""
+
+from collections import defaultdict
+import numpy as np
+import math
+import torch
+import pdb
+import torch.distributed as dist
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+import ctypes
+
+from ..utils import get_logger,boolean_string
+logger=get_logger()
+
+__all__ = ['Fp16Optimizer', 'ExpLossScaler', 'get_world_size']
+
+def get_world_size():
+  try:
+    wd = dist.get_world_size()
+    return wd
+  except:
+    return 1
+
+def fused_norm(input):
+  return torch.norm(input, p=2, dtype=torch.float32)
+
+class OptParameter(torch.Tensor):
+  def __new__(cls, data, out_data=None, grad=None, name=None):
+    param = torch.Tensor._make_subclass(cls, data)
+    param._xgrad = grad
+    param.out_data = out_data
+    param._name = name
+    return param
+
+  @property
+  def name(self):
+    return self._name
+
+  @property
+  def grad(self):
+    return self._xgrad
+
+  @grad.setter
+  def grad(self, grad):
+    self._xgrad = grad
+
+class Fp16Optimizer(object):
+  def __init__(self, param_groups, optimizer_fn, loss_scaler=None, grad_clip_norm = 1.0, lookahead_k = -1, lookahead_alpha = 0.5, rank=-1, distributed=False):
+    # all parameters should on the same device
+    groups = []
+    original_groups = []
+    self.rank = rank
+    self.distributed = distributed
+    if self.rank<0:
+      self.distributed = False
+    for group in param_groups:
+      if 'offset' not in group:
+        group['offset'] = None
+      if ('rank' not in group) or (not self.distributed):
+        group['rank'] = -1
+        assert group['offset'] is None, f"{group['names']}: {group['offset']}"
+      group_rank = group['rank']
+      params = group['params'] # parameter
+      if len(params) > 1:
+        flattened_params = _flatten_dense_tensors([p.data for p in params])
+        unflattend_params = _unflatten_dense_tensors(flattened_params, [p.data for p in params])
+        for uf,p in zip(unflattend_params, params):
+          p.data = uf
+      else:
+        flattened_params = params[0].data.view(-1)
+        if group['offset'] is not None:
+          start, length = group['offset']
+          flattened_params = flattened_params.narrow(0, start, length)
+      
+      if params[0].dtype==torch.half:
+        if self.rank == group_rank or (not self.distributed):
+          master_params = flattened_params.clone().to(torch.float).detach_().to(flattened_params.device)
+        else:
+          master_params = flattened_params.clone().to(torch.float).detach_().cpu()
+        group['params'] = [OptParameter(master_params, flattened_params, name='master')]
+      else:
+        group['params'] = [OptParameter(flattened_params, None, name='master')]
+      
+      o_group = defaultdict(list)
+      o_group['names'] = group['names']
+      o_group['params'] = params
+      o_group['rank'] = group_rank
+      o_group['offset'] = group['offset']
+
+      group['names'] = ['master']
+
+      original_groups.append(o_group)
+      groups.append(group)
+    self.param_groups = groups
+    self.loss_scaler = loss_scaler
+    self.optimizer = optimizer_fn(self.param_groups)
+    self.original_param_groups = original_groups
+    self.max_grad_norm = grad_clip_norm
+    self.lookahead_k = lookahead_k
+    self.lookahead_alpha = lookahead_alpha
+
+  def backward(self, loss):
+    if self.loss_scaler:
+      loss_scale, loss, step_loss = self.loss_scaler.scale(loss)
+    else:
+      loss_scale = 1
+      step_loss = loss.item()
+
+    loss.backward()
+    return loss_scale, step_loss
+
+  def step(self, lr_scale, loss_scale = 1):
+    grad_scale = self._grad_scale(loss_scale)
+    if grad_scale is None or math.isinf(grad_scale):
+      self.loss_scaler.update(False)
+      return False
+
+    if self.lookahead_k > 0:
+      for p in self.param_groups:
+        if 'la_count' not in p:
+          # init
+          #make old copy
+          p['la_count'] = 0
+          p['slow_params'] = [x.data.detach().clone().requires_grad_(False) for x in p['params']]
+    self.optimizer.step(grad_scale, lr_scale)
+
+    # for group in self.param_groups:
+    #     for p in group['params']:
+    #         # p.data        : master fp32
+    #         # p.out_data    : fp16 tensor backing model nn.Parameters
+    #         if hasattr(p, 'out_data') and p.out_data is not None:
+    #             p.out_data.copy_(p.data, non_blocking=True)
+
+    if self.lookahead_k > 0:
+      for p in self.param_groups:
+        p['la_count'] += 1
+        if p['la_count'] == self.lookahead_k:
+          p['la_count'] = 0
+          for s,f in zip(p['slow_params'], p['params']):
+            s.mul_(1-self.lookahead_alpha)
+            s.add_(f.data.detach()*self.lookahead_alpha)
+            f.data.copy_(s, non_blocking=True)
+            if hasattr(f, 'out_data') and f.out_data is not None:
+              f.out_data.copy_(f.data, non_blocking=True)
+
+    if self.loss_scaler:
+      self.loss_scaler.update(True)
+    return True
+
+  def zero_grad(self):
+    for group, o_group in zip(self.param_groups, self.original_param_groups):
+      for p in group['params']:
+        p.grad = None
+      for p in o_group['params']:
+        p.grad = None
+
+  def _grad_scale(self, loss_scale = 1):
+    named_params = {}
+    named_grads = {}
+    for g in self.original_param_groups:
+      for n,p in zip(g['names'], g['params']):
+        named_params[n] = p
+        named_grads[n] = p.grad if p.grad is not None else torch.zeros_like(p.data)
+    
+    wd = get_world_size()
+    def _reduce(group):
+      grads = [named_grads[n] for n in group]
+      if len(grads)>1:
+        flattened_grads = _flatten_dense_tensors(grads)
+      else:
+        flattened_grads = grads[0],view(-1)
+
+      if wd > 1:
+        flattened_grads /= wd
+        handle = dist.all_reduce(flattened_grads, async_op=True)
+      else:
+        handle = None
+      return flattened_grads, handle
+
+    def _process_grad(group, flattened_grads, max_grad, norm):
+      grads = [named_grads[n] for n in group]
+      norm = norm.to(flattened_grads.device)
+      norm = norm + fused_norm(flattened_grads)**2
+
+      if len(grads) > 1:
+        unflattend_grads = _unflatten_dense_tensors(flattened_grads, grads)
+      else:
+        unflattend_grads = [flattened_grads]
+
+      for n,ug in zip(group, unflattend_grads):
+        named_grads[n] = ug #.to(named_params[n].data)
+
+      return max_grad, norm
+
+    group_size = 0
+    group = []
+    max_size = 32*1024*1024
+    norm = torch.zeros(1, dtype=torch.float)
+    max_grad = 0
+    
+    all_grads = []
+    for name in sorted(named_params.keys(), key=lambda x:x.replace('deberta.', 'bert.')):
+      group.append(name)
+      group_size += named_params[name].data.numel()
+      if group_size>=max_size:
+        flatten, handle = _reduce(group)
+        all_grads.append([handle, flatten, group])
+        group = []
+        group_size = 0
+    if group_size>0:
+      flatten, handle = _reduce(group)
+      all_grads.append([handle, flatten, group])
+      group = []
+      group_size = 0
+    for h,fg,group in all_grads:
+      if h is not None:
+        h.wait()
+      max_grad, norm = _process_grad(group, fg, max_grad, norm)
+
+    norm = norm**0.5
+    if torch.isnan(norm) or torch.isinf(norm) :#in ['-inf', 'inf', 'nan']:
+      return None
+
+    scaled_norm = norm.detach().item()/loss_scale
+    grad_scale = loss_scale
+
+    if self.max_grad_norm>0:
+      scale = norm/(loss_scale*self.max_grad_norm)
+      if scale>1:
+        grad_scale *= scale
+
+    for group, o_g in zip(self.param_groups, self.original_param_groups):
+      grads = [named_grads[n] for n in o_g['names']]
+
+      if len(grads) > 1:
+        flattened_grads = _flatten_dense_tensors(grads)
+      else:
+        flattened_grads = grads[0].view(-1)
+        if group['offset'] is not None:
+          start, length = group['offset']
+          flattened_grads = flattened_grads.narrow(0, start, length)
+      if group['rank'] == self.rank or (not self.distributed):
+        group['params'][0].grad = flattened_grads
+
+    return grad_scale
+
+class ExpLossScaler:
+  def __init__(self, init_scale=2**16, scale_interval=1000):
+    self.cur_scale = init_scale
+    self.scale_interval = scale_interval
+    self.invalid_cnt = 0
+    self.last_scale = 0
+    self.steps = 0
+    self.down_scale_smooth = 0
+    
+  def scale(self, loss):
+    assert self.cur_scale > 0, self.init_scale
+    step_loss = loss.float().detach().item()
+    if step_loss != 0 and math.isfinite(step_loss):
+      loss_scale = self.cur_scale
+    else:
+      loss_scale = 1
+    loss = loss.float()*loss_scale
+    return (loss_scale, loss, step_loss)
+
+  def update(self, is_valid = True):
+    if not is_valid:
+      self.invalid_cnt += 1
+      if self.invalid_cnt>self.down_scale_smooth:
+        self.cur_scale /= 2
+        self.cur_scale = max(self.cur_scale, 1)
+        self.last_scale = self.steps
+    else:
+      self.invalid_cnt = 0
+      if self.steps - self.last_scale>self.scale_interval:
+        self.cur_scale *= 2
+        self.last_scale = self.steps
+    self.steps += 1
+
+  def state_dict(self):
+    state = defaultdict(float)
+    state['steps'] = self.steps
+    state['invalid_cnt'] = self.invalid_cnt
+    state['cur_scale'] = self.cur_scale
+    state['last_scale'] = self.last_scale
+    return state
+
+  def load_state_dict(self, state):
+    self.steps = state['steps']
+    self.invalid_cnt = state['invalid_cnt']
+    self.cur_scale = state['cur_scale']
+    self.last_scale = state['last_scale']
diff --git a/nlu/DeBERTa/optims/lr_schedulers.py b/nlu/DeBERTa/optims/lr_schedulers.py
new file mode 100644
index 0000000000000000000000000000000000000000..51be4c02f1d4cd8ff59c7f092efa8ef6a0011126
--- /dev/null
+++ b/nlu/DeBERTa/optims/lr_schedulers.py
@@ -0,0 +1,63 @@
+""" Learning rate schedulers
+"""
+
+import math
+import torch
+from torch.optim import Optimizer
+from torch.nn.utils import clip_grad_norm_
+
+def warmup_cosine(step, total, warmup=0.002, ends = 0):
+    x = step/total
+    x = x-int(x)
+    if x < warmup:
+        return x/warmup
+    return 0.5 * (1.0 + math.cos(math.pi * x))
+
+def warmup_constant(step, total, warmup=0.002, ends = 0):
+    x = step/total
+    x = x-int(x)
+    if x < warmup:
+        return x/warmup
+    return 1.0
+
+def warmup_linear(step, total, warmup=0.002, ends = 0):
+    x = step/total
+    x = x-int(x)
+    if x < warmup:
+        return x/warmup
+    return (1-ends)*(1.0 - x) + ends
+
+def warmup_linear_cosine(step, total, warmup=0.002, ends = 0):
+    x = step/total
+    x = x-int(x)
+    if x < warmup:
+        return x/warmup
+    return (1-ends)*max(0.5*(1+math.cos(math.pi*(x-warmup)/(1-warmup))), 0) + ends
+
+def warmup_cyclic_linear_cosine(step, total, warmup=0.002, ends = 0):
+    x = step/total
+    if x < warmup:
+        return x/warmup
+    total = total - int(total*warmup)
+    step = step - int(total*warmup)
+    n_epoch = 4
+    period = total//n_epoch
+    k = step//period
+    s = 1-k/n_epoch + 1/(2*n_epoch)*(math.pow(-1, k)*math.cos(math.pi*step/period)-1)
+    return (1-ends)*max(s, 0) + ends
+
+def warmup_linear_shift(step, total, warmup=0.002, ends = 0):
+    x = step/total
+    x = x-int(x)
+    if x < warmup:
+        return x/warmup
+    return (1-ends)*(1.0 - (x-warmup)/(1-warmup)) + ends
+
+SCHEDULES = {
+    'warmup_cosine':warmup_cosine,
+    'warmup_constant':warmup_constant,
+    'warmup_linear':warmup_linear,
+    'warmup_linear_cosine':warmup_linear_cosine,
+    'warmup_cyclic_linear_cosine':warmup_cyclic_linear_cosine,
+    'warmup_linear_shift':warmup_linear_shift,
+}
diff --git a/nlu/DeBERTa/optims/xadam.py b/nlu/DeBERTa/optims/xadam.py
new file mode 100644
index 0000000000000000000000000000000000000000..373075c306c13e6d137e939d3db4eaa0099d2a6f
--- /dev/null
+++ b/nlu/DeBERTa/optims/xadam.py
@@ -0,0 +1,214 @@
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: Pengcheng He (penhe@microsoft.com)
+# Date: 05/15/2019
+#
+
+""" Optimizer
+"""
+
+import math
+import torch
+from torch.optim import Optimizer
+from torch.nn.utils import clip_grad_norm_
+from torch import distributed as dist
+import pdb
+from .lr_schedulers import SCHEDULES
+from ..utils import get_logger
+
+def adamw(data,
+    out_data,
+    next_m,
+    next_v,
+    grad,
+    lr,
+    beta1,
+    beta2,
+    eps,
+    grad_scale, #combined_scale, g = g/scale
+    step,
+    eps_mode = 1, #self.eps_mode, esp inside sqrt:0, outside: 1, only update with momentum: 2
+    bias_correction = 0,
+    weight_decay = 0):
+  if bias_correction > 0:
+    lr *= bias_correction
+  beta1_ = 1 - beta1
+  beta2_ = 1 - beta2
+  grad = grad.float()
+  if grad_scale != 1:
+    grad *= 1/grad_scale
+  next_m.mul_(beta1).add_(beta1_, grad)
+  # admax
+  admax = eps_mode>>4
+  eps_mode = eps_mode&0xF
+  if admax > 0:
+    torch.max(next_v.mul_(beta2), grad.abs().to(next_v), out=next_v)
+    update = next_m/(next_v+eps)
+  else:
+    next_v.mul_(beta2).addcmul_(beta2_, grad, grad)
+    if eps_mode == 0:
+      update = (next_m)*(next_v+eps).rsqrt()
+    elif eps_mode == 1:
+      update = (next_m)/(next_v.sqrt()+eps)
+    else: #=2
+      update = next_m.clone()
+  if weight_decay>0:
+    update.add_(weight_decay, data)
+
+  data.add_(-lr, update)
+  if (out_data is not None) and len(out_data)>0:
+    out_data.copy_(data)
+
+class XAdam(Optimizer):
+  """Implements optimized version of Adam algorithm with weight decay fix.
+  Params:
+    lr: learning rate
+    warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
+    t_total: total number of training steps for the learning
+      rate schedule, -1  means constant learning rate. Default: -1
+    schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
+    b1: Adams b1. Default: 0.9
+    b2: Adams b2. Default: 0.999
+    e: Adams epsilon. Default: 1e-6
+    weight_decay_rate: Weight decay. Default: 0.01
+    max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
+    with_radam: Whether to enable radam. Default: False
+    radam_th: RAdam threshold for tractable variance. Default: 4
+    opt_type: The type of optimizer, [adam, admax], default: adam
+  """
+  def __init__(self, params, lr, warmup=-1, t_total=-1, schedule='warmup_linear',
+         b1=0.9, b2=0.999, e=1e-8, weight_decay_rate=0.01,
+         lr_ends = 0,
+         max_grad_norm = 1.0,
+         with_radam = False,
+         radam_th = 4,
+         opt_type=None,
+         rank = -1):
+    if not lr >= 0.0:
+      raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
+    if schedule not in SCHEDULES:
+      raise ValueError("Invalid schedule parameter: {}".format(schedule))
+    if not 0.0 <= warmup < 1.0 and not warmup == -1:
+      raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
+    if not 0.0 <= b1 < 1.0:
+      raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
+    if not 0.0 <= b2 < 1.0:
+      raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
+    if not e >= 0.0:
+      raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
+    self.defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
+            b1=b1, b2=b2, e=e, weight_decay_rate=weight_decay_rate,
+            lr_ends = lr_ends,
+            max_grad_norm=max_grad_norm,
+            with_radam = with_radam, radam_th = radam_th)
+    self.opt_type = opt_type.lower() if opt_type is not None else ""
+    self.rank = rank
+    super().__init__(params, self.defaults)
+
+  def step(self, grad_scale = 1, lr_scale = 1):
+    """Performs a single optimization step.
+
+    Arguments:
+      grad_scale: divid grad by grad_scale
+      lr_scale: scale learning rate by bs_scale
+    """
+    if 'global_step' not in self.state:
+      self.state['global_step'] = 0
+    for group in self.param_groups:
+      lr_sch = self.get_group_lr_sch(group, self.state['global_step'])
+      if group['rank'] == self.rank or group['rank']<0 or self.rank<0:
+        for param in group['params']:
+          self.update_param(group, param, grad_scale, lr_scale)
+
+    self.state['global_step'] += 1
+    self.last_grad_scale = grad_scale
+    handels = []
+    for group in self.param_groups:
+      if group['rank']>=0 and self.rank>=0:
+        # sync
+        for param in group['params']:
+          out_p = param.out_data if hasattr(param, 'out_data') and (param.out_data is not None) else None
+          if out_p is not None:
+            h = torch.distributed.broadcast(out_p, group['rank'], async_op=True)
+          else:
+            h = torch.distributed.broadcast(param.data, group['rank'], async_op=True)
+          handels.append(h)
+
+    for h in handels:
+      if h is not None:
+        h.wait()
+
+    return lr_sch
+
+  def get_group_lr_sch(self, group, steps):
+    if group['t_total'] > 0:
+      schedule_fct = SCHEDULES[group['schedule']]
+      lr_scheduled = schedule_fct(steps, group['t_total'], group['warmup'], group['lr_ends'])
+    else:
+      lr_scheduled = 1
+    return lr_scheduled
+
+  def update_param(self, group, param, grad_scale, lr_scale):
+    grad = param.grad
+    if grad.is_sparse:
+      raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+    state = self.get_state(param)
+    lr_sch = self.get_group_lr_sch(group, state['step'])
+    lr = group['lr'] * lr_scale *lr_sch
+    next_m, next_v = state['next_m'], state['next_v']
+    beta1, beta2 = group['b1'], group['b2']
+    state['step'] += 1
+
+    # Support for RAdam
+    t = (state['step']-1) + 1
+    eps_mode = 1
+    if group['with_radam']:
+      rou_ = 2/(1-beta2) - 1
+      rou_t = rou_ - 2*t/(beta2**-t - 1)
+      bias_c = 1/(1-beta1**t)
+      if rou_t > group['radam_th']:
+        bias_c *= math.sqrt(1 - beta2**t)
+        bias_c *= math.sqrt(((rou_t - 4)*(rou_t - 2)*rou_)/((rou_ - 4)*(rou_ - 2)*rou_t))
+      else:
+        eps_mode = 2
+        bias_c = 0
+      lr *= bias_c
+
+    if self.opt_type == 'admax':
+      eps_mode |= 0x10
+
+    with torch.cuda.device(param.device.index):
+      out_p = param.out_data if hasattr(param, 'out_data') and (param.out_data is not None) else None
+      if out_p is None or out_p.dtype != grad.dtype:
+        out_p = torch.tensor([], dtype=torch.float).to(param.data)
+      
+      weight_decay = group['weight_decay_rate']
+      adamw(param.data,
+                    out_p,
+                    next_m,
+                    next_v,
+                    grad,
+                    lr,
+                    beta1,
+                    beta2,
+                    group['e'],
+                    grad_scale, #combined_scale, g = g/scale
+                    state['step'],
+                    eps_mode, #self.eps_mode, esp inside sqrt:0, outside: 1, only update with momentum: 2
+                    0, #bias_correction,
+                    weight_decay)
+
+      out_p = param.out_data if hasattr(param, 'out_data') and (param.out_data is not None) else None
+      if out_p is not None and out_p.dtype != grad.dtype:
+        out_p.copy_(param.data)
+
+  def get_state(self, param):
+    state = self.state[param]
+    # State initialization
+    if len(state) == 0:
+      state['step'] = 0
+      state['next_m'] = torch.zeros_like(param.data)
+      state['next_v'] = torch.zeros_like(param.data)
+    return state
diff --git a/nlu/DeBERTa/sift/README.md b/nlu/DeBERTa/sift/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e82b8083db75c7b77d1673b467f117e44e0b87a8
--- /dev/null
+++ b/nlu/DeBERTa/sift/README.md
@@ -0,0 +1,53 @@
+# SiFT (Scale Invariant Fine-Tuning) 
+
+## Usage
+
+For example to try SiFT in DeBERTa, please check `experiments/glue/mnli.sh base-sift` or `experiments/glue/mnli.sh xxlarge-v2-sift`
+
+
+Here is an example to consume SiFT in your existing code,
+
+  ```python
+  # Create DeBERTa model
+  adv_modules = hook_sift_layer(model, hidden_size=768)
+  adv = AdversarialLearner(model, adv_modules)
+  def logits_fn(model, *wargs, **kwargs):
+    logits,_ = model(*wargs, **kwargs)
+    return logits
+  logits,loss = model(**data)
+
+  loss = loss + adv.loss(logits, logits_fn, **data)
+  # Other steps is the same as general training.
+
+  ```
+
+## Ablation study results
+
+
+| Model                     |  MNLI-m/mm   | SST-2 | QNLI | CoLA | RTE    | MRPC  | QQP   |STS-B |
+|---------------------------|-------------|-------|------|------|--------|-------|-------|------|
+|                           |  Acc         | Acc   | Acc  | MCC  | Acc    |Acc/F1 |Acc/F1 |P/S   |
+|**[DeBERTa-V2-XXLarge](https://huggingface.co/microsoft/deberta-v2-xxlarge)<sup>1,2</sup>**|91.7/91.9|97.2|96.0|72.0| 93.5| **93.1/94.9**|92.7/90.3 |93.2/93.1 |
+|**[DeBERTa-V2-XXLarge](https://huggingface.co/microsoft/deberta-v2-xxlarge)<sup>1,2</sup>**|**92.0/92.1**|97.5|**96.5**|**73.5**| **96.5**| - |**93.0/90.7** | - |
+
+# Citation
+```
+@misc{he2020deberta,
+    title={DeBERTa: Decoding-enhanced BERT with Disentangled Attention},
+    author={Pengcheng He and Xiaodong Liu and Jianfeng Gao and Weizhu Chen},
+    year={2020},
+    eprint={2006.03654},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+
+@article{Jiang_2020,
+   title={SMART: Robust and Efficient Fine-Tuning for Pre-trained Natural Language Models through Principled Regularized Optimization},
+   url={http://dx.doi.org/10.18653/v1/2020.acl-main.197},
+   DOI={10.18653/v1/2020.acl-main.197},
+   journal={Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
+   publisher={Association for Computational Linguistics},
+   author={Jiang, Haoming and He, Pengcheng and Chen, Weizhu and Liu, Xiaodong and Gao, Jianfeng and Zhao, Tuo},
+   year={2020}
+}
+```
diff --git a/nlu/DeBERTa/sift/__init__.py b/nlu/DeBERTa/sift/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..53c8e522655a235f120c2659e2f2959ff7709640
--- /dev/null
+++ b/nlu/DeBERTa/sift/__init__.py
@@ -0,0 +1 @@
+from .sift import *
diff --git a/nlu/DeBERTa/sift/sift.py b/nlu/DeBERTa/sift/sift.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7b1de8e6c3159922b1df4698aecf43e78d9ddb6
--- /dev/null
+++ b/nlu/DeBERTa/sift/sift.py
@@ -0,0 +1,210 @@
+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: penhe@microsoft.com
+# Date: 01/25/2021
+#
+
+import torch
+import torch.nn.functional as F
+
+__all__ = ['PerturbationLayer', 'AdversarialLearner', 'hook_sift_layer']
+
+class PerturbationLayer(torch.nn.Module):
+  def __init__(self, hidden_size, learning_rate=1e-4, init_perturbation=1e-2):
+    super().__init__()
+    self.learning_rate = learning_rate
+    self.init_perturbation = init_perturbation
+    self.delta = None
+    self.LayerNorm = torch.nn.LayerNorm(hidden_size, 1e-7, elementwise_affine=False)
+    self.adversarial_mode = False
+
+  def adversarial_(self, adversarial = True):
+    self.adversarial_mode = adversarial
+    if not adversarial:
+      self.delta = None
+
+  def forward(self, input):
+    if not self.adversarial_mode:
+      self.input = self.LayerNorm(input)
+      return self.input
+    else:
+      if self.delta is None:
+        self.update_delta(requires_grad=True)
+      return self.perturbated_input
+
+  def update_delta(self, requires_grad = False):
+    if not self.adversarial_mode:
+      return True
+    if self.delta is None:
+      delta = torch.clamp(self.input.new(self.input.size()).normal_(0, self.init_perturbation).float(), -2*self.init_perturbation, 2*self.init_perturbation)
+    else:
+      grad = self.delta.grad
+      self.delta.grad = None
+      delta = self.delta
+      norm = grad.norm()
+      if torch.isnan(norm) or torch.isinf(norm):
+        return False
+      eps = self.learning_rate
+      with torch.no_grad():
+        delta = delta + eps*grad/(1e-6 + grad.abs().max(-1, keepdim=True)[0])
+    self.delta = delta.float().detach().requires_grad_(requires_grad)
+    self.perturbated_input = (self.input.to(delta).detach() + self.delta).to(self.input)
+    return True
+
+def hook_sift_layer(model, hidden_size, learning_rate=1e-4, init_perturbation=1e-2, target_module = 'embeddings.LayerNorm'):
+  """
+  Hook the sift perturbation layer to and existing model. With this method, you can apply adversarial training
+  without changing the existing model implementation.
+
+  Params:
+    `model`: The model instance to apply adversarial training
+    `hidden_size`: The dimmension size of the perturbated embedding
+    `learning_rate`: The learning rate to update the perturbation
+    `init_perturbation`: The initial range of perturbation
+    `target_module`: The module to apply perturbation. It can be the name of the sub-module of the model or the sub-module instance.
+    The perturbation layer will be inserted before the sub-module.
+
+  Outputs:
+    The perturbation layers.
+
+  """
+  
+  if isinstance(target_module, str):
+    _modules = [k for n,k in model.named_modules() if  target_module in n]
+  else:
+    assert isinstance(target_module, torch.nn.Module), f'{type(target_module)} is not an instance of torch.nn.Module'
+    _modules = [target_module]
+  adv_modules = []
+  for m in _modules:
+    adv = PerturbationLayer(hidden_size, learning_rate, init_perturbation)
+    def adv_hook(module, inputs):
+      return adv(inputs[0])
+    for h in list(m._forward_pre_hooks.keys()):
+      if m._forward_pre_hooks[h].__name__ == 'adv_hook':
+        del m._forward_pre_hooks[h]
+    m.register_forward_pre_hook(adv_hook)
+    adv_modules.append(adv)
+  return adv_modules
+
+class AdversarialLearner:
+  """ Adversarial Learner
+  This class is the helper class for adversarial training.
+
+  Params:
+    `model`: The model instance to apply adversarial training
+    `perturbation_modules`: The sub modules in the model that will generate perturbations. If it's `None`,
+    the constructor will detect sub-modules of type `PerturbationLayer` in the model.
+
+  Example usage:
+  ```python
+  # Create DeBERTa model
+  adv_modules = hook_sift_layer(model, hidden_size=768)
+  adv = AdversarialLearner(model, adv_modules)
+  def logits_fn(model, *wargs, **kwargs):
+    logits,_ = model(*wargs, **kwargs)
+    return logits
+  logits,loss = model(**data)
+
+  loss = loss + adv.loss(logits, logits_fn, **data)
+  # Other steps is the same as general training.
+
+  ```
+
+  """
+  def __init__(self, model, adv_modules=None):
+    if adv_modules is None:
+      self.adv_modules = [m for m in model.modules() if isinstance(m, PerturbationLayer)]
+    else:
+      self.adv_modules = adv_modules
+    self.parameters = [p for p in model.parameters()]
+    self.model = model
+
+  def loss(self, target, logits_fn, loss_fn = 'symmetric-kl', *wargs, **kwargs):
+    """
+    Calculate the adversarial loss based on the given logits fucntion and loss function.
+    Inputs:
+    `target`: the logits from original inputs.
+    `logits_fn`: the function that produces logits based on perturbated inputs. E.g.,
+    ```python
+    def logits_fn(model, *wargs, **kwargs):
+      logits = model(*wargs, **kwargs)
+      return logits
+    ```
+    `loss_fn`: the function that caclulate the loss from perturbated logits and target logits.
+      - If it's a string, it can be pre-built loss functions, i.e. kl, symmetric_kl, mse.
+      - If it's a function, it will be called to calculate the loss, the signature of the function will be,
+      ```python
+      def loss_fn(source_logits, target_logits):
+        # Calculate the loss
+        return loss
+      ```
+    `*wargs`: the positional arguments that will be passed to the model
+    `**kwargs`: the key-word arguments that will be passed to the model
+    Outputs:
+      The loss based on pertubated inputs.
+    """
+    self.prepare()
+    if isinstance(loss_fn, str):
+      loss_fn = perturbation_loss_fns[loss_fn]
+    pert_logits = logits_fn(self.model, *wargs, **kwargs)
+    pert_loss = loss_fn(pert_logits, target.detach()).sum()
+    pert_loss.backward()
+    for m in self.adv_modules:
+      ok = m.update_delta(True)
+
+    for r,p in zip(self.prev, self.parameters):
+      p.requires_grad_(r)
+    pert_logits = logits_fn(self.model, *wargs, **kwargs)
+    pert_loss = symmetric_kl(pert_logits, target)
+
+    self.cleanup()
+    return pert_loss.mean()
+
+  def prepare(self):
+    self.prev = [p.requires_grad for p in self.parameters]
+    for p in self.parameters:
+      p.requires_grad_(False)
+    for m in self.adv_modules:
+      m.adversarial_(True)
+  
+  def cleanup(self):
+    for r,p in zip(self.prev, self.parameters):
+      p.requires_grad_(r)
+
+    for m in self.adv_modules:
+      m.adversarial_(False)
+
+def symmetric_kl(logits, target):
+  logit_stu = logits.view(-1, logits.size(-1)).float()
+  logit_tea = target.view(-1, target.size(-1)).float()
+  logprob_stu = F.log_softmax(logit_stu, -1)
+  logprob_tea = F.log_softmax(logit_tea, -1)
+  prob_tea = logprob_tea.exp().detach()
+  prob_stu = logprob_stu.exp().detach()
+  floss = ((prob_tea*(-logprob_stu)).sum(-1))    # Cross Entropy
+  bloss = ((prob_stu*(-logprob_tea)).sum(-1))    # Cross Entropy
+  loss = floss + bloss
+  return loss
+
+def kl(logits, target):
+  logit_stu = logits.view(-1, logits.size(-1)).float()
+  logit_tea = target.view(-1, target.size(-1)).float()
+  logprob_stu = F.log_softmax(logit_stu, -1)
+  logprob_tea = F.log_softmax(logit_tea.detach(), -1)
+  prob_tea = logprob_tea.exp()
+  loss = ((prob_tea*(-logprob_stu)).sum(-1))    # Cross Entropy
+  return loss
+
+def mse(logits, target):
+  logit_stu = logits.view(-1, logits.size(-1)).float()
+  logit_tea = target.view(-1, target.size(-1)).float()
+  return F.mse_loss(logit_stu.view(-1),logit_tea.view(-1))
+
+perturbation_loss_fns = {
+    'symmetric-kl': symmetric_kl,
+    'kl': kl,
+    'mse': mse
+    }
diff --git a/nlu/DeBERTa/training/__init__.py b/nlu/DeBERTa/training/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..457c4574297d2c1927c524e2b8230d57a6585307
--- /dev/null
+++ b/nlu/DeBERTa/training/__init__.py
@@ -0,0 +1,4 @@
+from .trainer import DistributedTrainer, set_random_seed
+from .args import get_args
+from .dist_launcher import initialize_distributed,kill_children
+from ._utils import batch_to,batch_apply
diff --git a/nlu/DeBERTa/training/_utils.py b/nlu/DeBERTa/training/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a54e53c72225b81f5495e631903b124929009bf
--- /dev/null
+++ b/nlu/DeBERTa/training/_utils.py
@@ -0,0 +1,16 @@
+import torch
+from collections.abc import Sequence, Mapping
+
+def batch_apply(batch, fn):
+  if isinstance(batch, torch.Tensor):
+    return fn(batch)
+  elif isinstance(batch, Sequence):
+    return [batch_apply(x, fn) for x in batch]
+  elif isinstance(batch, Mapping):
+    return {x:batch_apply(batch[x], fn) for x in batch}
+  else:
+    raise NotImplementedError(f'Type of {type(batch)} are not supported in batch_apply')
+
+def batch_to(batch, device):
+  return batch_apply(batch, lambda x: x.to(device))
+
diff --git a/nlu/DeBERTa/training/args.py b/nlu/DeBERTa/training/args.py
new file mode 100644
index 0000000000000000000000000000000000000000..771ee12f585dfb7123d7e5961641684db55230ce
--- /dev/null
+++ b/nlu/DeBERTa/training/args.py
@@ -0,0 +1,72 @@
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: Pengcheng He (penhe@microsoft.com)
+# Date: 05/15/2019
+#
+
+import argparse
+from ..utils import boolean_string
+
+__all__ = ['get_args']
+
+def get_args():
+  parser=argparse.ArgumentParser(add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+  group = parser.add_argument_group(title='Trainer', description='Parameters for the distributed trainer')
+  group.add_argument('--accumulative_update',
+            type=int,
+            default=1,
+            help="Number of updates steps to accumulate before performing a backward/update pass.")
+
+  group.add_argument("--dump_interval",
+            default=1000,
+            type=int,
+            help="Interval steps for generating checkpoint.")
+
+  group.add_argument("--local_rank",
+            type=int,
+            default=-1,
+            help="local_rank for distributed training on gpus")
+
+  group.add_argument('--workers',
+            type=int,
+            default=2,
+            help="The workers to load data.")
+
+  group.add_argument("--num_train_epochs",
+            default=3.0,
+            type=float,
+            help="Total number of training epochs to perform.")
+
+  group.add_argument('--seed',
+            type=int,
+            default=1234,
+            help="random seed for initialization")
+
+  group.add_argument("--train_batch_size",
+            default=64,
+            type=int,
+            help="Total batch size for training.")
+
+  group.add_argument("--world_size",
+            type=int,
+            default=-1,
+            help="[Internal] The world size of distributed training. Internal usage only!! To the world size of the program, you need to use environment. 'WORLD_SIZE'")
+
+  group.add_argument("--rank",
+            type=int,
+            default=-1,
+            help="[Internal] The rank id of current process. Internal usage only!! To the rank of the program, you need to use environment. 'RANK'")
+
+  group.add_argument("--master_ip",
+            type=str,
+            default=None,
+            help="[Internal] The ip address of master node. Internal usage only!! To the master IP of the program, you need to use environment. 'MASTER_ADDR'")
+
+  group.add_argument("--master_port",
+            type=str,
+            default=None,
+            help="[Internal] The port of master node. Internal usage only!! To the master IP of the program, you need to use environment. 'MASTER_PORT'")
+
+  return parser
diff --git a/nlu/DeBERTa/training/dist_launcher.py b/nlu/DeBERTa/training/dist_launcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdeba141349ee311dd362453cddbe035af2b0125
--- /dev/null
+++ b/nlu/DeBERTa/training/dist_launcher.py
@@ -0,0 +1,163 @@
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: Pengcheng He (penhe@microsoft.com)
+# Date: 05/15/2019
+#
+
+import os
+import time
+import pdb
+import signal
+import torch
+from multiprocessing import Process,Pool
+from collections import defaultdict
+import sys
+import psutil
+from ..utils import set_logger, get_logger
+logger = get_logger()
+
+def kill_children(proc=None, recursive = True):
+  if proc is None:
+    proc = psutil.Process()
+  _children = proc.children(recursive=False)
+  for c in _children:
+    try:
+      if recursive:
+        kill_children(c, recursive=recursive)
+      os.kill(c.pid, signal.SIGKILL)
+    except:
+      pass
+
+  for c in _children:
+    try:
+      c.wait(1)
+    except:
+      pass
+
+def gc(i):
+  return torch.cuda.device_count()
+
+def get_ngpu():
+  with Pool(1) as p:
+    return p.map(gc, range(1))[0]
+
+def _setup_distributed_group(args):
+  """Initialize torch.distributed."""
+
+  torch.backends.cudnn.enabled = False
+  if args.world_size == 1:
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+  else:
+    set_logger(args.task_name, os.path.join(args.output_dir, f'training_{args.task_name}_{args.rank}.log'), rank=args.rank, verbose=1 if args.local_rank==0 else 0)
+    device_id = args.rank % args.n_gpu
+    if args.local_rank >= 0:
+      device_id = args.local_rank
+    device = torch.device("cuda", device_id)
+    init_method = 'tcp://'
+    init_method += args.master_ip + ':' + args.master_port
+    distributed_backend = getattr(args, 'distributed_backend', 'nccl')
+    torch.distributed.init_process_group(
+      backend=distributed_backend,
+      world_size=args.world_size, rank=args.rank,
+      init_method=init_method)
+    torch.cuda.set_device(device)
+  n_gpu = torch.cuda.device_count()
+  logger.info("device=%s, n_gpu=%d, distributed training=%r, world_size=%d", device, n_gpu, bool(args.world_size != 1), args.world_size)
+  return device
+
+def _get_world_size(args):
+    world_size = int(os.getenv("WORLD_SIZE", '1'))
+    if not hasattr(args, 'n_gpu') or args.n_gpu is None:
+      n_gpu = get_ngpu()
+    return n_gpu * world_size
+
+def initialize_distributed(args, join=True):
+    args.world_size = int(os.getenv("WORLD_SIZE", '1'))
+    args.rank = int(os.getenv('RANK', '0'))
+    args.master_ip = os.getenv('MASTER_ADDR', 'localhost')
+    args.master_port = os.getenv('MASTER_PORT', '17006')
+  
+    if args.world_size == 1:
+      args.rank = 0
+      args.master_ip = 'localhost'
+
+    if not hasattr(args, 'n_gpu') or args.n_gpu is None:
+      args.n_gpu = get_ngpu()
+
+    args.node_rank = args.rank
+    args.world_size = args.n_gpu * args.world_size
+    seed = args.seed
+    is_child = False
+    if args.world_size>1:
+      children = []
+      for r in range(args.n_gpu):
+        args.rank = r + args.n_gpu*args.node_rank
+        args.local_rank = r
+        args.seed = seed + args.rank
+        child = os.fork()
+        if child>0:
+          children.append(child)
+        else:
+          signal.signal(signal.SIGINT, signal.SIG_IGN)
+          is_child = True
+          break
+    else:
+      is_child = True
+
+    if is_child:
+      return _setup_distributed_group(args)
+    else:
+      if join:
+        try:
+          for c in children:
+            cid, ccode = os.waitpid(0,0)
+            logger.debug(f'Worker {c} done with code {ccode}')
+            if ccode != 0:
+              logger.error(f'Worker {c} : {cid} failed with code {ccode}')
+              kill_children()
+              raise ValueError(f'Job failed. {cid}:{ccode}')
+        except (KeyboardInterrupt, SystemExit):
+          logger.warning('Keybord interrupt by user. Terminate all processes')
+          kill_children(None)
+      return children
+
+def test_dist_launch():
+  def test_functions(args):
+    global logger
+    set_logger(args.task_name, os.path.join(args.output_dir, f'training_{args.task_name}_{args.node_rank}.log'), rank=args.rank)
+    logger.info(args)
+
+  class Args:
+    def __init__(self):
+      pass
+    def __repr__(self):
+      return str(self.__dict__)
+
+  args = Args()
+  args.task_name = 'test'
+  args.seed = 0
+  args.n_gpu = None
+  args.no_cuda=False
+  args.output_dir = '/tmp'
+  distributed_launch(args, test_functions, (args,))
+
+def test_init_dist():
+  class Args:
+    def __init__(self):
+      pass
+    def __repr__(self):
+      return str(self.__dict__)
+
+  args = Args()
+  args.task_name = 'test'
+  args.seed = 0
+  args.n_gpu = None
+  args.no_cuda=False
+  args.output_dir = '/tmp'
+  device = initialize_distributed(args)
+  if isinstance(device, torch.device):
+    return 0
+  else:
+    return 1
diff --git a/nlu/DeBERTa/training/optimizer_utils.py b/nlu/DeBERTa/training/optimizer_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a7279a4259e6b131f6bd9022c975215105219ee
--- /dev/null
+++ b/nlu/DeBERTa/training/optimizer_utils.py
@@ -0,0 +1,181 @@
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: Pengcheng He (penhe@microsoft.com)
+# Date: 05/15/2019
+#
+
+from collections import defaultdict
+import numpy as np
+import pdb
+from functools import cmp_to_key
+import torch
+import re
+from ..optims import Fp16Optimizer,XAdam,ExpLossScaler,get_world_size
+from ..utils import get_logger
+logger=get_logger()
+
+
+
+def xadam_factory(args, training_steps=None):
+  def optimizer_fn(param_groups, max_grad_norm=None):
+    with_radam = getattr(args, 'with_radam', False)
+    opt_type = getattr(args, 'opt_type', None)
+    optimizer = XAdam(param_groups,
+            lr=args.learning_rate,
+            b1=args.adam_beta1,
+            b2=args.adam_beta2,
+            lr_ends=args.lr_schedule_ends,
+            e=args.epsilon,
+            warmup=args.warmup_proportion if args.warmup_proportion<1 else args.warmup_proportion/training_steps,
+            t_total=training_steps,
+            schedule=args.lr_schedule,
+            max_grad_norm = args.max_grad_norm if max_grad_norm is None else max_grad_norm,
+            weight_decay_rate = args.weight_decay,
+            with_radam = with_radam,
+            opt_type = opt_type,
+            rank = args.rank)
+    return optimizer
+
+  return optimizer_fn
+
+def create_xoptimizer(model, args, num_train_steps=None, no_decay=['bias', 'LayerNorm.weight']):
+  if args.fp16:
+    loss_scaler = ExpLossScaler(scale_interval = args.scale_steps, init_scale=args.loss_scale)
+  else:
+    loss_scaler = None
+
+  distributed_optimizer = getattr(args, 'distributed_optimizer', True)
+  max_distributed_groups = getattr(args, 'max_distributed_groups', 1000000)
+  world_size = get_world_size()
+  if world_size<=1:
+    distributed_optimizer = False
+
+  _no_decay = [x.strip() for x in getattr(args, 'no_decay', '').split('|') if len(x.strip())>0]
+  if len(_no_decay)>0:
+    no_decay = _no_decay
+
+  opt_fn = xadam_factory(args, num_train_steps)
+  
+  named_params = [(n,p) for n,p in model.named_parameters() if p.requires_grad]
+  param_size = [p.numel() for n,p in named_params]
+  type_groups = defaultdict(list)
+  if distributed_optimizer:
+    num_groups = min(world_size, max_distributed_groups)
+    max_group_size = (sum(param_size)+num_groups-1)//num_groups
+    #max_group_size = max(64*1024*1024, max_group_size)
+    #max_group_size = max_group_size//2
+    max_group_size = (max_group_size//32)*32
+    group_sizes = [0 for _ in range(num_groups)]
+    group_ranks = [g*(world_size//num_groups) for g in range(num_groups)]
+  else:
+    # TODO: Fix inconsistent results with different group size
+    max_group_size = max(64*1024*1024, max(param_size))
+    num_groups = (sum(param_size)+max_group_size-1)//max_group_size
+    group_sizes = [0 for _ in range(num_groups)]
+
+  def get_smallest_group(group_sizes):
+    return np.argmin([g+i/10000 for i,g in enumerate(group_sizes)])
+
+  def chunk_into_pieces(param, max_size):
+    num_chunks = param.numel()//max_size
+    if num_chunks<2:
+      return [param], [None]
+
+    flat = param.view(-1)
+    chunks=[]
+    offsets = []
+    for i in range(num_chunks-1):
+      chunks.append(flat.narrow(0, i*max_size, max_size))
+      offsets.append([i*max_size, max_size])
+    i += 1
+    chunks.append(flat.narrow(0, i*max_size, flat.size(0)-i*max_size))
+    offsets.append([i*max_size, flat.size(0)-i*max_size])
+    assert sum([c.numel() for c in chunks])==param.numel(), f'{param.numel()}: {offsets}'
+    return chunks, offsets
+
+  def param_cmp(x,y):
+    n1,p1 = x
+    n2,p2 = y
+    if p1.numel() == p2.numel():
+      if n1<n2:
+        return -1
+      elif n1>n2:
+        return 1
+      else:
+        return 0
+    else:
+      return p1.numel() - p2.numel()
+
+  def add_group(param_groups, group, group_id):
+    if distributed_optimizer:
+      group['rank'] = group_ranks[group_id]
+    param_groups.append(group.copy())
+    group['params'] = []
+    group['names'] = []
+    group['offset'] = None
+    return get_smallest_group(group_sizes),group
+
+  hard_reset = getattr(args, 'hard_reset', False)
+  group_id = 0
+  for n,p in named_params:
+    key = ''
+    if any(re.search(nd,n) for nd in no_decay):
+      key += f'{str(p.dtype)}-nd'
+    else:
+      key += f'{str(p.dtype)}-d'
+    type_groups[key].append((n,p))
+  param_groups = []
+  for key, params in type_groups.items():
+    wd_theta = 0
+    weight_decay = args.weight_decay
+    _hard_reset = False
+    if key.endswith('-nd'):
+      weight_decay = 0
+    else:
+      _hard_reset = hard_reset
+
+    group = dict(params=[],
+      weight_decay_rate=weight_decay,
+      wd_theta = wd_theta,
+      hard_reset = hard_reset,
+      names=[],
+      offset=None)
+    params = sorted(params, key=cmp_to_key(param_cmp))
+    for (n,p) in params:
+      if p.numel() >= max_group_size:
+        if len(group['params'])>0:
+          group_id,group = add_group(param_groups, group, group_id)
+        chunks, offsets = chunk_into_pieces(p, max_group_size)
+        for chk, off in zip(chunks, offsets):
+          group['params'].append(p)
+          group['names'].append(n)
+          group['offset'] = off
+          group_sizes[group_id] += chk.numel()
+          group_id,group = add_group(param_groups, group, group_id)
+      else:
+        group['params'].append(p)
+        group['names'].append(n)
+        group['offset'] = None
+        group_sizes[group_id] += p.numel()
+        if group_sizes[group_id]>=max_group_size:
+          group_id,group = add_group(param_groups, group, group_id)
+    if len(group['params'])>0:
+      group_id,group = add_group(param_groups, group, group_id)
+
+  lookahead_k = getattr(args, 'lookahead_k', -1)
+  lookahead_alpha = getattr(args, 'lookahead_alpha', 0.5)
+  optimizer = Fp16Optimizer(param_groups, opt_fn, loss_scaler, args.max_grad_norm, lookahead_k = lookahead_k,\
+      lookahead_alpha = lookahead_alpha, rank=args.rank, distributed=distributed_optimizer)
+
+  # if args.fp16:
+  #     # FP16
+  #     optimizer = Fp16Optimizer(param_groups, opt_fn, loss_scaler, args.max_grad_norm, lookahead_k = lookahead_k,\
+  #         lookahead_alpha = lookahead_alpha, rank=args.rank, distributed=distributed_optimizer)
+  # else:
+  #     #  FP32: Dùng trực tiếp Optimizer (XAdam)
+  #     logger.info("FP32 Detected: Bypassing Fp16Optimizer wrapper and using XAdam directly.")
+  #     optimizer = opt_fn(param_groups)
+
+  return optimizer
diff --git a/nlu/DeBERTa/training/trainer.py b/nlu/DeBERTa/training/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b6616f32c1726dd9565cbf778c3a86cb3961015
--- /dev/null
+++ b/nlu/DeBERTa/training/trainer.py
@@ -0,0 +1,302 @@
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Author: Pengcheng He (penhe@microsoft.com)
+# Date: 05/15/2019
+#
+
+import os
+import torch
+import random
+import time
+import numpy as np
+import pdb
+from collections import defaultdict, OrderedDict
+from collections.abc import Mapping, Sequence
+from torch.utils.data import DataLoader
+from ..data import BatchSampler, DistributedBatchSampler,RandomSampler,SequentialSampler, AsyncDataLoader
+from ..utils import get_logger
+logger = get_logger()
+
+from .dist_launcher import get_ngpu
+from .optimizer_utils import create_xoptimizer
+from ._utils import batch_to
+
+__all__ = ['DistributedTrainer', 'set_random_seed']
+
+def set_random_seed(seed, cpu_only=False):
+  random.seed(seed)
+  np.random.seed(seed)
+  torch.manual_seed(seed)
+  n_gpu = get_ngpu()
+  if n_gpu > 0 and not cpu_only:
+    torch.cuda.manual_seed_all(seed)
+
+class TrainerState:
+  def __init__(self, training_steps, name=None):
+    self.__dict__ = defaultdict(float)
+    self.loss = 0.0
+    self.examples = 0
+    self.steps = 0
+    self._last_report_step = 0
+    self.epochs = 0
+    self.next_batch = 0
+    self.num_training_steps = training_steps
+    self._last_report_time = time.time()
+    self.best_steps = 0
+    self.best_metric = -1e9
+    self.best_steps_2 = 0
+    self.best_metric_2 = -1e9
+    self.name = name
+    self.run_id = None
+
+  def update_step(self, loss, examples, loss_scale):
+    self.examples += examples
+    self.loss += loss
+    self.steps += 1
+    self.next_batch += 1
+    self.loss_scale = loss_scale
+  
+  def report_state(self):
+    if self.steps <= self._last_report_step:
+      return
+
+    end = time.time()
+    start = self._last_report_time
+    if self.name is not None:
+      tag = f'[{self.name}]'
+    else:
+      tag = None
+    logger.info('{}[{:0.1f}%][{:0.2f}h] Steps={}, loss={}, examples={}, loss_scale={:0.1f}, {:0.1f}s'.format(tag, 100*self.steps/self.num_training_steps, \
+      (self.num_training_steps - self.steps)*(start-end)/((self.steps-self._last_report_step)*3600), self.steps, self.loss/self.steps, self.examples, self.loss_scale, end-start))
+    self._last_report_time = end
+    self._last_report_step = self.steps
+
+class DistributedTrainer:
+  def __init__(self, args, output_dir, model, device, data_fn, loss_fn=None, optimizer_fn=None, eval_fn=None, init_fn=None, update_fn=None, dump_interval = 10000, name=None, **kwargs):
+    """
+    data_fn return tuples (training_dataset, training_steps, train_sampler, batch_scheduler), training_dataset is required
+    loss_fn return the loss of current mini-batch and the size of the batch
+    optimizer_fn return the created optimizer
+    eval_fn return metrics for model selection
+    """
+    self.__dict__.update(kwargs)
+    self.args = args
+    self.device = device
+    self.eval_fn = eval_fn
+    self.accumulative_update = 1
+    if hasattr(args, 'accumulative_update'):
+      self.accumulative_update = args.accumulative_update
+    
+    train_data, training_steps, train_sampler = data_fn(self)
+    self.train_data = train_data
+    self.train_sampler = train_sampler if train_sampler is not None else RandomSampler(len(train_data))
+    self.training_epochs = int(getattr(args, 'num_train_epochs', 1))
+
+    if training_steps is None:
+      training_steps = getattr(args, 'training_steps', (len(training_data) + self.args.train_batch_size-1)//self.args.train_batch_size*self.training_epochs)
+    self.training_steps = training_steps
+
+    self.output_dir = output_dir
+    self.init_fn = init_fn
+    self.trainer_state = TrainerState(self.training_steps, name = name)
+    self.dump_interval = dump_interval
+
+    self.model = self._setup_model(args, model)
+    self.post_loss_fn = None
+
+    def _opt_fn(trainer, model, training_steps):
+      return create_xoptimizer(model, args, num_train_steps = training_steps)
+    optimizer_fn = optimizer_fn if optimizer_fn is not None else _opt_fn
+
+    self.optimizer = optimizer_fn(self, model, training_steps)
+
+    def _loss_fn(trainer, model, batch):
+      _,loss = model(**batch)
+      batch_size = batch['input_ids'].size(0)
+      return loss.mean(), batch_size
+    self.loss_fn = loss_fn if loss_fn is not None else _loss_fn
+
+    self.initialized = False
+    self.update_fn = update_fn
+
+  def initialize(self):
+    set_random_seed(self.args.seed)
+
+    if self.args.world_size>1:
+      torch.distributed.barrier()
+    self.initialized = True
+
+  def train(self):
+    if not self.initialized:
+      self.initialize()
+
+    rank = self.args.rank
+    world_size = self.args.world_size
+
+    for n_epoch in range(self.trainer_state.epochs, self.training_epochs):
+      batch_sampler = BatchSampler(self.train_sampler, self.args.train_batch_size)
+      batch_sampler = DistributedBatchSampler(batch_sampler, rank = rank, world_size = world_size)
+      batch_sampler.next = self.trainer_state.next_batch
+      num_workers = getattr(self.args, 'workers', 2)
+      train_dataloader = DataLoader(self.train_data, batch_sampler=batch_sampler, num_workers=num_workers, worker_init_fn=self.init_fn,
+                                    pin_memory=True,persistent_workers=(num_workers>0))
+      torch.cuda.empty_cache()
+      for step, batch in enumerate(AsyncDataLoader(train_dataloader, 100)):
+        if self.trainer_state.steps >= self.training_steps:
+          break
+        bs_scale = 1
+        batch = batch_to(batch, self.device)
+        self._train_step(batch, bs_scale)
+     
+      # Save model
+      self.trainer_state.epochs += 1
+      self.trainer_state.next_batch = 0
+      self.trainer_state.report_state()
+      self._eval_model()
+      
+      if n_epoch == self.training_epochs - 1:
+        self.dump_interval = min(1000, self.dump_interval)
+      # for n,v in self.model.named_parameters():
+      #   if n == 'deberta.encoder.layer.0.attention.self.query_proj.hra_u.0':
+      #     print(v[:5])
+      #     print((v/v.norm())[:5])
+
+  def save_model(self, args, checkpoint_dir, chk_postfix, model, optimizer):
+    save_path= os.path.join(checkpoint_dir, f'pytorch.model-{chk_postfix}.bin')
+    if hasattr(model, 'module'):
+      model_state = OrderedDict([(n,p) for n,p in model.module.state_dict().items()])
+    else:
+      model_state = OrderedDict([(n,p) for n,p in model.state_dict().items()])
+    if args.rank < 1:
+      torch.save(model_state, save_path)
+    return save_path
+
+  def _eval_model(self, with_checkpoint=True):
+    if with_checkpoint:
+      checkpoint_dir = getattr(self.args, 'checkpoint_dir', None)
+      checkpoint_dir = checkpoint_dir if checkpoint_dir is not None else self.output_dir
+      chk_postfix = f'{self.trainer_state.steps:06}'
+      self.save_model(self.args, checkpoint_dir, chk_postfix, self.model, self.optimizer)
+      
+    _metric = self.trainer_state.best_metric
+    _steps = self.trainer_state.best_steps
+    if self.args.task_name == 'MNLI':
+      _metric_2 = self.trainer_state.best_metric_2
+      _steps_2 = self.trainer_state.best_steps_2
+    if self.eval_fn is not None:
+      metric = self.eval_fn(self, self.model, self.device, tag=f'{self.trainer_state.steps:06}-{self.training_steps}')
+      if self.args.task_name == 'MNLI':
+        if metric[0] > _metric:
+          _metric = metric[0]
+          _steps = self.trainer_state.steps
+        if metric[1] > _metric_2:
+          _metric_2 = metric[1]
+          _steps_2 = self.trainer_state.steps
+      else:
+        if metric > _metric:
+          _metric = metric
+          _steps = self.trainer_state.steps
+      
+      if self.args.task_name == 'MNLI':
+        logger.info(f'Best matched metric: {_metric}@{_steps}')
+        logger.info(f'Best mismatched metric: {_metric_2}@{_steps_2}')
+      else:
+        logger.info(f'Best metric: {_metric}@{_steps}')
+        
+    self.trainer_state.best_metric, self.trainer_state.best_steps =  _metric, _steps
+    if self.args.task_name == 'MNLI':
+      self.trainer_state.best_metric_2, self.trainer_state.best_steps_2 =  _metric_2, _steps_2
+
+  def _train_step(self, data, bs_scale):
+    self.model.train()
+    go_next=False
+
+    def split(batch, parts):
+      sub_batches = [{} for _ in range(parts)]
+      for k in batch.keys():
+        b = batch[k].size(0)
+        s = (b + parts - 1)//parts
+        v = batch[k].split(s)
+        for i,z in enumerate(v):
+          sub_batches[i][k]=z
+      chunks = [b for b in sub_batches if len(b)>0]
+      return chunks
+
+    if self.accumulative_update>1:
+      data_chunks = split(data, self.accumulative_update)
+    else:
+      data_chunks = [data]
+
+    while not go_next:
+      step_loss = 0
+      batch_size = 0
+      self.optimizer.zero_grad()
+      forward_outputs = []
+      for i, sub in enumerate(data_chunks):
+        output = self.loss_fn(self, self.model, sub)
+        if isinstance(output, dict):
+          loss, sub_size = output['loss'], output['batch_size']
+        else:
+          loss, sub_size = output
+        forward_outputs.append(output)
+        loss = loss/len(data_chunks)
+        # ------------------------------------------------------------------------------
+        # for name, param in self.model.named_parameters():
+        #   if 'hra_u' in name:
+        #     device = param.device
+        #     hra_u_norm = param / param.norm(dim=0)
+        #     orth_loss = torch.norm(torch.eye(8, device=device) - hra_u_norm.t() @ hra_u_norm)
+        #     loss =  loss + 1e-6 * orth_loss
+        # ------------------------------------------------------------------------------
+        if i == 0:
+          loss_scale, _loss = self.optimizer.backward(loss)
+        else:
+          _loss = loss.float().detach().item()
+          loss = loss.float() * loss_scale
+          loss.backward()
+        step_loss += _loss
+        batch_size += sub_size
+
+            ###
+      check_param = None
+      for n, p in self.model.named_parameters():
+          if "hra_" in n and p.requires_grad:
+              check_param = p
+              break
+      val_before = check_param.data.clone().cpu().float().numpy()[0,0] # Take first element
+      if not self.optimizer.step(bs_scale, loss_scale):
+        self.optimizer.zero_grad()
+        continue
+          
+      #Check value after update
+      # val_after = check_param.data.clone().cpu().float().numpy()[0,0]
+      
+      # if val_before == val_after:
+      #     print(f"[CRITICAL WARNING] HRA Param {n} did NOT change! Optimizer is broken.")
+      #     print(f"   Before: {val_before:.6f} | After: {val_after:.6f} | Grad: {check_param.grad.norm().item()}")
+      # else:
+      #     print(f"[SUCCESS] HRA Param updated. Delta: {val_after - val_before}") 
+      # exit()
+
+      go_next = True
+    self.trainer_state.update_step(step_loss, batch_size , loss_scale)
+    if self.update_fn is not None:
+      self.update_fn(self, self.model, loss_scale)
+    self.optimizer.zero_grad()
+
+    if self.post_loss_fn is not None:
+      self.post_loss_fn(forward_outputs)
+
+    if self.trainer_state.steps%100 == 0:
+      self.trainer_state.report_state()
+    if self.trainer_state.steps%self.dump_interval == 0:
+      self._eval_model()
+
+  def _setup_model(self, args, model):
+    if args.world_size > 1:
+      for p in model.parameters():
+        torch.distributed.broadcast(p.data, 0)
+      torch.cuda.synchronize()
+    return model
diff --git a/nlu/DeBERTa/utils/__init__.py b/nlu/DeBERTa/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8a97482bc86684b0a27fa53ed3568a125bf1638
--- /dev/null
+++ b/nlu/DeBERTa/utils/__init__.py
@@ -0,0 +1,8 @@
+"""
+utils
+@Author: penhe@microsoft.com
+"""
+
+from .logger_util import *
+from .argument_types import *
+from .xtqdm import *
diff --git a/nlu/DeBERTa/utils/argument_types.py b/nlu/DeBERTa/utils/argument_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce2df4e84ba4eeb5fa8e11318b7c39b72dfd9aad
--- /dev/null
+++ b/nlu/DeBERTa/utils/argument_types.py
@@ -0,0 +1,5 @@
+
+def boolean_string(s):
+  if s.lower() not in {'false', 'true'}:
+    raise ValueError('Not a valid boolean string')
+  return s.lower() == 'true'
diff --git a/nlu/DeBERTa/utils/jit_tracing.py b/nlu/DeBERTa/utils/jit_tracing.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bc043a9f599a4cc4538f29e973cb03af3504def
--- /dev/null
+++ b/nlu/DeBERTa/utils/jit_tracing.py
@@ -0,0 +1,44 @@
+"""
+Logging util
+@Author: penhe@microsoft.com
+"""
+
+""" Utils for torch jit tracing customer operators/functions
+"""
+import os
+
+def traceable(cls):
+  """ Decorator over customer functions
+      There is an issue for tracing customer python torch Function, using this decorator to work around it.
+      e.g.
+      @traceable
+      class MyOp(torch.autograd.Function):
+      xxx
+  """
+
+  class _Function(object):
+    @staticmethod
+    def apply(*args):
+      jit_trace = (os.getenv('JIT_TRACE', 'False').lower() == 'true')
+      if jit_trace:
+        return cls.forward(_Function, *args)
+      else:
+        return cls.apply(*args)
+
+    @staticmethod
+    def save_for_backward(*args):
+      pass
+
+  _Function.__name__ = cls.__name__
+  _Function.__doc__ = cls.__doc__
+  return _Function
+
+class TraceMode():
+  """ Trace context used when tracing modules contains customer operators/Functions
+  """
+  def __enter__(self):
+    os.environ['JIT_TRACE'] = 'True'
+    return self
+
+  def __exit__(self, exp_value, exp_type, trace):
+    del os.environ['JIT_TRACE']
diff --git a/nlu/DeBERTa/utils/logger_util.py b/nlu/DeBERTa/utils/logger_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..83d7cc647d96269227fe37b669a5486511f01109
--- /dev/null
+++ b/nlu/DeBERTa/utils/logger_util.py
@@ -0,0 +1,54 @@
+"""
+Logging util
+@Author: penhe@microsoft.com
+"""
+
+__all__ = ['get_logger', 'set_logger']
+import logging
+import os
+import pdb
+
+logging.basicConfig(format = '%(asctime)s|%(levelname)s|%(name)s| %(message)s',
+                    datefmt = '%m%d%Y %H:%M:%S',
+                    level = logging.INFO)
+logger=None
+def set_logger(name, file_log=None, rank=0, verbose=1):
+    global logger
+    if not logger:
+      logger = logging.getLogger(name)
+    else:
+      logger.name = name
+    
+    dirty_handlers = [h for h in logger.handlers]
+
+    if rank >= 0:
+      formatter = logging.Formatter(f'%(asctime)s|%(levelname)s|%(name)s|{rank:02}| %(message)s', datefmt='%m/%d/%Y %H:%M:%S')
+    else:
+      formatter = logging.Formatter(f'%(asctime)s|%(levelname)s|%(name)s| %(message)s', datefmt='%m/%d/%Y %H:%M:%S')
+    if file_log:
+        fh = logging.FileHandler(file_log)
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(formatter)
+        logger.addHandler(fh)
+
+    # Stdout
+    # create console handler with a higher log level
+    ch = logging.StreamHandler()
+    if verbose > 0:
+      ch.setLevel(logging.INFO)
+    else:
+      ch.setLevel(logging.WARN)
+    ch.setFormatter(formatter)
+    logger.addHandler(ch)
+
+    for h in dirty_handlers:
+      logger.removeHandler(h)
+    logger.propagate=False
+    return logger
+
+def get_logger(name='logging', file_log=None, rank=0, verbose=1):
+  global logger
+  if not logger:
+    logger = set_logger(name, file_log, rank, verbose)
+  return logger
+
diff --git a/nlu/DeBERTa/utils/xtqdm.py b/nlu/DeBERTa/utils/xtqdm.py
new file mode 100644
index 0000000000000000000000000000000000000000..2908d0c1ce9683afe2ca1e1050b351fbd3a03f0a
--- /dev/null
+++ b/nlu/DeBERTa/utils/xtqdm.py
@@ -0,0 +1,30 @@
+
+from tqdm import tqdm
+import os
+
+__all__=['xtqdm']
+
+class dummy_tqdm():
+  def __init__(self, iterable=None, *wargs, **kwargs):
+    self.iterable = iterable
+
+  def __iter__(self):
+    for d in self.iterable:
+      yield d
+
+  def update(self, *wargs, **kwargs):
+    pass
+
+  def close(self):
+    pass
+
+def xtqdm(iterable=None, *wargs, **kwargs):
+  disable = False
+  if 'disable' in kwargs:
+    disable = kwargs['disable']
+  if 'NO_TQDM' in os.environ:
+    disable = True if os.getenv('NO_TQDM', '0')!='0' else False
+  if disable:
+    return dummy_tqdm(iterable, *wargs, **kwargs)
+  else:
+    return tqdm(iterable, *wargs, **kwargs)
diff --git a/nlu/adapterlib/__init__.py b/nlu/adapterlib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fd9ad25a3ec6359639dfe7fa786dcc2c3c3e080
--- /dev/null
+++ b/nlu/adapterlib/__init__.py
@@ -0,0 +1,4 @@
+name = "lora"
+
+from .layers import *
+from .utils import *
\ No newline at end of file
diff --git a/nlu/adapterlib/layers.py b/nlu/adapterlib/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e387e404adcba109fbe3bd17c5c197ba257507c5
--- /dev/null
+++ b/nlu/adapterlib/layers.py
@@ -0,0 +1,507 @@
+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import math
+from typing import Optional, List
+
+class HRALinear(nn.Linear):
+    def __init__(
+        self, 
+        in_features: int, 
+        out_features: int, 
+        config: dict,
+        **kwargs
+    ):
+        nn.Linear.__init__(self, in_features, out_features, **kwargs)
+        config = config.hra
+        self.r = config.r
+        self.apply_GS = config.apply_GS
+        
+        half_u = torch.zeros(self.in_features, self.r // 2)
+        nn.init.kaiming_uniform_(half_u, a=math.sqrt(5))
+        self.hra_u = nn.Parameter(torch.repeat_interleave(half_u, 2, dim=1), requires_grad=True)
+        
+        self.weight.requires_grad = False
+
+        self.register_buffer(
+            "eye",
+            torch.eye(self.in_features)
+        )
+        self.alpha = getattr(config, "alpha", 16.0)
+        self.scale = self.alpha / self.r
+
+        nn.Linear.reset_parameters(self)
+                
+    def train(self, mode: bool = True):
+        nn.Linear.train(self, mode)
+                
+    # def forward(self, x):
+    #     orig_weight = self.weight
+    #     if self.apply_GS:
+    #         weight = [(self.hra_u[:, 0] / self.hra_u[:, 0].norm()).view(-1, 1)]
+    #         for i in range(1, self.r):
+    #             ui = self.hra_u[:, i].view(-1, 1)
+    #             for j in range(i):
+    #                 ui = ui - (weight[j].t() @ ui) * weight[j]
+    #             weight.append((ui / ui.norm()).view(-1, 1))
+    #         weight = torch.cat(weight, dim=1)
+    #         new_weight = torch.mm(orig_weight, torch.eye(self.in_features, device=x.device, dtype=x.dtype) - 2 * weight @ weight.t())
+            
+    #     else:
+    #         new_weight = orig_weight
+    #         hra_u_norm = self.hra_u / self.hra_u.norm(dim=0)
+    #         for i in range(self.r):
+    #             ui = hra_u_norm[:, i].view(-1, 1)
+    #             new_weight = torch.mm(new_weight, torch.eye(self.in_features, device=x.device, dtype=x.dtype) - 2 * ui @ ui.t())
+
+    #     out = F.linear(input=x, weight=new_weight, bias=self.bias)
+    #     return out
+
+    def forward(self, x):
+        # KHÔNG dùng .data
+        W = self.weight  # frozen weight, requires_grad=False
+
+        # ===== build orthogonal Q =====
+        if self.apply_GS:
+            U = []
+            for i in range(self.r):
+                ui = self.hra_u[:, i]
+                for uj in U:
+                    ui = ui - torch.dot(uj, ui) * uj
+                ui = ui / (ui.norm() + 1e-6)
+                U.append(ui)
+            U = torch.stack(U, dim=1)  # [in_features, r]
+            Q = self.eye - 2.0 * (U @ U.t())
+        else:
+            hra_u_norm = self.hra_u / (self.hra_u.norm(dim=0, keepdim=True) + 1e-6)
+            Q = self.eye
+            for i in range(self.r):
+                ui = hra_u_norm[:, i:i+1]
+                Q = Q @ (self.eye - 2.0 * ui @ ui.t())
+
+        # ===== HRA residual (CRITICAL) =====
+        deltaW = self.scale * (W @ (Q - self.eye))
+        W_eff = W + deltaW
+
+        return F.linear(x, W_eff, self.bias)
+
+
+def project(R, eps):
+    I = torch.zeros((R.size(0), R.size(0)), dtype=R.dtype, device=R.device)
+    diff = R - I
+    norm_diff = torch.norm(diff)
+    if norm_diff <= eps:
+        return R
+    else:
+        return I + eps * (diff / norm_diff)
+
+def project_batch(R, eps=1e-5):
+    # scaling factor for each of the smaller block matrix
+    eps = eps * 1 / torch.sqrt(torch.tensor(R.shape[0]))
+    I = torch.zeros((R.size(1), R.size(1)), device=R.device, dtype=R.dtype).unsqueeze(0).expand_as(R)
+    diff = R - I
+    norm_diff = torch.norm(R - I, dim=(1, 2), keepdim=True)
+    mask = (norm_diff <= eps).bool()
+    out = torch.where(mask, R, I + eps * (diff / norm_diff))
+    return out
+
+class OFTLinear(nn.Linear):
+    # LoRA implemented in a dense layer
+    def __init__(
+        self, 
+        in_features: int, 
+        out_features: int, 
+        config: dict,
+        fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        # 不是fan_in_fan_out的问题，因为没有一个module设它为true
+        **kwargs
+    ):
+        nn.Linear.__init__(self, in_features, out_features, **kwargs)
+        config = config.oft
+        self.block_size = config.block_size
+        self.r = in_features // self.block_size
+        self.is_coft = config.is_coft
+        self.block_share = config.block_share
+        self.eps = config.eps
+        
+        # Actual trainable parameters
+        if self.block_share:
+            # Initialized as an identity matrix
+            R_shape = [self.block_size, self.block_size]
+            self.oft_R = nn.Parameter(self.weight.new_zeros(R_shape[0], R_shape[0]))
+            self.eps = self.eps * R_shape[0] * R_shape[0]
+        else:
+            R_shape = [self.r, self.block_size, self.block_size]
+            self.oft_R = self.weight.new_zeros(R_shape[1], R_shape[1])
+            self.oft_R = torch.stack([self.oft_R] * self.r)
+            self.oft_R = nn.Parameter(self.oft_R)
+            self.eps = self.eps * R_shape[1] * R_shape[1]
+        
+        self.weight.requires_grad = False
+        # self.reset_parameters()
+        
+    def reset_parameters(self):
+        nn.Linear.reset_parameters(self)
+        if hasattr(self, 'R'):
+            nn.init.kaiming_uniform_(self.oft_R, a=math.sqrt(5))
+            
+    def forward(self, x):
+        if self.block_share:
+            if self.is_coft:
+                with torch.no_grad():
+                    self.oft_R.copy_(project(self.oft_R, eps=self.eps))
+            orth_rotate = self.cayley(self.oft_R)
+        else:
+            if self.is_coft:
+                with torch.no_grad():
+                    self.oft_R.copy_(project_batch(self.oft_R, eps=self.eps))
+            orth_rotate = self.cayley_batch(self.oft_R)
+
+        # Block-diagonal parametrization
+        block_diagonal_matrix = self.block_diagonal(orth_rotate)
+        out = F.linear(input=x, weight=self.weight @ block_diagonal_matrix.to(x.dtype).t(), bias=self.bias)
+
+        return out
+
+    def cayley(self, data):
+        r, c = list(data.shape)
+        # Ensure the input matrix is skew-symmetric
+        skew = 0.5 * (data - data.t())
+        I = torch.eye(r, device=data.device)
+        
+        # Perform the Cayley parametrization
+        Q = torch.mm(I + skew, torch.inverse(I - skew))
+        return Q
+    
+    def cayley_batch(self, data):
+        b, r, c = data.shape
+        # Ensure the input matrix is skew-symmetric
+        skew = 0.5 * (data - data.transpose(1, 2))
+        I = torch.eye(r, device=data.device).unsqueeze(0).expand(b, r, c)
+
+        # Perform the Cayley parametrization
+        Q = torch.bmm(I - skew, torch.inverse(I + skew))
+
+        return Q
+
+    def block_diagonal(self, R):
+        if self.block_share:
+            # Create a list of R repeated block_count times
+            blocks = [R] * self.r
+        else:
+            # Create a list of R slices along the third dimension
+            blocks = [R[i, ...] for i in range(self.r)]
+
+        # Use torch.block_diag to create the block diagonal matrix
+        A = torch.block_diag(*blocks)
+
+        return A
+
+class LoRALayer():
+    def __init__(
+        self, 
+        r: int, 
+        lora_alpha: int, 
+        lora_dropout: float,
+        merge_weights: bool,
+    ):
+        self.r = r
+        self.lora_alpha = lora_alpha
+        # Optional dropout
+        if lora_dropout > 0.:
+            self.lora_dropout = nn.Dropout(p=lora_dropout)
+        else:
+            self.lora_dropout = lambda x: x
+        # Mark the weight as unmerged
+        self.merged = False
+        self.merge_weights = merge_weights
+
+
+class Embedding(nn.Embedding, LoRALayer):
+    # LoRA implemented in a dense layer
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        r: int = 0,
+        lora_alpha: int = 1,
+        merge_weights: bool = True,
+        **kwargs
+    ):
+        nn.Embedding.__init__(self, num_embeddings, embedding_dim, **kwargs)
+        LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=0,
+                           merge_weights=merge_weights)
+        # Actual trainable parameters
+        if r > 0:
+            self.lora_A = nn.Parameter(self.weight.new_zeros((r, num_embeddings)))
+            self.lora_B = nn.Parameter(self.weight.new_zeros((embedding_dim, r)))
+            self.scaling = self.lora_alpha / self.r
+            # Freezing the pre-trained weight matrix
+            self.weight.requires_grad = False
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.Embedding.reset_parameters(self)
+        if hasattr(self, 'lora_A'):
+            # initialize A the same way as the default for nn.Linear and B to zero
+            nn.init.zeros_(self.lora_A)
+            nn.init.normal_(self.lora_B)
+
+    def train(self, mode: bool = True):
+        nn.Embedding.train(self, mode)
+        if mode:
+            if self.merge_weights and self.merged:
+                # Make sure that the weights are not merged
+                if self.r > 0:
+                    self.weight.data -= (self.lora_B @ self.lora_A).transpose(0, 1) * self.scaling
+                self.merged = False
+        else:
+            if self.merge_weights and not self.merged:
+                # Merge the weights and mark it
+                if self.r > 0:
+                    self.weight.data += (self.lora_B @ self.lora_A).transpose(0, 1) * self.scaling
+                self.merged = True
+        
+    def forward(self, x: torch.Tensor):
+        if self.r > 0 and not self.merged:
+            result = nn.Embedding.forward(self, x)
+            after_A = F.embedding(
+                x, self.lora_A.transpose(0, 1), self.padding_idx, self.max_norm,
+                self.norm_type, self.scale_grad_by_freq, self.sparse
+            )
+            result += (after_A @ self.lora_B.transpose(0, 1)) * self.scaling
+            return result
+        else:
+            return nn.Embedding.forward(self, x)
+            
+
+class LoRALinear(nn.Linear, LoRALayer):
+    # LoRA implemented in a dense layer
+    def __init__(
+        self, 
+        in_features: int, 
+        out_features: int, 
+        config: dict,
+        fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        **kwargs
+    ):
+        nn.Linear.__init__(self, in_features, out_features, **kwargs)
+        config = config.lora
+        LoRALayer.__init__(self, r=config.lora_r, lora_alpha=config.lora_alpha, lora_dropout=config.lora_dropout,
+                           merge_weights=config.merge_weights)
+
+        self.fan_in_fan_out = fan_in_fan_out
+        # Actual trainable parameters
+        if self.r > 0:
+            self.lora_A = nn.Parameter(self.weight.new_zeros((self.r, in_features)))
+            self.lora_B = nn.Parameter(self.weight.new_zeros((out_features, self.r)))
+            self.scaling = self.lora_alpha / self.r
+            # Freezing the pre-trained weight matrix
+            self.weight.requires_grad = False
+        self.reset_parameters()
+        if fan_in_fan_out:
+            self.weight.data = self.weight.data.transpose(0, 1)
+
+    def reset_parameters(self):
+        nn.Linear.reset_parameters(self)
+        if hasattr(self, 'lora_A'):
+            # initialize B the same way as the default for nn.Linear and A to zero
+            # this is different than what is described in the paper but should not affect performance
+            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+            nn.init.zeros_(self.lora_B)
+
+    def train(self, mode: bool = True):
+        def T(w):
+            return w.transpose(0, 1) if self.fan_in_fan_out else w
+        nn.Linear.train(self, mode)
+        if mode:
+            if self.merge_weights and self.merged:
+                # Make sure that the weights are not merged
+                if self.r > 0:
+                    self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
+                self.merged = False
+        else:
+            if self.merge_weights and not self.merged:
+                # Merge the weights and mark it
+                if self.r > 0:
+                    self.weight.data += T(self.lora_B @ self.lora_A) * self.scaling
+                self.merged = True       
+
+    def forward(self, x: torch.Tensor):
+        def T(w):
+            return w.transpose(0, 1) if self.fan_in_fan_out else w
+        if self.r > 0 and not self.merged:
+            result = F.linear(x, T(self.weight), bias=self.bias)            
+            result += (self.lora_dropout(x) @ self.lora_A.transpose(0, 1) @ self.lora_B.transpose(0, 1)) * self.scaling
+            return result
+        else:
+            return F.linear(x, T(self.weight), bias=self.bias)
+
+
+class MergedLinear(nn.Linear, LoRALayer):
+    # LoRA implemented in a dense layer
+    def __init__(
+        self, 
+        in_features: int, 
+        out_features: int, 
+        r: int = 0, 
+        lora_alpha: int = 1, 
+        lora_dropout: float = 0.,
+        enable_lora: List[bool] = [False],
+        fan_in_fan_out: bool = False,
+        merge_weights: bool = True,
+        **kwargs
+    ):
+        nn.Linear.__init__(self, in_features, out_features, **kwargs)
+        LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout,
+                           merge_weights=merge_weights)
+        assert out_features % len(enable_lora) == 0, \
+            'The length of enable_lora must divide out_features'
+        self.enable_lora = enable_lora
+        self.fan_in_fan_out = fan_in_fan_out
+        # Actual trainable parameters
+        if r > 0 and any(enable_lora):
+            self.lora_A = nn.Parameter(
+                self.weight.new_zeros((r * sum(enable_lora), in_features)))
+            self.lora_B = nn.Parameter(
+                self.weight.new_zeros((out_features // len(enable_lora) * sum(enable_lora), r))
+            ) # weights for Conv1D with groups=sum(enable_lora)
+            self.scaling = self.lora_alpha / self.r
+            # Freezing the pre-trained weight matrix
+            self.weight.requires_grad = False
+            # Compute the indices
+            self.lora_ind = self.weight.new_zeros(
+                (out_features, ), dtype=torch.bool
+            ).view(len(enable_lora), -1)
+            self.lora_ind[enable_lora, :] = True
+            self.lora_ind = self.lora_ind.view(-1)
+        self.reset_parameters()
+        if fan_in_fan_out:
+            self.weight.data = self.weight.data.transpose(0, 1)
+
+    def reset_parameters(self):
+        nn.Linear.reset_parameters(self)
+        if hasattr(self, 'lora_A'):
+            # initialize A the same way as the default for nn.Linear and B to zero
+            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+            nn.init.zeros_(self.lora_B)
+
+    def zero_pad(self, x):
+        result = x.new_zeros((len(self.lora_ind), *x.shape[1:]))
+        result[self.lora_ind] = x
+        return result
+
+    def merge_AB(self):
+        def T(w):
+            return w.transpose(0, 1) if self.fan_in_fan_out else w
+        delta_w = F.conv1d(
+            self.lora_A.unsqueeze(0), 
+            self.lora_B.unsqueeze(-1), 
+            groups=sum(self.enable_lora)
+        ).squeeze(0)
+        return T(self.zero_pad(delta_w))
+
+    def train(self, mode: bool = True):
+        def T(w):
+            return w.transpose(0, 1) if self.fan_in_fan_out else w
+        nn.Linear.train(self, mode)
+        if mode:
+            if self.merge_weights and self.merged:
+                # Make sure that the weights are not merged
+                if self.r > 0 and any(self.enable_lora):
+                    self.weight.data -= self.merge_AB() * self.scaling
+                self.merged = False
+        else:
+            if self.merge_weights and not self.merged:
+                # Merge the weights and mark it
+                if self.r > 0 and any(self.enable_lora):
+                    self.weight.data += self.merge_AB() * self.scaling
+                self.merged = True        
+
+    def forward(self, x: torch.Tensor):
+        def T(w):
+            return w.transpose(0, 1) if self.fan_in_fan_out else w
+        if self.merged:
+            return F.linear(x, T(self.weight), bias=self.bias)
+        else:
+            result = F.linear(x, T(self.weight), bias=self.bias)
+            if self.r > 0:
+                result += self.lora_dropout(x) @ T(self.merge_AB().T) * self.scaling
+            return result
+
+class ConvLoRA(nn.Module, LoRALayer):
+    def __init__(self, conv_module, in_channels, out_channels, kernel_size, r=0, lora_alpha=1, lora_dropout=0., merge_weights=True, **kwargs):
+        super(ConvLoRA, self).__init__()
+        self.conv = conv_module(in_channels, out_channels, kernel_size, **kwargs)
+        LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, merge_weights=merge_weights)
+        assert isinstance(kernel_size, int)
+        # Actual trainable parameters
+        if r > 0:
+            self.lora_A = nn.Parameter(
+                self.conv.weight.new_zeros((r * kernel_size, in_channels * kernel_size))
+            )
+            self.lora_B = nn.Parameter(
+              self.conv.weight.new_zeros((out_channels//self.conv.groups*kernel_size, r*kernel_size))
+            )
+            self.scaling = self.lora_alpha / self.r
+            # Freezing the pre-trained weight matrix
+            self.conv.weight.requires_grad = False
+        self.reset_parameters()
+        self.merged = False
+
+    def reset_parameters(self):
+        self.conv.reset_parameters()
+        if hasattr(self, 'lora_A'):
+            # initialize A the same way as the default for nn.Linear and B to zero
+            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+            nn.init.zeros_(self.lora_B)
+
+    def train(self, mode=True):
+        super(ConvLoRA, self).train(mode)
+        if mode:
+            if self.merge_weights and self.merged:
+                if self.r > 0:
+                    # Make sure that the weights are not merged
+                    self.conv.weight.data -= (self.lora_B @ self.lora_A).view(self.conv.weight.shape) * self.scaling
+                self.merged = False
+        else:
+            if self.merge_weights and not self.merged:
+                if self.r > 0:
+                    # Merge the weights and mark it
+                    self.conv.weight.data += (self.lora_B @ self.lora_A).view(self.conv.weight.shape) * self.scaling
+                self.merged = True
+
+    def forward(self, x):
+        if self.r > 0 and not self.merged:
+            return self.conv._conv_forward(
+                x, 
+                self.conv.weight + (self.lora_B @ self.lora_A).view(self.conv.weight.shape) * self.scaling,
+                self.conv.bias
+            )
+        return self.conv(x)
+
+class Conv2d(ConvLoRA):
+    def __init__(self, *args, **kwargs):
+        super(Conv2d, self).__init__(nn.Conv2d, *args, **kwargs)
+
+class Conv1d(ConvLoRA):
+    def __init__(self, *args, **kwargs):
+        super(Conv1d, self).__init__(nn.Conv1d, *args, **kwargs)
+
+# Can Extend to other ones like this
+
+class Conv3d(ConvLoRA):
+    def __init__(self, *args, **kwargs):
+        super(Conv3d, self).__init__(nn.Conv3d, *args, **kwargs)
+
+
+adapter_dict = {
+    'lora': LoRALinear,
+    'oft': OFTLinear,
+    'hra': HRALinear,
+}
diff --git a/nlu/adapterlib/utils.py b/nlu/adapterlib/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d121c9bbd6a424d8f4233fac17335aab5141d9b
--- /dev/null
+++ b/nlu/adapterlib/utils.py
@@ -0,0 +1,49 @@
+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+import torch
+import torch.nn as nn
+
+from typing import Dict
+
+from .layers import LoRALayer
+
+
+def mark_only_lora_as_trainable(model: nn.Module, bias: str = 'none') -> None:
+    for n, p in model.named_parameters():
+        if 'lora_' not in n:
+            p.requires_grad = False
+    if bias == 'none':
+        return
+    elif bias == 'all':
+        for n, p in model.named_parameters():
+            if 'bias' in n:
+                p.requires_grad = True
+    elif bias == 'lora_only':
+        for m in model.modules():
+            if isinstance(m, LoRALayer) and \
+                hasattr(m, 'bias') and \
+                m.bias is not None:
+                    m.bias.requires_grad = True
+    else:
+        raise NotImplementedError
+
+
+def lora_state_dict(model: nn.Module, bias: str = 'none') -> Dict[str, torch.Tensor]:
+    my_state_dict = model.state_dict()
+    if bias == 'none':
+        return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k}
+    elif bias == 'all':
+        return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k or 'bias' in k}
+    elif bias == 'lora_only':
+        to_return = {}
+        for k in my_state_dict:
+            if 'lora_' in k:
+                to_return[k] = my_state_dict[k]
+                bias_name = k.split('lora_')[0]+'bias'
+                if bias_name in my_state_dict:
+                    to_return[bias_name] = my_state_dict[bias_name]
+        return to_return
+    else:
+        raise NotImplementedError
diff --git a/nlu/base_model/.gitattributes b/nlu/base_model/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..6d34772f5ca361021038b404fb913ec8dc0b1a5a
--- /dev/null
+++ b/nlu/base_model/.gitattributes
@@ -0,0 +1,27 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text 
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/nlu/base_model/README.md b/nlu/base_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..340cb56cdc9bb884bb1857d8fb5013d8f17a6a82
--- /dev/null
+++ b/nlu/base_model/README.md
@@ -0,0 +1,96 @@
+---
+language: en
+tags: 
+  - deberta
+  - deberta-v3
+  - fill-mask
+thumbnail: https://huggingface.co/front/thumbnails/microsoft.png
+license: mit
+---
+
+## DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing
+
+[DeBERTa](https://arxiv.org/abs/2006.03654) improves the BERT and RoBERTa models using disentangled attention and enhanced mask decoder. With those two improvements, DeBERTa out perform RoBERTa on a majority of NLU tasks with 80GB training data. 
+
+In [DeBERTa V3](https://arxiv.org/abs/2111.09543), we further improved the efficiency of DeBERTa using ELECTRA-Style pre-training with Gradient Disentangled Embedding Sharing. Compared to DeBERTa,  our V3 version significantly improves the model performance on downstream tasks.  You can find more technique details about the new model from our [paper](https://arxiv.org/abs/2111.09543).
+
+Please check the [official repository](https://github.com/microsoft/DeBERTa) for more implementation details and updates.
+
+The DeBERTa V3 base model comes with 12 layers and a hidden size of 768. It has only 86M backbone parameters  with a vocabulary containing 128K tokens which introduces 98M parameters in the Embedding layer.  This model was trained using the 160GB data as DeBERTa V2.
+
+
+#### Fine-tuning on NLU tasks
+
+We present the dev results on SQuAD 2.0 and MNLI tasks.
+
+| Model             |Vocabulary(K)|Backbone #Params(M)| SQuAD 2.0(F1/EM) | MNLI-m/mm(ACC)|
+|-------------------|----------|-------------------|-----------|----------|
+| RoBERTa-base      |50     |86                 | 83.7/80.5 | 87.6/-   |
+| XLNet-base        |32     |92                 | -/80.2    | 86.8/-   |
+| ELECTRA-base      |30    |86                  | -/80.5    | 88.8/    |
+| DeBERTa-base      |50     |100                |  86.2/83.1| 88.8/88.5|
+| DeBERTa-v3-base   |128|86                       | **88.4/85.4** | **90.6/90.7**|
+| DeBERTa-v3-base + SiFT |128|86                | -/- | 91.0/-|
+
+We present the dev results on SQuAD 1.1/2.0 and MNLI tasks.
+
+#### Fine-tuning with HF transformers
+
+```bash
+#!/bin/bash
+
+cd transformers/examples/pytorch/text-classification/
+
+pip install datasets
+export TASK_NAME=mnli
+
+output_dir="ds_results"
+
+num_gpus=8
+
+batch_size=8
+
+python -m torch.distributed.launch --nproc_per_node=${num_gpus} \
+  run_glue.py \
+  --model_name_or_path microsoft/deberta-v3-base \
+  --task_name $TASK_NAME \
+  --do_train \
+  --do_eval \
+  --evaluation_strategy steps \
+  --max_seq_length 256 \
+  --warmup_steps 500 \
+  --per_device_train_batch_size ${batch_size} \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3 \
+  --output_dir $output_dir \
+  --overwrite_output_dir \
+  --logging_steps 1000 \
+  --logging_dir $output_dir
+
+```
+
+### Citation
+
+If you find DeBERTa useful for your work, please cite the following papers:
+
+``` latex
+@misc{he2021debertav3,
+      title={DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing}, 
+      author={Pengcheng He and Jianfeng Gao and Weizhu Chen},
+      year={2021},
+      eprint={2111.09543},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+
+``` latex
+@inproceedings{
+he2021deberta,
+title={DEBERTA: DECODING-ENHANCED BERT WITH DISENTANGLED ATTENTION},
+author={Pengcheng He and Xiaodong Liu and Jianfeng Gao and Weizhu Chen},
+booktitle={International Conference on Learning Representations},
+year={2021},
+url={https://openreview.net/forum?id=XPZIaotutsD}
+}
+```
diff --git a/nlu/base_model/config.json b/nlu/base_model/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..904b7ba900060d5ff61289fb29bee393189716c7
--- /dev/null
+++ b/nlu/base_model/config.json
@@ -0,0 +1,22 @@
+{
+		"model_type": "deberta-v2",
+		"attention_probs_dropout_prob": 0.1,
+		"hidden_act": "gelu",
+		"hidden_dropout_prob": 0.1,
+		"hidden_size": 768,
+		"initializer_range": 0.02,
+		"intermediate_size": 3072,
+		"max_position_embeddings": 512,
+		"relative_attention": true,
+		"position_buckets": 256,
+		"norm_rel_ebd": "layer_norm",
+		"share_att_key": true,
+		"pos_att_type": "p2c|c2p",
+		"layer_norm_eps": 1e-7,
+		"max_relative_positions": -1,
+		"position_biased_input": false,
+		"num_attention_heads": 12,
+		"num_hidden_layers": 12,
+		"type_vocab_size": 0,
+		"vocab_size": 128100
+}
diff --git a/nlu/base_model/pytorch_model.bin b/nlu/base_model/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cc09775b38deaff27b9d8f8d0a4326f21688ce4d
--- /dev/null
+++ b/nlu/base_model/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:691d48a2800b926a19e3051def466fc2cca4f59a15e42ce4a0cf7f1b380b5e33
+size 371146213
diff --git a/nlu/base_model/rust_model.ot b/nlu/base_model/rust_model.ot
new file mode 100644
index 0000000000000000000000000000000000000000..75939b30452f101e080cf56c6ae057de3c2894a4
--- /dev/null
+++ b/nlu/base_model/rust_model.ot
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:276aadc323988709f076fb489790103d28e64b80c72e9a3e19043d28f4c7c31a
+size 742218621
diff --git a/nlu/base_model/spm.model b/nlu/base_model/spm.model
new file mode 100644
index 0000000000000000000000000000000000000000..b1b95e5b0fef33623979511f423eaeee465c46f0
--- /dev/null
+++ b/nlu/base_model/spm.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
+size 2464616
diff --git a/nlu/base_model/tf_model.h5 b/nlu/base_model/tf_model.h5
new file mode 100644
index 0000000000000000000000000000000000000000..d151654ec2e9ac84ab44fe34038dc8164ff04a2f
--- /dev/null
+++ b/nlu/base_model/tf_model.h5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01ad1b35cac509fb00b9873c670d824363ef884d1aa2758471c47b26cc2948f0
+size 735589384
diff --git a/nlu/base_model/tokenizer_config.json b/nlu/base_model/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..acfd94e399c5659e4bed75f91b4ee24b111fc7a6
--- /dev/null
+++ b/nlu/base_model/tokenizer_config.json
@@ -0,0 +1,4 @@
+{
+  "do_lower_case": false,
+  "vocab_type": "spm"
+}
diff --git a/nlu/experiments/glue/README.md b/nlu/experiments/glue/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..506adeea2ff4d316de24b625ed951e8a2b7f9818
--- /dev/null
+++ b/nlu/experiments/glue/README.md
@@ -0,0 +1,30 @@
+# GLUE fine-tuning task
+To run the experiment, you need to
+
+run `./mnli.sh` for fine-tuning mnli base model, 
+
+run `./mnli.sh` for fine-tuning mnli large model.
+
+run `./cola.sh` for fine-tuning cola large model.
+
+run `./sst2.sh` for fine-tuning sst2 large model.
+
+run `./stsb.sh` for fine-tuning stsb large model.
+
+run `./rte.sh` for fine-tuning rte large model.
+
+run `./qqp.sh` for fine-tuning qqp large model.
+
+run `./qnli.sh` for fine-tuning qnli large model.
+
+run `./mrpc.sh` for fine-tuning mrpc large model.
+
+## Export model to ONNX format and quantization
+
+To export model to onnx format during evaluation, use argument `--export_ort_model True`. 
+To export quantized model, use `--fp16  False --export_ort_model True`.
+The exported model will be under output folder, and end with 
+`<prefix>__onnx_fp16.bin` if fp16 is True, otherwise the outputs will be `<prefix>__onnx_fp32.bin` and `<prefix>__onnx_qt.bin`.
+
+
+Please check [ONNX document](https://onnxruntime.ai/docs/performance/quantization.html) for more details.
diff --git a/nlu/experiments/glue/ax.sh b/nlu/experiments/glue/ax.sh
new file mode 100644
index 0000000000000000000000000000000000000000..bdf21fd30ca2049f64b00f94ffa5036a72e9650a
--- /dev/null
+++ b/nlu/experiments/glue/ax.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+cache_dir=/tmp/DeBERTa/
+base_model=deberta-v3-base
+task=AX
+
+export CUDA_MPS_PIPE_DIRECTORY="${HOME}/mps_pipe"
+export CUDA_MPS_LOG_DIRECTORY="${HOME}/mps_log"
+
+# Optional: Print to verify
+echo "MPS Pipe at: $CUDA_MPS_PIPE_DIRECTORY"
+
+python -m DeBERTa.apps.run --model_config config.json  \
+	--tag $base_model \
+	--do_train \
+	--do_eval \
+	--do_predict \
+	--max_seq_len 64 \
+	--dump_interval 100 \
+	--num_train_epochs 28 \
+	--fp16 True \
+	--warmup 100 \
+	--learning_rate 8e-4 \
+	--train_batch_size 32 \
+	--cls_drop_out 0.1 \
+	--task_name $task \
+	--data_dir $cache_dir/glue_tasks/$task \
+	--init_model $base_model \
+	--output_dir $cache_dir/outputs/$base_model/$task \
+	--eval_batch_size 256 \
+	--predict_batch_size 256 \
\ No newline at end of file
diff --git a/nlu/experiments/glue/cola.sh b/nlu/experiments/glue/cola.sh
new file mode 100644
index 0000000000000000000000000000000000000000..429294384126e814fa7c4b5cc9bba46909a3a543
--- /dev/null
+++ b/nlu/experiments/glue/cola.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+cache_dir=/tmp/DeBERTa/
+base_model=deberta-v3-base
+task=CoLA
+
+export CUDA_MPS_PIPE_DIRECTORY="${HOME}/mps_pipe"
+export CUDA_MPS_LOG_DIRECTORY="${HOME}/mps_log"
+
+# Optional: Print to verify
+echo "MPS Pipe at: $CUDA_MPS_PIPE_DIRECTORY"
+
+python -m DeBERTa.apps.run --model_config config.json  \
+	--tag $base_model \
+	--do_train \
+	--do_eval \
+	--do_predict \
+	--max_seq_len 64 \
+	--dump_interval 100 \
+	--num_train_epochs 34 \
+	--fp16 True \
+	--warmup 100 \
+	--learning_rate 9e-3 \
+	--train_batch_size 32 \
+	--cls_drop_out 0.1 \
+	--task_name $task \
+	--data_dir $cache_dir/glue_tasks/$task \
+	--init_model $base_model \
+	--output_dir $cache_dir/outputs/$base_model/$task \
+	--eval_batch_size 256 \
+	--predict_batch_size 256 \
diff --git a/nlu/experiments/glue/config.json b/nlu/experiments/glue/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed5298c61e233b732b91527f7b8c29d78fb2a755
--- /dev/null
+++ b/nlu/experiments/glue/config.json
@@ -0,0 +1,27 @@
+{
+	"pooling": {
+		"dropout": 0,
+		"hidden_act": "gelu"
+	},
+	"inject_adapter": "hra",
+	"hra": {
+		"r": 8,
+		"apply_GS": false, 
+		"suffix": ["hra_u"]
+	},
+	"oft": {
+		"block_size": 16,
+		"is_coft": true,
+		"block_share": false,
+		"eps": 1e-5,
+		"suffix": ["oft_R"]
+	},
+	"lora": {
+		"lora_r": 8,
+		"lora_alpha": 32,
+		"merge_weights": false,
+		"lora_dropout": 0,
+		"suffix": ["lora_A", "lora_B"]
+	},
+	"vocab_size": 128100
+}
diff --git a/nlu/experiments/glue/download_data.sh b/nlu/experiments/glue/download_data.sh
new file mode 100644
index 0000000000000000000000000000000000000000..200a852b7664b7d1412ba9f1981dca867eaed35c
--- /dev/null
+++ b/nlu/experiments/glue/download_data.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+cache_dir=$1
+task=$2
+if [[ -z $cache_dir ]]; then
+	cache_dir=/tmp/DeBERTa/glue 
+fi
+
+
+mkdir -p $cache_dir
+curl -s -J -L  https://raw.githubusercontent.com/nyu-mll/jiant/v1.3.2/scripts/download_glue_data.py -o $cache_dir/glue.py
+patch $cache_dir/glue.py patch.diff
+if [[ -z $task ]]; then
+	python3 $cache_dir/glue.py  --data_dir $cache_dir/
+else
+	python3 $cache_dir/glue.py  --data_dir $cache_dir/ --tasks $task
+fi
\ No newline at end of file
diff --git a/nlu/experiments/glue/glue_submission/AX.tsv b/nlu/experiments/glue/glue_submission/AX.tsv
new file mode 100644
index 0000000000000000000000000000000000000000..f3c5ef88bc2cd336a8cb621afc2b6b561b57d341
--- /dev/null
+++ b/nlu/experiments/glue/glue_submission/AX.tsv
@@ -0,0 +1,1105 @@
+index	prediction
+0	contradiction
+1	neutral
+2	neutral
+3	entailment
+4	contradiction
+5	entailment
+6	contradiction
+7	contradiction
+8	contradiction
+9	neutral
+10	neutral
+11	entailment
+12	contradiction
+13	neutral
+14	neutral
+15	neutral
+16	entailment
+17	neutral
+18	entailment
+19	neutral
+20	contradiction
+21	contradiction
+22	entailment
+23	contradiction
+24	neutral
+25	neutral
+26	entailment
+27	neutral
+28	entailment
+29	contradiction
+30	contradiction
+31	neutral
+32	contradiction
+33	neutral
+34	neutral
+35	entailment
+36	neutral
+37	contradiction
+38	contradiction
+39	contradiction
+40	contradiction
+41	entailment
+42	neutral
+43	neutral
+44	contradiction
+45	contradiction
+46	contradiction
+47	entailment
+48	contradiction
+49	entailment
+50	contradiction
+51	contradiction
+52	contradiction
+53	neutral
+54	entailment
+55	contradiction
+56	neutral
+57	contradiction
+58	contradiction
+59	entailment
+60	contradiction
+61	entailment
+62	contradiction
+63	contradiction
+64	contradiction
+65	neutral
+66	neutral
+67	neutral
+68	neutral
+69	contradiction
+70	entailment
+71	neutral
+72	contradiction
+73	contradiction
+74	entailment
+75	contradiction
+76	entailment
+77	entailment
+78	neutral
+79	contradiction
+80	contradiction
+81	contradiction
+82	entailment
+83	neutral
+84	contradiction
+85	neutral
+86	entailment
+87	entailment
+88	neutral
+89	neutral
+90	contradiction
+91	contradiction
+92	entailment
+93	entailment
+94	contradiction
+95	contradiction
+96	entailment
+97	neutral
+98	contradiction
+99	contradiction
+100	contradiction
+101	entailment
+102	neutral
+103	neutral
+104	entailment
+105	contradiction
+106	entailment
+107	entailment
+108	contradiction
+109	entailment
+110	entailment
+111	entailment
+112	entailment
+113	neutral
+114	entailment
+115	entailment
+116	neutral
+117	entailment
+118	neutral
+119	contradiction
+120	entailment
+121	entailment
+122	entailment
+123	contradiction
+124	entailment
+125	contradiction
+126	contradiction
+127	entailment
+128	entailment
+129	neutral
+130	neutral
+131	contradiction
+132	entailment
+133	entailment
+134	neutral
+135	neutral
+136	contradiction
+137	neutral
+138	entailment
+139	contradiction
+140	entailment
+141	neutral
+142	entailment
+143	neutral
+144	contradiction
+145	contradiction
+146	neutral
+147	entailment
+148	contradiction
+149	neutral
+150	neutral
+151	contradiction
+152	neutral
+153	neutral
+154	contradiction
+155	entailment
+156	neutral
+157	neutral
+158	entailment
+159	neutral
+160	entailment
+161	entailment
+162	entailment
+163	entailment
+164	entailment
+165	neutral
+166	entailment
+167	entailment
+168	entailment
+169	neutral
+170	neutral
+171	contradiction
+172	entailment
+173	contradiction
+174	contradiction
+175	entailment
+176	contradiction
+177	neutral
+178	entailment
+179	neutral
+180	neutral
+181	entailment
+182	neutral
+183	contradiction
+184	entailment
+185	entailment
+186	entailment
+187	contradiction
+188	contradiction
+189	entailment
+190	contradiction
+191	contradiction
+192	entailment
+193	neutral
+194	contradiction
+195	entailment
+196	neutral
+197	neutral
+198	contradiction
+199	entailment
+200	entailment
+201	neutral
+202	contradiction
+203	neutral
+204	contradiction
+205	neutral
+206	neutral
+207	entailment
+208	contradiction
+209	entailment
+210	entailment
+211	neutral
+212	entailment
+213	neutral
+214	contradiction
+215	neutral
+216	neutral
+217	neutral
+218	entailment
+219	entailment
+220	entailment
+221	neutral
+222	neutral
+223	contradiction
+224	neutral
+225	contradiction
+226	entailment
+227	neutral
+228	neutral
+229	entailment
+230	neutral
+231	contradiction
+232	neutral
+233	contradiction
+234	entailment
+235	contradiction
+236	contradiction
+237	neutral
+238	contradiction
+239	neutral
+240	entailment
+241	contradiction
+242	entailment
+243	entailment
+244	entailment
+245	entailment
+246	neutral
+247	entailment
+248	neutral
+249	entailment
+250	entailment
+251	entailment
+252	neutral
+253	neutral
+254	neutral
+255	contradiction
+256	neutral
+257	contradiction
+258	contradiction
+259	entailment
+260	neutral
+261	neutral
+262	entailment
+263	contradiction
+264	contradiction
+265	neutral
+266	contradiction
+267	entailment
+268	neutral
+269	contradiction
+270	neutral
+271	contradiction
+272	neutral
+273	entailment
+274	contradiction
+275	neutral
+276	contradiction
+277	neutral
+278	neutral
+279	contradiction
+280	entailment
+281	entailment
+282	neutral
+283	contradiction
+284	contradiction
+285	neutral
+286	neutral
+287	entailment
+288	neutral
+289	entailment
+290	entailment
+291	entailment
+292	neutral
+293	neutral
+294	entailment
+295	contradiction
+296	contradiction
+297	entailment
+298	neutral
+299	neutral
+300	neutral
+301	contradiction
+302	neutral
+303	neutral
+304	entailment
+305	neutral
+306	neutral
+307	entailment
+308	contradiction
+309	contradiction
+310	entailment
+311	neutral
+312	neutral
+313	contradiction
+314	entailment
+315	entailment
+316	neutral
+317	neutral
+318	entailment
+319	entailment
+320	neutral
+321	entailment
+322	contradiction
+323	contradiction
+324	entailment
+325	contradiction
+326	contradiction
+327	entailment
+328	entailment
+329	contradiction
+330	neutral
+331	neutral
+332	contradiction
+333	entailment
+334	neutral
+335	contradiction
+336	contradiction
+337	contradiction
+338	entailment
+339	entailment
+340	neutral
+341	entailment
+342	contradiction
+343	contradiction
+344	neutral
+345	contradiction
+346	contradiction
+347	contradiction
+348	entailment
+349	neutral
+350	entailment
+351	neutral
+352	contradiction
+353	contradiction
+354	contradiction
+355	contradiction
+356	contradiction
+357	contradiction
+358	entailment
+359	neutral
+360	entailment
+361	entailment
+362	neutral
+363	entailment
+364	neutral
+365	entailment
+366	neutral
+367	neutral
+368	entailment
+369	entailment
+370	contradiction
+371	neutral
+372	neutral
+373	neutral
+374	neutral
+375	entailment
+376	contradiction
+377	neutral
+378	neutral
+379	entailment
+380	neutral
+381	neutral
+382	neutral
+383	entailment
+384	contradiction
+385	neutral
+386	contradiction
+387	contradiction
+388	entailment
+389	contradiction
+390	contradiction
+391	entailment
+392	entailment
+393	contradiction
+394	entailment
+395	neutral
+396	entailment
+397	neutral
+398	entailment
+399	entailment
+400	contradiction
+401	contradiction
+402	neutral
+403	neutral
+404	entailment
+405	entailment
+406	neutral
+407	entailment
+408	contradiction
+409	contradiction
+410	entailment
+411	neutral
+412	contradiction
+413	contradiction
+414	entailment
+415	contradiction
+416	contradiction
+417	entailment
+418	entailment
+419	contradiction
+420	contradiction
+421	neutral
+422	entailment
+423	contradiction
+424	entailment
+425	neutral
+426	neutral
+427	neutral
+428	entailment
+429	entailment
+430	contradiction
+431	neutral
+432	entailment
+433	entailment
+434	contradiction
+435	contradiction
+436	neutral
+437	contradiction
+438	entailment
+439	contradiction
+440	neutral
+441	contradiction
+442	neutral
+443	contradiction
+444	neutral
+445	entailment
+446	entailment
+447	neutral
+448	contradiction
+449	neutral
+450	neutral
+451	contradiction
+452	neutral
+453	entailment
+454	entailment
+455	contradiction
+456	contradiction
+457	contradiction
+458	contradiction
+459	contradiction
+460	contradiction
+461	neutral
+462	neutral
+463	contradiction
+464	neutral
+465	contradiction
+466	contradiction
+467	entailment
+468	entailment
+469	contradiction
+470	neutral
+471	neutral
+472	entailment
+473	contradiction
+474	entailment
+475	contradiction
+476	entailment
+477	neutral
+478	neutral
+479	entailment
+480	entailment
+481	contradiction
+482	contradiction
+483	entailment
+484	contradiction
+485	neutral
+486	neutral
+487	neutral
+488	contradiction
+489	entailment
+490	neutral
+491	entailment
+492	entailment
+493	neutral
+494	entailment
+495	contradiction
+496	contradiction
+497	neutral
+498	entailment
+499	neutral
+500	contradiction
+501	entailment
+502	entailment
+503	entailment
+504	entailment
+505	contradiction
+506	contradiction
+507	neutral
+508	neutral
+509	entailment
+510	contradiction
+511	contradiction
+512	neutral
+513	contradiction
+514	entailment
+515	entailment
+516	neutral
+517	entailment
+518	contradiction
+519	neutral
+520	contradiction
+521	neutral
+522	neutral
+523	entailment
+524	neutral
+525	contradiction
+526	entailment
+527	contradiction
+528	entailment
+529	contradiction
+530	contradiction
+531	contradiction
+532	neutral
+533	neutral
+534	contradiction
+535	contradiction
+536	entailment
+537	neutral
+538	contradiction
+539	contradiction
+540	contradiction
+541	neutral
+542	neutral
+543	neutral
+544	entailment
+545	contradiction
+546	neutral
+547	contradiction
+548	neutral
+549	entailment
+550	neutral
+551	contradiction
+552	entailment
+553	neutral
+554	entailment
+555	contradiction
+556	entailment
+557	neutral
+558	entailment
+559	entailment
+560	entailment
+561	entailment
+562	entailment
+563	entailment
+564	neutral
+565	entailment
+566	contradiction
+567	neutral
+568	contradiction
+569	contradiction
+570	neutral
+571	entailment
+572	entailment
+573	entailment
+574	neutral
+575	contradiction
+576	neutral
+577	contradiction
+578	contradiction
+579	contradiction
+580	contradiction
+581	contradiction
+582	contradiction
+583	entailment
+584	contradiction
+585	neutral
+586	neutral
+587	entailment
+588	entailment
+589	neutral
+590	entailment
+591	contradiction
+592	contradiction
+593	entailment
+594	entailment
+595	contradiction
+596	contradiction
+597	contradiction
+598	entailment
+599	contradiction
+600	entailment
+601	neutral
+602	neutral
+603	entailment
+604	contradiction
+605	contradiction
+606	entailment
+607	neutral
+608	contradiction
+609	contradiction
+610	entailment
+611	neutral
+612	entailment
+613	neutral
+614	entailment
+615	neutral
+616	entailment
+617	contradiction
+618	contradiction
+619	neutral
+620	contradiction
+621	entailment
+622	neutral
+623	neutral
+624	contradiction
+625	entailment
+626	entailment
+627	entailment
+628	neutral
+629	contradiction
+630	contradiction
+631	contradiction
+632	neutral
+633	contradiction
+634	neutral
+635	entailment
+636	entailment
+637	contradiction
+638	contradiction
+639	contradiction
+640	entailment
+641	entailment
+642	neutral
+643	entailment
+644	neutral
+645	neutral
+646	neutral
+647	neutral
+648	neutral
+649	contradiction
+650	contradiction
+651	neutral
+652	contradiction
+653	neutral
+654	neutral
+655	contradiction
+656	contradiction
+657	entailment
+658	entailment
+659	contradiction
+660	entailment
+661	contradiction
+662	neutral
+663	contradiction
+664	entailment
+665	contradiction
+666	contradiction
+667	entailment
+668	contradiction
+669	contradiction
+670	neutral
+671	neutral
+672	contradiction
+673	contradiction
+674	entailment
+675	neutral
+676	contradiction
+677	entailment
+678	neutral
+679	entailment
+680	contradiction
+681	neutral
+682	entailment
+683	contradiction
+684	entailment
+685	neutral
+686	entailment
+687	contradiction
+688	neutral
+689	contradiction
+690	contradiction
+691	entailment
+692	contradiction
+693	contradiction
+694	contradiction
+695	neutral
+696	contradiction
+697	neutral
+698	contradiction
+699	contradiction
+700	entailment
+701	neutral
+702	contradiction
+703	contradiction
+704	entailment
+705	entailment
+706	contradiction
+707	contradiction
+708	entailment
+709	neutral
+710	entailment
+711	entailment
+712	contradiction
+713	contradiction
+714	entailment
+715	neutral
+716	neutral
+717	entailment
+718	neutral
+719	neutral
+720	neutral
+721	contradiction
+722	entailment
+723	entailment
+724	neutral
+725	neutral
+726	contradiction
+727	entailment
+728	entailment
+729	contradiction
+730	contradiction
+731	neutral
+732	contradiction
+733	neutral
+734	entailment
+735	contradiction
+736	entailment
+737	contradiction
+738	contradiction
+739	neutral
+740	neutral
+741	entailment
+742	entailment
+743	entailment
+744	contradiction
+745	neutral
+746	neutral
+747	neutral
+748	neutral
+749	neutral
+750	neutral
+751	neutral
+752	entailment
+753	contradiction
+754	entailment
+755	neutral
+756	contradiction
+757	contradiction
+758	entailment
+759	contradiction
+760	neutral
+761	contradiction
+762	entailment
+763	neutral
+764	contradiction
+765	neutral
+766	neutral
+767	entailment
+768	contradiction
+769	entailment
+770	neutral
+771	contradiction
+772	contradiction
+773	entailment
+774	contradiction
+775	entailment
+776	contradiction
+777	contradiction
+778	entailment
+779	contradiction
+780	entailment
+781	neutral
+782	entailment
+783	entailment
+784	contradiction
+785	entailment
+786	entailment
+787	entailment
+788	entailment
+789	contradiction
+790	contradiction
+791	entailment
+792	entailment
+793	contradiction
+794	contradiction
+795	contradiction
+796	neutral
+797	entailment
+798	contradiction
+799	neutral
+800	neutral
+801	entailment
+802	contradiction
+803	contradiction
+804	entailment
+805	contradiction
+806	neutral
+807	entailment
+808	neutral
+809	entailment
+810	contradiction
+811	entailment
+812	entailment
+813	entailment
+814	entailment
+815	neutral
+816	neutral
+817	neutral
+818	neutral
+819	entailment
+820	neutral
+821	entailment
+822	entailment
+823	contradiction
+824	neutral
+825	contradiction
+826	entailment
+827	entailment
+828	neutral
+829	neutral
+830	entailment
+831	neutral
+832	contradiction
+833	entailment
+834	entailment
+835	neutral
+836	entailment
+837	contradiction
+838	entailment
+839	contradiction
+840	entailment
+841	contradiction
+842	entailment
+843	neutral
+844	entailment
+845	entailment
+846	contradiction
+847	entailment
+848	entailment
+849	neutral
+850	contradiction
+851	neutral
+852	contradiction
+853	contradiction
+854	contradiction
+855	contradiction
+856	contradiction
+857	entailment
+858	contradiction
+859	contradiction
+860	contradiction
+861	contradiction
+862	contradiction
+863	contradiction
+864	entailment
+865	entailment
+866	neutral
+867	neutral
+868	entailment
+869	contradiction
+870	neutral
+871	entailment
+872	neutral
+873	contradiction
+874	entailment
+875	contradiction
+876	contradiction
+877	contradiction
+878	contradiction
+879	neutral
+880	neutral
+881	contradiction
+882	contradiction
+883	neutral
+884	contradiction
+885	entailment
+886	entailment
+887	contradiction
+888	contradiction
+889	neutral
+890	neutral
+891	neutral
+892	entailment
+893	entailment
+894	entailment
+895	entailment
+896	entailment
+897	entailment
+898	contradiction
+899	entailment
+900	entailment
+901	entailment
+902	contradiction
+903	entailment
+904	entailment
+905	entailment
+906	entailment
+907	contradiction
+908	entailment
+909	entailment
+910	entailment
+911	contradiction
+912	entailment
+913	neutral
+914	entailment
+915	neutral
+916	entailment
+917	neutral
+918	contradiction
+919	contradiction
+920	neutral
+921	neutral
+922	entailment
+923	neutral
+924	entailment
+925	contradiction
+926	contradiction
+927	contradiction
+928	contradiction
+929	entailment
+930	entailment
+931	entailment
+932	contradiction
+933	contradiction
+934	contradiction
+935	neutral
+936	neutral
+937	contradiction
+938	entailment
+939	neutral
+940	entailment
+941	neutral
+942	entailment
+943	contradiction
+944	contradiction
+945	neutral
+946	neutral
+947	neutral
+948	contradiction
+949	contradiction
+950	contradiction
+951	contradiction
+952	entailment
+953	neutral
+954	entailment
+955	neutral
+956	neutral
+957	neutral
+958	neutral
+959	neutral
+960	neutral
+961	contradiction
+962	entailment
+963	neutral
+964	contradiction
+965	neutral
+966	contradiction
+967	entailment
+968	entailment
+969	neutral
+970	contradiction
+971	contradiction
+972	entailment
+973	neutral
+974	contradiction
+975	contradiction
+976	contradiction
+977	entailment
+978	entailment
+979	contradiction
+980	neutral
+981	entailment
+982	contradiction
+983	contradiction
+984	contradiction
+985	neutral
+986	neutral
+987	contradiction
+988	neutral
+989	entailment
+990	entailment
+991	neutral
+992	neutral
+993	contradiction
+994	neutral
+995	contradiction
+996	neutral
+997	contradiction
+998	neutral
+999	neutral
+1000	neutral
+1001	neutral
+1002	contradiction
+1003	contradiction
+1004	neutral
+1005	neutral
+1006	neutral
+1007	entailment
+1008	contradiction
+1009	entailment
+1010	entailment
+1011	neutral
+1012	neutral
+1013	neutral
+1014	contradiction
+1015	neutral
+1016	contradiction
+1017	contradiction
+1018	entailment
+1019	entailment
+1020	contradiction
+1021	contradiction
+1022	contradiction
+1023	entailment
+1024	contradiction
+1025	contradiction
+1026	entailment
+1027	neutral
+1028	contradiction
+1029	contradiction
+1030	neutral
+1031	neutral
+1032	neutral
+1033	entailment
+1034	neutral
+1035	entailment
+1036	entailment
+1037	entailment
+1038	contradiction
+1039	contradiction
+1040	entailment
+1041	neutral
+1042	entailment
+1043	contradiction
+1044	entailment
+1045	entailment
+1046	contradiction
+1047	contradiction
+1048	neutral
+1049	neutral
+1050	entailment
+1051	contradiction
+1052	neutral
+1053	entailment
+1054	neutral
+1055	neutral
+1056	neutral
+1057	contradiction
+1058	contradiction
+1059	contradiction
+1060	entailment
+1061	contradiction
+1062	neutral
+1063	entailment
+1064	neutral
+1065	contradiction
+1066	contradiction
+1067	neutral
+1068	contradiction
+1069	neutral
+1070	entailment
+1071	contradiction
+1072	neutral
+1073	entailment
+1074	contradiction
+1075	entailment
+1076	neutral
+1077	neutral
+1078	entailment
+1079	entailment
+1080	neutral
+1081	entailment
+1082	contradiction
+1083	entailment
+1084	neutral
+1085	contradiction
+1086	contradiction
+1087	contradiction
+1088	neutral
+1089	neutral
+1090	neutral
+1091	entailment
+1092	neutral
+1093	neutral
+1094	entailment
+1095	contradiction
+1096	contradiction
+1097	neutral
+1098	neutral
+1099	neutral
+1100	contradiction
+1101	contradiction
+1102	contradiction
+1103	neutral
diff --git a/nlu/experiments/glue/glue_submission/WNLI.tsv b/nlu/experiments/glue/glue_submission/WNLI.tsv
new file mode 100644
index 0000000000000000000000000000000000000000..eb860c9ad02cb109de8c6dbb12eb7e4dbae60090
--- /dev/null
+++ b/nlu/experiments/glue/glue_submission/WNLI.tsv
@@ -0,0 +1,147 @@
+index	prediction
+0	1
+1	1
+2	0
+3	0
+4	0
+5	1
+6	1
+7	1
+8	1
+9	1
+10	1
+11	1
+12	1
+13	1
+14	0
+15	0
+16	0
+17	0
+18	1
+19	1
+20	1
+21	1
+22	1
+23	0
+24	1
+25	1
+26	1
+27	0
+28	1
+29	0
+30	0
+31	1
+32	0
+33	1
+34	1
+35	0
+36	0
+37	1
+38	0
+39	1
+40	1
+41	0
+42	0
+43	1
+44	1
+45	1
+46	1
+47	0
+48	0
+49	1
+50	0
+51	1
+52	1
+53	0
+54	1
+55	0
+56	1
+57	1
+58	0
+59	1
+60	0
+61	0
+62	1
+63	0
+64	0
+65	1
+66	0
+67	0
+68	1
+69	0
+70	1
+71	0
+72	1
+73	0
+74	0
+75	0
+76	1
+77	1
+78	0
+79	1
+80	0
+81	1
+82	0
+83	1
+84	1
+85	0
+86	1
+87	0
+88	1
+89	1
+90	1
+91	1
+92	1
+93	0
+94	0
+95	1
+96	1
+97	0
+98	0
+99	1
+100	0
+101	1
+102	0
+103	0
+104	1
+105	0
+106	0
+107	0
+108	0
+109	1
+110	1
+111	1
+112	0
+113	1
+114	0
+115	1
+116	1
+117	1
+118	0
+119	1
+120	0
+121	1
+122	0
+123	1
+124	1
+125	1
+126	0
+127	0
+128	0
+129	0
+130	1
+131	1
+132	0
+133	1
+134	1
+135	1
+136	0
+137	0
+138	0
+139	1
+140	1
+141	1
+142	0
+143	1
+144	0
+145	0
diff --git a/nlu/experiments/glue/mnli.sh b/nlu/experiments/glue/mnli.sh
new file mode 100644
index 0000000000000000000000000000000000000000..df08fdb62c0666003f6481c4384d58b12e591016
--- /dev/null
+++ b/nlu/experiments/glue/mnli.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+cache_dir=/tmp/DeBERTa/
+base_model=deberta-v3-base
+task=MNLI
+
+python -m DeBERTa.apps.run --model_config config.json  \
+	--tag $base_model \
+	--do_train \
+	--do_eval \
+	--do_predict \
+	--max_seq_len 256 \
+	--dump_interval 1000 \
+	--num_train_epochs 8 \
+	--fp16 True \
+	--warmup 1000 \
+	--learning_rate 2e-3 \
+	--train_batch_size 32 \
+	--cls_drop_out 0.1 \
+	--task_name $task \
+	--data_dir $cache_dir/glue_tasks/$task \
+	--init_model $base_model \
+	--output_dir $cache_dir/outputs/$base_model/$task \
+	--eval_batch_size 256 \
+  	--predict_batch_size 256 \
+
diff --git a/nlu/experiments/glue/mrpc.sh b/nlu/experiments/glue/mrpc.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9c6ccb5315eb388a06d668b70af9a13ae6c2dee2
--- /dev/null
+++ b/nlu/experiments/glue/mrpc.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+cache_dir=/tmp/DeBERTa/
+base_model=deberta-v3-base
+task=MRPC
+
+python -m DeBERTa.apps.run --model_config config.json  \
+	--tag $base_model \
+	--do_train \
+	--do_eval \
+	--do_predict \
+	--max_seq_len 320 \
+	--dump_interval 100 \
+	--num_train_epochs 60 \
+	--fp16 True \
+	--warmup 50 \
+	--learning_rate 6e-3 \
+	--train_batch_size 32 \
+	--cls_drop_out 0.1 \
+	--task_name $task \
+	--data_dir $cache_dir/glue_tasks/$task \
+	--init_model $base_model \
+	--output_dir $cache_dir/outputs/$base_model/$task \
+	--eval_batch_size 256 \
+  	--predict_batch_size 256 \
diff --git a/nlu/experiments/glue/patch.diff b/nlu/experiments/glue/patch.diff
new file mode 100644
index 0000000000000000000000000000000000000000..3dcd78717b1962d7c43634acaa8f8aa30c2c1f11
--- /dev/null
+++ b/nlu/experiments/glue/patch.diff
@@ -0,0 +1,32 @@
+--- download_glue_data.py	2021-02-01 18:22:04.664290174 -0500
++++ download_glue_data_fixed.py	2021-02-01 18:21:13.399941815 -0500
+@@ -31,18 +31,18 @@
+ 
+ TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
+ TASK2PATH = {
+-    "CoLA": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4",  # noqa
+-    "SST": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8",  # noqa
+-    "MRPC": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc",  # noqa
+-    "QQP": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP-clean.zip?alt=media&token=11a647cb-ecd3-49c9-9d31-79f8ca8fe277",  # noqa
+-    "STS": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5",  # noqa
+-    "MNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce",  # noqa
+-    "SNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df",  # noqa
+-    "QNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601",  # noqa
+-    "RTE": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb",  # noqa
+-    "WNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf",  # noqa
++    "CoLA": "https://dl.fbaipublicfiles.com/glue/data/CoLA.zip",
++    "SST": "https://dl.fbaipublicfiles.com/glue/data/SST-2.zip",
++    "MRPC": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc",
++    "QQP": "https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip",
++    "STS": "https://dl.fbaipublicfiles.com/glue/data/STS-B.zip",
++    "MNLI": "https://dl.fbaipublicfiles.com/glue/data/MNLI.zip",
++    "SNLI": "https://dl.fbaipublicfiles.com/glue/data/SNLI.zip",
++    "QNLI": "https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip",
++    "RTE": "https://dl.fbaipublicfiles.com/glue/data/RTE.zip",
++    "WNLI": "https://dl.fbaipublicfiles.com/glue/data/WNLI.zip",
+     "diagnostic": [
+-        "https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D",  # noqa
++        "https://dl.fbaipublicfiles.com/glue/data/AX.tsv",
+         "https://www.dropbox.com/s/ju7d95ifb072q9f/diagnostic-full.tsv?dl=1",
+     ],
+ }
diff --git a/nlu/experiments/glue/pseudo.ipynb b/nlu/experiments/glue/pseudo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..f450c32396b7fe99cd863e1aebbc927a0ab3467a
--- /dev/null
+++ b/nlu/experiments/glue/pseudo.ipynb
@@ -0,0 +1,110 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "4dce6d03",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Generating glue_submission/AX.tsv with 1104 samples...\n",
+      "Generating glue_submission/WNLI.tsv with 146 samples...\n",
+      "Done! Dummy files correspond to GLUE submission standards.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import csv\n",
+    "import random\n",
+    "\n",
+    "def generate_glue_dummy_files(output_dir=\"glue_submission\"):\n",
+    "    \"\"\"\n",
+    "    Generates dummy submission files for AX and WNLI tasks \n",
+    "    to satisfy GLUE benchmark submission requirements.\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    # Create the directory if it doesn't exist\n",
+    "    if not os.path.exists(output_dir):\n",
+    "        os.makedirs(output_dir)\n",
+    "        # Log: Directory created\n",
+    "        print(f\"Created directory: {output_dir}\")\n",
+    "\n",
+    "    # ---------------------------------------------------------\n",
+    "    # 1. Generate AX.tsv (Diagnostic Dataset)\n",
+    "    # Specs: 1105 samples.\n",
+    "    # Format: index (int), prediction (string: entailment/neutral/contradiction)\n",
+    "    # ---------------------------------------------------------\n",
+    "    ax_filename = os.path.join(output_dir, \"AX.tsv\")\n",
+    "    ax_count = 1104\n",
+    "    # NLI labels for submission are typically strings\n",
+    "    ax_labels = [\"entailment\", \"neutral\", \"contradiction\"]\n",
+    "\n",
+    "    print(f\"Generating {ax_filename} with {ax_count} samples...\")\n",
+    "\n",
+    "    with open(ax_filename, mode='w', newline='', encoding='utf-8') as f:\n",
+    "        writer = csv.writer(f, delimiter='\\t')\n",
+    "        \n",
+    "        # Write header\n",
+    "        writer.writerow([\"index\", \"prediction\"])\n",
+    "        \n",
+    "        for i in range(ax_count):\n",
+    "            # Pick a random label since we are not actually testing this task\n",
+    "            pred = random.choice(ax_labels)\n",
+    "            writer.writerow([i, pred])\n",
+    "\n",
+    "    # ---------------------------------------------------------\n",
+    "    # 2. Generate WNLI.tsv (Winograd NLI)\n",
+    "    # Specs: 147 samples.\n",
+    "    # Format: index (int), prediction (int: 0 or 1)\n",
+    "    # ---------------------------------------------------------\n",
+    "    wnli_filename = os.path.join(output_dir, \"WNLI.tsv\")\n",
+    "    wnli_count = 146\n",
+    "    # WNLI labels are typically 0 (not entailment) or 1 (entailment)\n",
+    "    wnli_labels = [0, 1]\n",
+    "\n",
+    "    print(f\"Generating {wnli_filename} with {wnli_count} samples...\")\n",
+    "\n",
+    "    with open(wnli_filename, mode='w', newline='', encoding='utf-8') as f:\n",
+    "        writer = csv.writer(f, delimiter='\\t')\n",
+    "        \n",
+    "        # Write header\n",
+    "        writer.writerow([\"index\", \"prediction\"])\n",
+    "        \n",
+    "        for i in range(wnli_count):\n",
+    "            # Pick a random label\n",
+    "            pred = random.choice(wnli_labels)\n",
+    "            writer.writerow([i, pred])\n",
+    "\n",
+    "    print(\"Done! Dummy files correspond to GLUE submission standards.\")\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    generate_glue_dummy_files()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "allm",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/nlu/experiments/glue/qnli.sh b/nlu/experiments/glue/qnli.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8b5ef126c7d42c3730ddab1e442f2da6141190f9
--- /dev/null
+++ b/nlu/experiments/glue/qnli.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+cache_dir=/tmp/DeBERTa/
+base_model=deberta-v3-base
+task=QNLI
+
+python -m DeBERTa.apps.run --model_config config.json  \
+	--tag $base_model \
+	--do_train \
+	--do_eval \
+	--do_predict \
+	--max_seq_len 512 \
+	--dump_interval 100 \
+	--num_train_epochs 12 \
+	--fp16 True \
+	--warmup 500 \
+	--learning_rate 1e-3 \
+	--train_batch_size 32 \
+	--cls_drop_out 0.1 \
+	--task_name $task \
+	--data_dir $cache_dir/glue_tasks/$task \
+	--init_model $base_model \
+	--output_dir $cache_dir/outputs/$base_model/$task \
+	--eval_batch_size 256 \
+  	--predict_batch_size 256 \
\ No newline at end of file
diff --git a/nlu/experiments/glue/qqp.sh b/nlu/experiments/glue/qqp.sh
new file mode 100644
index 0000000000000000000000000000000000000000..28461ea30575ccdae9df7403edb611a24c9f105f
--- /dev/null
+++ b/nlu/experiments/glue/qqp.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+cache_dir=/tmp/DeBERTa/
+base_model=deberta-v3-base
+task=QQP
+
+python -m DeBERTa.apps.run --model_config config.json  \
+	--tag $base_model \
+	--do_train \
+	--do_eval \
+	--do_predict \
+	--max_seq_len 320 \
+	--dump_interval 500 \
+	--num_train_epochs 10 \
+	--fp16 True \
+	--warmup 1000 \
+	--learning_rate 9e-4 \
+	--train_batch_size 32 \
+	--cls_drop_out 0.1 \
+	--task_name $task \
+	--data_dir $cache_dir/glue_tasks/$task \
+	--init_model $base_model \
+	--output_dir $cache_dir/outputs/$base_model/$task \
+	--eval_batch_size 256 \
+  	--predict_batch_size 256 \
+
diff --git a/nlu/experiments/glue/rte.sh b/nlu/experiments/glue/rte.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e22697a3936935d607ae4a77678bca2934545feb
--- /dev/null
+++ b/nlu/experiments/glue/rte.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+cache_dir=/tmp/DeBERTa/
+base_model=deberta-v3-base
+task=RTE
+
+python -m DeBERTa.apps.run --model_config config.json  \
+	--tag $base_model \
+	--do_train \
+	--do_eval \
+	--do_predict \
+	--num_train_epochs 4 \
+	--dump_interval 100 \
+	--fp16 False \
+	--warmup 50 \
+	--learning_rate 5e-3 \
+	--train_batch_size 32 \
+	--max_seq_len 320 \
+	--cls_drop_out 0.0 \
+	--task_name $task \
+	--data_dir $cache_dir/glue_tasks/$task \
+	--init_model $base_model \
+	--output_dir $cache_dir/outputs/$base_model/$task \
+	--eval_batch_size 256 \
+  	--predict_batch_size 256 \
diff --git a/nlu/experiments/glue/sst2.sh b/nlu/experiments/glue/sst2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cb95a88ab9413179ee4ea1517292e597d1500a2c
--- /dev/null
+++ b/nlu/experiments/glue/sst2.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+cache_dir=/tmp/DeBERTa/
+base_model=deberta-v3-base
+task=SST-2
+
+python -m DeBERTa.apps.run --model_config config.json  \
+	--tag $base_model \
+	--do_train \
+	--do_eval \
+	--do_predict \
+	--max_seq_len 128 \
+	--dump_interval 100 \
+	--num_train_epochs 16 \
+	--fp16 True \
+	--warmup 500 \
+	--learning_rate 1e-3 \
+	--train_batch_size 32 \
+	--cls_drop_out 0.1 \
+	--task_name $task \
+	--data_dir $cache_dir/glue_tasks/$task \
+	--init_model $base_model \
+	--output_dir $cache_dir/outputs/$base_model/$task \
+	--eval_batch_size 256 \
+  	--predict_batch_size 256 \
\ No newline at end of file
diff --git a/nlu/experiments/glue/stsb.sh b/nlu/experiments/glue/stsb.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fffe8953538071249c1b84a4bfcca961ab22999b
--- /dev/null
+++ b/nlu/experiments/glue/stsb.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+cache_dir=/tmp/DeBERTa/
+base_model=deberta-v3-base
+task=STS-B
+
+python -m DeBERTa.apps.run --model_config config.json  \
+	--tag $base_model \
+	--do_train \
+	--do_eval \
+	--do_predict \
+	--max_seq_len 128 \
+	--dump_interval 100 \
+	--num_train_epochs 39 \
+	--fp16 True \
+	--warmup 50 \
+	--learning_rate 5e-3 \
+	--train_batch_size 32 \
+	--cls_drop_out 0.1 \
+	--task_name $task \
+	--data_dir $cache_dir/glue_tasks/$task \
+	--init_model $base_model \
+	--output_dir $cache_dir/outputs/$base_model/$task \
+	--eval_batch_size 256 \
+  	--predict_batch_size 256 \