diff --git a/.gitattributes b/.gitattributes
index 4ba88ff626c37291ecfec397725f7150be6a5a42..0c703e813ea35a94b633c99a159d1d360399b5f3 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -40,3 +40,5 @@ fairseq/data/data_utils_fast.cpython-38-darwin.so filter=lfs diff=lfs merge=lfs
 fairseq/data/token_block_utils_fast.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
 fairseq/data/token_block_utils_fast.cpython-36m-darwin.so filter=lfs diff=lfs merge=lfs -text
 fairseq/data/token_block_utils_fast.cpython-38-darwin.so filter=lfs diff=lfs merge=lfs -text
+fairseq/data/data_utils_fast.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+fairseq/data/token_block_utils_fast.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
diff --git a/fairseq/__init__.py b/fairseq/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dd29637af82124c6f90f23808908c8a884cf88e
--- /dev/null
+++ b/fairseq/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+__all__ = ['pdb']
+__version__ = '0.9.0'
+
+import sys
+
+# backwards compatibility to support `from fairseq.meters import AverageMeter`
+from fairseq.logging import meters, metrics, progress_bar  # noqa
+sys.modules['fairseq.meters'] = meters
+sys.modules['fairseq.metrics'] = metrics
+sys.modules['fairseq.progress_bar'] = progress_bar
+
+import fairseq.criterions  # noqa
+import fairseq.models  # noqa
+import fairseq.modules  # noqa
+import fairseq.optim  # noqa
+import fairseq.optim.lr_scheduler  # noqa
+import fairseq.pdb  # noqa
+import fairseq.tasks  # noqa
+
+import fairseq.benchmark  # noqa
+import fairseq.model_parallel  # noqa
diff --git a/fairseq/__pycache__/__init__.cpython-310.pyc b/fairseq/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8241ad57ba17aea696541e222165f0790728443
Binary files /dev/null and b/fairseq/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/__pycache__/binarizer.cpython-310.pyc b/fairseq/__pycache__/binarizer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ea1931f58fd8c52e625c0779d761e8a7b36b7b7
Binary files /dev/null and b/fairseq/__pycache__/binarizer.cpython-310.pyc differ
diff --git a/fairseq/__pycache__/checkpoint_utils.cpython-310.pyc b/fairseq/__pycache__/checkpoint_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e92332763ecbb6a3bfd414775e19138e9f75f8b3
Binary files /dev/null and b/fairseq/__pycache__/checkpoint_utils.cpython-310.pyc differ
diff --git a/fairseq/__pycache__/distributed_utils.cpython-310.pyc b/fairseq/__pycache__/distributed_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b41f7e49a7312bd8db387d16228a4f640788e30
Binary files /dev/null and b/fairseq/__pycache__/distributed_utils.cpython-310.pyc differ
diff --git a/fairseq/__pycache__/file_io.cpython-310.pyc b/fairseq/__pycache__/file_io.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2a673b2d85b90376a2a9f92df8c6c7ccaac7703
Binary files /dev/null and b/fairseq/__pycache__/file_io.cpython-310.pyc differ
diff --git a/fairseq/__pycache__/file_utils.cpython-310.pyc b/fairseq/__pycache__/file_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f88dd582b19fb3111e2fbc19e4f16b1ce9dc6c0
Binary files /dev/null and b/fairseq/__pycache__/file_utils.cpython-310.pyc differ
diff --git a/fairseq/__pycache__/incremental_decoding_utils.cpython-310.pyc b/fairseq/__pycache__/incremental_decoding_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e0294f6eb2024b858edc021f1866e6997be497d
Binary files /dev/null and b/fairseq/__pycache__/incremental_decoding_utils.cpython-310.pyc differ
diff --git a/fairseq/__pycache__/iterative_refinement_generator.cpython-310.pyc b/fairseq/__pycache__/iterative_refinement_generator.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61134cc3cbc4d592b71d47b469c11ea527bed980
Binary files /dev/null and b/fairseq/__pycache__/iterative_refinement_generator.cpython-310.pyc differ
diff --git a/fairseq/__pycache__/legacy_distributed_data_parallel.cpython-310.pyc b/fairseq/__pycache__/legacy_distributed_data_parallel.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec36601deb4d0c64e423d3c6982e7ab57ec2c0fb
Binary files /dev/null and b/fairseq/__pycache__/legacy_distributed_data_parallel.cpython-310.pyc differ
diff --git a/fairseq/__pycache__/options.cpython-310.pyc b/fairseq/__pycache__/options.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..08c0f766d505ea734bc5a5c1e54767f0ab621b33
Binary files /dev/null and b/fairseq/__pycache__/options.cpython-310.pyc differ
diff --git a/fairseq/__pycache__/pdb.cpython-310.pyc b/fairseq/__pycache__/pdb.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3b9177cc9438cbf2c9fe93865fcd4bf07ebe966
Binary files /dev/null and b/fairseq/__pycache__/pdb.cpython-310.pyc differ
diff --git a/fairseq/__pycache__/registry.cpython-310.pyc b/fairseq/__pycache__/registry.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..941bc2769852ba09c756b44050b4c31d65555772
Binary files /dev/null and b/fairseq/__pycache__/registry.cpython-310.pyc differ
diff --git a/fairseq/__pycache__/search.cpython-310.pyc b/fairseq/__pycache__/search.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03f9b5355d4a116eca6be5523a898a80de56b324
Binary files /dev/null and b/fairseq/__pycache__/search.cpython-310.pyc differ
diff --git a/fairseq/__pycache__/sequence_generator.cpython-310.pyc b/fairseq/__pycache__/sequence_generator.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56f558e5a3f3fe68973956bd0f2ae1b7d51d46ad
Binary files /dev/null and b/fairseq/__pycache__/sequence_generator.cpython-310.pyc differ
diff --git a/fairseq/__pycache__/tokenizer.cpython-310.pyc b/fairseq/__pycache__/tokenizer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a404b3458b631f92381cb511102f523546e410d8
Binary files /dev/null and b/fairseq/__pycache__/tokenizer.cpython-310.pyc differ
diff --git a/fairseq/__pycache__/utils.cpython-310.pyc b/fairseq/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3555960e34025d6077ac0fa271e452c209693662
Binary files /dev/null and b/fairseq/__pycache__/utils.cpython-310.pyc differ
diff --git a/fairseq/benchmark/__init__.py b/fairseq/benchmark/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..926f3ce73975745fbe7e15f307b44761228ec46e
--- /dev/null
+++ b/fairseq/benchmark/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# import models/tasks to register them
+from . import (  # noqa
+    dummy_lm,
+    dummy_masked_lm,
+    dummy_model,
+    dummy_mt,
+)
diff --git a/fairseq/benchmark/__pycache__/__init__.cpython-310.pyc b/fairseq/benchmark/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad26a8b829742ee2cc12f10eea6150c29e3a62e0
Binary files /dev/null and b/fairseq/benchmark/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/benchmark/__pycache__/dummy_lm.cpython-310.pyc b/fairseq/benchmark/__pycache__/dummy_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf38e95297004a51b01faa88add542a56961dea3
Binary files /dev/null and b/fairseq/benchmark/__pycache__/dummy_lm.cpython-310.pyc differ
diff --git a/fairseq/benchmark/__pycache__/dummy_masked_lm.cpython-310.pyc b/fairseq/benchmark/__pycache__/dummy_masked_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91012312f5e6fd43b716aee6f87f35db92341a00
Binary files /dev/null and b/fairseq/benchmark/__pycache__/dummy_masked_lm.cpython-310.pyc differ
diff --git a/fairseq/benchmark/__pycache__/dummy_model.cpython-310.pyc b/fairseq/benchmark/__pycache__/dummy_model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca36f6ca77c7303cd4c725c739f67dfa3881d420
Binary files /dev/null and b/fairseq/benchmark/__pycache__/dummy_model.cpython-310.pyc differ
diff --git a/fairseq/benchmark/__pycache__/dummy_mt.cpython-310.pyc b/fairseq/benchmark/__pycache__/dummy_mt.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b126e3e7b989effaa4fe4b32f9d2033a9cc0025
Binary files /dev/null and b/fairseq/benchmark/__pycache__/dummy_mt.cpython-310.pyc differ
diff --git a/fairseq/benchmark/dummy_lm.py b/fairseq/benchmark/dummy_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..92e9dc8df556ee41761065b476a7f017d1a2fe45
--- /dev/null
+++ b/fairseq/benchmark/dummy_lm.py
@@ -0,0 +1,118 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import numpy as np
+import torch
+
+from fairseq.data import Dictionary, FairseqDataset
+from fairseq.tasks import FairseqTask, register_task
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_task('dummy_lm')
+class DummyLMTask(FairseqTask):
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument('--dict-size', default=49996, type=int)
+        parser.add_argument('--dataset-size', default=100000, type=int)
+        parser.add_argument('--tokens-per-sample', default=512, type=int,
+                            help='max number of total tokens over all segments '
+                                 'per sample for BERT dataset')
+
+    def __init__(self, args, dictionary):
+        super().__init__(args)
+        self.dictionary = dictionary
+        self.seed = args.seed
+
+        dictionary.pad_to_multiple_(8)  # often faster if divisible by 8
+
+        seq = torch.arange(args.tokens_per_sample + 1) + dictionary.pad() + 1
+
+        self.dummy_src = seq[:-1]
+        self.dummy_tgt = seq[1:]
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task. """
+        dictionary = Dictionary()
+        for i in range(args.dict_size):
+            dictionary.add_symbol('word{}'.format(i))
+        logger.info('dictionary: {} types'.format(len(dictionary)))
+        return cls(args, dictionary)
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        if self.args.max_sentences is not None:
+            bsz = self.args.max_sentences
+        else:
+            bsz = max(1, self.args.max_tokens // self.args.tokens_per_sample)
+        self.datasets[split] = DummyDataset(
+            {
+                'id': 1,
+                'net_input': {
+                    'src_tokens': torch.stack([self.dummy_src for _ in range(bsz)]),
+                    'src_lengths': torch.full(
+                        (bsz, ), self.args.tokens_per_sample, dtype=torch.long
+                    ),
+                },
+                'target': torch.stack([self.dummy_tgt for _ in range(bsz)]),
+                'nsentences': bsz,
+                'ntokens': bsz * self.args.tokens_per_sample,
+            },
+            num_items=self.args.dataset_size,
+            item_size=self.args.tokens_per_sample,
+        )
+
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
+
+
+class DummyDataset(FairseqDataset):
+
+    def __init__(self, batch, num_items, item_size):
+        super().__init__()
+        self.batch = batch
+        self.num_items = num_items
+        self.item_size = item_size
+
+    def __getitem__(self, index):
+        return index
+
+    def __len__(self):
+        return self.num_items
+
+    def collater(self, samples):
+        return self.batch
+
+    @property
+    def sizes(self):
+        return np.array([self.item_size] * self.num_items)
+
+    def num_tokens(self, index):
+        return self.item_size
+
+    def size(self, index):
+        return self.item_size
+
+    def ordered_indices(self):
+        return np.arange(self.num_items)
+
+    @property
+    def supports_prefetch(self):
+        return False
diff --git a/fairseq/benchmark/dummy_masked_lm.py b/fairseq/benchmark/dummy_masked_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2e459caa2c247e7babce1c0d0f9390106ac307c
--- /dev/null
+++ b/fairseq/benchmark/dummy_masked_lm.py
@@ -0,0 +1,127 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import numpy as np
+import torch
+
+from fairseq.data import Dictionary, FairseqDataset
+from fairseq.tasks import FairseqTask, register_task
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_task('dummy_masked_lm')
+class DummyMaskedLMTask(FairseqTask):
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument('--dict-size', default=49995, type=int)
+        parser.add_argument('--dataset-size', default=100000, type=int)
+        parser.add_argument('--tokens-per-sample', default=512, type=int,
+                            help='max number of total tokens over all segments '
+                                 'per sample for BERT dataset')
+
+    def __init__(self, args, dictionary):
+        super().__init__(args)
+        self.dictionary = dictionary
+        self.seed = args.seed
+
+        # add mask token
+        self.mask_idx = dictionary.add_symbol('<mask>')
+        dictionary.pad_to_multiple_(8)  # often faster if divisible by 8
+
+        mask_idx = 0
+        pad_idx = 1
+        seq = torch.arange(args.tokens_per_sample) + pad_idx + 1
+        mask = torch.arange(2, args.tokens_per_sample, 7)  # ~15%
+        src = seq.clone()
+        src[mask] = mask_idx
+        tgt = torch.full_like(seq, pad_idx)
+        tgt[mask] = seq[mask]
+
+        self.dummy_src = src
+        self.dummy_tgt = tgt
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task. """
+        dictionary = Dictionary()
+        for i in range(args.dict_size):
+            dictionary.add_symbol('word{}'.format(i))
+        logger.info('dictionary: {} types'.format(len(dictionary)))
+        return cls(args, dictionary)
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        if self.args.max_sentences is not None:
+            bsz = self.args.max_sentences
+        else:
+            bsz = max(1, self.args.max_tokens // self.args.tokens_per_sample)
+        self.datasets[split] = DummyDataset(
+            {
+                'id': 1,
+                'net_input': {
+                    'src_tokens': torch.stack([self.dummy_src for _ in range(bsz)]),
+                    'src_lengths': torch.full(
+                        (bsz, ), self.args.tokens_per_sample, dtype=torch.long
+                    ),
+                },
+                'target': torch.stack([self.dummy_tgt for _ in range(bsz)]),
+                'nsentences': bsz,
+                'ntokens': bsz * self.args.tokens_per_sample,
+            },
+            num_items=self.args.dataset_size,
+            item_size=self.args.tokens_per_sample,
+        )
+
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
+
+
+class DummyDataset(FairseqDataset):
+
+    def __init__(self, batch, num_items, item_size):
+        super().__init__()
+        self.batch = batch
+        self.num_items = num_items
+        self.item_size = item_size
+
+    def __getitem__(self, index):
+        return index
+
+    def __len__(self):
+        return self.num_items
+
+    def collater(self, samples):
+        return self.batch
+
+    @property
+    def sizes(self):
+        return np.array([self.item_size] * self.num_items)
+
+    def num_tokens(self, index):
+        return self.item_size
+
+    def size(self, index):
+        return self.item_size
+
+    def ordered_indices(self):
+        return np.arange(self.num_items)
+
+    @property
+    def supports_prefetch(self):
+        return False
diff --git a/fairseq/benchmark/dummy_model.py b/fairseq/benchmark/dummy_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..817cdb34bb2dcdc8a49dd13328c9d020b4e9d03c
--- /dev/null
+++ b/fairseq/benchmark/dummy_model.py
@@ -0,0 +1,95 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq.data import Dictionary
+from fairseq.models import (
+    FairseqDecoder,
+    FairseqLanguageModel,
+    register_model,
+    register_model_architecture,
+)
+
+
+@register_model('dummy_model')
+class DummyModel(FairseqLanguageModel):
+
+    def __init__(self, args, encoder):
+        super().__init__(encoder)
+        self.args = args
+
+    @staticmethod
+    def add_args(parser):
+        parser.add_argument('--num-layers', type=int, default=24)
+        parser.add_argument('--embed-dim', type=int, default=1024)
+
+    @classmethod
+    def build_model(cls, args, task):
+        encoder = DummyEncoder(
+            num_embed=len(task.target_dictionary),
+            embed_dim=args.embed_dim,
+            num_layers=args.num_layers,
+        )
+        return cls(args, encoder)
+
+    def forward(self, src_tokens, masked_tokens=None, **kwargs):
+        return self.decoder(src_tokens, masked_tokens=masked_tokens)
+
+
+class DummyEncoder(FairseqDecoder):
+
+    def __init__(self, num_embed=50000, embed_dim=1024, num_layers=24):
+        super().__init__(Dictionary())
+        self.embed = nn.Embedding(
+            num_embeddings=num_embed, embedding_dim=embed_dim, padding_idx=0
+        )
+        self.layers_a = nn.ModuleList([
+            nn.Sequential(
+                nn.LayerNorm(embed_dim),
+                nn.Linear(embed_dim, 3*embed_dim),  # q, k, v input projection
+                nn.Linear(3*embed_dim, embed_dim),  # skip self-attention
+                nn.Linear(embed_dim, embed_dim),    # output projection
+                nn.Dropout(),
+            )
+            for i in range(num_layers)
+        ])
+        self.layers_b = nn.ModuleList([
+            nn.Sequential(
+                nn.LayerNorm(embed_dim),
+                nn.Linear(embed_dim, 4*embed_dim),  # FFN
+                nn.ReLU(),
+                nn.Linear(4*embed_dim, embed_dim),  # FFN
+                nn.Dropout(0.1),
+            )
+            for i in range(num_layers)
+        ])
+        self.out_proj = nn.Linear(embed_dim, num_embed)
+
+    def forward(self, tokens, masked_tokens=None):
+        x = self.embed(tokens)
+        for layer_a, layer_b in zip(self.layers_a, self.layers_b):
+            x = x + layer_a(x)
+            x = x + layer_b(x)
+        x = self.out_proj(x)
+        if masked_tokens is not None:
+            x = x[masked_tokens]
+        return (x,)
+
+    def max_positions(self):
+        return 1024
+
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        logits = net_output[0].float()
+        if log_probs:
+            return F.log_softmax(logits, dim=-1)
+        else:
+            return F.softmax(logits, dim=-1)
+
+
+@register_model_architecture('dummy_model', 'dummy_model')
+def base_architecture(args):
+    pass
diff --git a/fairseq/benchmark/dummy_mt.py b/fairseq/benchmark/dummy_mt.py
new file mode 100644
index 0000000000000000000000000000000000000000..09f2f0c1192636d402bfd1a4d7416e08a74fad83
--- /dev/null
+++ b/fairseq/benchmark/dummy_mt.py
@@ -0,0 +1,120 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import numpy as np
+import torch
+
+from fairseq.data import Dictionary, FairseqDataset
+from fairseq.tasks import FairseqTask, register_task
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_task('dummy_mt')
+class DummyMTTask(FairseqTask):
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument('--dict-size', default=49996, type=int)
+        parser.add_argument('--dataset-size', default=100000, type=int)
+        parser.add_argument('--tokens-per-sample', default=512, type=int,
+                            help='max number of total tokens over all segments '
+                                 'per sample for BERT dataset')
+
+    def __init__(self, args, dictionary):
+        super().__init__(args)
+        self.dictionary = dictionary
+        self.seed = args.seed
+
+        dictionary.pad_to_multiple_(8)  # often faster if divisible by 8
+
+        seq = torch.arange(args.tokens_per_sample + 1) + dictionary.pad() + 1
+
+        self.dummy_src = seq[:-1]
+        self.dummy_tgt = seq[1:]
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task. """
+        dictionary = Dictionary()
+        for i in range(args.dict_size):
+            dictionary.add_symbol('word{}'.format(i))
+        logger.info('dictionary: {} types'.format(len(dictionary)))
+        return cls(args, dictionary)
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        if self.args.max_sentences is not None:
+            bsz = self.args.max_sentences
+        else:
+            bsz = max(1, self.args.max_tokens // self.args.tokens_per_sample)
+        tgt = torch.stack([self.dummy_tgt for _ in range(bsz)])
+        self.datasets[split] = DummyDataset(
+            {
+                'id': 1,
+                'net_input': {
+                    'src_tokens': torch.stack([self.dummy_src for _ in range(bsz)]),
+                    'src_lengths': torch.full(
+                        (bsz, ), self.args.tokens_per_sample, dtype=torch.long
+                    ),
+                    'prev_output_tokens': tgt.clone(),
+                },
+                'target': tgt,
+                'nsentences': bsz,
+                'ntokens': bsz * self.args.tokens_per_sample,
+            },
+            num_items=self.args.dataset_size,
+            item_size=self.args.tokens_per_sample,
+        )
+
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
+
+
+class DummyDataset(FairseqDataset):
+
+    def __init__(self, batch, num_items, item_size):
+        super().__init__()
+        self.batch = batch
+        self.num_items = num_items
+        self.item_size = item_size
+
+    def __getitem__(self, index):
+        return index
+
+    def __len__(self):
+        return self.num_items
+
+    def collater(self, samples):
+        return self.batch
+
+    @property
+    def sizes(self):
+        return np.array([self.item_size] * self.num_items)
+
+    def num_tokens(self, index):
+        return self.item_size
+
+    def size(self, index):
+        return self.item_size
+
+    def ordered_indices(self):
+        return np.arange(self.num_items)
+
+    @property
+    def supports_prefetch(self):
+        return False
diff --git a/fairseq/binarizer.py b/fairseq/binarizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec3b90f211af7a8308f076389944f8a7184279ea
--- /dev/null
+++ b/fairseq/binarizer.py
@@ -0,0 +1,104 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from collections import Counter
+
+from fairseq.tokenizer import tokenize_line
+import torch
+from fairseq.file_io import PathManager
+
+def safe_readline(f):
+    pos = f.tell()
+    while True:
+        try:
+            return f.readline()
+        except UnicodeDecodeError:
+            pos -= 1
+            f.seek(pos)  # search where this character begins
+
+
+class Binarizer:
+    @staticmethod
+    def binarize(
+        filename,
+        dict,
+        consumer,
+        tokenize=tokenize_line,
+        append_eos=True,
+        reverse_order=False,
+        offset=0,
+        end=-1,
+        already_numberized=False,
+    ):
+        nseq, ntok = 0, 0
+        replaced = Counter()
+
+        def replaced_consumer(word, idx):
+            if idx == dict.unk_index and word != dict.unk_word:
+                replaced.update([word])
+
+        with open(PathManager.get_local_path(filename), "r", encoding="utf-8") as f:
+            f.seek(offset)
+            # next(f) breaks f.tell(), hence readline() must be used
+            line = safe_readline(f)
+            while line:
+                if end > 0 and f.tell() > end:
+                    break
+                if already_numberized:
+                    id_strings = line.strip().split()
+                    id_list = [int(id_string) for id_string in id_strings]
+                    if reverse_order:
+                        id_list.reverse()
+                    if append_eos:
+                        id_list.append(dict.eos())
+                    ids = torch.IntTensor(id_list)
+                else:
+                    ids = dict.encode_line(
+                        line=line,
+                        line_tokenizer=tokenize,
+                        add_if_not_exist=False,
+                        consumer=replaced_consumer,
+                        append_eos=append_eos,
+                        reverse_order=reverse_order,
+                    )
+                nseq += 1
+                ntok += len(ids)
+                consumer(ids)
+                line = f.readline()
+        return {
+            "nseq": nseq,
+            "nunk": sum(replaced.values()),
+            "ntok": ntok,
+            "replaced": replaced,
+        }
+
+    @staticmethod
+    def binarize_alignments(filename, alignment_parser, consumer, offset=0, end=-1):
+        nseq = 0
+
+        with open(PathManager.get_local_path(filename), "r") as f:
+            f.seek(offset)
+            line = safe_readline(f)
+            while line:
+                if end > 0 and f.tell() > end:
+                    break
+                ids = alignment_parser(line)
+                nseq += 1
+                consumer(ids)
+                line = f.readline()
+        return {"nseq": nseq}
+
+    @staticmethod
+    def find_offsets(filename, num_chunks):
+        with open(PathManager.get_local_path(filename), "r", encoding="utf-8") as f:
+            size = os.fstat(f.fileno()).st_size
+            chunk_size = size // num_chunks
+            offsets = [0 for _ in range(num_chunks + 1)]
+            for i in range(1, num_chunks):
+                f.seek(chunk_size * i)
+                safe_readline(f)
+                offsets[i] = f.tell()
+            return offsets
diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..af21db929ffef0c01599b87fd19431c316d6d26b
--- /dev/null
+++ b/fairseq/checkpoint_utils.py
@@ -0,0 +1,522 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import collections
+import logging
+import os
+import re
+import traceback
+from collections import OrderedDict
+from typing import Union
+
+import torch
+from fairseq.file_io import PathManager
+from fairseq.models import FairseqDecoder, FairseqEncoder
+from torch.serialization import default_restore_location
+
+
+logger = logging.getLogger(__name__)
+
+
+def save_checkpoint(args, trainer, epoch_itr, val_loss):
+    from fairseq import distributed_utils, meters
+
+    # only one worker should attempt to create the required dir
+    if args.distributed_rank == 0:
+        os.makedirs(args.save_dir, exist_ok=True)
+
+    prev_best = getattr(save_checkpoint, "best", val_loss)
+    if val_loss is not None:
+        best_function = max if args.maximize_best_checkpoint_metric else min
+        save_checkpoint.best = best_function(val_loss, prev_best)
+
+    if args.no_save or not trainer.is_data_parallel_master:
+        return
+
+    def is_better(a, b):
+        return a >= b if args.maximize_best_checkpoint_metric else a <= b
+
+    write_timer = meters.StopwatchMeter()
+    write_timer.start()
+
+    epoch = epoch_itr.epoch
+    end_of_epoch = epoch_itr.end_of_epoch()
+    updates = trainer.get_num_updates()
+
+    suffix = getattr(args, "checkpoint_suffix", "")
+    checkpoint_conds = collections.OrderedDict()
+    checkpoint_conds["checkpoint{}{}.pt".format(epoch, suffix)] = (
+        end_of_epoch
+        and not args.no_epoch_checkpoints
+        and epoch % args.save_interval == 0
+    )
+    checkpoint_conds["checkpoint_{}_{}{}.pt".format(epoch, updates, suffix)] = (
+        not end_of_epoch
+        and args.save_interval_updates > 0
+        and updates % args.save_interval_updates == 0
+    )
+    checkpoint_conds["checkpoint_best{}.pt".format(suffix)] = val_loss is not None and (
+        not hasattr(save_checkpoint, "best")
+        or is_better(val_loss, save_checkpoint.best)
+    )
+    if val_loss is not None and args.keep_best_checkpoints > 0:
+        checkpoint_conds["checkpoint.best_{}_{:.2f}.pt".format(
+            args.best_checkpoint_metric, val_loss)] = (
+            not hasattr(save_checkpoint, "best")
+            or is_better(val_loss, save_checkpoint.best)
+        )
+    checkpoint_conds["checkpoint_last{}.pt".format(suffix)] = not args.no_last_checkpoints
+
+    extra_state = {"train_iterator": epoch_itr.state_dict(), "val_loss": val_loss}
+    if hasattr(save_checkpoint, "best"):
+        extra_state.update({"best": save_checkpoint.best})
+
+    checkpoints = [
+        os.path.join(args.save_dir, fn) for fn, cond in checkpoint_conds.items() if cond
+    ]
+    if len(checkpoints) > 0:
+        trainer.save_checkpoint(checkpoints[0], extra_state)
+        for cp in checkpoints[1:]:
+            PathManager.copy(checkpoints[0], cp, overwrite=True)
+
+        write_timer.stop()
+        logger.info(
+            "saved checkpoint {} (epoch {} @ {} updates, score {}) (writing took {} seconds)".format(
+                checkpoints[0], epoch, updates, val_loss, write_timer.sum
+            )
+        )
+
+    if not end_of_epoch and args.keep_interval_updates > 0:
+        # remove old checkpoints; checkpoints are sorted in descending order
+        checkpoints = checkpoint_paths(
+            args.save_dir, pattern=r"checkpoint_\d+_(\d+)\.pt"
+        )
+        for old_chk in checkpoints[args.keep_interval_updates :]:
+            if os.path.lexists(old_chk):
+                os.remove(old_chk)
+
+    if args.keep_last_epochs > 0:
+        # remove old epoch checkpoints; checkpoints are sorted in descending order
+        checkpoints = checkpoint_paths(args.save_dir, pattern=r"checkpoint(\d+)\.pt")
+        for old_chk in checkpoints[args.keep_last_epochs :]:
+            if os.path.lexists(old_chk):
+                os.remove(old_chk)
+
+    if args.keep_best_checkpoints > 0:
+        # only keep the best N checkpoints according to validation metric
+        checkpoints = checkpoint_paths(
+            args.save_dir, pattern=r"checkpoint\.best_{}_(\d+\.?\d*)\.pt".format(args.best_checkpoint_metric))
+        if not args.maximize_best_checkpoint_metric:
+            checkpoints = checkpoints[::-1]
+        for old_chk in checkpoints[args.keep_best_checkpoints:]:
+            if os.path.lexists(old_chk):
+                os.remove(old_chk)
+
+
+def load_checkpoint(args, trainer, **passthrough_args):
+    """
+    Load a checkpoint and restore the training iterator.
+
+    *passthrough_args* will be passed through to
+    ``trainer.get_train_iterator``.
+    """
+    reset_optimizer = args.reset_optimizer
+    reset_lr_scheduler = args.reset_lr_scheduler
+    optimizer_overrides = eval(args.optimizer_overrides)
+    reset_meters = args.reset_meters
+    reset_dataloader = args.reset_dataloader
+
+    if getattr(args, 'finetune_from_model', None) is not None \
+       and (reset_optimizer or reset_lr_scheduler or reset_meters or reset_dataloader):
+        raise ValueError("--finetune-from-model can not be set together with either --reset-optimizer"
+                         " or reset_lr_scheduler or reset_meters or reset_dataloader")
+
+    suffix = getattr(args, "checkpoint_suffix", "")
+    if args.restore_file == "checkpoint_last.pt":  # default value of restore_file is 'checkpoint_last.pt'
+        checkpoint_path = os.path.join(args.save_dir, "checkpoint_last{}.pt".format(suffix))
+        first_launch = not PathManager.exists(checkpoint_path)
+        if getattr(args, 'finetune_from_model', None) is not None and first_launch:
+            # if there is no last checkpoint to restore, start the finetune from pretrained model
+            # else just use usual logic to load checkpoint, e.g. restart from last checkpoint and etc.
+            if PathManager.exists(args.finetune_from_model):
+                checkpoint_path = args.finetune_from_model
+                reset_optimizer = True
+                reset_lr_scheduler = True
+                reset_meters = True
+                reset_dataloader = True
+                logger.info(f'loading pretrained model from {checkpoint_path}: '
+                            'optimizer, lr scheduler, meters, dataloader will be reset')
+            else:
+                raise ValueError(f'--funetune-from-model {args.finetune_from_model} does not exist')
+    elif getattr(args, "model_parallel_size", 1) > 1:
+        checkpoint_path = args.restore_file.replace(".pt", suffix + ".pt")
+    else:
+        checkpoint_path = args.restore_file
+
+    if args.restore_file != "checkpoint_last.pt" and getattr(args, 'finetune_from_model', None):
+        raise ValueError(
+            '--finetune-from-model and --restore-file (non-default value) '
+            'can not be specified together: ' + str(args))
+
+    extra_state = trainer.load_checkpoint(
+        checkpoint_path,
+        reset_optimizer,
+        reset_lr_scheduler,
+        optimizer_overrides,
+        reset_meters=reset_meters,
+    )
+
+    if (
+        extra_state is not None
+        and "best" in extra_state
+        and not reset_optimizer
+        and not reset_meters
+    ):
+        save_checkpoint.best = extra_state["best"]
+
+    if extra_state is not None and not reset_dataloader:
+        # restore iterator from checkpoint
+        itr_state = extra_state["train_iterator"]
+        epoch_itr = trainer.get_train_iterator(
+            epoch=itr_state["epoch"], load_dataset=True, **passthrough_args
+        )
+        epoch_itr.load_state_dict(itr_state)
+    else:
+        epoch_itr = trainer.get_train_iterator(
+            epoch=1, load_dataset=True, **passthrough_args
+        )
+
+    trainer.lr_step(epoch_itr.epoch)
+
+    return extra_state, epoch_itr
+
+
+def load_checkpoint_to_cpu(path, arg_overrides=None):
+    """Loads a checkpoint to CPU (with upgrading for backward compatibility)."""
+    with PathManager.open(path, "rb") as f:
+        state = torch.load(
+            f, map_location=lambda s, l: default_restore_location(s, "cpu")
+        )
+
+    args = state["args"]
+    if arg_overrides is not None:
+        for arg_name, arg_val in arg_overrides.items():
+            setattr(args, arg_name, arg_val)
+    state = _upgrade_state_dict(state)
+    return state
+
+
+def load_model_ensemble(filenames, arg_overrides=None, task=None, strict=True, suffix=''):
+    """Loads an ensemble of models.
+
+    Args:
+        filenames (List[str]): checkpoint files to load
+        arg_overrides (Dict[str,Any], optional): override model args that
+            were used during model training
+        task (fairseq.tasks.FairseqTask, optional): task to use for loading
+    """
+    ensemble, args, _task = load_model_ensemble_and_task(
+        filenames, arg_overrides, task, strict, suffix,
+    )
+    return ensemble, args
+
+
+def load_model_ensemble_and_task(filenames, arg_overrides=None, task=None, strict=True, suffix=''):
+    from fairseq import tasks
+
+    ensemble = []
+    for filename in filenames:
+        filename = filename.replace(".pt", suffix + ".pt")
+        if not PathManager.exists(filename):
+            raise IOError("Model file not found: {}".format(filename))
+        state = load_checkpoint_to_cpu(filename, arg_overrides)
+
+        args = state["args"]
+        if task is None:
+            task = tasks.setup_task(args)
+
+        # build model for ensemble
+        model = task.build_model(args)
+        model.load_state_dict(state["model"], strict=strict, args=args)
+        ensemble.append(model)
+    return ensemble, args, task
+
+
+def checkpoint_paths(path, pattern=r"checkpoint(\d+)\.pt"):
+    """Retrieves all checkpoints found in `path` directory.
+
+    Checkpoints are identified by matching filename to the specified pattern. If
+    the pattern contains groups, the result will be sorted by the first group in
+    descending order.
+    """
+    pt_regexp = re.compile(pattern)
+    files = os.listdir(path)
+
+    entries = []
+    for i, f in enumerate(files):
+        m = pt_regexp.fullmatch(f)
+        if m is not None:
+            idx = float(m.group(1)) if len(m.groups()) > 0 else i
+            entries.append((idx, m.group(0)))
+    return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)]
+
+
+def torch_persistent_save(*args, **kwargs):
+    for i in range(3):
+        try:
+            return torch.save(*args, **kwargs)
+        except Exception:
+            if i == 2:
+                logger.error(traceback.format_exc())
+
+
+def save_state(
+    filename,
+    args,
+    model_state_dict,
+    criterion,
+    optimizer,
+    lr_scheduler,
+    num_updates,
+    optim_history=None,
+    extra_state=None,
+):
+    from fairseq import utils
+
+    if optim_history is None:
+        optim_history = []
+    if extra_state is None:
+        extra_state = {}
+    state_dict = {
+        "args": args,
+        "model": model_state_dict or {},
+        "optimizer_history": optim_history
+        + [
+            {
+                "criterion_name": criterion.__class__.__name__,
+                "optimizer_name": optimizer.__class__.__name__,
+                "lr_scheduler_state": lr_scheduler.state_dict(),
+                "num_updates": num_updates,
+            }
+        ],
+        "extra_state": extra_state,
+    }
+    if utils.has_parameters(criterion):
+        state_dict["criterion"] = criterion.state_dict()
+    if not args.no_save_optimizer_state:
+        state_dict["last_optimizer_state"] = optimizer.state_dict()
+
+    # convert all state to CPU
+    state_dict = utils.move_to_cpu(state_dict)
+
+    with PathManager.open(filename, "wb") as f:
+        torch_persistent_save(state_dict, f)
+
+
+def _upgrade_state_dict(state):
+    """Helper for upgrading old model checkpoints."""
+    from fairseq import models, registry, tasks
+
+    # add optimizer_history
+    if "optimizer_history" not in state:
+        state["optimizer_history"] = [
+            {"criterion_name": "CrossEntropyCriterion", "best_loss": state["best_loss"]}
+        ]
+        state["last_optimizer_state"] = state["optimizer"]
+        del state["optimizer"]
+        del state["best_loss"]
+    # move extra_state into sub-dictionary
+    if "epoch" in state and "extra_state" not in state:
+        state["extra_state"] = {
+            "epoch": state["epoch"],
+            "batch_offset": state["batch_offset"],
+            "val_loss": state["val_loss"],
+        }
+        del state["epoch"]
+        del state["batch_offset"]
+        del state["val_loss"]
+    # reduce optimizer history's memory usage (only keep the last state)
+    if "optimizer" in state["optimizer_history"][-1]:
+        state["last_optimizer_state"] = state["optimizer_history"][-1]["optimizer"]
+        for optim_hist in state["optimizer_history"]:
+            del optim_hist["optimizer"]
+    # record the optimizer class name
+    if "optimizer_name" not in state["optimizer_history"][-1]:
+        state["optimizer_history"][-1]["optimizer_name"] = "FairseqNAG"
+    # move best_loss into lr_scheduler_state
+    if "lr_scheduler_state" not in state["optimizer_history"][-1]:
+        state["optimizer_history"][-1]["lr_scheduler_state"] = {
+            "best": state["optimizer_history"][-1]["best_loss"]
+        }
+        del state["optimizer_history"][-1]["best_loss"]
+    # keep track of number of updates
+    if "num_updates" not in state["optimizer_history"][-1]:
+        state["optimizer_history"][-1]["num_updates"] = 0
+    # old model checkpoints may not have separate source/target positions
+    if hasattr(state["args"], "max_positions") and not hasattr(
+        state["args"], "max_source_positions"
+    ):
+        state["args"].max_source_positions = state["args"].max_positions
+        state["args"].max_target_positions = state["args"].max_positions
+    # use stateful training data iterator
+    if "train_iterator" not in state["extra_state"]:
+        state["extra_state"]["train_iterator"] = {
+            "epoch": state["extra_state"]["epoch"],
+            "iterations_in_epoch": state["extra_state"].get("batch_offset", 0),
+        }
+    # default to translation task
+    if not hasattr(state["args"], "task"):
+        state["args"].task = "translation"
+    # --raw-text and --lazy-load are deprecated
+    if getattr(state["args"], "raw_text", False):
+        state["args"].dataset_impl = "raw"
+    elif getattr(state["args"], "lazy_load", False):
+        state["args"].dataset_impl = "lazy"
+    # epochs start at 1
+    if state["extra_state"]["train_iterator"] is not None:
+        state["extra_state"]["train_iterator"]["epoch"] = max(
+            state["extra_state"]["train_iterator"].get("epoch", 1),
+            1,
+        )
+
+    # set any missing default values in the task, model or other registries
+    registry.set_defaults(state["args"], tasks.TASK_REGISTRY[state["args"].task])
+    registry.set_defaults(state["args"], models.ARCH_MODEL_REGISTRY[state["args"].arch])
+    for registry_name, REGISTRY in registry.REGISTRIES.items():
+        choice = getattr(state["args"], registry_name, None)
+        if choice is not None:
+            cls = REGISTRY["registry"][choice]
+            registry.set_defaults(state["args"], cls)
+
+    return state
+
+
+def prune_state_dict(state_dict, args):
+    """Prune the given state_dict if desired for LayerDrop
+    (https://arxiv.org/abs/1909.11556).
+
+    Training with LayerDrop allows models to be robust to pruning at inference
+    time. This function prunes state_dict to allow smaller models to be loaded
+    from a larger model and re-maps the existing state_dict for this to occur.
+
+    It's called by functions that load models from checkpoints and does not
+    need to be called directly.
+    """
+    if not args or args.arch == "ptt_transformer":
+        # args should not be none, but don't crash if it is.
+        return state_dict
+
+    encoder_layers_to_keep = (
+        args.encoder_layers_to_keep if "encoder_layers_to_keep" in vars(args) else None
+    )
+    decoder_layers_to_keep = (
+        args.decoder_layers_to_keep if "decoder_layers_to_keep" in vars(args) else None
+    )
+
+    if not encoder_layers_to_keep and not decoder_layers_to_keep:
+        return state_dict
+
+    # apply pruning
+    logger.info(
+        "Pruning model to specified layer configuration - this works best if the model was trained with LayerDrop"
+    )
+
+    def create_pruning_pass(layers_to_keep, layer_name):
+        keep_layers = sorted(
+            [int(layer_string) for layer_string in layers_to_keep.split(",")]
+        )
+        mapping_dict = {}
+        for i in range(len(keep_layers)):
+            mapping_dict[str(keep_layers[i])] = str(i)
+
+        regex = re.compile(r"^{layer}.*\.layers\.(\d+)".format(layer=layer_name))
+        return {"substitution_regex": regex, "mapping_dict": mapping_dict}
+
+    pruning_passes = []
+    if encoder_layers_to_keep:
+        pruning_passes.append(create_pruning_pass(encoder_layers_to_keep, "encoder"))
+    if decoder_layers_to_keep:
+        pruning_passes.append(create_pruning_pass(decoder_layers_to_keep, "decoder"))
+
+    new_state_dict = {}
+    for layer_name in state_dict.keys():
+        match = re.search(r"\.layers\.(\d+)\.", layer_name)
+        # if layer has no number in it, it is a supporting layer, such as an
+        # embedding
+        if not match:
+            new_state_dict[layer_name] = state_dict[layer_name]
+            continue
+
+        # otherwise, layer should be pruned.
+        original_layer_number = match.group(1)
+        # figure out which mapping dict to replace from
+        for pruning_pass in pruning_passes:
+            if original_layer_number in pruning_pass["mapping_dict"] and pruning_pass[
+                "substitution_regex"
+            ].search(layer_name):
+                new_layer_number = pruning_pass["mapping_dict"][original_layer_number]
+                substitution_match = pruning_pass["substitution_regex"].search(
+                    layer_name
+                )
+                new_state_key = (
+                    layer_name[: substitution_match.start(1)]
+                    + new_layer_number
+                    + layer_name[substitution_match.end(1) :]
+                )
+                new_state_dict[new_state_key] = state_dict[layer_name]
+
+    # Since layers are now pruned, *_layers_to_keep are no longer needed.
+    # This is more of "It would make it work fix" rather than a proper fix.
+    if "encoder_layers_to_keep" in vars(args):
+        args.encoder_layers_to_keep = None
+    if "decoder_layers_to_keep" in vars(args):
+        args.decoder_layers_to_keep = None
+
+    return new_state_dict
+
+
+def load_pretrained_component_from_model(
+    component: Union[FairseqEncoder, FairseqDecoder], checkpoint: str
+):
+    """
+    Load a pretrained FairseqEncoder or FairseqDecoder from checkpoint into the
+    provided `component` object. If state_dict fails to load, there may be a
+    mismatch in the architecture of the corresponding `component` found in the
+    `checkpoint` file.
+    """
+    if not PathManager.exists(checkpoint):
+        raise IOError("Model file not found: {}".format(checkpoint))
+    state = load_checkpoint_to_cpu(checkpoint)
+    if isinstance(component, FairseqEncoder):
+        component_type = "encoder"
+    elif isinstance(component, FairseqDecoder):
+        component_type = "decoder"
+    else:
+        raise ValueError(
+            "component to load must be either a FairseqEncoder or "
+            "FairseqDecoder. Loading other component types are not supported."
+        )
+    component_state_dict = OrderedDict()
+    for key in state["model"].keys():
+        if key.startswith(component_type):
+            # encoder.input_layers.0.0.weight --> input_layers.0.0.weight
+            component_subkey = key[len(component_type) + 1 :]
+            component_state_dict[component_subkey] = state["model"][key]
+    component.load_state_dict(component_state_dict, strict=True)
+    return component
+
+
+def verify_checkpoint_directory(save_dir: str) -> None:
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir, exist_ok=True)
+    temp_file_path = os.path.join(save_dir, "dummy")
+    try:
+        with open(temp_file_path, "w"):
+            pass
+    except OSError as e:
+        logger.warning("Unable to access checkpoint save directory: {}".format(save_dir))
+        raise e
+    else:
+        os.remove(temp_file_path)
diff --git a/fairseq/clib/libbleu/libbleu.cpp b/fairseq/clib/libbleu/libbleu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3cf2d65b6d16e19ea299ebe43c9c25e3481d4524
--- /dev/null
+++ b/fairseq/clib/libbleu/libbleu.cpp
@@ -0,0 +1,141 @@
+/**
+ * Copyright 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <map>
+#include <array>
+#include <cstring>
+#include <cstdio>
+
+typedef struct
+{
+    size_t reflen;
+    size_t predlen;
+    size_t match1;
+    size_t count1;
+    size_t match2;
+    size_t count2;
+    size_t match3;
+    size_t count3;
+    size_t match4;
+    size_t count4;
+} bleu_stat;
+
+// left trim (remove pad)
+void bleu_ltrim(size_t* len, int** sent, int pad) {
+  size_t start = 0;
+  while(start < *len) {
+    if (*(*sent + start) != pad) { break; }
+    start++;
+  }
+  *sent += start;
+  *len -= start;
+}
+
+// right trim remove (eos)
+void bleu_rtrim(size_t* len, int** sent, int pad, int eos) {
+  size_t end = *len - 1;
+  while (end > 0) {
+    if (*(*sent + end) != eos && *(*sent + end) != pad) { break; }
+    end--;
+  }
+  *len = end + 1;
+}
+
+// left and right trim
+void bleu_trim(size_t* len, int** sent, int pad, int eos) {
+  bleu_ltrim(len, sent, pad);
+  bleu_rtrim(len, sent, pad, eos);
+}
+
+size_t bleu_hash(int len, int* data) {
+  size_t h     = 14695981039346656037ul;
+  size_t prime = 0x100000001b3;
+  char* b      = (char*) data;
+  size_t blen  = sizeof(int) * len;
+
+  while (blen-- > 0) {
+    h ^= *b++;
+    h *= prime;
+  }
+
+  return h;
+}
+
+void bleu_addngram(
+    size_t *ntotal, size_t *nmatch, size_t n,
+    size_t reflen, int* ref, size_t predlen, int* pred) {
+
+  if (predlen < n) { return; }
+
+  predlen = predlen - n + 1;
+  (*ntotal) += predlen;
+
+  if (reflen < n) { return; }
+
+  reflen = reflen - n + 1;
+
+  std::map<size_t, size_t> count;
+  while (predlen > 0) {
+    size_t w = bleu_hash(n, pred++);
+    count[w]++;
+    predlen--;
+  }
+
+  while (reflen > 0) {
+    size_t w = bleu_hash(n, ref++);
+    if (count[w] > 0) {
+      (*nmatch)++;
+      count[w] -=1;
+    }
+    reflen--;
+  }
+}
+
+extern "C" {
+
+#ifdef _WIN64
+__declspec(dllexport) 
+#endif
+void bleu_zero_init(bleu_stat* stat) {
+  std::memset(stat, 0, sizeof(bleu_stat));
+}
+
+#ifdef _WIN64
+__declspec(dllexport) 
+#endif
+void bleu_one_init(bleu_stat* stat) {
+  bleu_zero_init(stat);
+  stat->count1 = 0;
+  stat->count2 = 1;
+  stat->count3 = 1;
+  stat->count4 = 1;
+  stat->match1 = 0;
+  stat->match2 = 1;
+  stat->match3 = 1;
+  stat->match4 = 1;
+}
+
+#ifdef _WIN64
+__declspec(dllexport) 
+#endif
+void bleu_add(
+    bleu_stat* stat,
+    size_t reflen, int* ref, size_t predlen, int* pred, int pad, int eos) {
+
+  bleu_trim(&reflen, &ref, pad, eos);
+  bleu_trim(&predlen, &pred, pad, eos);
+  stat->reflen += reflen;
+  stat->predlen += predlen;
+
+  bleu_addngram(&stat->count1, &stat->match1, 1, reflen, ref, predlen, pred);
+  bleu_addngram(&stat->count2, &stat->match2, 2, reflen, ref, predlen, pred);
+  bleu_addngram(&stat->count3, &stat->match3, 3, reflen, ref, predlen, pred);
+  bleu_addngram(&stat->count4, &stat->match4, 4, reflen, ref, predlen, pred);
+}
+
+}
diff --git a/fairseq/clib/libbleu/module.cpp b/fairseq/clib/libbleu/module.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ed9a84b1c028bfe9ed1d45be6857b6e79b3459f
--- /dev/null
+++ b/fairseq/clib/libbleu/module.cpp
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <Python.h>
+
+
+static PyMethodDef method_def[] = {
+  {NULL, NULL, 0, NULL}
+};
+
+static struct PyModuleDef module_def = {
+   PyModuleDef_HEAD_INIT,
+   "libbleu",   /* name of module */
+   NULL,     /* module documentation, may be NULL */
+   -1,       /* size of per-interpreter state of the module,
+                or -1 if the module keeps state in global variables. */
+   method_def
+};
+
+
+#if PY_MAJOR_VERSION == 2
+PyMODINIT_FUNC init_libbleu()
+#else
+PyMODINIT_FUNC PyInit_libbleu()
+#endif
+{
+  PyObject *m = PyModule_Create(&module_def);
+  if (!m) {
+    return NULL;
+  }
+  return m;
+}
diff --git a/fairseq/clib/libnat/edit_dist.cpp b/fairseq/clib/libnat/edit_dist.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6bc6a937d6abde0cd49769c4def69ac0560096bc
--- /dev/null
+++ b/fairseq/clib/libnat/edit_dist.cpp
@@ -0,0 +1,231 @@
+/**
+ * Copyright 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/torch.h> // @manual=//caffe2:torch_extension
+#include <pybind11/detail/common.h>
+#include <pybind11/pybind11.h>
+#include <vector>
+#include <algorithm>
+#include <cstdint>
+#include <iosfwd>
+#include <memory>
+#include <new>
+#include <string>
+#include <utility>
+
+using namespace ::std;
+
+vector<vector<uint32_t>> edit_distance2_with_dp(
+    vector<uint32_t>& x,
+    vector<uint32_t>& y) {
+  uint32_t lx = x.size();
+  uint32_t ly = y.size();
+  vector<vector<uint32_t>> d(lx + 1, vector<uint32_t>(ly + 1));
+  for (uint32_t i = 0; i < lx + 1; i++) {
+    d[i][0] = i;
+  }
+  for (uint32_t j = 0; j < ly + 1; j++) {
+    d[0][j] = j;
+  }
+  for (uint32_t i = 1; i < lx + 1; i++) {
+    for (uint32_t j = 1; j < ly + 1; j++) {
+      d[i][j] =
+          min(min(d[i - 1][j], d[i][j - 1]) + 1,
+              d[i - 1][j - 1] + 2 * (x.at(i - 1) == y.at(j - 1) ? 0 : 1));
+    }
+  }
+  return d;
+}
+
+vector<vector<uint32_t>> edit_distance2_backtracking(
+    vector<vector<uint32_t>>& d,
+    vector<uint32_t>& x,
+    vector<uint32_t>& y,
+    uint32_t terminal_symbol) {
+  vector<uint32_t> seq;
+  vector<vector<uint32_t>> edit_seqs(x.size() + 2, vector<uint32_t>());
+  /*
+  edit_seqs:
+  0~x.size() cell is the insertion sequences
+  last cell is the delete sequence
+  */
+
+  if (x.size() == 0) {
+    edit_seqs.at(0) = y;
+    return edit_seqs;
+  }
+
+  uint32_t i = d.size() - 1;
+  uint32_t j = d.at(0).size() - 1;
+
+  while ((i >= 0) && (j >= 0)) {
+    if ((i == 0) && (j == 0)) {
+      break;
+    }
+
+    if ((j > 0) && (d.at(i).at(j - 1) < d.at(i).at(j))) {
+      seq.push_back(1); // insert
+      seq.push_back(y.at(j - 1));
+      j--;
+    } else if ((i > 0) && (d.at(i - 1).at(j) < d.at(i).at(j))) {
+      seq.push_back(2); // delete
+      seq.push_back(x.at(i - 1));
+      i--;
+    } else {
+      seq.push_back(3); // keep
+      seq.push_back(x.at(i - 1));
+      i--;
+      j--;
+    }
+  }
+
+  uint32_t prev_op, op, s, word;
+  prev_op = 0, s = 0;
+  for (uint32_t k = 0; k < seq.size() / 2; k++) {
+    op = seq.at(seq.size() - 2 * k - 2);
+    word = seq.at(seq.size() - 2 * k - 1);
+    if (prev_op != 1) {
+      s++;
+    }
+    if (op == 1) // insert
+    {
+      edit_seqs.at(s - 1).push_back(word);
+    } else if (op == 2) // delete
+    {
+      edit_seqs.at(x.size() + 1).push_back(1);
+    } else {
+      edit_seqs.at(x.size() + 1).push_back(0);
+    }
+
+    prev_op = op;
+  }
+
+  for (uint32_t k = 0; k < edit_seqs.size(); k++) {
+    if (edit_seqs[k].size() == 0) {
+      edit_seqs[k].push_back(terminal_symbol);
+    }
+  }
+  return edit_seqs;
+}
+
+vector<vector<uint32_t>> edit_distance2_backtracking_with_delete(
+    vector<vector<uint32_t>>& d,
+    vector<uint32_t>& x,
+    vector<uint32_t>& y,
+    uint32_t terminal_symbol,
+    uint32_t deletion_symbol) {
+  vector<uint32_t> seq;
+  vector<vector<uint32_t>> edit_seqs(x.size() + 1, vector<uint32_t>());
+  /*
+  edit_seqs:
+  0~x.size() cell is the insertion sequences
+  last cell is the delete sequence
+  */
+
+  if (x.size() == 0) {
+    edit_seqs.at(0) = y;
+    return edit_seqs;
+  }
+
+  uint32_t i = d.size() - 1;
+  uint32_t j = d.at(0).size() - 1;
+
+  while ((i >= 0) && (j >= 0)) {
+    if ((i == 0) && (j == 0)) {
+      break;
+    }
+
+    if ((j > 0) && (d.at(i).at(j - 1) < d.at(i).at(j))) {
+      seq.push_back(1); // insert
+      seq.push_back(y.at(j - 1));
+      j--;
+    } else if ((i > 0) && (d.at(i - 1).at(j) < d.at(i).at(j))) {
+      seq.push_back(2); // delete
+      seq.push_back(x.at(i - 1));
+      i--;
+    } else {
+      seq.push_back(3); // keep
+      seq.push_back(x.at(i - 1));
+      i--;
+      j--;
+    }
+  }
+
+  uint32_t prev_op, op, s, word;
+  prev_op = 0, s = 0;
+  for (uint32_t k = 0; k < seq.size() / 2; k++) {
+    op = seq.at(seq.size() - 2 * k - 2);
+    word = seq.at(seq.size() - 2 * k - 1);
+    if (prev_op != 1) {
+      s++;
+    }
+    if (op == 1) // insert
+    {
+      edit_seqs.at(s - 1).push_back(word);
+    } else if (op == 2) // delete
+    {
+      edit_seqs.at(s - 1).push_back(deletion_symbol);
+    }
+
+    prev_op = op;
+  }
+
+  for (uint32_t k = 0; k < edit_seqs.size(); k++) {
+    if (edit_seqs.at(k).size() == 0) {
+      edit_seqs.at(k).push_back(terminal_symbol);
+    }
+  }
+  return edit_seqs;
+}
+
+vector<uint32_t> compute_ed2(
+    vector<vector<uint32_t>>& xs,
+    vector<vector<uint32_t>>& ys) {
+  vector<uint32_t> distances(xs.size());
+  for (uint32_t i = 0; i < xs.size(); i++) {
+    vector<vector<uint32_t>> d = edit_distance2_with_dp(xs.at(i), ys.at(i));
+    distances.at(i) = d.at(xs.at(i).size()).at(ys.at(i).size());
+  }
+  return distances;
+}
+
+vector<vector<vector<uint32_t>>> suggested_ed2_path(
+    vector<vector<uint32_t>>& xs,
+    vector<vector<uint32_t>>& ys,
+    uint32_t terminal_symbol) {
+  vector<vector<vector<uint32_t>>> seq(xs.size());
+  for (uint32_t i = 0; i < xs.size(); i++) {
+    vector<vector<uint32_t>> d = edit_distance2_with_dp(xs.at(i), ys.at(i));
+    seq.at(i) =
+        edit_distance2_backtracking(d, xs.at(i), ys.at(i), terminal_symbol);
+  }
+  return seq;
+}
+
+vector<vector<vector<uint32_t>>> suggested_ed2_path_with_delete(
+    vector<vector<uint32_t>>& xs,
+    vector<vector<uint32_t>>& ys,
+    uint32_t terminal_symbol,
+    uint32_t deletion_symbol) {
+  vector<vector<vector<uint32_t>>> seq(xs.size());
+  for (uint32_t i = 0; i < xs.size(); i++) {
+    vector<vector<uint32_t>> d = edit_distance2_with_dp(xs.at(i), ys.at(i));
+    seq.at(i) = edit_distance2_backtracking_with_delete(
+        d, xs.at(i), ys.at(i), terminal_symbol, deletion_symbol);
+  }
+  return seq;
+}
+
+PYBIND11_MODULE(libnat, m) {
+  m.def("compute_ed2", &compute_ed2, "compute_ed2");
+  m.def("suggested_ed2_path", &suggested_ed2_path, "suggested_ed2_path");
+  m.def(
+      "suggested_ed2_path_with_delete",
+      &suggested_ed2_path_with_delete,
+      "suggested_ed2_path_with_delete");
+}
diff --git a/fairseq/clib/libnat_cuda/binding.cpp b/fairseq/clib/libnat_cuda/binding.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aaa6244d5c6819acfae5f408280205661a3389ae
--- /dev/null
+++ b/fairseq/clib/libnat_cuda/binding.cpp
@@ -0,0 +1,60 @@
+/**
+ * Copyright 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ This code is partially adpoted from https://github.com/1ytic/pytorch-edit-distance
+ */
+
+#include "edit_dist.h"
+#include <torch/types.h>
+
+#ifndef TORCH_CHECK
+#define TORCH_CHECK AT_CHECK
+#endif
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+
+torch::Tensor LevenshteinDistance(
+        torch::Tensor source,
+        torch::Tensor target,
+        torch::Tensor source_length,
+        torch::Tensor target_length) {
+
+    CHECK_INPUT(source);
+    CHECK_INPUT(target);
+    CHECK_INPUT(source_length);
+    CHECK_INPUT(target_length);
+    return LevenshteinDistanceCuda(source, target, source_length, target_length);
+}
+
+torch::Tensor GenerateDeletionLabel(
+        torch::Tensor source,
+        torch::Tensor operations) {
+
+    CHECK_INPUT(source);
+    CHECK_INPUT(operations);
+    return GenerateDeletionLabelCuda(source, operations);
+}
+
+std::pair<torch::Tensor, torch::Tensor> GenerateInsertionLabel(
+        torch::Tensor target,
+        torch::Tensor operations) {
+
+    CHECK_INPUT(target);
+    CHECK_INPUT(operations);
+    return GenerateInsertionLabelCuda(target, operations);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("levenshtein_distance", &LevenshteinDistance, "Levenshtein distance");
+    m.def("generate_deletion_labels", &GenerateDeletionLabel, "Generate Deletion Label");
+    m.def("generate_insertion_labels", &GenerateInsertionLabel, "Generate Insertion Label");
+}
diff --git a/fairseq/clib/libnat_cuda/edit_dist.cu b/fairseq/clib/libnat_cuda/edit_dist.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b6486a8c22f0dc138ce80c8936082824a80a17e7
--- /dev/null
+++ b/fairseq/clib/libnat_cuda/edit_dist.cu
@@ -0,0 +1,332 @@
+/**
+* Copyright 2017-present, Facebook, Inc.
+* All rights reserved.
+*
+* This source code is licensed under the license found in the
+* LICENSE file in the root directory of this source tree.
+*/
+
+#include "edit_dist.h"
+#include <THC/THC.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <utility>      // std::pair
+
+template <typename scalar_t>
+__global__ void generate_deletion_label_kernel(
+        const scalar_t* __restrict__ source,
+        const size_t source_size,
+        const size_t operation_size,
+        int* __restrict__ operations,
+        int* __restrict__ labels) {
+
+    const int index = blockIdx.x;
+    const int offset = index * operation_size;
+    const int offset_label = index * source_size;
+
+    for (int i = 0; i < source_size; i++) {
+        labels[offset_label + i] = 0;
+    }
+
+    int k = 0;
+    for (int i = 0; i < operation_size; i++){
+        if (operations[offset + i] == 0){
+            break;
+        } else if (operations[offset + i] == 1){
+            continue;
+        } else {
+            labels[offset_label + k] = 3 - operations[offset + i];
+            k++;
+        }
+    }
+}
+
+template <typename scalar_t>
+__global__ void generate_insertion_label_kernel(
+        const scalar_t* __restrict__ target,
+        const size_t target_size,
+        const size_t operation_size,
+        int* __restrict__ operations,
+        int* __restrict__ labels,
+        int* __restrict__ masks) {
+
+    const int index = blockIdx.x;
+    const int offset = index * operation_size;
+    const int offset_label = index * target_size;
+
+    int k = 0;
+    int u = 0;
+    int m = 0;
+
+    for (int i = 0; i < target_size; i++) {
+        labels[offset_label + i] = 0;
+        masks[offset_label + i] = 0;
+    }
+
+    for (int i = 0; i < operation_size-1; i++){
+        if (operations[offset + i] == 0){
+            break;
+        } else if (operations[offset + i] == 2){
+            continue;
+        } else if (operations[offset + i] == 1){
+            masks[offset_label + m] = 1;
+            u++; m++;
+        } else {
+            labels[offset_label + k] = u;
+            masks[offset_label + m] = 0;
+            k++; m++;
+            u = 0;
+        }
+    }
+}
+
+template <typename scalar_t>
+__global__ void levenshtein_distance_kernel(
+        const scalar_t* __restrict__ source,
+        const scalar_t* __restrict__ target,
+        const int* __restrict__ source_length,
+        const int* __restrict__ target_length,
+        const size_t source_size,
+        const size_t target_size,
+        int* __restrict__ operations,
+        int* __restrict__ errors_curr) {
+
+    const int index = blockIdx.x;
+    const int offset = index * (source_size + target_size);
+    const int d = index * (source_size + 1) * (target_size + 1);
+    const int t = target_size + 1;
+
+    auto err_idx = [d, t](int i, int j) { return d + i * t + j; };
+    auto opt_idx = [offset](int k) { return offset + k; };
+
+    const int hyp_len = source_length[index];
+    const int ref_len = target_length[index];
+    const scalar_t* hyp_begin = source + index * source_size;
+    const scalar_t* ref_begin = target + index * target_size;
+
+    // dynamic programming
+    for (int i = 0; i <= hyp_len; i++){
+        errors_curr[err_idx(i, 0)] = i;
+    }
+    for (int j = 0; j <= ref_len; j++){
+        errors_curr[err_idx(0, j)] = j;
+    }
+    for (int i = 1; i <= hyp_len; i++){
+        for (int j = 1; j <= ref_len; j++){
+            errors_curr[err_idx(i, j)] = min(
+                min(
+                    errors_curr[err_idx(i-1, j)],
+                    errors_curr[err_idx(i, j-1)]
+                ) + 1,
+                errors_curr[err_idx(i-1, j-1)] + 2 * (
+                    *(hyp_begin+i-1) == *(ref_begin+j-1) ? 0 : 1
+                )
+            );
+        }
+    }
+
+    // back-tracing
+    int i = hyp_len;
+    int j = ref_len;
+    int o = hyp_len + ref_len;
+
+    for (int k = 0; k < source_size + target_size; k++) {
+        operations[opt_idx(k)] = 0;
+    }
+
+    while ((i >= 0) && (j >= 0)) {
+        if ((i == 0) && (j == 0)) {
+        break;
+        }
+
+        if ((j > 0) && (errors_curr[err_idx(i, j-1)] < errors_curr[err_idx(i, j)])) {
+            o--; operations[opt_idx(o)] = 1; j--;  // insertion
+        } else if ((i > 0) && (errors_curr[err_idx(i-1, j)] < errors_curr[err_idx(i, j)])) {
+            o--; operations[opt_idx(o)] = 2; i--;  // deletion
+        } else {
+            o--; operations[opt_idx(o)] = 3; i--; j--;  // do nothing
+        }
+    }
+
+    // moving to the left
+    for (int k = 0; k < hyp_len + ref_len; k++) {
+        if (k + o < hyp_len + ref_len){
+            operations[opt_idx(k)] = operations[opt_idx(k+o)];
+        } else{
+            operations[opt_idx(k)] = 0;  // padding
+        }
+    }
+
+}
+
+template <typename scalar_t>
+__global__ void faster_levenshtein_distance_kernel(
+        const scalar_t* __restrict__ source,
+        const scalar_t* __restrict__ target,
+        const int* __restrict__ source_length,
+        const int* __restrict__ target_length,
+        const size_t source_size,
+        const size_t target_size,
+        int* __restrict__ operations) {
+
+    extern __shared__ short errors[];
+    auto errors_curr = errors;
+
+    const int index = blockIdx.x;
+    const int offset = index * (source_size + target_size);
+    const int t = target_size + 1;
+
+    auto err_idx = [t](int i, int j) { return i * t + j; };
+    auto opt_idx = [offset](int k) { return offset + k; };
+
+    const int hyp_len = source_length[index];
+    const int ref_len = target_length[index];
+    const scalar_t* hyp_begin = source + index * source_size;
+    const scalar_t* ref_begin = target + index * target_size;
+
+    // dynamic programming
+    for (int i = 0; i <= hyp_len; i++){
+        errors_curr[err_idx(i, 0)] = i;
+    }
+    for (int j = 0; j <= ref_len; j++){
+        errors_curr[err_idx(0, j)] = j;
+    }
+    for (int i = 1; i <= hyp_len; i++){
+        for (int j = 1; j <= ref_len; j++){
+            errors_curr[err_idx(i, j)] = min(
+                min(
+                    errors_curr[err_idx(i-1, j)],
+                    errors_curr[err_idx(i, j-1)]
+                ) + 1,
+                errors_curr[err_idx(i-1, j-1)] + 2 * (
+                    *(hyp_begin+i-1) == *(ref_begin+j-1) ? 0 : 1
+                )
+            );
+        }
+    }
+
+    // back-tracing
+    int i = hyp_len;
+    int j = ref_len;
+    int o = hyp_len + ref_len;
+
+    for (int k = 0; k < source_size + target_size; k++) {
+        operations[opt_idx(k)] = 0;
+    }
+
+    while ((i >= 0) && (j >= 0)) {
+        if ((i == 0) && (j == 0)) {
+        break;
+        }
+
+        if ((j > 0) && (errors_curr[err_idx(i, j-1)] < errors_curr[err_idx(i, j)])) {
+            o--; operations[opt_idx(o)] = 1; j--;  // insertion
+        } else if ((i > 0) && (errors_curr[err_idx(i-1, j)] < errors_curr[err_idx(i, j)])) {
+            o--; operations[opt_idx(o)] = 2; i--;  // deletion
+        } else {
+            o--; operations[opt_idx(o)] = 3; i--; j--;  // do nothing
+        }
+    }
+
+    // moving to the left
+    for (int k = 0; k < hyp_len + ref_len; k++) {
+        if (k + o < hyp_len + ref_len){
+            operations[opt_idx(k)] = operations[opt_idx(k+o)];
+        } else{
+            operations[opt_idx(k)] = 0;  // padding
+        }
+    }
+
+}
+
+
+torch::Tensor GenerateDeletionLabelCuda(
+        torch::Tensor source,
+        torch::Tensor operations) {
+
+    const auto batch_size = source.size(0);
+    at::TensorOptions options(source.device());
+    options = options.dtype(at::ScalarType::Int);
+    auto labels = torch::empty({batch_size, source.size(1)}, options);
+    auto stream = at::cuda::getCurrentCUDAStream(source.device().index());
+
+    AT_DISPATCH_ALL_TYPES(source.scalar_type(), "generate_deletion_labels", ([&] {
+        generate_deletion_label_kernel<scalar_t><<<batch_size, 1, 0, stream>>>(
+            source.data<scalar_t>(),
+            source.size(1),
+            operations.size(1),
+            operations.data<int>(),
+            labels.data<int>());
+    }));
+
+    return labels;
+}
+
+std::pair<torch::Tensor, torch::Tensor> GenerateInsertionLabelCuda(
+    torch::Tensor target,
+    torch::Tensor operations) {
+
+const auto batch_size = target.size(0);
+at::TensorOptions options(target.device());
+options = options.dtype(at::ScalarType::Int);
+auto labels = torch::empty({batch_size, target.size(1)}, options);
+auto masks  = torch::empty({batch_size, target.size(1)}, options);
+auto stream = at::cuda::getCurrentCUDAStream(target.device().index());
+
+AT_DISPATCH_ALL_TYPES(target.scalar_type(), "generate_insertion_labels", ([&] {
+    generate_insertion_label_kernel<scalar_t><<<batch_size, 1, 0, stream>>>(
+        target.data<scalar_t>(),
+        target.size(1),
+        operations.size(1),
+        operations.data<int>(),
+        labels.data<int>(),
+        masks.data<int>());
+}));
+
+return std::make_pair(labels, masks);
+}
+
+
+torch::Tensor LevenshteinDistanceCuda(
+        torch::Tensor source,
+        torch::Tensor target,
+        torch::Tensor source_length,
+        torch::Tensor target_length) {
+
+    const auto batch_size = source.size(0);
+    const auto shared_size = (source.size(1) + 1) * (target.size(1) + 1) * sizeof(short);
+    
+    at::TensorOptions options(source.device());
+    options = options.dtype(at::ScalarType::Int);
+    auto operations = torch::empty({batch_size, source.size(1) + target.size(1)}, options);
+    auto stream = at::cuda::getCurrentCUDAStream(source.device().index());
+
+    if (shared_size > 40000) {
+        auto distances = torch::empty({batch_size, (source.size(1) + 1) * (target.size(1) + 1)}, options);
+        AT_DISPATCH_ALL_TYPES(source.scalar_type(), "levenshtein_distance", ([&] {
+            levenshtein_distance_kernel<scalar_t><<<batch_size, 1, 0, stream>>>(
+                source.data<scalar_t>(),
+                target.data<scalar_t>(),
+                source_length.data<int>(),
+                target_length.data<int>(),
+                source.size(1),
+                target.size(1),
+                operations.data<int>(),
+                distances.data<int>());
+        }));
+    } else {
+        AT_DISPATCH_ALL_TYPES(source.scalar_type(), "faster_levenshtein_distance", ([&] {
+            faster_levenshtein_distance_kernel<scalar_t><<<batch_size, 1, shared_size, stream>>>(
+                source.data<scalar_t>(),
+                target.data<scalar_t>(),
+                source_length.data<int>(),
+                target_length.data<int>(),
+                source.size(1),
+                target.size(1),
+                operations.data<int>());
+        }));
+    }
+
+    return operations;
+}
diff --git a/fairseq/clib/libnat_cuda/edit_dist.h b/fairseq/clib/libnat_cuda/edit_dist.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3506cd34ddaa35bb724fe64a459bad8046b9a34
--- /dev/null
+++ b/fairseq/clib/libnat_cuda/edit_dist.h
@@ -0,0 +1,25 @@
+/**
+ * Copyright 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <torch/extension.h>
+
+torch::Tensor LevenshteinDistanceCuda(
+        torch::Tensor source,
+        torch::Tensor target,
+        torch::Tensor source_length,
+        torch::Tensor target_length);
+
+torch::Tensor GenerateDeletionLabelCuda(
+        torch::Tensor source,
+        torch::Tensor operations);
+
+std::pair<torch::Tensor, torch::Tensor> GenerateInsertionLabelCuda(
+        torch::Tensor source,
+        torch::Tensor operations);
diff --git a/fairseq/criterions/__init__.py b/fairseq/criterions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c28780111e6015e8c274024325d4c73b3c6c84d
--- /dev/null
+++ b/fairseq/criterions/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import importlib
+import os
+
+from fairseq import registry
+from fairseq.criterions.fairseq_criterion import FairseqCriterion, LegacyFairseqCriterion
+
+
+build_criterion, register_criterion, CRITERION_REGISTRY = registry.setup_registry(
+    '--criterion',
+    base_class=FairseqCriterion,
+    default='cross_entropy',
+)
+
+
+# automatically import any Python files in the criterions/ directory
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith('.py') and not file.startswith('_'):
+        module = file[:file.find('.py')]
+        importlib.import_module('fairseq.criterions.' + module)
diff --git a/fairseq/criterions/__pycache__/__init__.cpython-310.pyc b/fairseq/criterions/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..62ede6bb72bf14120440823b21b2551c44d87d3f
Binary files /dev/null and b/fairseq/criterions/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/criterions/__pycache__/adaptive_loss.cpython-310.pyc b/fairseq/criterions/__pycache__/adaptive_loss.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20b19904d968b2a4bd571b3580546398efba81d4
Binary files /dev/null and b/fairseq/criterions/__pycache__/adaptive_loss.cpython-310.pyc differ
diff --git a/fairseq/criterions/__pycache__/composite_loss.cpython-310.pyc b/fairseq/criterions/__pycache__/composite_loss.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f40665cbdb4b02c4991ae0569caaf3543f391258
Binary files /dev/null and b/fairseq/criterions/__pycache__/composite_loss.cpython-310.pyc differ
diff --git a/fairseq/criterions/__pycache__/cross_entropy.cpython-310.pyc b/fairseq/criterions/__pycache__/cross_entropy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb1ca95b5e9090a4d7925e74d69fbcce07318d6c
Binary files /dev/null and b/fairseq/criterions/__pycache__/cross_entropy.cpython-310.pyc differ
diff --git a/fairseq/criterions/__pycache__/ctc.cpython-310.pyc b/fairseq/criterions/__pycache__/ctc.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4fb18ca67d42b0f2ff3203c1539c252b0ad2edc
Binary files /dev/null and b/fairseq/criterions/__pycache__/ctc.cpython-310.pyc differ
diff --git a/fairseq/criterions/__pycache__/fairseq_criterion.cpython-310.pyc b/fairseq/criterions/__pycache__/fairseq_criterion.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5729dbcf9d0306170b752dbb47361f7765d86004
Binary files /dev/null and b/fairseq/criterions/__pycache__/fairseq_criterion.cpython-310.pyc differ
diff --git a/fairseq/criterions/__pycache__/label_smoothed_cross_entropy.cpython-310.pyc b/fairseq/criterions/__pycache__/label_smoothed_cross_entropy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5dc0d09562c28e2282b353d1e654eb35f9b8bdc8
Binary files /dev/null and b/fairseq/criterions/__pycache__/label_smoothed_cross_entropy.cpython-310.pyc differ
diff --git a/fairseq/criterions/__pycache__/label_smoothed_cross_entropy_with_alignment.cpython-310.pyc b/fairseq/criterions/__pycache__/label_smoothed_cross_entropy_with_alignment.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d594d3e8c6b25993806c059404ed0dff07f8a0d
Binary files /dev/null and b/fairseq/criterions/__pycache__/label_smoothed_cross_entropy_with_alignment.cpython-310.pyc differ
diff --git a/fairseq/criterions/__pycache__/legacy_masked_lm.cpython-310.pyc b/fairseq/criterions/__pycache__/legacy_masked_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c348ab7b5fc64177f53b478a53631fd0b41cbf09
Binary files /dev/null and b/fairseq/criterions/__pycache__/legacy_masked_lm.cpython-310.pyc differ
diff --git a/fairseq/criterions/__pycache__/masked_lm.cpython-310.pyc b/fairseq/criterions/__pycache__/masked_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..60686443f3a70397392570b43c279a612119e80e
Binary files /dev/null and b/fairseq/criterions/__pycache__/masked_lm.cpython-310.pyc differ
diff --git a/fairseq/criterions/__pycache__/nat_loss.cpython-310.pyc b/fairseq/criterions/__pycache__/nat_loss.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6490ac0dd643073634a95712bc5af09181133ac4
Binary files /dev/null and b/fairseq/criterions/__pycache__/nat_loss.cpython-310.pyc differ
diff --git a/fairseq/criterions/__pycache__/sentence_prediction.cpython-310.pyc b/fairseq/criterions/__pycache__/sentence_prediction.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d615526f8f540d39dd347d3dd77cffce00100578
Binary files /dev/null and b/fairseq/criterions/__pycache__/sentence_prediction.cpython-310.pyc differ
diff --git a/fairseq/criterions/__pycache__/sentence_ranking.cpython-310.pyc b/fairseq/criterions/__pycache__/sentence_ranking.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0fd7b406e575fd7668e8874982e9e33da92e13f
Binary files /dev/null and b/fairseq/criterions/__pycache__/sentence_ranking.cpython-310.pyc differ
diff --git a/fairseq/criterions/__pycache__/wav2vec_criterion.cpython-310.pyc b/fairseq/criterions/__pycache__/wav2vec_criterion.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8aa015f1147a952453db5c632137bdaaa5920b36
Binary files /dev/null and b/fairseq/criterions/__pycache__/wav2vec_criterion.cpython-310.pyc differ
diff --git a/fairseq/criterions/adaptive_loss.py b/fairseq/criterions/adaptive_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..1916131bb133e0b00d21730c4a386ba93483978d
--- /dev/null
+++ b/fairseq/criterions/adaptive_loss.py
@@ -0,0 +1,101 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch.nn.functional as F
+
+from fairseq import metrics, utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+
+@register_criterion('adaptive_loss')
+class AdaptiveLoss(FairseqCriterion):
+    """This is an implementation of the loss function accompanying the adaptive softmax approximation for
+    graphical processing units (GPU), described in the paper "Efficient softmax approximation for GPUs"
+    (http://arxiv.org/abs/1609.04309)."""
+
+    def __init__(self, task, sentence_avg):
+        super().__init__(task)
+        self.sentence_avg = sentence_avg
+
+    @classmethod
+    def build_criterion(cls, args, task):
+        if getattr(args, 'ddp_backend', None) == 'c10d':
+            raise Exception(
+                'AdaptiveLoss is not compatible with the c10d '
+                'version of DistributedDataParallel. Please use '
+                '`--ddp-backend=no_c10d` instead.'
+            )
+        return cls(task, args.sentence_avg)
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+
+        assert hasattr(model.decoder, 'adaptive_softmax') and model.decoder.adaptive_softmax is not None
+        adaptive_softmax = model.decoder.adaptive_softmax
+
+        net_output = model(**sample['net_input'])
+        orig_target = model.get_targets(sample, net_output)
+
+        nsentences = orig_target.size(0)
+        orig_target = orig_target.view(-1)
+
+        bsz = orig_target.size(0)
+
+        logits, target = adaptive_softmax(net_output[0], orig_target)
+        assert len(target) == len(logits)
+
+        loss = net_output[0].new(1 if reduce else bsz).zero_()
+
+        for i in range(len(target)):
+            if target[i] is not None:
+                assert (target[i].min() >= 0 and target[i].max() <= logits[i].size(1))
+                loss += F.cross_entropy(
+                    logits[i],
+                    target[i],
+                    ignore_index=self.padding_idx,
+                    reduction='sum' if reduce else 'none',
+                )
+
+        orig = utils.strip_pad(orig_target, self.padding_idx)
+        ntokens = orig.numel()
+        sample_size = sample['target'].size(0) if self.sentence_avg else ntokens
+        logging_output = {
+            'loss': loss.data,
+            'ntokens': ntokens,
+            'nsentences': nsentences,
+            'sample_size': sample_size,
+        }
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = utils.item(sum(log.get('loss', 0) for log in logging_outputs))
+        ntokens = utils.item(sum(log.get('ntokens', 0) for log in logging_outputs))
+        sample_size = utils.item(sum(log.get('sample_size', 0) for log in logging_outputs))
+
+        metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3)
+        if sample_size != ntokens:
+            metrics.log_scalar('nll_loss', loss_sum / ntokens / math.log(2), ntokens, round=3)
+            metrics.log_derived('ppl', lambda meters: utils.get_perplexity(meters['nll_loss'].avg))
+        else:
+            metrics.log_derived('ppl', lambda meters: utils.get_perplexity(meters['loss'].avg))
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/fairseq/criterions/composite_loss.py b/fairseq/criterions/composite_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..6671c696e97b9f9240b96d9f9da4157bcdbd74de
--- /dev/null
+++ b/fairseq/criterions/composite_loss.py
@@ -0,0 +1,99 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from torch import nn
+
+from fairseq import utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+
+@register_criterion('composite_loss')
+class CompositeLoss(FairseqCriterion):
+    """This is a composite loss that, given a list of model outputs and a list of targets,
+    computes an average of losses for each output-target pair"""
+
+    def __init__(self, task, underlying_criterion):
+        super().__init__(task)
+        self.underlying_criterion = underlying_criterion
+
+    @staticmethod
+    def add_args(parser):
+        """Add criterion-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--underlying-criterion', type=str, metavar='VAL', required=True,
+                            help='underlying criterion to use for the composite loss')
+        # fmt: on
+
+    @staticmethod
+    def build_underlying_criterion(args, task):
+        saved_criterion = args.criterion
+        args.criterion = args.underlying_criterion
+        assert saved_criterion != args.underlying_criterion
+        underlying_criterion = task.build_criterion(args)
+        args.criterion = saved_criterion
+        return underlying_criterion
+
+    @classmethod
+    def build_criterion(cls, args, task):
+        underlying_criterion = CompositeLoss.build_underlying_criterion(args, task)
+
+        class FakeModel(nn.Module):
+
+            def __init__(self, model, net_out, target):
+                super().__init__()
+                self.model = model
+                self.net_out = net_out
+                self.target = target
+
+            def forward(self, **unused):
+                return self.net_out
+
+            def get_normalized_probs(self, net_output, log_probs, sample=None):
+                return self.model.get_normalized_probs(net_output, log_probs, sample=sample)
+
+            def get_targets(self, *unused):
+                return self.target
+
+            @property
+            def decoder(self):
+                return self.model.decoder
+
+        class _CompositeLoss(FairseqCriterion):
+
+            def __init__(self, task, underlying_criterion):
+                super().__init__(task)
+                self.underlying_criterion = underlying_criterion
+
+            def forward(self, model, sample, reduce=True):
+                net_outputs = model(**sample['net_input'])
+                targets = sample['target']
+
+                bsz = targets[0].size(0)
+                loss = net_outputs[0][0].new(1 if reduce else bsz).float().zero_()
+
+                sample_size = 0
+                logging_output = {}
+                for o, t in zip(net_outputs[0], targets):
+                    m = FakeModel(model, (o, net_outputs[1]), t)
+                    sample['target'] = t
+                    l, ss, logging_output = self.underlying_criterion(m, sample, reduce)
+                    loss += l
+                    sample_size += ss
+
+                loss.div_(len(targets))
+                sample_size /= len(targets)
+
+                logging_output['loss'] = utils.item(loss.data) if reduce else loss.data
+                return loss, sample_size, logging_output
+
+            @staticmethod
+            def aggregate_logging_outputs(logging_outputs):
+                return underlying_criterion.__class__.aggregate_logging_outputs(logging_outputs)
+
+            @staticmethod
+            def reduce_metrics(logging_outputs) -> None:
+                underlying_criterion.__class__.reduce_metrics(logging_outputs)
+
+        return _CompositeLoss(task, underlying_criterion)
diff --git a/fairseq/criterions/cross_entropy.py b/fairseq/criterions/cross_entropy.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e750f62e3f252e1ad4e780bd5168bf1c9924de5
--- /dev/null
+++ b/fairseq/criterions/cross_entropy.py
@@ -0,0 +1,73 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch.nn.functional as F
+
+from fairseq import metrics, utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+
+@register_criterion('cross_entropy')
+class CrossEntropyCriterion(FairseqCriterion):
+
+    def __init__(self, task, sentence_avg):
+        super().__init__(task)
+        self.sentence_avg = sentence_avg
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample['net_input'])
+        loss, _ = self.compute_loss(model, net_output, sample, reduce=reduce)
+        sample_size = sample['target'].size(0) if self.sentence_avg else sample['ntokens']
+        logging_output = {
+            'loss': loss.data,
+            'ntokens': sample['ntokens'],
+            'nsentences': sample['target'].size(0),
+            'sample_size': sample_size,
+        }
+        return loss, sample_size, logging_output
+
+    def compute_loss(self, model, net_output, sample, reduce=True):
+        lprobs = model.get_normalized_probs(net_output, log_probs=True)
+        lprobs = lprobs.view(-1, lprobs.size(-1))
+        target = model.get_targets(sample, net_output).view(-1)
+        loss = F.nll_loss(
+            lprobs,
+            target,
+            ignore_index=self.padding_idx,
+            reduction='sum' if reduce else 'none',
+        )
+        return loss, loss
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get('loss', 0) for log in logging_outputs)
+        ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
+        sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
+
+        metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3)
+        if sample_size != ntokens:
+            metrics.log_scalar('nll_loss', loss_sum / ntokens / math.log(2), ntokens, round=3)
+            metrics.log_derived('ppl', lambda meters: utils.get_perplexity(meters['nll_loss'].avg))
+        else:
+            metrics.log_derived('ppl', lambda meters: utils.get_perplexity(meters['loss'].avg))
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/fairseq/criterions/ctc.py b/fairseq/criterions/ctc.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbf712c69d3b990ce31a6031a0b248b2423e5817
--- /dev/null
+++ b/fairseq/criterions/ctc.py
@@ -0,0 +1,247 @@
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from argparse import Namespace
+import math
+
+import torch
+import torch.nn.functional as F
+from fairseq import metrics, utils
+from fairseq.data.data_utils import post_process
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.logging.meters import safe_round
+
+
+@register_criterion("ctc")
+class CtcCriterion(FairseqCriterion):
+    def __init__(self, task, wer_args, zero_infinity, sentence_avg, remove_bpe):
+        super().__init__(task)
+        self.blank_idx = task.target_dictionary.bos()
+        self.pad_idx = task.target_dictionary.pad()
+        self.eos_idx = task.target_dictionary.eos()
+        self.post_process = remove_bpe if remove_bpe else "letter"
+
+        if wer_args is not None:
+            from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder
+
+            wer_compute_kenlm, wer_lexicon, lm_w, ws_w = eval(wer_args)
+
+            dec_args = Namespace()
+            dec_args.nbest = 1
+            dec_args.criterion = "ctc"
+            dec_args.kenlm_model = wer_compute_kenlm
+            dec_args.lexicon = wer_lexicon
+            dec_args.beam = 50
+            dec_args.beam_size_token = min(50, len(task.target_dictionary))
+            dec_args.beam_threshold = min(50, len(task.target_dictionary))
+            dec_args.lm_weight = lm_w
+            dec_args.word_score = ws_w
+            dec_args.unk_weight = -math.inf
+            dec_args.sil_weight = 0
+
+            self.w2l_decoder = W2lKenLMDecoder(dec_args, task.target_dictionary)
+        else:
+            self.w2l_decoder = None
+
+        self.zero_infinity = zero_infinity
+        self.sentence_avg = sentence_avg
+
+    @staticmethod
+    def add_args(parser):
+        """Add criterion-specific arguments to the parser."""
+        parser.add_argument(
+            "--zero-infinity", action="store_true", help="zero inf loss"
+        )
+        try:
+            parser.add_argument(
+                "--remove-bpe",
+                "--post-process",
+                default="letter",
+                help="remove BPE tokens before scoring (can be set to sentencepiece, letter, and more)",
+            )
+        except:
+            pass  # this option might have been added from eval args
+        parser.add_argument(
+            "--wer-args",
+            type=str,
+            default=None,
+            help="options for wer computation on valid set using 4 gram lm. this should be a tuple of 4 elements: path to 4-gram lm, \
+            path to lexicon, lm score, word score",
+        )
+
+    def forward(self, model, sample, reduce=True):
+        net_output = model(**sample["net_input"])
+        lprobs = model.get_normalized_probs(
+            net_output, log_probs=True
+        ).contiguous()  # (T, B, C) from the encoder
+
+        if "src_lengths" in sample["net_input"]:
+            input_lengths = sample["net_input"]["src_lengths"]
+        else:
+            non_padding_mask = ~net_output["padding_mask"]
+            input_lengths = non_padding_mask.long().sum(-1)
+
+        pad_mask = (sample["target"] != self.pad_idx) & (
+            sample["target"] != self.eos_idx
+        )
+        targets_flat = sample["target"].masked_select(pad_mask)
+        target_lengths = sample["target_lengths"]
+
+        with torch.backends.cudnn.flags(enabled=False):
+            loss = F.ctc_loss(
+                lprobs,
+                targets_flat,
+                input_lengths,
+                target_lengths,
+                blank=self.blank_idx,
+                reduction="sum",
+                zero_infinity=self.zero_infinity,
+            )
+
+        ntokens = (
+            sample["ntokens"] if "ntokens" in sample else target_lengths.sum().item()
+        )
+
+        sample_size = sample["target"].size(0) if self.sentence_avg else ntokens
+        logging_output = {
+            "loss": utils.item(loss.data),  # * sample['ntokens'],
+            "ntokens": ntokens,
+            "nsentences": sample["id"].numel(),
+            "sample_size": sample_size,
+        }
+
+        if not model.training:
+            import editdistance
+
+            with torch.no_grad():
+                lprobs_t = lprobs.transpose(0, 1).float().cpu()
+
+                c_err = 0
+                c_len = 0
+                w_errs = 0
+                w_len = 0
+                wv_errs = 0
+                for lp, t, inp_l in zip(
+                    lprobs_t,
+                    sample["target_label"]
+                    if "target_label" in sample
+                    else sample["target"],
+                    input_lengths,
+                ):
+                    lp = lp[:inp_l].unsqueeze(0)
+
+                    decoded = None
+                    if self.w2l_decoder is not None:
+                        decoded = self.w2l_decoder.decode(lp)
+                        if len(decoded) < 1:
+                            decoded = None
+                        else:
+                            decoded = decoded[0]
+                            if len(decoded) < 1:
+                                decoded = None
+                            else:
+                                decoded = decoded[0]
+
+                    p = (t != self.task.target_dictionary.pad()) & (
+                        t != self.task.target_dictionary.eos()
+                    )
+                    targ = t[p]
+                    targ_units = self.task.target_dictionary.string(targ)
+                    targ_units_arr = targ.tolist()
+
+                    toks = lp.argmax(dim=-1).unique_consecutive()
+                    pred_units_arr = toks[toks != self.blank_idx].tolist()
+
+                    c_err += editdistance.eval(pred_units_arr, targ_units_arr)
+                    c_len += len(targ_units_arr)
+
+                    targ_words = post_process(targ_units, self.post_process).split()
+
+                    pred_units = self.task.target_dictionary.string(pred_units_arr)
+                    pred_words_raw = post_process(pred_units, self.post_process).split()
+
+                    if decoded is not None and "words" in decoded:
+                        pred_words = decoded["words"]
+                        w_errs += editdistance.eval(pred_words, targ_words)
+                        wv_errs += editdistance.eval(pred_words_raw, targ_words)
+                    else:
+                        dist = editdistance.eval(pred_words_raw, targ_words)
+                        w_errs += dist
+                        wv_errs += dist
+
+                    w_len += len(targ_words)
+
+                logging_output["wv_errors"] = wv_errs
+                logging_output["w_errors"] = w_errs
+                logging_output["w_total"] = w_len
+                logging_output["c_errors"] = c_err
+                logging_output["c_total"] = c_len
+
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+
+        loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs))
+        ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs))
+        nsentences = utils.item(
+            sum(log.get("nsentences", 0) for log in logging_outputs)
+        )
+        sample_size = utils.item(
+            sum(log.get("sample_size", 0) for log in logging_outputs)
+        )
+
+        metrics.log_scalar(
+            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
+        )
+        metrics.log_scalar("ntokens", ntokens)
+        metrics.log_scalar("nsentences", nsentences)
+        if sample_size != ntokens:
+            metrics.log_scalar(
+                "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
+            )
+
+        c_errors = sum(log.get("c_errors", 0) for log in logging_outputs)
+        metrics.log_scalar("_c_errors", c_errors)
+        c_total = sum(log.get("c_total", 0) for log in logging_outputs)
+        metrics.log_scalar("_c_total", c_total)
+        w_errors = sum(log.get("w_errors", 0) for log in logging_outputs)
+        metrics.log_scalar("_w_errors", w_errors)
+        wv_errors = sum(log.get("wv_errors", 0) for log in logging_outputs)
+        metrics.log_scalar("_wv_errors", wv_errors)
+        w_total = sum(log.get("w_total", 0) for log in logging_outputs)
+        metrics.log_scalar("_w_total", w_total)
+
+        if c_total > 0:
+            metrics.log_derived(
+                "uer",
+                lambda meters: safe_round(meters["_c_errors"].sum * 100.0 / meters["_c_total"].sum, 3)
+                if meters["_c_total"].sum > 0
+                else float("nan"),
+            )
+        if w_total > 0:
+            metrics.log_derived(
+                "wer",
+                lambda meters: safe_round(meters["_w_errors"].sum * 100.0 / meters["_w_total"].sum, 3)
+                if meters["_w_total"].sum > 0
+                else float("nan"),
+            )
+            metrics.log_derived(
+                "raw_wer",
+                lambda meters: safe_round(meters["_wv_errors"].sum * 100.0 / meters["_w_total"].sum, 3)
+                if meters["_w_total"].sum > 0
+                else float("nan"),
+            )
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/fairseq/criterions/fairseq_criterion.py b/fairseq/criterions/fairseq_criterion.py
new file mode 100644
index 0000000000000000000000000000000000000000..9873574d474f03713b75fe22dae4302716cda467
--- /dev/null
+++ b/fairseq/criterions/fairseq_criterion.py
@@ -0,0 +1,119 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import inspect
+from typing import Any, Dict, List
+
+from torch.nn.modules.loss import _Loss
+
+from fairseq import metrics, utils
+
+
+class FairseqCriterion(_Loss):
+
+    def __init__(self, task):
+        super().__init__()
+        self.task = task
+        if hasattr(task, 'target_dictionary'):
+            tgt_dict = task.target_dictionary
+            self.padding_idx = tgt_dict.pad() if tgt_dict is not None else -100
+
+    @staticmethod
+    def add_args(parser):
+        """Add criterion-specific arguments to the parser."""
+        pass
+
+    @classmethod
+    def build_criterion(cls, args, task):
+        """Construct a criterion from command-line args."""
+        # Criterions can override this, but for convenience we also try
+        # to automatically map argparse.Namespace keys to corresponding
+        # arguments in the __init__.
+        init_args = {}
+        for p in inspect.signature(cls).parameters.values():
+            if (
+                p.kind == p.POSITIONAL_ONLY
+                or p.kind == p.VAR_POSITIONAL
+                or p.kind == p.VAR_KEYWORD
+            ):
+                # we haven't implemented inference for these argument types,
+                # but PRs welcome :)
+                raise NotImplementedError('{} not supported'.format(p.kind))
+
+            assert p.kind in {p.POSITIONAL_OR_KEYWORD, p.KEYWORD_ONLY}
+
+            if p.name == 'task':
+                init_args['task'] = task
+            elif hasattr(args, p.name):
+                init_args[p.name] = getattr(args, p.name)
+            elif p.default != p.empty:
+                pass  # we'll use the default value
+            else:
+                raise NotImplementedError(
+                    'Unable to infer Criterion arguments, please implement '
+                    '{}.build_criterion'.format(cls.__name__)
+                )
+        return cls(**init_args)
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    def aggregate_logging_outputs(
+        logging_outputs: List[Dict[str, Any]],
+    ) -> Dict[str, Any]:
+        """Aggregate logging outputs from data parallel training."""
+        utils.deprecation_warning(
+            'The aggregate_logging_outputs API is deprecated. '
+            'Please use the reduce_metrics API instead.'
+        )
+        raise NotImplementedError
+
+    @classmethod
+    def reduce_metrics(cls, logging_outputs: List[Dict[str, Any]]) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        utils.deprecation_warning(
+            'Criterions should implement the reduce_metrics API. '
+            'Falling back to deprecated aggregate_logging_outputs API.'
+        )
+        agg_logging_outputs = cls.aggregate_logging_outputs(logging_outputs)
+        for k, v in agg_logging_outputs.items():
+            if k in {'nsentences', 'ntokens', 'sample_size'}:
+                continue
+            metrics.log_scalar(k, v)
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return False
+
+
+class LegacyFairseqCriterion(FairseqCriterion):
+
+    def __init__(self, args, task):
+        super().__init__(task=task)
+        self.args = args
+
+        utils.deprecation_warning(
+            'Criterions should take explicit arguments instead of an '
+            'argparse.Namespace object, please update your criterion by '
+            'extending FairseqCriterion instead of LegacyFairseqCriterion.'
+        )
+
+    @classmethod
+    def build_criterion(cls, args, task):
+        """Construct a criterion from command-line args."""
+        return cls(args, task)
diff --git a/fairseq/criterions/label_smoothed_cross_entropy.py b/fairseq/criterions/label_smoothed_cross_entropy.py
new file mode 100644
index 0000000000000000000000000000000000000000..d010c3d03d82fffb6d5d6d0e346b850564534134
--- /dev/null
+++ b/fairseq/criterions/label_smoothed_cross_entropy.py
@@ -0,0 +1,96 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+from fairseq import metrics, utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+
+def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None, reduce=True):
+    if target.dim() == lprobs.dim() - 1:
+        target = target.unsqueeze(-1)
+    nll_loss = -lprobs.gather(dim=-1, index=target)
+    smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
+    if ignore_index is not None:
+        pad_mask = target.eq(ignore_index)
+        nll_loss.masked_fill_(pad_mask, 0.)
+        smooth_loss.masked_fill_(pad_mask, 0.)
+    else:
+        nll_loss = nll_loss.squeeze(-1)
+        smooth_loss = smooth_loss.squeeze(-1)
+    if reduce:
+        nll_loss = nll_loss.sum()
+        smooth_loss = smooth_loss.sum()
+    eps_i = epsilon / lprobs.size(-1)
+    loss = (1. - epsilon) * nll_loss + eps_i * smooth_loss
+    return loss, nll_loss
+
+
+@register_criterion('label_smoothed_cross_entropy')
+class LabelSmoothedCrossEntropyCriterion(FairseqCriterion):
+
+    def __init__(self, task, sentence_avg, label_smoothing):
+        super().__init__(task)
+        self.sentence_avg = sentence_avg
+        self.eps = label_smoothing
+
+    @staticmethod
+    def add_args(parser):
+        """Add criterion-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--label-smoothing', default=0., type=float, metavar='D',
+                            help='epsilon for label smoothing, 0 means no label smoothing')
+        # fmt: on
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample['net_input'])
+        loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce)
+        sample_size = sample['target'].size(0) if self.sentence_avg else sample['ntokens']
+        logging_output = {
+            'loss': loss.data,
+            'nll_loss': nll_loss.data,
+            'ntokens': sample['ntokens'],
+            'nsentences': sample['target'].size(0),
+            'sample_size': sample_size,
+        }
+        return loss, sample_size, logging_output
+
+    def compute_loss(self, model, net_output, sample, reduce=True):
+        lprobs = model.get_normalized_probs(net_output, log_probs=True)
+        lprobs = lprobs.view(-1, lprobs.size(-1))
+        target = model.get_targets(sample, net_output).view(-1, 1)
+        loss, nll_loss = label_smoothed_nll_loss(
+            lprobs, target, self.eps, ignore_index=self.padding_idx, reduce=reduce,
+        )
+        return loss, nll_loss
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get('loss', 0) for log in logging_outputs)
+        nll_loss_sum = sum(log.get('nll_loss', 0) for log in logging_outputs)
+        ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
+        sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
+
+        metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3)
+        metrics.log_scalar('nll_loss', nll_loss_sum / ntokens / math.log(2), ntokens, round=3)
+        metrics.log_derived('ppl', lambda meters: utils.get_perplexity(meters['nll_loss'].avg))
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py b/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfc7e008cd5387cbdcc835cabb1f8eaa18851064
--- /dev/null
+++ b/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py
@@ -0,0 +1,97 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+from fairseq import metrics, utils
+from fairseq.criterions import register_criterion
+
+from .label_smoothed_cross_entropy import LabelSmoothedCrossEntropyCriterion
+
+
+@register_criterion('label_smoothed_cross_entropy_with_alignment')
+class LabelSmoothedCrossEntropyCriterionWithAlignment(LabelSmoothedCrossEntropyCriterion):
+
+    def __init__(self, task, sentence_avg, label_smoothing, alignment_lambda):
+        super().__init__(task, sentence_avg, label_smoothing)
+        self.alignment_lambda = alignment_lambda
+
+    @staticmethod
+    def add_args(parser):
+        """Add criterion-specific arguments to the parser."""
+        LabelSmoothedCrossEntropyCriterion.add_args(parser)
+        parser.add_argument('--alignment-lambda', default=0.05, type=float, metavar='D',
+                            help='weight for the alignment loss')
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample['net_input'])
+        loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce)
+        sample_size = sample['target'].size(0) if self.sentence_avg else sample['ntokens']
+        logging_output = {
+            'loss': utils.item(loss.data) if reduce else loss.data,
+            'nll_loss': utils.item(nll_loss.data) if reduce else nll_loss.data,
+            'ntokens': sample['ntokens'],
+            'nsentences': sample['target'].size(0),
+            'sample_size': sample_size,
+        }
+
+        alignment_loss = None
+
+        # Compute alignment loss only for training set and non dummy batches.
+        if 'alignments' in sample and sample['alignments'] is not None:
+            alignment_loss = self.compute_alignment_loss(sample, net_output)
+
+        if alignment_loss is not None:
+            logging_output['alignment_loss'] = utils.item(alignment_loss.data)
+            loss += self.alignment_lambda * alignment_loss
+
+        return loss, sample_size, logging_output
+
+    def compute_alignment_loss(self, sample, net_output):
+        attn_prob = net_output[1]['attn'][0]
+        bsz, tgt_sz, src_sz = attn_prob.shape
+        attn = attn_prob.view(bsz * tgt_sz, src_sz)
+
+        align = sample['alignments']
+        align_weights = sample['align_weights'].float()
+
+        if len(align) > 0:
+            # Alignment loss computation. align (shape [:, 2]) contains the src-tgt index pairs corresponding to
+            # the alignments. align_weights (shape [:]) contains the 1 / frequency of a tgt index for normalizing.
+            loss = -((attn[align[:, 1][:, None], align[:, 0][:, None]]).log() * align_weights[:, None]).sum()
+        else:
+            return None
+
+        return loss
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = utils.item(sum(log.get('loss', 0) for log in logging_outputs))
+        nll_loss_sum = utils.item(sum(log.get('nll_loss', 0) for log in logging_outputs))
+        alignment_loss_sum = utils.item(sum(log.get('alignment_loss', 0) for log in logging_outputs))
+        ntokens = utils.item(sum(log.get('ntokens', 0) for log in logging_outputs))
+        sample_size = utils.item(sum(log.get('sample_size', 0) for log in logging_outputs))
+
+        metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3)
+        metrics.log_scalar('nll_loss', nll_loss_sum / ntokens / math.log(2), ntokens, round=3)
+        metrics.log_scalar('alignment_loss', alignment_loss_sum / sample_size / math.log(2), sample_size, round=3)
+        metrics.log_derived('ppl', lambda meters: utils.get_perplexity(meters['nll_loss'].avg))
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/fairseq/criterions/legacy_masked_lm.py b/fairseq/criterions/legacy_masked_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..10dea76e4b1a6151ff21174e6a0333f636440d45
--- /dev/null
+++ b/fairseq/criterions/legacy_masked_lm.py
@@ -0,0 +1,158 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn.functional as F
+
+from fairseq import utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+
+def compute_cross_entropy_loss(logits, targets, ignore_index=-100):
+    """
+    Function to compute the cross entropy loss. The default value of
+    ignore_index is the same as the default value for F.cross_entropy in
+    pytorch.
+    """
+    assert logits.size(0) == targets.size(-1), \
+        "Logits and Targets tensor shapes don't match up"
+
+    loss = F.nll_loss(
+        F.log_softmax(logits, -1, dtype=torch.float32),
+        targets,
+        reduction="sum",
+        ignore_index=ignore_index,
+    )
+    return loss
+
+
+@register_criterion('legacy_masked_lm_loss')
+class LegacyMaskedLmLoss(FairseqCriterion):
+    """
+    Implementation for the loss used in masked language model (MLM) training.
+    This optionally also computes the next sentence prediction (NSP) loss and
+    adds it to the overall loss based on the specified args. There are three
+    cases to consider:
+        1) Generic MLM training without NSP loss. In this case sentence_targets
+           and sentence_logits are both None.
+        2) BERT training without NSP loss. In this case sentence_targets is
+           not None but sentence_logits is None and we should not be computing
+           a sentence level loss.
+        3) BERT training with NSP loss. In this case both sentence_targets and
+           sentence_logits are not None and we should be computing a sentence
+           level loss. The weight of the sentence level loss is specified as
+           an argument.
+    """
+
+    def __init__(self, task, masked_lm_only, nsp_loss_weight):
+        super().__init__(task)
+        self.masked_lm_only = masked_lm_only
+        self.nsp_loss_weight = nsp_loss_weight
+
+    @staticmethod
+    def add_args(parser):
+        """Args for MaskedLM Loss"""
+        # Default for masked_lm_only is False so as to not break BERT training
+        parser.add_argument('--masked-lm-only', default=False,
+                            action='store_true', help='compute MLM loss only')
+        parser.add_argument('--nsp-loss-weight', default=1.0, type=float,
+                            help='weight for next sentence prediction'
+                                 ' loss (default 1)')
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        lm_logits, output_metadata = model(**sample["net_input"])
+
+        # reshape lm_logits from (N,T,C) to (N*T,C)
+        lm_logits = lm_logits.view(-1, lm_logits.size(-1))
+        lm_targets = sample['lm_target'].view(-1)
+        lm_loss = compute_cross_entropy_loss(
+            lm_logits, lm_targets, self.padding_idx)
+
+        # compute the number of tokens for which loss is computed. This is used
+        # to normalize the loss
+        ntokens = utils.strip_pad(lm_targets, self.padding_idx).numel()
+        loss = lm_loss / ntokens
+        nsentences = sample['nsentences']
+        # nsentences = 0
+
+        # Compute sentence loss if masked_lm_only is False
+        sentence_loss = None
+        if not self.masked_lm_only:
+            sentence_logits = output_metadata['sentence_logits']
+            sentence_targets = sample['sentence_target'].view(-1)
+            # This needs to be recomputed due to some differences between
+            # TokenBlock and BlockPair dataset. This can be resolved with a
+            # refactor of BERTModel which we will do in the future.
+            # TODO: Remove this after refactor of BERTModel
+            nsentences = sentence_targets.size(0)
+
+            # Check for logits being none which can happen when remove_heads
+            # is set to true in the BERT model. Ideally we should set
+            # masked_lm_only to true in this case, but that requires some
+            # refactor in the BERT model.
+            if sentence_logits is not None:
+                sentence_loss = compute_cross_entropy_loss(
+                    sentence_logits, sentence_targets)
+
+                loss += self.nsp_loss_weight * (sentence_loss / nsentences)
+
+        # NOTE: as we are summing up per token mlm loss and per sentence nsp loss
+        # we don't need to use sample_size as denominator for the gradient
+        # here sample_size is just used for logging
+        sample_size = 1
+        logging_output = {
+            'loss': utils.item(loss.data) if reduce else loss.data,
+            'lm_loss': utils.item(lm_loss.data) if reduce else lm_loss.data,
+            # sentence loss is not always computed
+            'sentence_loss': (
+                (
+                    utils.item(sentence_loss.data) if reduce
+                    else sentence_loss.data
+                ) if sentence_loss is not None else 0.0
+            ),
+            'ntokens': ntokens,
+            'nsentences': nsentences,
+            'sample_size': sample_size,
+        }
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        lm_loss_sum = sum(log.get('lm_loss', 0) for log in logging_outputs)
+        sentence_loss_sum = sum(
+            log.get('sentence_loss', 0) for log in logging_outputs)
+        ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
+        nsentences = sum(log.get('nsentences', 0) for log in logging_outputs)
+        sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
+        agg_loss = sum(log.get('loss', 0) for log in logging_outputs)
+
+        agg_output = {
+            'loss': agg_loss / sample_size / math.log(2) if sample_size > 0 else 0.,
+            'lm_loss': lm_loss_sum / ntokens / math.log(2) if ntokens > 0 else 0.,
+            'sentence_loss': sentence_loss_sum / nsentences / math.log(2) if nsentences > 0 else 0.,
+            'nll_loss': lm_loss_sum / ntokens / math.log(2) if ntokens > 0 else 0.,
+            'ntokens': ntokens,
+            'nsentences': nsentences,
+            'sample_size': sample_size,
+        }
+        return agg_output
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/fairseq/criterions/masked_lm.py b/fairseq/criterions/masked_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..80864693ecf7935443a7f14f6e3f65b4a334d1cb
--- /dev/null
+++ b/fairseq/criterions/masked_lm.py
@@ -0,0 +1,88 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn.functional as F
+
+from fairseq import metrics, modules, utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+
+@register_criterion('masked_lm')
+class MaskedLmLoss(FairseqCriterion):
+    """
+    Implementation for the loss used in masked language model (MLM) training.
+    """
+
+    def __init__(self, task, tpu):
+        super().__init__(task)
+        self.tpu = tpu
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        masked_tokens = sample['target'].ne(self.padding_idx)
+        sample_size = masked_tokens.int().sum()
+
+        # Rare: when all tokens are masked, project all tokens.
+        # We use torch.where to avoid device-to-host transfers,
+        # except on CPU where torch.where is not well supported
+        # (see github.com/pytorch/pytorch/issues/26247).
+        if self.tpu:
+            masked_tokens = None  # always project all tokens on TPU
+        elif masked_tokens.device == torch.device('cpu'):
+            if not masked_tokens.any():
+                masked_tokens = None
+        else:
+            masked_tokens = torch.where(
+                masked_tokens.any(),
+                masked_tokens,
+                masked_tokens.new([True]),
+            )
+
+        logits = model(**sample['net_input'], masked_tokens=masked_tokens)[0]
+        targets = model.get_targets(sample, [logits])
+        if masked_tokens is not None:
+            targets = targets[masked_tokens]
+
+        loss = modules.cross_entropy(
+            logits.view(-1, logits.size(-1)),
+            targets.view(-1),
+            reduction='sum',
+            ignore_index=self.padding_idx,
+        )
+
+        logging_output = {
+            'loss': loss if self.tpu else loss.data,
+            'ntokens': sample['ntokens'],
+            'nsentences': sample['nsentences'],
+            'sample_size': sample_size,
+        }
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get('loss', 0) for log in logging_outputs)
+        sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
+
+        metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3)
+        metrics.log_derived('ppl', lambda meters: utils.get_perplexity(meters['loss'].avg))
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/fairseq/criterions/nat_loss.py b/fairseq/criterions/nat_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..3326734d5535c72f4644ebe0e3a4b2966477dc94
--- /dev/null
+++ b/fairseq/criterions/nat_loss.py
@@ -0,0 +1,173 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch.nn.functional as F
+import torch
+from torch import Tensor
+
+from fairseq import metrics, utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+
+@register_criterion("nat_loss")
+class LabelSmoothedDualImitationCriterion(FairseqCriterion):
+
+    def __init__(self, task, label_smoothing):
+        super().__init__(task)
+        self.label_smoothing = label_smoothing
+
+    @staticmethod
+    def add_args(parser):
+        """Add criterion-specific arguments to the parser."""
+        parser.add_argument(
+            '--label-smoothing',
+            default=0.,
+            type=float,
+            metavar='D',
+            help='epsilon for label smoothing, 0 means no label smoothing',
+        )
+
+    def _compute_loss(
+        self, outputs, targets, masks=None, label_smoothing=0.0, name="loss", factor=1.0
+    ):
+        """
+            outputs: batch x len x d_model
+            targets: batch x len
+            masks:   batch x len
+
+            policy_logprob: if there is some policy
+                depends on the likelihood score as rewards.
+        """
+
+        def mean_ds(x: Tensor, dim=None) -> Tensor:
+            return (
+                x.float().mean().type_as(x)
+                if dim is None
+                else x.float().mean(dim).type_as(x)
+            )
+        if masks is not None:
+            outputs, targets = outputs[masks], targets[masks]
+
+        if masks is not None and not masks.any():
+            nll_loss = torch.tensor(0)
+            loss = nll_loss
+        else:
+            logits = F.log_softmax(outputs, dim=-1)
+            if targets.dim() == 1:
+                losses = F.nll_loss(logits, targets.to(logits.device), reduction='none')
+
+            else:  # soft-labels
+                losses = F.kl_div(logits, targets.to(logits.device), reduction='none')
+                losses = losses.sum(-1)
+
+            nll_loss = mean_ds(losses)
+            if label_smoothing > 0:
+                loss = nll_loss * (
+                    1 - label_smoothing) - mean_ds(logits) * label_smoothing
+            else:
+                loss = nll_loss
+
+        loss = loss * factor
+        return {"name": name, "loss": loss, "nll_loss": nll_loss, "factor": factor}
+
+    def _custom_loss(self, loss, name="loss", factor=1.0):
+        return {"name": name, "loss": loss, "factor": factor}
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        nsentences, ntokens = sample["nsentences"], sample["ntokens"]
+
+        # B x T
+        src_tokens, src_lengths = (
+            sample["net_input"]["src_tokens"],
+            sample["net_input"]["src_lengths"],
+        )
+        tgt_tokens, prev_output_tokens = sample["target"], sample["prev_target"]
+
+        outputs = model(src_tokens, src_lengths, prev_output_tokens, tgt_tokens)
+        losses, nll_loss = [], []
+
+        for obj in outputs:
+            if outputs[obj].get("loss", None) is None:
+                _losses = self._compute_loss(
+                    outputs[obj].get("out"),
+                    outputs[obj].get("tgt"),
+                    outputs[obj].get("mask", None),
+                    outputs[obj].get("ls", 0.0),
+                    name=obj + '-loss',
+                    factor=outputs[obj].get("factor", 1.0)
+                )
+            else:
+                _losses = self._custom_loss(
+                    outputs[obj].get("loss"),
+                    name=obj + '-loss',
+                    factor=outputs[obj].get("factor", 1.0)
+                )
+
+            losses += [_losses]
+            if outputs[obj].get("nll_loss", False):
+                nll_loss += [_losses.get("nll_loss", 0.0)]
+
+        loss = sum(l["loss"] for l in losses)
+        nll_loss = sum(l for l in nll_loss) if len(nll_loss) > 0 \
+            else loss.new_tensor(0)
+
+        # NOTE:
+        # we don't need to use sample_size as denominator for the gradient
+        # here sample_size is just used for logging
+        sample_size = 1
+        logging_output = {
+            "loss": loss.data,
+            "nll_loss": nll_loss.data,
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "sample_size": sample_size,
+        }
+
+        for l in losses:
+            logging_output[l["name"]] = (
+                utils.item(l["loss"].data / l["factor"])
+                if reduce
+                else l[["loss"]].data / l["factor"]
+            )
+
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        sample_size = utils.item(sum(log.get("sample_size", 0) for log in logging_outputs))
+        loss = utils.item(sum(log.get("loss", 0) for log in logging_outputs))
+        nll_loss = utils.item(sum(log.get("nll_loss", 0) for log in logging_outputs))
+
+        metrics.log_scalar('loss', loss / sample_size / math.log(2), sample_size, round=3)
+        metrics.log_scalar('nll_loss', nll_loss / sample_size / math.log(2), sample_size, round=3)
+        metrics.log_derived('ppl', lambda meters: utils.get_perplexity(meters['loss'].avg))
+
+        for key in logging_outputs[0]:
+            if key[-5:] == "-loss":
+                val = sum(log.get(key, 0) for log in logging_outputs)
+                metrics.log_scalar(
+                    key[:-5],
+                    val / sample_size / math.log(2) if sample_size > 0 else 0.0,
+                    sample_size,
+                    round=3,
+                )
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/fairseq/criterions/sentence_prediction.py b/fairseq/criterions/sentence_prediction.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ba13178560e1fc8db4435b07dc46033bd478031
--- /dev/null
+++ b/fairseq/criterions/sentence_prediction.py
@@ -0,0 +1,95 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn.functional as F
+
+from fairseq import metrics, utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+
+@register_criterion('sentence_prediction')
+class SentencePredictionCriterion(FairseqCriterion):
+
+    def __init__(self, task, classification_head_name, regression_target):
+        super().__init__(task)
+        self.classification_head_name = classification_head_name
+        self.regression_target = regression_target
+
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--classification-head-name',
+                            default='sentence_classification_head',
+                            help='name of the classification head to use')
+        # fmt: on
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        assert (
+            hasattr(model, 'classification_heads')
+            and self.classification_head_name in model.classification_heads
+        ), 'model must provide sentence classification head for --criterion=sentence_prediction'
+
+        logits, _ = model(
+            **sample['net_input'],
+            features_only=True,
+            classification_head_name=self.classification_head_name,
+        )
+        targets = model.get_targets(sample, [logits]).view(-1)
+        sample_size = targets.numel()
+
+        if not self.regression_target:
+            lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+            loss = F.nll_loss(lprobs, targets, reduction='sum')
+        else:
+            logits = logits.view(-1).float()
+            targets = targets.float()
+            loss = F.mse_loss(logits, targets, reduction='sum')
+
+        logging_output = {
+            'loss': loss.data,
+            'ntokens': sample['ntokens'],
+            'nsentences': sample_size,
+            'sample_size': sample_size,
+        }
+        if not self.regression_target:
+            preds = logits.argmax(dim=1)
+            logging_output['ncorrect'] = (preds == targets).sum()
+
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get('loss', 0) for log in logging_outputs)
+        ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
+        nsentences = sum(log.get('nsentences', 0) for log in logging_outputs)
+        sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
+
+        metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3)
+        if sample_size != ntokens:
+            metrics.log_scalar('nll_loss', loss_sum / ntokens / math.log(2), ntokens, round=3)
+
+        if len(logging_outputs) > 0 and 'ncorrect' in logging_outputs[0]:
+            ncorrect = sum(log.get('ncorrect', 0) for log in logging_outputs)
+            metrics.log_scalar('accuracy', 100.0 * ncorrect / nsentences, nsentences, round=1)
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/fairseq/criterions/sentence_ranking.py b/fairseq/criterions/sentence_ranking.py
new file mode 100644
index 0000000000000000000000000000000000000000..52a0a177d846553ca736026efc181e713fa6bd97
--- /dev/null
+++ b/fairseq/criterions/sentence_ranking.py
@@ -0,0 +1,116 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn.functional as F
+
+from fairseq import metrics, utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+
+@register_criterion('sentence_ranking')
+class SentenceRankingCriterion(FairseqCriterion):
+
+    def __init__(self, task, ranking_head_name, save_predictions, num_classes):
+        super().__init__(task)
+        self.ranking_head_name = ranking_head_name
+        if save_predictions is not None:
+            self.prediction_h = open(save_predictions, 'w')
+        else:
+            self.prediction_h = None
+        self.num_classes = num_classes
+
+    def __del__(self):
+        if self.prediction_h is not None:
+            self.prediction_h.close()
+
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--save-predictions', metavar='FILE',
+                            help='file to save predictions to')
+        parser.add_argument('--ranking-head-name',
+                            default='sentence_classification_head',
+                            help='name of the ranking head to use')
+        # fmt: on
+
+    def forward(self, model, sample, reduce=True):
+        """Compute ranking loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        assert (
+            hasattr(model, 'classification_heads')
+            and self.ranking_head_name in model.classification_heads
+        ), 'model must provide sentence ranking head for --criterion=sentence_ranking'
+
+        scores = []
+        for idx in range(self.num_classes):
+            score, _ = model(
+                **sample['net_input{idx}'.format(idx=idx+1)],
+                classification_head_name=self.ranking_head_name,
+            )
+            scores.append(score)
+
+        logits = torch.cat(scores, dim=1)
+        sample_size = logits.size(0)
+
+        if 'target' in sample:
+            targets = model.get_targets(sample, [logits]).view(-1)
+            lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+            loss = F.nll_loss(lprobs, targets, reduction='sum')
+        else:
+            targets = None
+            loss = torch.tensor(0.0, requires_grad=True)
+
+        if self.prediction_h is not None:
+            preds = logits.argmax(dim=1)
+            for i, (id, pred) in enumerate(zip(sample['id'].tolist(), preds.tolist())):
+                if targets is not None:
+                    label = targets[i].item()
+                    print('{}\t{}\t{}'.format(id, pred, label), file=self.prediction_h)
+                else:
+                    print('{}\t{}'.format(id, pred), file=self.prediction_h)
+
+        logging_output = {
+            'loss': loss.data,
+            'ntokens': sample['ntokens'],
+            'nsentences': sample_size,
+            'sample_size': sample_size,
+        }
+        if targets is not None:
+            logging_output['ncorrect'] = (logits.argmax(dim=1) == targets).sum()
+
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get('loss', 0) for log in logging_outputs)
+        ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
+        nsentences = sum(log.get('nsentences', 0) for log in logging_outputs)
+        sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
+
+        metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3)
+        if sample_size != ntokens:
+            metrics.log_scalar('nll_loss', loss_sum / ntokens / math.log(2), ntokens, round=3)
+
+        if len(logging_outputs) > 0 and 'ncorrect' in logging_outputs[0]:
+            ncorrect = sum(log.get('ncorrect', 0) for log in logging_outputs)
+            metrics.log_scalar('accuracy', 100.0 * ncorrect / nsentences, nsentences, round=1)
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/fairseq/criterions/wav2vec_criterion.py b/fairseq/criterions/wav2vec_criterion.py
new file mode 100644
index 0000000000000000000000000000000000000000..019db622496cfb4a81170d12157db47a002912fe
--- /dev/null
+++ b/fairseq/criterions/wav2vec_criterion.py
@@ -0,0 +1,157 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn.functional as F
+
+from fairseq import metrics, utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+
+@register_criterion('wav2vec')
+class Wav2vecCriterion(FairseqCriterion):
+
+    def __init__(self, task, infonce=False, loss_weights=None, log_keys=None):
+        super().__init__(task)
+        self.infonce = infonce
+        self.loss_weights = None if loss_weights is None else eval(loss_weights)
+        self.log_keys = [] if log_keys is None else eval(log_keys)
+
+    @staticmethod
+    def add_args(parser):
+        """Add criterion-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--infonce', action='store_true',
+                            help='if set, uses cross entropy instead of binary cross entropy (i.e. InfoNCE loss)')
+        parser.add_argument('--loss-weights', type=str, default=None,
+                            help='weights for additional loss terms (not first one)')
+        parser.add_argument('--log-keys', type=str, default=None,
+                            help='output keys to log')
+
+    def forward(self, model, sample, reduce=True, log_pred=False):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample['net_input'])
+        logits = model.get_logits(net_output).float()
+        target = model.get_targets(sample, net_output)
+
+        weights = None
+        if hasattr(model, 'get_target_weights') and not self.infonce:
+            weights = model.get_target_weights(target, net_output)
+            if torch.is_tensor(weights):
+                weights = weights.float()
+
+        losses = []
+
+        if self.infonce:
+            loss = F.cross_entropy(logits, target, reduction="sum" if reduce else "none",)
+        else:
+            loss = F.binary_cross_entropy_with_logits(logits, target.float(), weights, reduction="sum" if reduce else "none",)
+
+        sample_size = target.numel() if self.infonce else target.long().sum().item()
+        losses.append(loss)
+
+        if self.loss_weights is not None:
+            assert hasattr(model, "get_extra_losses")
+            extra_losses = model.get_extra_losses(net_output)
+            if torch.is_tensor(extra_losses):
+                extra_losses = [extra_losses]
+            if len(self.loss_weights) == 1 and len(extra_losses) != 1:
+                self.loss_weights = [self.loss_weights[0]] * len(extra_losses)
+            assert len(extra_losses) == len(self.loss_weights), f'{len(extra_losses)}, {len(self.loss_weights)}'
+            for p, coef in zip(extra_losses, self.loss_weights):
+                if coef != 0 and p is not None:
+                    p = coef * p.float() * sample_size
+                    loss += p
+                    losses.append(p)
+
+        logging_output = {
+            'loss': loss.item() if reduce else loss,
+            'ntokens': sample_size,
+            'nsentences': sample['id'].numel(),
+            'sample_size': sample_size,
+        }
+
+        for lk in self.log_keys:
+            if lk in net_output:
+                logging_output[lk] = float((net_output[lk]))
+
+        if len(losses) > 1:
+            for i, l in enumerate(losses):
+                logging_output[f'loss_{i}'] = l.item()
+
+        if self.infonce:
+            with torch.no_grad():
+                if logits.numel() == 0:
+                    corr = 0
+                    count = 0
+                else:
+                    assert logits.dim() > 1, logits.shape
+                    max = logits.argmax(-1) == 0
+                    min = logits.argmin(-1) == 0
+                    both = max & min
+                    corr = max.long().sum().item() - both.long().sum().item()
+                    count = max.numel()
+
+                logging_output["correct"] = corr
+                logging_output["count"] = count
+
+        if log_pred:
+            logging_output['logits'] = logits.cpu().numpy()
+            logging_output['target'] = target.cpu().numpy()
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = utils.item(sum(log.get('loss', 0) for log in logging_outputs))
+        ntokens = utils.item(sum(log.get('ntokens', 0) for log in logging_outputs))
+        nsentences = utils.item(sum(log.get('nsentences', 0) for log in logging_outputs))
+        sample_size = utils.item(sum(log.get('sample_size', 0) for log in logging_outputs))
+
+        metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3)
+        metrics.log_scalar('ntokens', ntokens)
+        metrics.log_scalar('nsentences', nsentences)
+
+        correct = sum(log.get("correct", 0) for log in logging_outputs)
+        metrics.log_scalar("_correct", correct)
+
+        total = sum(log.get("count", 0) for log in logging_outputs)
+        metrics.log_scalar("_total", total)
+
+
+        if total > 0:
+            metrics.log_derived(
+                "accuracy",
+                lambda meters: round(meters["_correct"].sum / meters["_total"].sum, 5)
+                if meters["_total"].sum > 0
+                else float("nan"),
+            )
+
+        builtin_keys = {'loss', 'ntokens', 'nsentences', 'sample_size', 'correct', 'count'}
+
+        for k in logging_outputs[0]:
+            if k not in builtin_keys:
+                val = sum(log.get(k, 0) for log in logging_outputs) / len(logging_outputs)
+                if k.startswith('loss'):
+                    metrics.log_scalar(k, val / sample_size / math.log(2), sample_size)
+                else:
+                    metrics.log_scalar(k, val, round=3)
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return False
diff --git a/fairseq/data/__init__.py b/fairseq/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a99d9280fa1ecab0d8ba86983e0ca72dd012538e
--- /dev/null
+++ b/fairseq/data/__init__.py
@@ -0,0 +1,114 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .dictionary import Dictionary, TruncatedDictionary
+
+from .fairseq_dataset import FairseqDataset, FairseqIterableDataset
+
+from .base_wrapper_dataset import BaseWrapperDataset
+
+from .add_target_dataset import AddTargetDataset
+from .append_token_dataset import AppendTokenDataset
+from .audio.raw_audio_dataset import FileAudioDataset
+from .backtranslation_dataset import BacktranslationDataset
+from .bucket_pad_length_dataset import BucketPadLengthDataset
+from .colorize_dataset import ColorizeDataset
+from .concat_dataset import ConcatDataset
+from .concat_sentences_dataset import ConcatSentencesDataset
+from .denoising_dataset import DenoisingDataset
+from .id_dataset import IdDataset
+from .indexed_dataset import IndexedCachedDataset, IndexedDataset, IndexedRawTextDataset, MMapIndexedDataset
+from .language_pair_dataset import LanguagePairDataset
+from .list_dataset import ListDataset
+from .lm_context_window_dataset import LMContextWindowDataset
+from .lru_cache_dataset import LRUCacheDataset
+from .mask_tokens_dataset import MaskTokensDataset
+from .monolingual_dataset import MonolingualDataset
+from .multi_corpus_sampled_dataset import MultiCorpusSampledDataset
+from .nested_dictionary_dataset import NestedDictionaryDataset
+from .noising import NoisingDataset
+from .numel_dataset import NumelDataset
+from .num_samples_dataset import NumSamplesDataset
+from .offset_tokens_dataset import OffsetTokensDataset
+from .pad_dataset import LeftPadDataset, PadDataset, RightPadDataset
+from .prepend_dataset import PrependDataset
+from .prepend_token_dataset import PrependTokenDataset
+from .raw_label_dataset import RawLabelDataset
+from .replace_dataset import ReplaceDataset
+from .resampling_dataset import ResamplingDataset
+from .roll_dataset import RollDataset
+from .round_robin_zip_datasets import RoundRobinZipDatasets
+from .sort_dataset import SortDataset
+from .strip_token_dataset import StripTokenDataset
+from .subsample_dataset import SubsampleDataset
+from .token_block_dataset import TokenBlockDataset
+from .transform_eos_dataset import TransformEosDataset
+from .transform_eos_lang_pair_dataset import TransformEosLangPairDataset
+from .shorten_dataset import TruncateDataset, RandomCropDataset
+from .multilingual.sampled_multi_dataset import SampledMultiDataset
+from .multilingual.sampled_multi_epoch_dataset import SampledMultiEpochDataset
+from .iterators import (
+    CountingIterator,
+    EpochBatchIterator,
+    GroupedIterator,
+    ShardedIterator,
+)
+
+__all__ = [
+    'AddTargetDataset',
+    'AppendTokenDataset',
+    'BacktranslationDataset',
+    'BaseWrapperDataset',
+    'BucketPadLengthDataset',
+    'ColorizeDataset',
+    'ConcatDataset',
+    'ConcatSentencesDataset',
+    'CountingIterator',
+    'DenoisingDataset',
+    'Dictionary',
+    'EpochBatchIterator',
+    'FairseqDataset',
+    'FairseqIterableDataset',
+    'GroupedIterator',
+    'IdDataset',
+    'IndexedCachedDataset',
+    'IndexedDataset',
+    'IndexedRawTextDataset',
+    'LanguagePairDataset',
+    'LeftPadDataset',
+    'ListDataset',
+    'LMContextWindowDataset',
+    'LRUCacheDataset',
+    'MaskTokensDataset',
+    'MMapIndexedDataset',
+    'MonolingualDataset',
+    'MultiCorpusSampledDataset',
+    'NestedDictionaryDataset',
+    'NoisingDataset',
+    'NumelDataset',
+    'NumSamplesDataset',
+    'OffsetTokensDataset',
+    'PadDataset',
+    'PrependDataset',
+    'PrependTokenDataset',
+    'ReplaceDataset',
+    'RollDataset',
+    'FileAudioDataset',
+    'RawLabelDataset',
+    'ResamplingDataset',
+    'RightPadDataset',
+    'RoundRobinZipDatasets',
+    'SampledMultiDataset',
+    'SampledMultiEpochDataset',
+    'ShardedIterator',
+    'SortDataset',
+    'StripTokenDataset',
+    'SubsampleDataset',
+    'TokenBlockDataset',
+    'TransformEosDataset',
+    'TransformEosLangPairDataset',
+    'TruncateDataset',
+    'TruncatedDictionary',
+]
diff --git a/fairseq/data/__pycache__/__init__.cpython-310.pyc b/fairseq/data/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad41218b144dd8376b168030ed6977aa72f6ffb1
Binary files /dev/null and b/fairseq/data/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/add_target_dataset.cpython-310.pyc b/fairseq/data/__pycache__/add_target_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..835a27526cd7e56cd0550e4743bd6f937f9acc1a
Binary files /dev/null and b/fairseq/data/__pycache__/add_target_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/append_token_dataset.cpython-310.pyc b/fairseq/data/__pycache__/append_token_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c33c9beaf9e0251846233d56197cd231630e0ff1
Binary files /dev/null and b/fairseq/data/__pycache__/append_token_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/backtranslation_dataset.cpython-310.pyc b/fairseq/data/__pycache__/backtranslation_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..297488a799d60ce75a7ca9740359a2e1dff93a68
Binary files /dev/null and b/fairseq/data/__pycache__/backtranslation_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/base_wrapper_dataset.cpython-310.pyc b/fairseq/data/__pycache__/base_wrapper_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86578ea8c97e281c19e2d7dd259228e65633d12d
Binary files /dev/null and b/fairseq/data/__pycache__/base_wrapper_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/bucket_pad_length_dataset.cpython-310.pyc b/fairseq/data/__pycache__/bucket_pad_length_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e2ed3cc85c32a37b1e641e4bf6a347d32438af3
Binary files /dev/null and b/fairseq/data/__pycache__/bucket_pad_length_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/colorize_dataset.cpython-310.pyc b/fairseq/data/__pycache__/colorize_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04b98140abf808a6c1f48f74fc34ef01fe2f8376
Binary files /dev/null and b/fairseq/data/__pycache__/colorize_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/concat_dataset.cpython-310.pyc b/fairseq/data/__pycache__/concat_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..346095dac46e74f06dd4b7d4f8317cf35b0154df
Binary files /dev/null and b/fairseq/data/__pycache__/concat_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/concat_sentences_dataset.cpython-310.pyc b/fairseq/data/__pycache__/concat_sentences_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02bae56218104c724543c895da6d5401b3ddac9f
Binary files /dev/null and b/fairseq/data/__pycache__/concat_sentences_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/data_utils.cpython-310.pyc b/fairseq/data/__pycache__/data_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c2d9405f8ca3da8471be892c0cd925afc4d4d2cf
Binary files /dev/null and b/fairseq/data/__pycache__/data_utils.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/denoising_dataset.cpython-310.pyc b/fairseq/data/__pycache__/denoising_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a4226384d4a48dee3be21b059f1fc7ccb4895ff
Binary files /dev/null and b/fairseq/data/__pycache__/denoising_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/dictionary.cpython-310.pyc b/fairseq/data/__pycache__/dictionary.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ea1907f215670eeb17f87dd579d6c0f317fb226
Binary files /dev/null and b/fairseq/data/__pycache__/dictionary.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/fairseq_dataset.cpython-310.pyc b/fairseq/data/__pycache__/fairseq_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..127d32b6edde81c67fa2c04c305ffb129a1b5d8e
Binary files /dev/null and b/fairseq/data/__pycache__/fairseq_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/id_dataset.cpython-310.pyc b/fairseq/data/__pycache__/id_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d18d3bb4cfff3271f134e0af637b3a42e915fa4
Binary files /dev/null and b/fairseq/data/__pycache__/id_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/indexed_dataset.cpython-310.pyc b/fairseq/data/__pycache__/indexed_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..149845d065d6184dea2fad6a50eb680e63fb59f2
Binary files /dev/null and b/fairseq/data/__pycache__/indexed_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/iterators.cpython-310.pyc b/fairseq/data/__pycache__/iterators.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06e3a9998a6a52c1d81b6636e7f0fe168db25202
Binary files /dev/null and b/fairseq/data/__pycache__/iterators.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/language_pair_dataset.cpython-310.pyc b/fairseq/data/__pycache__/language_pair_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8083e77e134b7bb8edefe9fd3320f04db92cf6f
Binary files /dev/null and b/fairseq/data/__pycache__/language_pair_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/list_dataset.cpython-310.pyc b/fairseq/data/__pycache__/list_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..271f43896a8ed9cbe067505d25d7e212f42eff8d
Binary files /dev/null and b/fairseq/data/__pycache__/list_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/lm_context_window_dataset.cpython-310.pyc b/fairseq/data/__pycache__/lm_context_window_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a3443d7ecf00d157e25790759087bf61813729c
Binary files /dev/null and b/fairseq/data/__pycache__/lm_context_window_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/lru_cache_dataset.cpython-310.pyc b/fairseq/data/__pycache__/lru_cache_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e6ec08a167582523bcb25959fde6eaae15d2081
Binary files /dev/null and b/fairseq/data/__pycache__/lru_cache_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/mask_tokens_dataset.cpython-310.pyc b/fairseq/data/__pycache__/mask_tokens_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48280876daa2a7505f86512a6586634732b369bf
Binary files /dev/null and b/fairseq/data/__pycache__/mask_tokens_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/monolingual_dataset.cpython-310.pyc b/fairseq/data/__pycache__/monolingual_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cde3ecbca18bff7283a27d71c3ad55ba7943d06c
Binary files /dev/null and b/fairseq/data/__pycache__/monolingual_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/multi_corpus_sampled_dataset.cpython-310.pyc b/fairseq/data/__pycache__/multi_corpus_sampled_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..757269577c5f974b0d5e17689dc3dcf23f05d52c
Binary files /dev/null and b/fairseq/data/__pycache__/multi_corpus_sampled_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/nested_dictionary_dataset.cpython-310.pyc b/fairseq/data/__pycache__/nested_dictionary_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e68c21e44e62f9c0fe0e5dd7274d6ca40e80763
Binary files /dev/null and b/fairseq/data/__pycache__/nested_dictionary_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/noising.cpython-310.pyc b/fairseq/data/__pycache__/noising.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe1107260d39856e578b3d2ecfd20314f988a1b2
Binary files /dev/null and b/fairseq/data/__pycache__/noising.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/num_samples_dataset.cpython-310.pyc b/fairseq/data/__pycache__/num_samples_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ccba5e403e5a73e1b8c36f86efa228260cb8bea
Binary files /dev/null and b/fairseq/data/__pycache__/num_samples_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/numel_dataset.cpython-310.pyc b/fairseq/data/__pycache__/numel_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7be8cf04ec1a091d56b0878dc3b680e5b370cc5c
Binary files /dev/null and b/fairseq/data/__pycache__/numel_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/offset_tokens_dataset.cpython-310.pyc b/fairseq/data/__pycache__/offset_tokens_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc2d683fd0b205a4e8c5605944b517c87a76d317
Binary files /dev/null and b/fairseq/data/__pycache__/offset_tokens_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/pad_dataset.cpython-310.pyc b/fairseq/data/__pycache__/pad_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8630d10e68c99d2d21040dce659d5974cf4be4b0
Binary files /dev/null and b/fairseq/data/__pycache__/pad_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/plasma_utils.cpython-310.pyc b/fairseq/data/__pycache__/plasma_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d192e6c349ac6de2d0f828668a2f6415019f26f4
Binary files /dev/null and b/fairseq/data/__pycache__/plasma_utils.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/prepend_dataset.cpython-310.pyc b/fairseq/data/__pycache__/prepend_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f9bd78e89f268efba6fe29cd88a97662733c305
Binary files /dev/null and b/fairseq/data/__pycache__/prepend_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/prepend_token_dataset.cpython-310.pyc b/fairseq/data/__pycache__/prepend_token_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5014bffbf45032e3771519a02152b6b4f7863b0
Binary files /dev/null and b/fairseq/data/__pycache__/prepend_token_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/raw_label_dataset.cpython-310.pyc b/fairseq/data/__pycache__/raw_label_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4899af9aca8155611df0a38c6cd57b739df2ab04
Binary files /dev/null and b/fairseq/data/__pycache__/raw_label_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/replace_dataset.cpython-310.pyc b/fairseq/data/__pycache__/replace_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4de08000e5eeb458ae1d44084ba9183cd6f9eb4c
Binary files /dev/null and b/fairseq/data/__pycache__/replace_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/resampling_dataset.cpython-310.pyc b/fairseq/data/__pycache__/resampling_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13e0318d453f6d07809fb45acbfc383199b23d27
Binary files /dev/null and b/fairseq/data/__pycache__/resampling_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/roll_dataset.cpython-310.pyc b/fairseq/data/__pycache__/roll_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0069e2fc4fdbb99f0e60bba5050eb3d1a005bee5
Binary files /dev/null and b/fairseq/data/__pycache__/roll_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/round_robin_zip_datasets.cpython-310.pyc b/fairseq/data/__pycache__/round_robin_zip_datasets.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5174abd34829c05d48745f7737f8ff3c6f35cd07
Binary files /dev/null and b/fairseq/data/__pycache__/round_robin_zip_datasets.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/shorten_dataset.cpython-310.pyc b/fairseq/data/__pycache__/shorten_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e0f2347001583942f1a46daf2a339a831597f1e
Binary files /dev/null and b/fairseq/data/__pycache__/shorten_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/sort_dataset.cpython-310.pyc b/fairseq/data/__pycache__/sort_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..25db7b3057f8f51e18b4b3566c8f18ba8efed15f
Binary files /dev/null and b/fairseq/data/__pycache__/sort_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/strip_token_dataset.cpython-310.pyc b/fairseq/data/__pycache__/strip_token_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a2a991e11b6751521f5be59104663e574f61c34f
Binary files /dev/null and b/fairseq/data/__pycache__/strip_token_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/subsample_dataset.cpython-310.pyc b/fairseq/data/__pycache__/subsample_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0cc376017fbe7a7d29d2c4cb243fb16da78dc757
Binary files /dev/null and b/fairseq/data/__pycache__/subsample_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/token_block_dataset.cpython-310.pyc b/fairseq/data/__pycache__/token_block_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..913ff9df16bfb459a080a4580d24497dcedc8a2a
Binary files /dev/null and b/fairseq/data/__pycache__/token_block_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/transform_eos_dataset.cpython-310.pyc b/fairseq/data/__pycache__/transform_eos_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0fc937393158c870601acf92d3e97f92911ee0bf
Binary files /dev/null and b/fairseq/data/__pycache__/transform_eos_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/__pycache__/transform_eos_lang_pair_dataset.cpython-310.pyc b/fairseq/data/__pycache__/transform_eos_lang_pair_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..491433270eef64b146e07f01bd95fc713755f9a5
Binary files /dev/null and b/fairseq/data/__pycache__/transform_eos_lang_pair_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/add_target_dataset.py b/fairseq/data/add_target_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a42dd7a2ef29f1fad139c79509923e684bfa9ad
--- /dev/null
+++ b/fairseq/data/add_target_dataset.py
@@ -0,0 +1,56 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from . import BaseWrapperDataset
+from . import data_utils
+
+
+class AddTargetDataset(BaseWrapperDataset):
+    def __init__(self, dataset, labels, pad, eos, batch_targets, process_label=None, add_to_input=False):
+        super().__init__(dataset)
+        self.labels = labels
+        self.batch_targets = batch_targets
+        self.pad = pad
+        self.eos = eos
+        self.process_label = process_label
+        self.add_to_input = add_to_input
+
+    def get_label(self, index):
+        return self.labels[index] if self.process_label is None else self.process_label(self.labels[index])
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        item["label"] = self.get_label(index)
+        return item
+
+    def size(self, index):
+        sz = self.dataset.size(index)
+        own_sz = len(self.get_label(index))
+        return (sz, own_sz)
+
+    def collater(self, samples):
+        collated = self.dataset.collater(samples)
+        if len(collated) == 0:
+            return collated
+        indices = set(collated["id"].tolist())
+        target = [s["label"] for s in samples if s["id"] in indices]
+
+        if self.batch_targets:
+            collated["target_lengths"] = torch.LongTensor([len(t) for t in target])
+            target = data_utils.collate_tokens(target, pad_idx=self.pad, left_pad=False)
+            collated["ntokens"] = collated["target_lengths"].sum().item()
+        else:
+            collated["ntokens"] = sum([len(t) for t in target])
+
+        collated["target"] = target
+
+        if self.add_to_input:
+            eos = target.new_full((target.size(0), 1), self.eos)
+            collated["target"] = torch.cat([target, eos], dim=-1).long()
+            collated["net_input"]["prev_output_tokens"] = torch.cat([eos, target], dim=-1).long()
+            collated["ntokens"] += target.size(0)
+        return collated
\ No newline at end of file
diff --git a/fairseq/data/append_token_dataset.py b/fairseq/data/append_token_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..7298129f62a0a61fad34045f44154ba6f2d7b864
--- /dev/null
+++ b/fairseq/data/append_token_dataset.py
@@ -0,0 +1,42 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from . import BaseWrapperDataset
+
+
+class AppendTokenDataset(BaseWrapperDataset):
+
+    def __init__(self, dataset, token=None):
+        super().__init__(dataset)
+        self.token = token
+        if token is not None:
+            self._sizes = np.array(dataset.sizes) + 1
+        else:
+            self._sizes = dataset.sizes
+
+    def __getitem__(self, idx):
+        item = self.dataset[idx]
+        if self.token is not None:
+            item = torch.cat([item, item.new([self.token])])
+        return item
+
+    @property
+    def sizes(self):
+        return self._sizes
+
+    def num_tokens(self, index):
+        n = self.dataset.num_tokens(index)
+        if self.token is not None:
+            n += 1
+        return n
+
+    def size(self, index):
+        n = self.dataset.size(index)
+        if self.token is not None:
+            n += 1
+        return n
diff --git a/fairseq/data/audio/__init__.py b/fairseq/data/audio/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fairseq/data/audio/__pycache__/__init__.cpython-310.pyc b/fairseq/data/audio/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19a86c04da258724d7f56130c5a553498627d0a8
Binary files /dev/null and b/fairseq/data/audio/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/data/audio/__pycache__/raw_audio_dataset.cpython-310.pyc b/fairseq/data/audio/__pycache__/raw_audio_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a798bbc8bb039d5ee9ae4ba9f4c15f1c68de966
Binary files /dev/null and b/fairseq/data/audio/__pycache__/raw_audio_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/audio/raw_audio_dataset.py b/fairseq/data/audio/raw_audio_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..675b09564776e28f2bfaf1a81a0f915d7c34ae7c
--- /dev/null
+++ b/fairseq/data/audio/raw_audio_dataset.py
@@ -0,0 +1,181 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import os
+import logging
+import numpy as np
+import sys
+
+import torch
+import torch.nn.functional as F
+
+from .. import FairseqDataset
+
+logger = logging.getLogger(__name__)
+
+
+class RawAudioDataset(FairseqDataset):
+    def __init__(
+        self,
+        sample_rate,
+        max_sample_size=None,
+        min_sample_size=None,
+        shuffle=True,
+        min_length=0,
+        pad=False,
+        normalize=False,
+    ):
+        super().__init__()
+
+        self.sample_rate = sample_rate
+        self.sizes = []
+        self.max_sample_size = (
+            max_sample_size if max_sample_size is not None else sys.maxsize
+        )
+        self.min_sample_size = min_sample_size
+        self.min_length = min_length
+        self.pad = pad
+        self.shuffle = shuffle
+        self.normalize = normalize
+
+    def __getitem__(self, index):
+        raise NotImplementedError()
+
+    def __len__(self):
+        return len(self.sizes)
+
+    def postprocess(self, feats, curr_sample_rate):
+        if feats.dim() == 2:
+            feats = feats.mean(-1)
+
+        if curr_sample_rate != self.sample_rate:
+            raise Exception(f"sample rate: {curr_sample_rate}, need {self.sample_rate}")
+
+        assert feats.dim() == 1, feats.dim()
+
+        if self.normalize:
+            with torch.no_grad():
+                feats = F.layer_norm(feats, feats.shape)
+        return feats
+
+    def crop_to_max_size(self, wav, target_size):
+        size = len(wav)
+        diff = size - target_size
+        if diff <= 0:
+            return wav
+
+        start = np.random.randint(0, diff + 1)
+        end = size - diff + start
+        return wav[start:end]
+
+    def collater(self, samples):
+        samples = [
+            s
+            for s in samples
+            if s["source"] is not None
+        ]
+        if len(samples) == 0:
+            return {}
+
+        sources = [s["source"] for s in samples]
+        sizes = [len(s) for s in sources]
+
+        if self.pad:
+            target_size = min(max(sizes), self.max_sample_size)
+        else:
+            target_size = min(min(sizes), self.max_sample_size)
+
+        collated_sources = sources[0].new(len(sources), target_size)
+        padding_mask = (
+            torch.BoolTensor(collated_sources.shape).fill_(False) if self.pad else None
+        )
+        for i, (source, size) in enumerate(zip(sources, sizes)):
+            diff = size - target_size
+            if diff == 0:
+                collated_sources[i] = source
+            elif diff < 0:
+                assert self.pad
+                collated_sources[i] = torch.cat(
+                    [source, source.new_full((-diff,), 0.0)]
+                )
+                padding_mask[i, diff:] = True
+            else:
+                collated_sources[i] = self.crop_to_max_size(source, target_size)
+
+        input = {"source": collated_sources}
+        if self.pad:
+            input["padding_mask"] = padding_mask
+        return {"id": torch.LongTensor([s["id"] for s in samples]), "net_input": input}
+
+    def num_tokens(self, index):
+        return self.size(index)
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        if self.pad:
+            return self.sizes[index]
+        return min(self.sizes[index], self.max_sample_size)
+
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+
+        if self.shuffle:
+            order = [np.random.permutation(len(self))]
+        else:
+            order = [np.arange(len(self))]
+
+        order.append(self.sizes)
+        return np.lexsort(order)[::-1]
+
+
+class FileAudioDataset(RawAudioDataset):
+    def __init__(
+        self,
+        manifest_path,
+        sample_rate,
+        max_sample_size=None,
+        min_sample_size=None,
+        shuffle=True,
+        min_length=0,
+        pad=False,
+        normalize=False,
+    ):
+        super().__init__(
+            sample_rate=sample_rate,
+            max_sample_size=max_sample_size,
+            min_sample_size=min_sample_size,
+            shuffle=shuffle,
+            min_length=min_length,
+            pad=pad,
+            normalize=normalize,
+        )
+
+        self.fnames = []
+
+        skipped = 0
+        with open(manifest_path, "r") as f:
+            self.root_dir = f.readline().strip()
+            for line in f:
+                items = line.strip().split("\t")
+                assert len(items) == 2, line
+                sz = int(items[1])
+                if min_length is not None and sz < min_length:
+                    skipped += 1
+                    continue
+                self.fnames.append(items[0])
+                self.sizes.append(sz)
+        logger.info(f"loaded {len(self.fnames)}, skipped {skipped} samples")
+
+    def __getitem__(self, index):
+        import soundfile as sf
+
+        fname = os.path.join(self.root_dir, self.fnames[index])
+        wav, curr_sample_rate = sf.read(fname)
+        feats = torch.from_numpy(wav).float()
+        feats = self.postprocess(feats, curr_sample_rate)
+        return {"id": index, "source": feats}
diff --git a/fairseq/data/backtranslation_dataset.py b/fairseq/data/backtranslation_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..0007a015064cdb025b39201b3ea1647caee04ec9
--- /dev/null
+++ b/fairseq/data/backtranslation_dataset.py
@@ -0,0 +1,165 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from fairseq import utils
+
+from . import FairseqDataset
+
+
+def backtranslate_samples(samples, collate_fn, generate_fn, cuda=True):
+    """Backtranslate a list of samples.
+
+    Given an input (*samples*) of the form:
+
+        [{'id': 1, 'source': 'hallo welt'}]
+
+    this will return:
+
+        [{'id': 1, 'source': 'hello world', 'target': 'hallo welt'}]
+
+    Args:
+        samples (List[dict]): samples to backtranslate. Individual samples are
+            expected to have a 'source' key, which will become the 'target'
+            after backtranslation.
+        collate_fn (callable): function to collate samples into a mini-batch
+        generate_fn (callable): function to generate backtranslations
+        cuda (bool): use GPU for generation (default: ``True``)
+
+    Returns:
+        List[dict]: an updated list of samples with a backtranslated source
+    """
+    collated_samples = collate_fn(samples)
+    s = utils.move_to_cuda(collated_samples) if cuda else collated_samples
+    generated_sources = generate_fn(s)
+
+    id_to_src = {
+        sample['id']: sample['source'] for sample in samples
+    }
+
+    # Go through each tgt sentence in batch and its corresponding best
+    # generated hypothesis and create a backtranslation data pair
+    # {id: id, source: generated backtranslation, target: original tgt}
+    return [
+        {'id': id.item(), 'target': id_to_src[id.item()], 'source': hypos[0]['tokens'].cpu()}
+        for id, hypos in zip(collated_samples['id'], generated_sources)
+    ]
+
+
+class BacktranslationDataset(FairseqDataset):
+    """
+    Sets up a backtranslation dataset which takes a tgt batch, generates
+    a src using a tgt-src backtranslation function (*backtranslation_fn*),
+    and returns the corresponding `{generated src, input tgt}` batch.
+
+    Args:
+        tgt_dataset (~fairseq.data.FairseqDataset): the dataset to be
+            backtranslated. Only the source side of this dataset will be used.
+            After backtranslation, the source sentences in this dataset will be
+            returned as the targets.
+        src_dict (~fairseq.data.Dictionary): the dictionary of backtranslated
+            sentences.
+        tgt_dict (~fairseq.data.Dictionary, optional): the dictionary of
+            sentences to be backtranslated.
+        backtranslation_fn (callable, optional): function to call to generate
+            backtranslations. This is typically the `generate` method of a
+            :class:`~fairseq.sequence_generator.SequenceGenerator` object.
+            Pass in None when it is not available at initialization time, and
+            use set_backtranslation_fn function to set it when available.
+        output_collater (callable, optional): function to call on the
+            backtranslated samples to create the final batch
+            (default: ``tgt_dataset.collater``).
+        cuda: use GPU for generation
+    """
+
+    def __init__(
+        self,
+        tgt_dataset,
+        src_dict,
+        tgt_dict=None,
+        backtranslation_fn=None,
+        output_collater=None,
+        cuda=True,
+        **kwargs
+    ):
+        self.tgt_dataset = tgt_dataset
+        self.backtranslation_fn = backtranslation_fn
+        self.output_collater = output_collater if output_collater is not None \
+            else tgt_dataset.collater
+        self.cuda = cuda if torch.cuda.is_available() else False
+        self.src_dict = src_dict
+        self.tgt_dict = tgt_dict
+
+    def __getitem__(self, index):
+        """
+        Returns a single sample from *tgt_dataset*. Note that backtranslation is
+        not applied in this step; use :func:`collater` instead to backtranslate
+        a batch of samples.
+        """
+        return self.tgt_dataset[index]
+
+    def __len__(self):
+        return len(self.tgt_dataset)
+
+    def set_backtranslation_fn(self, backtranslation_fn):
+        self.backtranslation_fn = backtranslation_fn
+
+    def collater(self, samples):
+        """Merge and backtranslate a list of samples to form a mini-batch.
+
+        Using the samples from *tgt_dataset*, load a collated target sample to
+        feed to the backtranslation model. Then take the backtranslation with
+        the best score as the source and the original input as the target.
+
+        Note: we expect *tgt_dataset* to provide a function `collater()` that
+        will collate samples into the format expected by *backtranslation_fn*.
+        After backtranslation, we will feed the new list of samples (i.e., the
+        `(backtranslated source, original source)` pairs) to *output_collater*
+        and return the result.
+
+        Args:
+            samples (List[dict]): samples to backtranslate and collate
+
+        Returns:
+            dict: a mini-batch with keys coming from *output_collater*
+        """
+        if samples[0].get('is_dummy', False):
+            return samples
+        samples = backtranslate_samples(
+            samples=samples,
+            collate_fn=self.tgt_dataset.collater,
+            generate_fn=(
+                lambda net_input: self.backtranslation_fn(net_input)
+            ),
+            cuda=self.cuda,
+        )
+        return self.output_collater(samples)
+
+    def num_tokens(self, index):
+        """Just use the tgt dataset num_tokens"""
+        return self.tgt_dataset.num_tokens(index)
+
+    def ordered_indices(self):
+        """Just use the tgt dataset ordered_indices"""
+        return self.tgt_dataset.ordered_indices()
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used
+        when filtering a dataset with ``--max-positions``.
+
+        Note: we use *tgt_dataset* to approximate the length of the source
+        sentence, since we do not know the actual length until after
+        backtranslation.
+        """
+        tgt_size = self.tgt_dataset.size(index)[0]
+        return (tgt_size, tgt_size)
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.tgt_dataset, 'supports_prefetch', False)
+
+    def prefetch(self, indices):
+        return self.tgt_dataset.prefetch(indices)
diff --git a/fairseq/data/base_wrapper_dataset.py b/fairseq/data/base_wrapper_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b5326a6354641e81a2e6289e5f60e619dbadc90
--- /dev/null
+++ b/fairseq/data/base_wrapper_dataset.py
@@ -0,0 +1,69 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from torch.utils.data.dataloader import default_collate
+
+from . import FairseqDataset
+
+
+class BaseWrapperDataset(FairseqDataset):
+
+    def __init__(self, dataset):
+        super().__init__()
+        self.dataset = dataset
+
+    def __getitem__(self, index):
+        return self.dataset[index]
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def collater(self, samples):
+        if hasattr(self.dataset, 'collater'):
+            return self.dataset.collater(samples)
+        else:
+            return default_collate(samples)
+
+    @property
+    def sizes(self):
+        return self.dataset.sizes
+
+    def num_tokens(self, index):
+        return self.dataset.num_tokens(index)
+
+    def size(self, index):
+        return self.dataset.size(index)
+
+    def ordered_indices(self):
+        return self.dataset.ordered_indices()
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, 'supports_prefetch', False)
+
+    def prefetch(self, indices):
+        self.dataset.prefetch(indices)
+
+    def get_batch_shapes(self):
+        return self.dataset.get_batch_shapes()
+
+    def batch_by_size(
+        self,
+        indices,
+        max_tokens=None,
+        max_sentences=None,
+        required_batch_size_multiple=1,
+    ):
+        return self.dataset.batch_by_size(
+            indices,
+            max_tokens=max_tokens,
+            max_sentences=max_sentences,
+            required_batch_size_multiple=required_batch_size_multiple,
+        )
+
+    def set_epoch(self, epoch):
+        super().set_epoch(epoch)
+        if hasattr(self.dataset, 'set_epoch'):
+            self.dataset.set_epoch(epoch)
diff --git a/fairseq/data/bucket_pad_length_dataset.py b/fairseq/data/bucket_pad_length_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f53d011881b889f2fb22f0cccbca6cee1e309ac
--- /dev/null
+++ b/fairseq/data/bucket_pad_length_dataset.py
@@ -0,0 +1,77 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch.nn.functional as F
+
+from fairseq.data import BaseWrapperDataset
+
+
+class BucketPadLengthDataset(BaseWrapperDataset):
+    """
+    Bucket and pad item lengths to the nearest bucket size. This can be used to
+    reduce the number of unique batch shapes, which is important on TPUs since
+    each new batch shape requires a recompilation.
+
+    Args:
+        dataset (FairseqDatset): dataset to bucket
+        sizes (List[int]): all item sizes
+        num_buckets (int): number of buckets to create
+        pad_idx (int): padding symbol
+        left_pad (bool): if True, pad on the left; otherwise right pad
+    """
+
+    def __init__(
+        self,
+        dataset,
+        sizes,
+        num_buckets,
+        pad_idx,
+        left_pad,
+    ):
+        super().__init__(dataset)
+        self.pad_idx = pad_idx
+        self.left_pad = left_pad
+
+        assert num_buckets > 0
+        self.buckets = np.unique(
+            np.percentile(
+                sizes,
+                np.linspace(0, 100, num_buckets + 1),
+                interpolation='lower',
+            )[1:]
+        )
+
+        def get_bucketed_sizes(orig_sizes, buckets):
+            sizes = np.copy(orig_sizes)
+            assert np.min(sizes) >= 0
+            start_val = -1
+            for end_val in buckets:
+                mask = (sizes > start_val) & (sizes <= end_val)
+                sizes[mask] = end_val
+                start_val = end_val
+            return sizes
+
+        self._bucketed_sizes = get_bucketed_sizes(sizes, self.buckets)
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        bucket_size = self._bucketed_sizes[index]
+        num_pad = bucket_size - item.size(-1)
+        return F.pad(
+            item,
+            (num_pad if self.left_pad else 0, 0 if self.left_pad else num_pad),
+            value=self.pad_idx,
+        )
+
+    @property
+    def sizes(self):
+        return self._bucketed_sizes
+
+    def num_tokens(self, index):
+        return self._bucketed_sizes[index]
+
+    def size(self, index):
+        return self._bucketed_sizes[index]
diff --git a/fairseq/data/colorize_dataset.py b/fairseq/data/colorize_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..89e0e04142ff16aff931e0a0dc876558000db514
--- /dev/null
+++ b/fairseq/data/colorize_dataset.py
@@ -0,0 +1,24 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from . import BaseWrapperDataset
+
+
+class ColorizeDataset(BaseWrapperDataset):
+    """ Adds 'colors' property to net input that is obtained from the provided color getter for use by models """
+    def __init__(self, dataset, color_getter):
+        super().__init__(dataset)
+        self.color_getter = color_getter
+
+    def collater(self, samples):
+        base_collate = super().collater(samples)
+        if len(base_collate) > 0:
+            base_collate["net_input"]["colors"] = torch.tensor(
+                list(self.color_getter(self.dataset, s["id"]) for s in samples),
+                dtype=torch.long,
+            )
+        return base_collate
diff --git a/fairseq/data/concat_dataset.py b/fairseq/data/concat_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ca80631f0d1e9eeffc3ec2b6782b285feae29f4
--- /dev/null
+++ b/fairseq/data/concat_dataset.py
@@ -0,0 +1,105 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import bisect
+
+import numpy as np
+from torch.utils.data.dataloader import default_collate
+
+from . import FairseqDataset
+
+
+class ConcatDataset(FairseqDataset):
+    @staticmethod
+    def cumsum(sequence, sample_ratios):
+        r, s = [], 0
+        for e, ratio in zip(sequence, sample_ratios):
+            curr_len = int(ratio * len(e))
+            r.append(curr_len + s)
+            s += curr_len
+        return r
+
+    def __init__(self, datasets, sample_ratios=1):
+        super(ConcatDataset, self).__init__()
+        assert len(datasets) > 0, "datasets should not be an empty iterable"
+        self.datasets = list(datasets)
+        if isinstance(sample_ratios, int):
+            sample_ratios = [sample_ratios] * len(self.datasets)
+        self.sample_ratios = sample_ratios
+        self.cumulative_sizes = self.cumsum(self.datasets, sample_ratios)
+        self.real_sizes = [len(d) for d in self.datasets]
+
+    def __len__(self):
+        return self.cumulative_sizes[-1]
+
+    def __getitem__(self, idx):
+        dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx)
+        return self.datasets[dataset_idx][sample_idx]
+
+    def _get_dataset_and_sample_index(self, idx: int):
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        sample_idx = sample_idx % self.real_sizes[dataset_idx]
+        return dataset_idx, sample_idx
+
+    def collater(self, samples, **extra_args):
+        # For now only supports datasets with same underlying collater implementations
+        if hasattr(self.datasets[0], 'collater'):
+            return self.datasets[0].collater(samples, **extra_args)
+        else:
+            return default_collate(samples, **extra_args)
+
+    def size(self, idx: int):
+        """
+        Return an example's size as a float or tuple.
+        """
+        dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx)
+        return self.datasets[dataset_idx].size(sample_idx)
+
+    def num_tokens(self, index: int):
+        return np.max(self.size(index))
+
+    def attr(self, attr: str, index: int):
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, index)
+        return getattr(self.datasets[dataset_idx], attr, None)
+
+    @property
+    def sizes(self):
+        _dataset_sizes = []
+        for ds, sr in zip(self.datasets, self.sample_ratios):
+            if isinstance(ds.sizes, np.ndarray):
+                _dataset_sizes.append(np.tile(ds.sizes, sr))
+            else:
+                # Only support underlying dataset with single size array.
+                assert isinstance(ds.sizes, list)
+                _dataset_sizes.append(np.tile(ds.sizes[0], sr))
+        return np.concatenate(_dataset_sizes)
+
+    @property
+    def supports_prefetch(self):
+        return all(d.supports_prefetch for d in self.datasets)
+
+    def ordered_indices(self):
+        """
+        Returns indices sorted by length. So less padding is needed.
+        """
+        return np.argsort(self.sizes)
+
+    def prefetch(self, indices):
+        frm = 0
+        for to, ds in zip(self.cumulative_sizes, self.datasets):
+            real_size = len(ds)
+            if getattr(ds, 'supports_prefetch', False):
+                ds.prefetch([(i - frm) % real_size for i in indices if frm <= i < to])
+            frm = to
+
+    def set_epoch(self, epoch):
+        super().set_epoch(epoch)
+        for ds in self.datasets:
+            if hasattr(ds, 'set_epoch'):
+                ds.set_epoch(epoch)
diff --git a/fairseq/data/concat_sentences_dataset.py b/fairseq/data/concat_sentences_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..55445ee1c789bdacc4acaecd628903ced276a65c
--- /dev/null
+++ b/fairseq/data/concat_sentences_dataset.py
@@ -0,0 +1,56 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from . import FairseqDataset
+
+
+class ConcatSentencesDataset(FairseqDataset):
+
+    def __init__(self, *datasets):
+        super().__init__()
+        self.datasets = datasets
+        assert all(len(ds) == len(datasets[0]) for ds in datasets), \
+            'datasets must have the same length'
+
+    def __getitem__(self, index):
+        return torch.cat([ds[index] for ds in self.datasets])
+
+    def __len__(self):
+        return len(self.datasets[0])
+
+    def collater(self, samples):
+        return self.datasets[0].collater(samples)
+
+    @property
+    def sizes(self):
+        return sum(ds.sizes for ds in self.datasets)
+
+    def num_tokens(self, index):
+        return sum(ds.num_tokens(index) for ds in self.datasets)
+
+    def size(self, index):
+        return sum(ds.size(index) for ds in self.datasets)
+
+    def ordered_indices(self):
+        return self.datasets[0].ordered_indices()
+
+    @property
+    def supports_prefetch(self):
+        return any(
+            getattr(ds, 'supports_prefetch', False) for ds in self.datasets
+        )
+
+    def prefetch(self, indices):
+        for ds in self.datasets:
+            if getattr(ds, 'supports_prefetch', False):
+                ds.prefetch(indices)
+
+    def set_epoch(self, epoch):
+        super().set_epoch(epoch)
+        for ds in self.datasets:
+            if hasattr(ds, 'set_epoch'):
+                ds.set_epoch(epoch)
diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..70d8997b19a879fe520be94ed5475349bfe9c585
--- /dev/null
+++ b/fairseq/data/data_utils.py
@@ -0,0 +1,396 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+try:
+    from collections.abc import Iterable
+except ImportError:
+    from collections import Iterable
+import contextlib
+import itertools
+import logging
+import os
+
+from typing import Tuple, Optional
+
+import numpy as np
+import torch
+
+
+logger = logging.getLogger(__name__)
+
+
+def infer_language_pair(path):
+    """Infer language pair from filename: <split>.<lang1>-<lang2>.(...).idx"""
+    src, dst = None, None
+    for filename in os.listdir(path):
+        parts = filename.split('.')
+        if len(parts) >= 3 and len(parts[1].split('-')) == 2:
+            return parts[1].split('-')
+    return src, dst
+
+
+def collate_tokens(values, pad_idx, eos_idx=None, left_pad=False, move_eos_to_beginning=False, pad_to_length=None):
+    """Convert a list of 1d tensors into a padded 2d tensor."""
+    size = max(v.size(0) for v in values)
+    size = size if pad_to_length is None else max(size, pad_to_length)
+    res = values[0].new(len(values), size).fill_(pad_idx)
+
+    def copy_tensor(src, dst):
+        assert dst.numel() == src.numel()
+        if move_eos_to_beginning:
+            if eos_idx is None:
+                # if no eos_idx is specified, then use the last token in src
+                dst[0] = src[-1]
+            else:
+                dst[0] = eos_idx
+            dst[1:] = src[:-1]
+        else:
+            dst.copy_(src)
+
+    for i, v in enumerate(values):
+        copy_tensor(v, res[i][size - len(v):] if left_pad else res[i][:len(v)])
+    return res
+
+
+def load_indexed_dataset(path, dictionary=None, dataset_impl=None, combine=False, default='cached'):
+    """A helper function for loading indexed datasets.
+
+    Args:
+        path (str): path to indexed dataset (e.g., 'data-bin/train')
+        dictionary (~fairseq.data.Dictionary): data dictionary
+        dataset_impl (str, optional): which dataset implementation to use. If
+            not provided, it will be inferred automatically. For legacy indexed
+            data we use the 'cached' implementation by default.
+        combine (bool, optional): automatically load and combine multiple
+            datasets. For example, if *path* is 'data-bin/train', then we will
+            combine 'data-bin/train', 'data-bin/train1', ... and return a
+            single ConcatDataset instance.
+    """
+    from fairseq.data.concat_dataset import ConcatDataset
+    import fairseq.data.indexed_dataset as indexed_dataset
+
+    datasets = []
+    for k in itertools.count():
+        path_k = path + (str(k) if k > 0 else '')
+
+        dataset_impl_k = dataset_impl
+        if dataset_impl_k is None:
+            dataset_impl_k = indexed_dataset.infer_dataset_impl(path_k)
+
+        dataset = indexed_dataset.make_dataset(
+            path_k,
+            impl=dataset_impl_k or default,
+            fix_lua_indexing=True,
+            dictionary=dictionary,
+        )
+        if dataset is None:
+            break
+        logger.info('loaded {} examples from: {}'.format(len(dataset), path_k))
+        datasets.append(dataset)
+        if not combine:
+            break
+    if len(datasets) == 0:
+        return None
+    elif len(datasets) == 1:
+        return datasets[0]
+    else:
+        return ConcatDataset(datasets)
+
+
+@contextlib.contextmanager
+def numpy_seed(seed, *addl_seeds):
+    """Context manager which seeds the NumPy PRNG with the specified seed and
+    restores the state afterward"""
+    if seed is None:
+        yield
+        return
+    if len(addl_seeds) > 0:
+        seed = int(hash((seed, *addl_seeds)) % 1e6)
+    state = np.random.get_state()
+    np.random.seed(seed)
+    try:
+        yield
+    finally:
+        np.random.set_state(state)
+
+
+def collect_filtered(function, iterable, filtered):
+    """
+    Similar to :func:`filter` but collects filtered elements in ``filtered``.
+
+    Args:
+        function (callable): function that returns ``False`` for elements that
+            should be filtered
+        iterable (iterable): iterable to filter
+        filtered (list): list to store filtered elements
+    """
+    for el in iterable:
+        if function(el):
+            yield el
+        else:
+            filtered.append(el)
+
+
+def _filter_by_size_dynamic(indices, size_fn, max_positions, raise_exception=False):
+    def compare_leq(a, b):
+        return a <= b if not isinstance(a, tuple) else max(a) <= b
+
+    def check_size(idx):
+        if isinstance(max_positions, float) or isinstance(max_positions, int):
+            return size_fn(idx) <= max_positions
+        elif isinstance(max_positions, dict):
+            idx_size = size_fn(idx)
+            assert isinstance(idx_size, dict)
+            intersect_keys = set(max_positions.keys()) & set(idx_size.keys())
+            return all(
+                all(a is None or b is None or a <= b
+                    for a, b in zip(idx_size[key], max_positions[key]))
+                for key in intersect_keys
+            )
+        else:
+            # Hacky as heck, for the specific case of multilingual training with RoundRobin.
+            if isinstance(size_fn(idx), dict) and isinstance(max_positions, tuple):
+                return all(
+                    a is None or b is None or compare_leq(a, b)
+                    for a, b in zip(size_fn(idx).values(), max_positions)
+                )
+            # For MultiCorpusSampledDataset, will generalize it later
+            if not isinstance(size_fn(idx), Iterable):
+                return all(size_fn(idx) <= b for b in max_positions)
+            return all(
+                a is None or b is None or a <= b
+                for a, b in zip(size_fn(idx), max_positions)
+            )
+    ignored = []
+    itr = collect_filtered(check_size, indices, ignored)
+    indices = np.fromiter(itr, dtype=np.int64, count=-1)
+    return indices, ignored
+
+
+def filter_by_size(indices, dataset, max_positions, raise_exception=False):
+    """
+    [deprecated] Filter indices based on their size.
+    Use `FairseqDataset::filter_indices_by_size` instead.
+
+    Args:
+        indices (List[int]): ordered list of dataset indices
+        dataset (FairseqDataset): fairseq dataset instance
+        max_positions (tuple): filter elements larger than this size.
+            Comparisons are done component-wise.
+        raise_exception (bool, optional): if ``True``, raise an exception if
+            any elements are filtered (default: False).
+    """
+    if isinstance(max_positions, float) or isinstance(max_positions, int):
+        if hasattr(dataset, 'sizes') and isinstance(dataset.sizes, np.ndarray):
+            ignored = indices[dataset.sizes[indices] > max_positions].tolist()
+            indices = indices[dataset.sizes[indices] <= max_positions]
+        elif hasattr(dataset, 'sizes') and isinstance(dataset.sizes, list) and len(dataset.sizes) == 1:
+            ignored = indices[dataset.sizes[0][indices] > max_positions].tolist()
+            indices = indices[dataset.sizes[0][indices] <= max_positions]
+        else:
+            indices, ignored = _filter_by_size_dynamic(indices, dataset.size, max_positions)
+    else:
+        indices, ignored = _filter_by_size_dynamic(indices, dataset.size, max_positions)
+
+    if len(ignored) > 0 and raise_exception:
+        raise Exception((
+            'Size of sample #{} is invalid (={}) since max_positions={}, '
+            'skip this example with --skip-invalid-size-inputs-valid-test'
+        ).format(ignored[0], dataset.size(ignored[0]), max_positions))
+    if len(ignored) > 0:
+        logger.warning((
+            '{} samples have invalid sizes and will be skipped, '
+            'max_positions={}, first few sample ids={}'
+        ).format(len(ignored), max_positions, ignored[:10]))
+    return indices
+
+
+def batch_by_size(
+    indices, num_tokens_fn, max_tokens=None, max_sentences=None,
+    required_batch_size_multiple=1, fixed_shapes=None,
+):
+    """
+    Yield mini-batches of indices bucketed by size. Batches may contain
+    sequences of different lengths.
+
+    Args:
+        indices (List[int]): ordered list of dataset indices
+        num_tokens_fn (callable): function that returns the number of tokens at
+            a given index
+        max_tokens (int, optional): max number of tokens in each batch
+            (default: None).
+        max_sentences (int, optional): max number of sentences in each
+            batch (default: None).
+        required_batch_size_multiple (int, optional): require batch size to
+            be less than N or a multiple of N (default: 1).
+        fixed_shapes (List[Tuple[int, int]], optional): if given, batches will
+            only be created with the given shapes. *max_sentences* and
+            *required_batch_size_multiple* will be ignored (default: None).
+    """
+    try:
+        from fairseq.data.data_utils_fast import (
+            batch_by_size_fast, batch_fixed_shapes_fast,
+        )
+    except ImportError:
+        raise ImportError(
+            'Please build Cython components with: `pip install --editable .` '
+            'or `python setup.py build_ext --inplace`'
+        )
+
+    max_tokens = max_tokens if max_tokens is not None else -1
+    max_sentences = max_sentences if max_sentences is not None else -1
+    bsz_mult = required_batch_size_multiple
+
+    if not isinstance(indices, np.ndarray):
+        indices = np.fromiter(indices, dtype=np.int64, count=-1)
+
+    if fixed_shapes is None:
+        return batch_by_size_fast(
+            indices, num_tokens_fn, max_tokens, max_sentences, bsz_mult,
+        )
+    else:
+        fixed_shapes = np.array(fixed_shapes, dtype=np.int64)
+        sort_order = np.lexsort([
+            fixed_shapes[:, 1].argsort(),  # length
+            fixed_shapes[:, 0].argsort(),  # bsz
+        ])
+        fixed_shapes_sorted = fixed_shapes[sort_order]
+        return batch_fixed_shapes_fast(indices, num_tokens_fn, fixed_shapes_sorted)
+
+
+def post_process(sentence: str, symbol: str):
+    if symbol == "sentencepiece":
+        sentence = sentence.replace(" ", "").replace("\u2581", " ").strip()
+    elif symbol == 'wordpiece':
+        sentence = sentence.replace(" ", "").replace("_", " ").strip()
+    elif symbol == 'letter':
+        sentence = sentence.replace(" ", "").replace("|", " ").strip()
+    elif symbol == "_EOW":
+        sentence = sentence.replace(" ", "").replace("_EOW", " ").strip()
+    elif symbol is not None and symbol != 'none':
+        sentence = (sentence + " ").replace(symbol, "").rstrip()
+    return sentence
+
+def compute_mask_indices(
+        shape: Tuple[int, int],
+        padding_mask: Optional[torch.Tensor],
+        mask_prob: float,
+        mask_length: int,
+        mask_type: str = "static",
+        mask_other: float = 0.0,
+        min_masks: int = 0,
+        no_overlap: bool = False,
+        min_space: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape
+
+    Args:
+        shape: the the shape for which to compute masks.
+            should be of size 2 where first element is batch size and 2nd is timesteps
+        padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
+        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+        mask_type: how to compute mask lengths
+            static = fixed size
+            uniform = sample from uniform distribution [mask_other, mask_length*2]
+            normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
+            poisson = sample from possion distribution with lambda = mask length
+        min_masks: minimum number of masked spans
+        no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
+        min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
+    """
+
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+
+    all_num_mask = int(
+        # add a random number for probabilistic rounding
+        mask_prob * all_sz / float(mask_length)
+        + np.random.rand()
+    )
+
+    all_num_mask = max(min_masks, all_num_mask)
+
+    mask_idcs = []
+    for i in range(bsz):
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            num_mask = int(
+                # add a random number for probabilistic rounding
+                mask_prob * sz / float(mask_length)
+                + np.random.rand()
+            )
+            num_mask = max(min_masks, num_mask)
+        else:
+            sz = all_sz
+            num_mask = all_num_mask
+
+        if mask_type == "static":
+            lengths = np.full(num_mask, mask_length)
+        elif mask_type == "uniform":
+            lengths = np.random.randint(mask_other, mask_length * 2 + 1, size=num_mask)
+        elif mask_type == "normal":
+            lengths = np.random.normal(mask_length, mask_other, size=num_mask)
+            lengths = [max(1, int(round(x))) for x in lengths]
+        elif mask_type == "poisson":
+            lengths = np.random.poisson(mask_length, size=num_mask)
+            lengths = [int(round(x)) for x in lengths]
+        else:
+            raise Exception("unknown mask selection " + mask_type)
+
+        if sum(lengths) == 0:
+            lengths[0] = min(mask_length, sz - 1)
+
+        if no_overlap:
+            mask_idc = []
+            def arrange(s, e, length, keep_length):
+                span_start = np.random.randint(s, e-length)
+                mask_idc.extend(span_start + i for i in range(length))
+
+                new_parts = []
+                if span_start - s - min_space >= keep_length:
+                    new_parts.append((s, span_start-min_space+1))
+                if e - span_start - keep_length - min_space > keep_length:
+                    new_parts.append((span_start + length + min_space, e))
+                return new_parts
+
+            parts = [(0, sz)]
+            min_length = min(lengths)
+            for length in sorted(lengths, reverse=True):
+                lens = np.fromiter((e - s if e-s >= length+min_space else 0 for s, e in parts), np.int)
+                l_sum = np.sum(lens)
+                if l_sum == 0:
+                    break
+                probs = lens / np.sum(lens)
+                c = np.random.choice(len(parts), p=probs)
+                s, e = parts.pop(c)
+                parts.extend(arrange(s, e, length, min_length))
+            mask_idc = np.asarray(mask_idc)
+        else:
+            min_len = min(lengths)
+            if sz - min_len <= num_mask:
+                min_len  = sz - num_mask - 1
+
+            mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
+
+            mask_idc = np.asarray(
+                [
+                    mask_idc[j] + offset
+                    for j in range(len(mask_idc))
+                    for offset in range(lengths[j])
+                ]
+            )
+
+        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
+
+    min_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        if len(mask_idc) > min_len:
+            mask_idc = np.random.choice(mask_idc, min_len, replace=False)
+        mask[i, mask_idc] = True
+
+    return mask
diff --git a/fairseq/data/data_utils_fast.cpp b/fairseq/data/data_utils_fast.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b0acd994ebafcb1d54dab2a5ae2119933f9fc9b4
--- /dev/null
+++ b/fairseq/data/data_utils_fast.cpp
@@ -0,0 +1,31968 @@
+/* Generated by Cython 3.0.12 */
+
+/* BEGIN: Cython Metadata
+{
+    "distutils": {
+        "depends": [
+            "/tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/_core/include/numpy/arrayobject.h",
+            "/tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/_core/include/numpy/arrayscalars.h",
+            "/tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/_core/include/numpy/ndarrayobject.h",
+            "/tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/_core/include/numpy/ndarraytypes.h",
+            "/tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/_core/include/numpy/ufuncobject.h"
+        ],
+        "extra_compile_args": [
+            "-std=c++11",
+            "-O3"
+        ],
+        "include_dirs": [
+            "/tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/_core/include",
+            "/tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/_core/include"
+        ],
+        "language": "c++",
+        "name": "fairseq.data.data_utils_fast",
+        "sources": [
+            "fairseq/data/data_utils_fast.pyx"
+        ]
+    },
+    "module_name": "fairseq.data.data_utils_fast"
+}
+END: Cython Metadata */
+
+#ifndef PY_SSIZE_T_CLEAN
+#define PY_SSIZE_T_CLEAN
+#endif /* PY_SSIZE_T_CLEAN */
+#if defined(CYTHON_LIMITED_API) && 0
+  #ifndef Py_LIMITED_API
+    #if CYTHON_LIMITED_API+0 > 0x03030000
+      #define Py_LIMITED_API CYTHON_LIMITED_API
+    #else
+      #define Py_LIMITED_API 0x03030000
+    #endif
+  #endif
+#endif
+
+#include "Python.h"
+#ifndef Py_PYTHON_H
+    #error Python headers needed to compile C extensions, please install development version of Python.
+#elif PY_VERSION_HEX < 0x02070000 || (0x03000000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x03030000)
+    #error Cython requires Python 2.7+ or Python 3.3+.
+#else
+#if defined(CYTHON_LIMITED_API) && CYTHON_LIMITED_API
+#define __PYX_EXTRA_ABI_MODULE_NAME "limited"
+#else
+#define __PYX_EXTRA_ABI_MODULE_NAME ""
+#endif
+#define CYTHON_ABI "3_0_12" __PYX_EXTRA_ABI_MODULE_NAME
+#define __PYX_ABI_MODULE_NAME "_cython_" CYTHON_ABI
+#define __PYX_TYPE_MODULE_PREFIX __PYX_ABI_MODULE_NAME "."
+#define CYTHON_HEX_VERSION 0x03000CF0
+#define CYTHON_FUTURE_DIVISION 1
+#include <stddef.h>
+#ifndef offsetof
+  #define offsetof(type, member) ( (size_t) & ((type*)0) -> member )
+#endif
+#if !defined(_WIN32) && !defined(WIN32) && !defined(MS_WINDOWS)
+  #ifndef __stdcall
+    #define __stdcall
+  #endif
+  #ifndef __cdecl
+    #define __cdecl
+  #endif
+  #ifndef __fastcall
+    #define __fastcall
+  #endif
+#endif
+#ifndef DL_IMPORT
+  #define DL_IMPORT(t) t
+#endif
+#ifndef DL_EXPORT
+  #define DL_EXPORT(t) t
+#endif
+#define __PYX_COMMA ,
+#ifndef HAVE_LONG_LONG
+  #define HAVE_LONG_LONG
+#endif
+#ifndef PY_LONG_LONG
+  #define PY_LONG_LONG LONG_LONG
+#endif
+#ifndef Py_HUGE_VAL
+  #define Py_HUGE_VAL HUGE_VAL
+#endif
+#define __PYX_LIMITED_VERSION_HEX PY_VERSION_HEX
+#if defined(GRAALVM_PYTHON)
+  /* For very preliminary testing purposes. Most variables are set the same as PyPy.
+     The existence of this section does not imply that anything works or is even tested */
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #define CYTHON_COMPILING_IN_LIMITED_API 0
+  #define CYTHON_COMPILING_IN_GRAAL 1
+  #define CYTHON_COMPILING_IN_NOGIL 0
+  #undef CYTHON_USE_TYPE_SLOTS
+  #define CYTHON_USE_TYPE_SLOTS 0
+  #undef CYTHON_USE_TYPE_SPECS
+  #define CYTHON_USE_TYPE_SPECS 0
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #if PY_VERSION_HEX < 0x03050000
+    #undef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 0
+  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_INTERNALS
+  #define CYTHON_USE_UNICODE_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #undef CYTHON_AVOID_BORROWED_REFS
+  #define CYTHON_AVOID_BORROWED_REFS 1
+  #undef CYTHON_ASSUME_SAFE_MACROS
+  #define CYTHON_ASSUME_SAFE_MACROS 0
+  #undef CYTHON_UNPACK_METHODS
+  #define CYTHON_UNPACK_METHODS 0
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_GIL
+  #define CYTHON_FAST_GIL 0
+  #undef CYTHON_METH_FASTCALL
+  #define CYTHON_METH_FASTCALL 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #ifndef CYTHON_PEP487_INIT_SUBCLASS
+    #define CYTHON_PEP487_INIT_SUBCLASS (PY_MAJOR_VERSION >= 3)
+  #endif
+  #undef CYTHON_PEP489_MULTI_PHASE_INIT
+  #define CYTHON_PEP489_MULTI_PHASE_INIT 1
+  #undef CYTHON_USE_MODULE_STATE
+  #define CYTHON_USE_MODULE_STATE 0
+  #undef CYTHON_USE_TP_FINALIZE
+  #define CYTHON_USE_TP_FINALIZE 0
+  #undef CYTHON_USE_DICT_VERSIONS
+  #define CYTHON_USE_DICT_VERSIONS 0
+  #undef CYTHON_USE_EXC_INFO_STACK
+  #define CYTHON_USE_EXC_INFO_STACK 0
+  #ifndef CYTHON_UPDATE_DESCRIPTOR_DOC
+    #define CYTHON_UPDATE_DESCRIPTOR_DOC 0
+  #endif
+  #undef CYTHON_USE_FREELISTS
+  #define CYTHON_USE_FREELISTS 0
+#elif defined(PYPY_VERSION)
+  #define CYTHON_COMPILING_IN_PYPY 1
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #define CYTHON_COMPILING_IN_LIMITED_API 0
+  #define CYTHON_COMPILING_IN_GRAAL 0
+  #define CYTHON_COMPILING_IN_NOGIL 0
+  #undef CYTHON_USE_TYPE_SLOTS
+  #define CYTHON_USE_TYPE_SLOTS 0
+  #ifndef CYTHON_USE_TYPE_SPECS
+    #define CYTHON_USE_TYPE_SPECS 0
+  #endif
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #if PY_VERSION_HEX < 0x03050000
+    #undef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 0
+  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_INTERNALS
+  #define CYTHON_USE_UNICODE_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #undef CYTHON_AVOID_BORROWED_REFS
+  #define CYTHON_AVOID_BORROWED_REFS 1
+  #undef CYTHON_ASSUME_SAFE_MACROS
+  #define CYTHON_ASSUME_SAFE_MACROS 0
+  #undef CYTHON_UNPACK_METHODS
+  #define CYTHON_UNPACK_METHODS 0
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_GIL
+  #define CYTHON_FAST_GIL 0
+  #undef CYTHON_METH_FASTCALL
+  #define CYTHON_METH_FASTCALL 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #ifndef CYTHON_PEP487_INIT_SUBCLASS
+    #define CYTHON_PEP487_INIT_SUBCLASS (PY_MAJOR_VERSION >= 3)
+  #endif
+  #if PY_VERSION_HEX < 0x03090000
+    #undef CYTHON_PEP489_MULTI_PHASE_INIT
+    #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+  #elif !defined(CYTHON_PEP489_MULTI_PHASE_INIT)
+    #define CYTHON_PEP489_MULTI_PHASE_INIT 1
+  #endif
+  #undef CYTHON_USE_MODULE_STATE
+  #define CYTHON_USE_MODULE_STATE 0
+  #undef CYTHON_USE_TP_FINALIZE
+  #define CYTHON_USE_TP_FINALIZE (PY_VERSION_HEX >= 0x030400a1 && PYPY_VERSION_NUM >= 0x07030C00)
+  #undef CYTHON_USE_DICT_VERSIONS
+  #define CYTHON_USE_DICT_VERSIONS 0
+  #undef CYTHON_USE_EXC_INFO_STACK
+  #define CYTHON_USE_EXC_INFO_STACK 0
+  #ifndef CYTHON_UPDATE_DESCRIPTOR_DOC
+    #define CYTHON_UPDATE_DESCRIPTOR_DOC 0
+  #endif
+  #undef CYTHON_USE_FREELISTS
+  #define CYTHON_USE_FREELISTS 0
+#elif defined(CYTHON_LIMITED_API)
+  #ifdef Py_LIMITED_API
+    #undef __PYX_LIMITED_VERSION_HEX
+    #define __PYX_LIMITED_VERSION_HEX Py_LIMITED_API
+  #endif
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #define CYTHON_COMPILING_IN_LIMITED_API 1
+  #define CYTHON_COMPILING_IN_GRAAL 0
+  #define CYTHON_COMPILING_IN_NOGIL 0
+  #undef CYTHON_CLINE_IN_TRACEBACK
+  #define CYTHON_CLINE_IN_TRACEBACK 0
+  #undef CYTHON_USE_TYPE_SLOTS
+  #define CYTHON_USE_TYPE_SLOTS 0
+  #undef CYTHON_USE_TYPE_SPECS
+  #define CYTHON_USE_TYPE_SPECS 1
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #undef CYTHON_USE_ASYNC_SLOTS
+  #define CYTHON_USE_ASYNC_SLOTS 0
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_INTERNALS
+  #define CYTHON_USE_UNICODE_INTERNALS 0
+  #ifndef CYTHON_USE_UNICODE_WRITER
+    #define CYTHON_USE_UNICODE_WRITER 0
+  #endif
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #undef CYTHON_ASSUME_SAFE_MACROS
+  #define CYTHON_ASSUME_SAFE_MACROS 0
+  #undef CYTHON_UNPACK_METHODS
+  #define CYTHON_UNPACK_METHODS 0
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_GIL
+  #define CYTHON_FAST_GIL 0
+  #undef CYTHON_METH_FASTCALL
+  #define CYTHON_METH_FASTCALL 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #ifndef CYTHON_PEP487_INIT_SUBCLASS
+    #define CYTHON_PEP487_INIT_SUBCLASS 1
+  #endif
+  #undef CYTHON_PEP489_MULTI_PHASE_INIT
+  #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+  #undef CYTHON_USE_MODULE_STATE
+  #define CYTHON_USE_MODULE_STATE 1
+  #ifndef CYTHON_USE_TP_FINALIZE
+    #define CYTHON_USE_TP_FINALIZE 0
+  #endif
+  #undef CYTHON_USE_DICT_VERSIONS
+  #define CYTHON_USE_DICT_VERSIONS 0
+  #undef CYTHON_USE_EXC_INFO_STACK
+  #define CYTHON_USE_EXC_INFO_STACK 0
+  #ifndef CYTHON_UPDATE_DESCRIPTOR_DOC
+    #define CYTHON_UPDATE_DESCRIPTOR_DOC 0
+  #endif
+  #undef CYTHON_USE_FREELISTS
+  #define CYTHON_USE_FREELISTS 0
+#elif defined(Py_GIL_DISABLED) || defined(Py_NOGIL)
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #define CYTHON_COMPILING_IN_LIMITED_API 0
+  #define CYTHON_COMPILING_IN_GRAAL 0
+  #define CYTHON_COMPILING_IN_NOGIL 1
+  #ifndef CYTHON_USE_TYPE_SLOTS
+    #define CYTHON_USE_TYPE_SLOTS 1
+  #endif
+  #ifndef CYTHON_USE_TYPE_SPECS
+    #define CYTHON_USE_TYPE_SPECS 0
+  #endif
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #ifndef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #ifndef CYTHON_USE_PYLONG_INTERNALS
+    #define CYTHON_USE_PYLONG_INTERNALS 0
+  #endif
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #ifndef CYTHON_USE_UNICODE_INTERNALS
+    #define CYTHON_USE_UNICODE_INTERNALS 1
+  #endif
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #ifndef CYTHON_ASSUME_SAFE_MACROS
+    #define CYTHON_ASSUME_SAFE_MACROS 1
+  #endif
+  #ifndef CYTHON_UNPACK_METHODS
+    #define CYTHON_UNPACK_METHODS 1
+  #endif
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_GIL
+  #define CYTHON_FAST_GIL 0
+  #ifndef CYTHON_METH_FASTCALL
+    #define CYTHON_METH_FASTCALL 1
+  #endif
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #ifndef CYTHON_PEP487_INIT_SUBCLASS
+    #define CYTHON_PEP487_INIT_SUBCLASS 1
+  #endif
+  #ifndef CYTHON_PEP489_MULTI_PHASE_INIT
+    #define CYTHON_PEP489_MULTI_PHASE_INIT 1
+  #endif
+  #ifndef CYTHON_USE_MODULE_STATE
+    #define CYTHON_USE_MODULE_STATE 0
+  #endif
+  #ifndef CYTHON_USE_TP_FINALIZE
+    #define CYTHON_USE_TP_FINALIZE 1
+  #endif
+  #undef CYTHON_USE_DICT_VERSIONS
+  #define CYTHON_USE_DICT_VERSIONS 0
+  #undef CYTHON_USE_EXC_INFO_STACK
+  #define CYTHON_USE_EXC_INFO_STACK 0
+  #ifndef CYTHON_UPDATE_DESCRIPTOR_DOC
+    #define CYTHON_UPDATE_DESCRIPTOR_DOC 1
+  #endif
+  #ifndef CYTHON_USE_FREELISTS
+    #define CYTHON_USE_FREELISTS 0
+  #endif
+#else
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_CPYTHON 1
+  #define CYTHON_COMPILING_IN_LIMITED_API 0
+  #define CYTHON_COMPILING_IN_GRAAL 0
+  #define CYTHON_COMPILING_IN_NOGIL 0
+  #ifndef CYTHON_USE_TYPE_SLOTS
+    #define CYTHON_USE_TYPE_SLOTS 1
+  #endif
+  #ifndef CYTHON_USE_TYPE_SPECS
+    #define CYTHON_USE_TYPE_SPECS 0
+  #endif
+  #ifndef CYTHON_USE_PYTYPE_LOOKUP
+    #define CYTHON_USE_PYTYPE_LOOKUP 1
+  #endif
+  #if PY_MAJOR_VERSION < 3
+    #undef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 0
+  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #ifndef CYTHON_USE_PYLONG_INTERNALS
+    #define CYTHON_USE_PYLONG_INTERNALS 1
+  #endif
+  #ifndef CYTHON_USE_PYLIST_INTERNALS
+    #define CYTHON_USE_PYLIST_INTERNALS 1
+  #endif
+  #ifndef CYTHON_USE_UNICODE_INTERNALS
+    #define CYTHON_USE_UNICODE_INTERNALS 1
+  #endif
+  #if PY_VERSION_HEX < 0x030300F0 || PY_VERSION_HEX >= 0x030B00A2
+    #undef CYTHON_USE_UNICODE_WRITER
+    #define CYTHON_USE_UNICODE_WRITER 0
+  #elif !defined(CYTHON_USE_UNICODE_WRITER)
+    #define CYTHON_USE_UNICODE_WRITER 1
+  #endif
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #ifndef CYTHON_ASSUME_SAFE_MACROS
+    #define CYTHON_ASSUME_SAFE_MACROS 1
+  #endif
+  #ifndef CYTHON_UNPACK_METHODS
+    #define CYTHON_UNPACK_METHODS 1
+  #endif
+  #ifndef CYTHON_FAST_THREAD_STATE
+    #define CYTHON_FAST_THREAD_STATE 1
+  #endif
+  #ifndef CYTHON_FAST_GIL
+    #define CYTHON_FAST_GIL (PY_MAJOR_VERSION < 3 || PY_VERSION_HEX >= 0x03060000 && PY_VERSION_HEX < 0x030C00A6)
+  #endif
+  #ifndef CYTHON_METH_FASTCALL
+    #define CYTHON_METH_FASTCALL (PY_VERSION_HEX >= 0x030700A1)
+  #endif
+  #ifndef CYTHON_FAST_PYCALL
+    #define CYTHON_FAST_PYCALL 1
+  #endif
+  #ifndef CYTHON_PEP487_INIT_SUBCLASS
+    #define CYTHON_PEP487_INIT_SUBCLASS 1
+  #endif
+  #if PY_VERSION_HEX < 0x03050000
+    #undef CYTHON_PEP489_MULTI_PHASE_INIT
+    #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+  #elif !defined(CYTHON_PEP489_MULTI_PHASE_INIT)
+    #define CYTHON_PEP489_MULTI_PHASE_INIT 1
+  #endif
+  #ifndef CYTHON_USE_MODULE_STATE
+    #define CYTHON_USE_MODULE_STATE 0
+  #endif
+  #if PY_VERSION_HEX < 0x030400a1
+    #undef CYTHON_USE_TP_FINALIZE
+    #define CYTHON_USE_TP_FINALIZE 0
+  #elif !defined(CYTHON_USE_TP_FINALIZE)
+    #define CYTHON_USE_TP_FINALIZE 1
+  #endif
+  #if PY_VERSION_HEX < 0x030600B1
+    #undef CYTHON_USE_DICT_VERSIONS
+    #define CYTHON_USE_DICT_VERSIONS 0
+  #elif !defined(CYTHON_USE_DICT_VERSIONS)
+    #define CYTHON_USE_DICT_VERSIONS  (PY_VERSION_HEX < 0x030C00A5)
+  #endif
+  #if PY_VERSION_HEX < 0x030700A3
+    #undef CYTHON_USE_EXC_INFO_STACK
+    #define CYTHON_USE_EXC_INFO_STACK 0
+  #elif !defined(CYTHON_USE_EXC_INFO_STACK)
+    #define CYTHON_USE_EXC_INFO_STACK 1
+  #endif
+  #ifndef CYTHON_UPDATE_DESCRIPTOR_DOC
+    #define CYTHON_UPDATE_DESCRIPTOR_DOC 1
+  #endif
+  #ifndef CYTHON_USE_FREELISTS
+    #define CYTHON_USE_FREELISTS 1
+  #endif
+#endif
+#if !defined(CYTHON_FAST_PYCCALL)
+#define CYTHON_FAST_PYCCALL  (CYTHON_FAST_PYCALL && PY_VERSION_HEX >= 0x030600B1)
+#endif
+#if !defined(CYTHON_VECTORCALL)
+#define CYTHON_VECTORCALL  (CYTHON_FAST_PYCCALL && PY_VERSION_HEX >= 0x030800B1)
+#endif
+#define CYTHON_BACKPORT_VECTORCALL (CYTHON_METH_FASTCALL && PY_VERSION_HEX < 0x030800B1)
+#if CYTHON_USE_PYLONG_INTERNALS
+  #if PY_MAJOR_VERSION < 3
+    #include "longintrepr.h"
+  #endif
+  #undef SHIFT
+  #undef BASE
+  #undef MASK
+  #ifdef SIZEOF_VOID_P
+    enum { __pyx_check_sizeof_voidp = 1 / (int)(SIZEOF_VOID_P == sizeof(void*)) };
+  #endif
+#endif
+#ifndef __has_attribute
+  #define __has_attribute(x) 0
+#endif
+#ifndef __has_cpp_attribute
+  #define __has_cpp_attribute(x) 0
+#endif
+#ifndef CYTHON_RESTRICT
+  #if defined(__GNUC__)
+    #define CYTHON_RESTRICT __restrict__
+  #elif defined(_MSC_VER) && _MSC_VER >= 1400
+    #define CYTHON_RESTRICT __restrict
+  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define CYTHON_RESTRICT restrict
+  #else
+    #define CYTHON_RESTRICT
+  #endif
+#endif
+#ifndef CYTHON_UNUSED
+  #if defined(__cplusplus)
+    /* for clang __has_cpp_attribute(maybe_unused) is true even before C++17
+     * but leads to warnings with -pedantic, since it is a C++17 feature */
+    #if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
+      #if __has_cpp_attribute(maybe_unused)
+        #define CYTHON_UNUSED [[maybe_unused]]
+      #endif
+    #endif
+  #endif
+#endif
+#ifndef CYTHON_UNUSED
+# if defined(__GNUC__)
+#   if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+#     define CYTHON_UNUSED __attribute__ ((__unused__))
+#   else
+#     define CYTHON_UNUSED
+#   endif
+# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER))
+#   define CYTHON_UNUSED __attribute__ ((__unused__))
+# else
+#   define CYTHON_UNUSED
+# endif
+#endif
+#ifndef CYTHON_UNUSED_VAR
+#  if defined(__cplusplus)
+     template<class T> void CYTHON_UNUSED_VAR( const T& ) { }
+#  else
+#    define CYTHON_UNUSED_VAR(x) (void)(x)
+#  endif
+#endif
+#ifndef CYTHON_MAYBE_UNUSED_VAR
+  #define CYTHON_MAYBE_UNUSED_VAR(x) CYTHON_UNUSED_VAR(x)
+#endif
+#ifndef CYTHON_NCP_UNUSED
+# if CYTHON_COMPILING_IN_CPYTHON
+#  define CYTHON_NCP_UNUSED
+# else
+#  define CYTHON_NCP_UNUSED CYTHON_UNUSED
+# endif
+#endif
+#ifndef CYTHON_USE_CPP_STD_MOVE
+  #if defined(__cplusplus) && (\
+    __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1600))
+    #define CYTHON_USE_CPP_STD_MOVE 1
+  #else
+    #define CYTHON_USE_CPP_STD_MOVE 0
+  #endif
+#endif
+#define __Pyx_void_to_None(void_result) ((void)(void_result), Py_INCREF(Py_None), Py_None)
+#ifdef _MSC_VER
+    #ifndef _MSC_STDINT_H_
+        #if _MSC_VER < 1300
+            typedef unsigned char     uint8_t;
+            typedef unsigned short    uint16_t;
+            typedef unsigned int      uint32_t;
+        #else
+            typedef unsigned __int8   uint8_t;
+            typedef unsigned __int16  uint16_t;
+            typedef unsigned __int32  uint32_t;
+        #endif
+    #endif
+    #if _MSC_VER < 1300
+        #ifdef _WIN64
+            typedef unsigned long long  __pyx_uintptr_t;
+        #else
+            typedef unsigned int        __pyx_uintptr_t;
+        #endif
+    #else
+        #ifdef _WIN64
+            typedef unsigned __int64    __pyx_uintptr_t;
+        #else
+            typedef unsigned __int32    __pyx_uintptr_t;
+        #endif
+    #endif
+#else
+    #include <stdint.h>
+    typedef uintptr_t  __pyx_uintptr_t;
+#endif
+#ifndef CYTHON_FALLTHROUGH
+  #if defined(__cplusplus)
+    /* for clang __has_cpp_attribute(fallthrough) is true even before C++17
+     * but leads to warnings with -pedantic, since it is a C++17 feature */
+    #if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
+      #if __has_cpp_attribute(fallthrough)
+        #define CYTHON_FALLTHROUGH [[fallthrough]]
+      #endif
+    #endif
+    #ifndef CYTHON_FALLTHROUGH
+      #if __has_cpp_attribute(clang::fallthrough)
+        #define CYTHON_FALLTHROUGH [[clang::fallthrough]]
+      #elif __has_cpp_attribute(gnu::fallthrough)
+        #define CYTHON_FALLTHROUGH [[gnu::fallthrough]]
+      #endif
+    #endif
+  #endif
+  #ifndef CYTHON_FALLTHROUGH
+    #if __has_attribute(fallthrough)
+      #define CYTHON_FALLTHROUGH __attribute__((fallthrough))
+    #else
+      #define CYTHON_FALLTHROUGH
+    #endif
+  #endif
+  #if defined(__clang__) && defined(__apple_build_version__)
+    #if __apple_build_version__ < 7000000
+      #undef  CYTHON_FALLTHROUGH
+      #define CYTHON_FALLTHROUGH
+    #endif
+  #endif
+#endif
+#ifdef __cplusplus
+  template <typename T>
+  struct __PYX_IS_UNSIGNED_IMPL {static const bool value = T(0) < T(-1);};
+  #define __PYX_IS_UNSIGNED(type) (__PYX_IS_UNSIGNED_IMPL<type>::value)
+#else
+  #define __PYX_IS_UNSIGNED(type) (((type)-1) > 0)
+#endif
+#if CYTHON_COMPILING_IN_PYPY == 1
+  #define __PYX_NEED_TP_PRINT_SLOT  (PY_VERSION_HEX >= 0x030800b4 && PY_VERSION_HEX < 0x030A0000)
+#else
+  #define __PYX_NEED_TP_PRINT_SLOT  (PY_VERSION_HEX >= 0x030800b4 && PY_VERSION_HEX < 0x03090000)
+#endif
+#define __PYX_REINTERPRET_FUNCION(func_pointer, other_pointer) ((func_pointer)(void(*)(void))(other_pointer))
+
+#ifndef __cplusplus
+  #error "Cython files generated with the C++ option must be compiled with a C++ compiler."
+#endif
+#ifndef CYTHON_INLINE
+  #if defined(__clang__)
+    #define CYTHON_INLINE __inline__ __attribute__ ((__unused__))
+  #else
+    #define CYTHON_INLINE inline
+  #endif
+#endif
+template<typename T>
+void __Pyx_call_destructor(T& x) {
+    x.~T();
+}
+template<typename T>
+class __Pyx_FakeReference {
+  public:
+    __Pyx_FakeReference() : ptr(NULL) { }
+    __Pyx_FakeReference(const T& ref) : ptr(const_cast<T*>(&ref)) { }
+    T *operator->() { return ptr; }
+    T *operator&() { return ptr; }
+    operator T&() { return *ptr; }
+    template<typename U> bool operator ==(const U& other) const { return *ptr == other; }
+    template<typename U> bool operator !=(const U& other) const { return *ptr != other; }
+    template<typename U> bool operator==(const __Pyx_FakeReference<U>& other) const { return *ptr == *other.ptr; }
+    template<typename U> bool operator!=(const __Pyx_FakeReference<U>& other) const { return *ptr != *other.ptr; }
+  private:
+    T *ptr;
+};
+
+#define __PYX_BUILD_PY_SSIZE_T "n"
+#define CYTHON_FORMAT_SSIZE_T "z"
+#if PY_MAJOR_VERSION < 3
+  #define __Pyx_BUILTIN_MODULE_NAME "__builtin__"
+  #define __Pyx_DefaultClassType PyClass_Type
+  #define __Pyx_PyCode_New(a, p, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_New(a+k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+#else
+  #define __Pyx_BUILTIN_MODULE_NAME "builtins"
+  #define __Pyx_DefaultClassType PyType_Type
+#if CYTHON_COMPILING_IN_LIMITED_API
+    static CYTHON_INLINE PyObject* __Pyx_PyCode_New(int a, int p, int k, int l, int s, int f,
+                                                    PyObject *code, PyObject *c, PyObject* n, PyObject *v,
+                                                    PyObject *fv, PyObject *cell, PyObject* fn,
+                                                    PyObject *name, int fline, PyObject *lnos) {
+        PyObject *exception_table = NULL;
+        PyObject *types_module=NULL, *code_type=NULL, *result=NULL;
+        #if __PYX_LIMITED_VERSION_HEX < 0x030B0000
+        PyObject *version_info;
+        PyObject *py_minor_version = NULL;
+        #endif
+        long minor_version = 0;
+        PyObject *type, *value, *traceback;
+        PyErr_Fetch(&type, &value, &traceback);
+        #if __PYX_LIMITED_VERSION_HEX >= 0x030B0000
+        minor_version = 11;
+        #else
+        if (!(version_info = PySys_GetObject("version_info"))) goto end;
+        if (!(py_minor_version = PySequence_GetItem(version_info, 1))) goto end;
+        minor_version = PyLong_AsLong(py_minor_version);
+        Py_DECREF(py_minor_version);
+        if (minor_version == -1 && PyErr_Occurred()) goto end;
+        #endif
+        if (!(types_module = PyImport_ImportModule("types"))) goto end;
+        if (!(code_type = PyObject_GetAttrString(types_module, "CodeType"))) goto end;
+        if (minor_version <= 7) {
+            (void)p;
+            result = PyObject_CallFunction(code_type, "iiiiiOOOOOOiOO", a, k, l, s, f, code,
+                          c, n, v, fn, name, fline, lnos, fv, cell);
+        } else if (minor_version <= 10) {
+            result = PyObject_CallFunction(code_type, "iiiiiiOOOOOOiOO", a,p, k, l, s, f, code,
+                          c, n, v, fn, name, fline, lnos, fv, cell);
+        } else {
+            if (!(exception_table = PyBytes_FromStringAndSize(NULL, 0))) goto end;
+            result = PyObject_CallFunction(code_type, "iiiiiiOOOOOOOiOO", a,p, k, l, s, f, code,
+                          c, n, v, fn, name, name, fline, lnos, exception_table, fv, cell);
+        }
+    end:
+        Py_XDECREF(code_type);
+        Py_XDECREF(exception_table);
+        Py_XDECREF(types_module);
+        if (type) {
+            PyErr_Restore(type, value, traceback);
+        }
+        return result;
+    }
+    #ifndef CO_OPTIMIZED
+    #define CO_OPTIMIZED 0x0001
+    #endif
+    #ifndef CO_NEWLOCALS
+    #define CO_NEWLOCALS 0x0002
+    #endif
+    #ifndef CO_VARARGS
+    #define CO_VARARGS 0x0004
+    #endif
+    #ifndef CO_VARKEYWORDS
+    #define CO_VARKEYWORDS 0x0008
+    #endif
+    #ifndef CO_ASYNC_GENERATOR
+    #define CO_ASYNC_GENERATOR 0x0200
+    #endif
+    #ifndef CO_GENERATOR
+    #define CO_GENERATOR 0x0020
+    #endif
+    #ifndef CO_COROUTINE
+    #define CO_COROUTINE 0x0080
+    #endif
+#elif PY_VERSION_HEX >= 0x030B0000
+  static CYTHON_INLINE PyCodeObject* __Pyx_PyCode_New(int a, int p, int k, int l, int s, int f,
+                                                    PyObject *code, PyObject *c, PyObject* n, PyObject *v,
+                                                    PyObject *fv, PyObject *cell, PyObject* fn,
+                                                    PyObject *name, int fline, PyObject *lnos) {
+    PyCodeObject *result;
+    PyObject *empty_bytes = PyBytes_FromStringAndSize("", 0);
+    if (!empty_bytes) return NULL;
+    result =
+      #if PY_VERSION_HEX >= 0x030C0000
+        PyUnstable_Code_NewWithPosOnlyArgs
+      #else
+        PyCode_NewWithPosOnlyArgs
+      #endif
+        (a, p, k, l, s, f, code, c, n, v, fv, cell, fn, name, name, fline, lnos, empty_bytes);
+    Py_DECREF(empty_bytes);
+    return result;
+  }
+#elif PY_VERSION_HEX >= 0x030800B2 && !CYTHON_COMPILING_IN_PYPY
+  #define __Pyx_PyCode_New(a, p, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_NewWithPosOnlyArgs(a, p, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+#else
+  #define __Pyx_PyCode_New(a, p, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+#endif
+#endif
+#if PY_VERSION_HEX >= 0x030900A4 || defined(Py_IS_TYPE)
+  #define __Pyx_IS_TYPE(ob, type) Py_IS_TYPE(ob, type)
+#else
+  #define __Pyx_IS_TYPE(ob, type) (((const PyObject*)ob)->ob_type == (type))
+#endif
+#if PY_VERSION_HEX >= 0x030A00B1 || defined(Py_Is)
+  #define __Pyx_Py_Is(x, y)  Py_Is(x, y)
+#else
+  #define __Pyx_Py_Is(x, y) ((x) == (y))
+#endif
+#if PY_VERSION_HEX >= 0x030A00B1 || defined(Py_IsNone)
+  #define __Pyx_Py_IsNone(ob) Py_IsNone(ob)
+#else
+  #define __Pyx_Py_IsNone(ob) __Pyx_Py_Is((ob), Py_None)
+#endif
+#if PY_VERSION_HEX >= 0x030A00B1 || defined(Py_IsTrue)
+  #define __Pyx_Py_IsTrue(ob) Py_IsTrue(ob)
+#else
+  #define __Pyx_Py_IsTrue(ob) __Pyx_Py_Is((ob), Py_True)
+#endif
+#if PY_VERSION_HEX >= 0x030A00B1 || defined(Py_IsFalse)
+  #define __Pyx_Py_IsFalse(ob) Py_IsFalse(ob)
+#else
+  #define __Pyx_Py_IsFalse(ob) __Pyx_Py_Is((ob), Py_False)
+#endif
+#define __Pyx_NoneAsNull(obj)  (__Pyx_Py_IsNone(obj) ? NULL : (obj))
+#if PY_VERSION_HEX >= 0x030900F0 && !CYTHON_COMPILING_IN_PYPY
+  #define __Pyx_PyObject_GC_IsFinalized(o) PyObject_GC_IsFinalized(o)
+#else
+  #define __Pyx_PyObject_GC_IsFinalized(o) _PyGC_FINALIZED(o)
+#endif
+#ifndef CO_COROUTINE
+  #define CO_COROUTINE 0x80
+#endif
+#ifndef CO_ASYNC_GENERATOR
+  #define CO_ASYNC_GENERATOR 0x200
+#endif
+#ifndef Py_TPFLAGS_CHECKTYPES
+  #define Py_TPFLAGS_CHECKTYPES 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_INDEX
+  #define Py_TPFLAGS_HAVE_INDEX 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_NEWBUFFER
+  #define Py_TPFLAGS_HAVE_NEWBUFFER 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_FINALIZE
+  #define Py_TPFLAGS_HAVE_FINALIZE 0
+#endif
+#ifndef Py_TPFLAGS_SEQUENCE
+  #define Py_TPFLAGS_SEQUENCE 0
+#endif
+#ifndef Py_TPFLAGS_MAPPING
+  #define Py_TPFLAGS_MAPPING 0
+#endif
+#ifndef METH_STACKLESS
+  #define METH_STACKLESS 0
+#endif
+#if PY_VERSION_HEX <= 0x030700A3 || !defined(METH_FASTCALL)
+  #ifndef METH_FASTCALL
+     #define METH_FASTCALL 0x80
+  #endif
+  typedef PyObject *(*__Pyx_PyCFunctionFast) (PyObject *self, PyObject *const *args, Py_ssize_t nargs);
+  typedef PyObject *(*__Pyx_PyCFunctionFastWithKeywords) (PyObject *self, PyObject *const *args,
+                                                          Py_ssize_t nargs, PyObject *kwnames);
+#else
+  #if PY_VERSION_HEX >= 0x030d00A4
+  #  define __Pyx_PyCFunctionFast PyCFunctionFast
+  #  define __Pyx_PyCFunctionFastWithKeywords PyCFunctionFastWithKeywords
+  #else
+  #  define __Pyx_PyCFunctionFast _PyCFunctionFast
+  #  define __Pyx_PyCFunctionFastWithKeywords _PyCFunctionFastWithKeywords
+  #endif
+#endif
+#if CYTHON_METH_FASTCALL
+  #define __Pyx_METH_FASTCALL METH_FASTCALL
+  #define __Pyx_PyCFunction_FastCall __Pyx_PyCFunctionFast
+  #define __Pyx_PyCFunction_FastCallWithKeywords __Pyx_PyCFunctionFastWithKeywords
+#else
+  #define __Pyx_METH_FASTCALL METH_VARARGS
+  #define __Pyx_PyCFunction_FastCall PyCFunction
+  #define __Pyx_PyCFunction_FastCallWithKeywords PyCFunctionWithKeywords
+#endif
+#if CYTHON_VECTORCALL
+  #define __pyx_vectorcallfunc vectorcallfunc
+  #define __Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET  PY_VECTORCALL_ARGUMENTS_OFFSET
+  #define __Pyx_PyVectorcall_NARGS(n)  PyVectorcall_NARGS((size_t)(n))
+#elif CYTHON_BACKPORT_VECTORCALL
+  typedef PyObject *(*__pyx_vectorcallfunc)(PyObject *callable, PyObject *const *args,
+                                            size_t nargsf, PyObject *kwnames);
+  #define __Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET  ((size_t)1 << (8 * sizeof(size_t) - 1))
+  #define __Pyx_PyVectorcall_NARGS(n)  ((Py_ssize_t)(((size_t)(n)) & ~__Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET))
+#else
+  #define __Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET  0
+  #define __Pyx_PyVectorcall_NARGS(n)  ((Py_ssize_t)(n))
+#endif
+#if PY_MAJOR_VERSION >= 0x030900B1
+#define __Pyx_PyCFunction_CheckExact(func)  PyCFunction_CheckExact(func)
+#else
+#define __Pyx_PyCFunction_CheckExact(func)  PyCFunction_Check(func)
+#endif
+#define __Pyx_CyOrPyCFunction_Check(func)  PyCFunction_Check(func)
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_CyOrPyCFunction_GET_FUNCTION(func)  (((PyCFunctionObject*)(func))->m_ml->ml_meth)
+#elif !CYTHON_COMPILING_IN_LIMITED_API
+#define __Pyx_CyOrPyCFunction_GET_FUNCTION(func)  PyCFunction_GET_FUNCTION(func)
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_CyOrPyCFunction_GET_FLAGS(func)  (((PyCFunctionObject*)(func))->m_ml->ml_flags)
+static CYTHON_INLINE PyObject* __Pyx_CyOrPyCFunction_GET_SELF(PyObject *func) {
+    return (__Pyx_CyOrPyCFunction_GET_FLAGS(func) & METH_STATIC) ? NULL : ((PyCFunctionObject*)func)->m_self;
+}
+#endif
+static CYTHON_INLINE int __Pyx__IsSameCFunction(PyObject *func, void *cfunc) {
+#if CYTHON_COMPILING_IN_LIMITED_API
+    return PyCFunction_Check(func) && PyCFunction_GetFunction(func) == (PyCFunction) cfunc;
+#else
+    return PyCFunction_Check(func) && PyCFunction_GET_FUNCTION(func) == (PyCFunction) cfunc;
+#endif
+}
+#define __Pyx_IsSameCFunction(func, cfunc)   __Pyx__IsSameCFunction(func, cfunc)
+#if __PYX_LIMITED_VERSION_HEX < 0x030900B1
+  #define __Pyx_PyType_FromModuleAndSpec(m, s, b)  ((void)m, PyType_FromSpecWithBases(s, b))
+  typedef PyObject *(*__Pyx_PyCMethod)(PyObject *, PyTypeObject *, PyObject *const *, size_t, PyObject *);
+#else
+  #define __Pyx_PyType_FromModuleAndSpec(m, s, b)  PyType_FromModuleAndSpec(m, s, b)
+  #define __Pyx_PyCMethod  PyCMethod
+#endif
+#ifndef METH_METHOD
+  #define METH_METHOD 0x200
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Malloc)
+  #define PyObject_Malloc(s)   PyMem_Malloc(s)
+  #define PyObject_Free(p)     PyMem_Free(p)
+  #define PyObject_Realloc(p)  PyMem_Realloc(p)
+#endif
+#if CYTHON_COMPILING_IN_LIMITED_API
+  #define __Pyx_PyCode_HasFreeVars(co)  (PyCode_GetNumFree(co) > 0)
+  #define __Pyx_PyFrame_SetLineNumber(frame, lineno)
+#else
+  #define __Pyx_PyCode_HasFreeVars(co)  (PyCode_GetNumFree(co) > 0)
+  #define __Pyx_PyFrame_SetLineNumber(frame, lineno)  (frame)->f_lineno = (lineno)
+#endif
+#if CYTHON_COMPILING_IN_LIMITED_API
+  #define __Pyx_PyThreadState_Current PyThreadState_Get()
+#elif !CYTHON_FAST_THREAD_STATE
+  #define __Pyx_PyThreadState_Current PyThreadState_GET()
+#elif PY_VERSION_HEX >= 0x030d00A1
+  #define __Pyx_PyThreadState_Current PyThreadState_GetUnchecked()
+#elif PY_VERSION_HEX >= 0x03060000
+  #define __Pyx_PyThreadState_Current _PyThreadState_UncheckedGet()
+#elif PY_VERSION_HEX >= 0x03000000
+  #define __Pyx_PyThreadState_Current PyThreadState_GET()
+#else
+  #define __Pyx_PyThreadState_Current _PyThreadState_Current
+#endif
+#if CYTHON_COMPILING_IN_LIMITED_API
+static CYTHON_INLINE void *__Pyx_PyModule_GetState(PyObject *op)
+{
+    void *result;
+    result = PyModule_GetState(op);
+    if (!result)
+        Py_FatalError("Couldn't find the module state");
+    return result;
+}
+#endif
+#define __Pyx_PyObject_GetSlot(obj, name, func_ctype)  __Pyx_PyType_GetSlot(Py_TYPE(obj), name, func_ctype)
+#if CYTHON_COMPILING_IN_LIMITED_API
+  #define __Pyx_PyType_GetSlot(type, name, func_ctype)  ((func_ctype) PyType_GetSlot((type), Py_##name))
+#else
+  #define __Pyx_PyType_GetSlot(type, name, func_ctype)  ((type)->name)
+#endif
+#if PY_VERSION_HEX < 0x030700A2 && !defined(PyThread_tss_create) && !defined(Py_tss_NEEDS_INIT)
+#include "pythread.h"
+#define Py_tss_NEEDS_INIT 0
+typedef int Py_tss_t;
+static CYTHON_INLINE int PyThread_tss_create(Py_tss_t *key) {
+  *key = PyThread_create_key();
+  return 0;
+}
+static CYTHON_INLINE Py_tss_t * PyThread_tss_alloc(void) {
+  Py_tss_t *key = (Py_tss_t *)PyObject_Malloc(sizeof(Py_tss_t));
+  *key = Py_tss_NEEDS_INIT;
+  return key;
+}
+static CYTHON_INLINE void PyThread_tss_free(Py_tss_t *key) {
+  PyObject_Free(key);
+}
+static CYTHON_INLINE int PyThread_tss_is_created(Py_tss_t *key) {
+  return *key != Py_tss_NEEDS_INIT;
+}
+static CYTHON_INLINE void PyThread_tss_delete(Py_tss_t *key) {
+  PyThread_delete_key(*key);
+  *key = Py_tss_NEEDS_INIT;
+}
+static CYTHON_INLINE int PyThread_tss_set(Py_tss_t *key, void *value) {
+  return PyThread_set_key_value(*key, value);
+}
+static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) {
+  return PyThread_get_key_value(*key);
+}
+#endif
+#if PY_MAJOR_VERSION < 3
+    #if CYTHON_COMPILING_IN_PYPY
+        #if PYPY_VERSION_NUM < 0x07030600
+            #if defined(__cplusplus) && __cplusplus >= 201402L
+                [[deprecated("`with nogil:` inside a nogil function will not release the GIL in PyPy2 < 7.3.6")]]
+            #elif defined(__GNUC__) || defined(__clang__)
+                __attribute__ ((__deprecated__("`with nogil:` inside a nogil function will not release the GIL in PyPy2 < 7.3.6")))
+            #elif defined(_MSC_VER)
+                __declspec(deprecated("`with nogil:` inside a nogil function will not release the GIL in PyPy2 < 7.3.6"))
+            #endif
+            static CYTHON_INLINE int PyGILState_Check(void) {
+                return 0;
+            }
+        #else  // PYPY_VERSION_NUM < 0x07030600
+        #endif  // PYPY_VERSION_NUM < 0x07030600
+    #else
+        static CYTHON_INLINE int PyGILState_Check(void) {
+            PyThreadState * tstate = _PyThreadState_Current;
+            return tstate && (tstate == PyGILState_GetThisThreadState());
+        }
+    #endif
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX < 0x030d0000 || defined(_PyDict_NewPresized)
+#define __Pyx_PyDict_NewPresized(n)  ((n <= 8) ? PyDict_New() : _PyDict_NewPresized(n))
+#else
+#define __Pyx_PyDict_NewPresized(n)  PyDict_New()
+#endif
+#if PY_MAJOR_VERSION >= 3 || CYTHON_FUTURE_DIVISION
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_TrueDivide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceTrueDivide(x,y)
+#else
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_Divide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceDivide(x,y)
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX > 0x030600B4 && PY_VERSION_HEX < 0x030d0000 && CYTHON_USE_UNICODE_INTERNALS
+#define __Pyx_PyDict_GetItemStrWithError(dict, name)  _PyDict_GetItem_KnownHash(dict, name, ((PyASCIIObject *) name)->hash)
+static CYTHON_INLINE PyObject * __Pyx_PyDict_GetItemStr(PyObject *dict, PyObject *name) {
+    PyObject *res = __Pyx_PyDict_GetItemStrWithError(dict, name);
+    if (res == NULL) PyErr_Clear();
+    return res;
+}
+#elif PY_MAJOR_VERSION >= 3 && (!CYTHON_COMPILING_IN_PYPY || PYPY_VERSION_NUM >= 0x07020000)
+#define __Pyx_PyDict_GetItemStrWithError  PyDict_GetItemWithError
+#define __Pyx_PyDict_GetItemStr           PyDict_GetItem
+#else
+static CYTHON_INLINE PyObject * __Pyx_PyDict_GetItemStrWithError(PyObject *dict, PyObject *name) {
+#if CYTHON_COMPILING_IN_PYPY
+    return PyDict_GetItem(dict, name);
+#else
+    PyDictEntry *ep;
+    PyDictObject *mp = (PyDictObject*) dict;
+    long hash = ((PyStringObject *) name)->ob_shash;
+    assert(hash != -1);
+    ep = (mp->ma_lookup)(mp, name, hash);
+    if (ep == NULL) {
+        return NULL;
+    }
+    return ep->me_value;
+#endif
+}
+#define __Pyx_PyDict_GetItemStr           PyDict_GetItem
+#endif
+#if CYTHON_USE_TYPE_SLOTS
+  #define __Pyx_PyType_GetFlags(tp)   (((PyTypeObject *)tp)->tp_flags)
+  #define __Pyx_PyType_HasFeature(type, feature)  ((__Pyx_PyType_GetFlags(type) & (feature)) != 0)
+  #define __Pyx_PyObject_GetIterNextFunc(obj)  (Py_TYPE(obj)->tp_iternext)
+#else
+  #define __Pyx_PyType_GetFlags(tp)   (PyType_GetFlags((PyTypeObject *)tp))
+  #define __Pyx_PyType_HasFeature(type, feature)  PyType_HasFeature(type, feature)
+  #define __Pyx_PyObject_GetIterNextFunc(obj)  PyIter_Next
+#endif
+#if CYTHON_COMPILING_IN_LIMITED_API
+  #define __Pyx_SetItemOnTypeDict(tp, k, v) PyObject_GenericSetAttr((PyObject*)tp, k, v)
+#else
+  #define __Pyx_SetItemOnTypeDict(tp, k, v) PyDict_SetItem(tp->tp_dict, k, v)
+#endif
+#if CYTHON_USE_TYPE_SPECS && PY_VERSION_HEX >= 0x03080000
+#define __Pyx_PyHeapTypeObject_GC_Del(obj)  {\
+    PyTypeObject *type = Py_TYPE((PyObject*)obj);\
+    assert(__Pyx_PyType_HasFeature(type, Py_TPFLAGS_HEAPTYPE));\
+    PyObject_GC_Del(obj);\
+    Py_DECREF(type);\
+}
+#else
+#define __Pyx_PyHeapTypeObject_GC_Del(obj)  PyObject_GC_Del(obj)
+#endif
+#if CYTHON_COMPILING_IN_LIMITED_API
+  #define CYTHON_PEP393_ENABLED 1
+  #define __Pyx_PyUnicode_READY(op)       (0)
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GetLength(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_ReadChar(u, i)
+  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   ((void)u, 1114111U)
+  #define __Pyx_PyUnicode_KIND(u)         ((void)u, (0))
+  #define __Pyx_PyUnicode_DATA(u)         ((void*)u)
+  #define __Pyx_PyUnicode_READ(k, d, i)   ((void)k, PyUnicode_ReadChar((PyObject*)(d), i))
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != PyUnicode_GetLength(u))
+#elif PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_KIND)
+  #define CYTHON_PEP393_ENABLED 1
+  #if PY_VERSION_HEX >= 0x030C0000
+    #define __Pyx_PyUnicode_READY(op)       (0)
+  #else
+    #define __Pyx_PyUnicode_READY(op)       (likely(PyUnicode_IS_READY(op)) ?\
+                                                0 : _PyUnicode_Ready((PyObject *)(op)))
+  #endif
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_LENGTH(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_READ_CHAR(u, i)
+  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   PyUnicode_MAX_CHAR_VALUE(u)
+  #define __Pyx_PyUnicode_KIND(u)         ((int)PyUnicode_KIND(u))
+  #define __Pyx_PyUnicode_DATA(u)         PyUnicode_DATA(u)
+  #define __Pyx_PyUnicode_READ(k, d, i)   PyUnicode_READ(k, d, i)
+  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  PyUnicode_WRITE(k, d, i, (Py_UCS4) ch)
+  #if PY_VERSION_HEX >= 0x030C0000
+    #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != PyUnicode_GET_LENGTH(u))
+  #else
+    #if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x03090000
+    #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : ((PyCompactUnicodeObject *)(u))->wstr_length))
+    #else
+    #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u)))
+    #endif
+  #endif
+#else
+  #define CYTHON_PEP393_ENABLED 0
+  #define PyUnicode_1BYTE_KIND  1
+  #define PyUnicode_2BYTE_KIND  2
+  #define PyUnicode_4BYTE_KIND  4
+  #define __Pyx_PyUnicode_READY(op)       (0)
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_SIZE(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) ((Py_UCS4)(PyUnicode_AS_UNICODE(u)[i]))
+  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   ((sizeof(Py_UNICODE) == 2) ? 65535U : 1114111U)
+  #define __Pyx_PyUnicode_KIND(u)         ((int)sizeof(Py_UNICODE))
+  #define __Pyx_PyUnicode_DATA(u)         ((void*)PyUnicode_AS_UNICODE(u))
+  #define __Pyx_PyUnicode_READ(k, d, i)   ((void)(k), (Py_UCS4)(((Py_UNICODE*)d)[i]))
+  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  (((void)(k)), ((Py_UNICODE*)d)[i] = (Py_UNICODE) ch)
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != PyUnicode_GET_SIZE(u))
+#endif
+#if CYTHON_COMPILING_IN_PYPY
+  #define __Pyx_PyUnicode_Concat(a, b)      PyNumber_Add(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  PyNumber_Add(a, b)
+#else
+  #define __Pyx_PyUnicode_Concat(a, b)      PyUnicode_Concat(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ?\
+      PyNumber_Add(a, b) : __Pyx_PyUnicode_Concat(a, b))
+#endif
+#if CYTHON_COMPILING_IN_PYPY
+  #if !defined(PyUnicode_DecodeUnicodeEscape)
+    #define PyUnicode_DecodeUnicodeEscape(s, size, errors)  PyUnicode_Decode(s, size, "unicode_escape", errors)
+  #endif
+  #if !defined(PyUnicode_Contains) || (PY_MAJOR_VERSION == 2 && PYPY_VERSION_NUM < 0x07030500)
+    #undef PyUnicode_Contains
+    #define PyUnicode_Contains(u, s)  PySequence_Contains(u, s)
+  #endif
+  #if !defined(PyByteArray_Check)
+    #define PyByteArray_Check(obj)  PyObject_TypeCheck(obj, &PyByteArray_Type)
+  #endif
+  #if !defined(PyObject_Format)
+    #define PyObject_Format(obj, fmt)  PyObject_CallMethod(obj, "__format__", "O", fmt)
+  #endif
+#endif
+#define __Pyx_PyString_FormatSafe(a, b)   ((unlikely((a) == Py_None || (PyString_Check(b) && !PyString_CheckExact(b)))) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b))
+#define __Pyx_PyUnicode_FormatSafe(a, b)  ((unlikely((a) == Py_None || (PyUnicode_Check(b) && !PyUnicode_CheckExact(b)))) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b))
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyString_Format(a, b)  PyUnicode_Format(a, b)
+#else
+  #define __Pyx_PyString_Format(a, b)  PyString_Format(a, b)
+#endif
+#if PY_MAJOR_VERSION < 3 && !defined(PyObject_ASCII)
+  #define PyObject_ASCII(o)            PyObject_Repr(o)
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBaseString_Type            PyUnicode_Type
+  #define PyStringObject               PyUnicodeObject
+  #define PyString_Type                PyUnicode_Type
+  #define PyString_Check               PyUnicode_Check
+  #define PyString_CheckExact          PyUnicode_CheckExact
+#ifndef PyObject_Unicode
+  #define PyObject_Unicode             PyObject_Str
+#endif
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj)
+  #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj)
+#else
+  #define __Pyx_PyBaseString_Check(obj) (PyString_Check(obj) || PyUnicode_Check(obj))
+  #define __Pyx_PyBaseString_CheckExact(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj))
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+  #define __Pyx_PySequence_ListKeepNew(obj)\
+    (likely(PyList_CheckExact(obj) && Py_REFCNT(obj) == 1) ? __Pyx_NewRef(obj) : PySequence_List(obj))
+#else
+  #define __Pyx_PySequence_ListKeepNew(obj)  PySequence_List(obj)
+#endif
+#ifndef PySet_CheckExact
+  #define PySet_CheckExact(obj)        __Pyx_IS_TYPE(obj, &PySet_Type)
+#endif
+#if PY_VERSION_HEX >= 0x030900A4
+  #define __Pyx_SET_REFCNT(obj, refcnt) Py_SET_REFCNT(obj, refcnt)
+  #define __Pyx_SET_SIZE(obj, size) Py_SET_SIZE(obj, size)
+#else
+  #define __Pyx_SET_REFCNT(obj, refcnt) Py_REFCNT(obj) = (refcnt)
+  #define __Pyx_SET_SIZE(obj, size) Py_SIZE(obj) = (size)
+#endif
+#if CYTHON_ASSUME_SAFE_MACROS
+  #define __Pyx_PySequence_ITEM(o, i) PySequence_ITEM(o, i)
+  #define __Pyx_PySequence_SIZE(seq)  Py_SIZE(seq)
+  #define __Pyx_PyTuple_SET_ITEM(o, i, v) (PyTuple_SET_ITEM(o, i, v), (0))
+  #define __Pyx_PyList_SET_ITEM(o, i, v) (PyList_SET_ITEM(o, i, v), (0))
+  #define __Pyx_PyTuple_GET_SIZE(o) PyTuple_GET_SIZE(o)
+  #define __Pyx_PyList_GET_SIZE(o) PyList_GET_SIZE(o)
+  #define __Pyx_PySet_GET_SIZE(o) PySet_GET_SIZE(o)
+  #define __Pyx_PyBytes_GET_SIZE(o) PyBytes_GET_SIZE(o)
+  #define __Pyx_PyByteArray_GET_SIZE(o) PyByteArray_GET_SIZE(o)
+#else
+  #define __Pyx_PySequence_ITEM(o, i) PySequence_GetItem(o, i)
+  #define __Pyx_PySequence_SIZE(seq)  PySequence_Size(seq)
+  #define __Pyx_PyTuple_SET_ITEM(o, i, v) PyTuple_SetItem(o, i, v)
+  #define __Pyx_PyList_SET_ITEM(o, i, v) PyList_SetItem(o, i, v)
+  #define __Pyx_PyTuple_GET_SIZE(o) PyTuple_Size(o)
+  #define __Pyx_PyList_GET_SIZE(o) PyList_Size(o)
+  #define __Pyx_PySet_GET_SIZE(o) PySet_Size(o)
+  #define __Pyx_PyBytes_GET_SIZE(o) PyBytes_Size(o)
+  #define __Pyx_PyByteArray_GET_SIZE(o) PyByteArray_Size(o)
+#endif
+#if __PYX_LIMITED_VERSION_HEX >= 0x030d00A1
+  #define __Pyx_PyImport_AddModuleRef(name) PyImport_AddModuleRef(name)
+#else
+  static CYTHON_INLINE PyObject *__Pyx_PyImport_AddModuleRef(const char *name) {
+      PyObject *module = PyImport_AddModule(name);
+      Py_XINCREF(module);
+      return module;
+  }
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyIntObject                  PyLongObject
+  #define PyInt_Type                   PyLong_Type
+  #define PyInt_Check(op)              PyLong_Check(op)
+  #define PyInt_CheckExact(op)         PyLong_CheckExact(op)
+  #define __Pyx_Py3Int_Check(op)       PyLong_Check(op)
+  #define __Pyx_Py3Int_CheckExact(op)  PyLong_CheckExact(op)
+  #define PyInt_FromString             PyLong_FromString
+  #define PyInt_FromUnicode            PyLong_FromUnicode
+  #define PyInt_FromLong               PyLong_FromLong
+  #define PyInt_FromSize_t             PyLong_FromSize_t
+  #define PyInt_FromSsize_t            PyLong_FromSsize_t
+  #define PyInt_AsLong                 PyLong_AsLong
+  #define PyInt_AS_LONG                PyLong_AS_LONG
+  #define PyInt_AsSsize_t              PyLong_AsSsize_t
+  #define PyInt_AsUnsignedLongMask     PyLong_AsUnsignedLongMask
+  #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask
+  #define PyNumber_Int                 PyNumber_Long
+#else
+  #define __Pyx_Py3Int_Check(op)       (PyLong_Check(op) || PyInt_Check(op))
+  #define __Pyx_Py3Int_CheckExact(op)  (PyLong_CheckExact(op) || PyInt_CheckExact(op))
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBoolObject                 PyLongObject
+#endif
+#if PY_MAJOR_VERSION >= 3 && CYTHON_COMPILING_IN_PYPY
+  #ifndef PyUnicode_InternFromString
+    #define PyUnicode_InternFromString(s) PyUnicode_FromString(s)
+  #endif
+#endif
+#if PY_VERSION_HEX < 0x030200A4
+  typedef long Py_hash_t;
+  #define __Pyx_PyInt_FromHash_t PyInt_FromLong
+  #define __Pyx_PyInt_AsHash_t   __Pyx_PyIndex_AsHash_t
+#else
+  #define __Pyx_PyInt_FromHash_t PyInt_FromSsize_t
+  #define __Pyx_PyInt_AsHash_t   __Pyx_PyIndex_AsSsize_t
+#endif
+#if CYTHON_USE_ASYNC_SLOTS
+  #if PY_VERSION_HEX >= 0x030500B1
+    #define __Pyx_PyAsyncMethodsStruct PyAsyncMethods
+    #define __Pyx_PyType_AsAsync(obj) (Py_TYPE(obj)->tp_as_async)
+  #else
+    #define __Pyx_PyType_AsAsync(obj) ((__Pyx_PyAsyncMethodsStruct*) (Py_TYPE(obj)->tp_reserved))
+  #endif
+#else
+  #define __Pyx_PyType_AsAsync(obj) NULL
+#endif
+#ifndef __Pyx_PyAsyncMethodsStruct
+    typedef struct {
+        unaryfunc am_await;
+        unaryfunc am_aiter;
+        unaryfunc am_anext;
+    } __Pyx_PyAsyncMethodsStruct;
+#endif
+
+#if defined(_WIN32) || defined(WIN32) || defined(MS_WINDOWS)
+  #if !defined(_USE_MATH_DEFINES)
+    #define _USE_MATH_DEFINES
+  #endif
+#endif
+#include <math.h>
+#ifdef NAN
+#define __PYX_NAN() ((float) NAN)
+#else
+static CYTHON_INLINE float __PYX_NAN() {
+  float value;
+  memset(&value, 0xFF, sizeof(value));
+  return value;
+}
+#endif
+#if defined(__CYGWIN__) && defined(_LDBL_EQ_DBL)
+#define __Pyx_truncl trunc
+#else
+#define __Pyx_truncl truncl
+#endif
+
+#define __PYX_MARK_ERR_POS(f_index, lineno) \
+    { __pyx_filename = __pyx_f[f_index]; (void)__pyx_filename; __pyx_lineno = lineno; (void)__pyx_lineno; __pyx_clineno = __LINE__;  (void)__pyx_clineno; }
+#define __PYX_ERR(f_index, lineno, Ln_error) \
+    { __PYX_MARK_ERR_POS(f_index, lineno) goto Ln_error; }
+
+#ifdef CYTHON_EXTERN_C
+    #undef __PYX_EXTERN_C
+    #define __PYX_EXTERN_C CYTHON_EXTERN_C
+#elif defined(__PYX_EXTERN_C)
+    #ifdef _MSC_VER
+    #pragma message ("Please do not define the '__PYX_EXTERN_C' macro externally. Use 'CYTHON_EXTERN_C' instead.")
+    #else
+    #warning Please do not define the '__PYX_EXTERN_C' macro externally. Use 'CYTHON_EXTERN_C' instead.
+    #endif
+#else
+    #define __PYX_EXTERN_C extern "C++"
+#endif
+
+#define __PYX_HAVE__fairseq__data__data_utils_fast
+#define __PYX_HAVE_API__fairseq__data__data_utils_fast
+/* Early includes */
+#include <string.h>
+#include <stdio.h>
+
+    /* Using NumPy API declarations from "numpy/__init__.cython-30.pxd" */
+    
+#include "numpy/arrayobject.h"
+#include "numpy/ndarrayobject.h"
+#include "numpy/ndarraytypes.h"
+#include "numpy/arrayscalars.h"
+#include "numpy/ufuncobject.h"
+#include "pythread.h"
+#include <stdlib.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif /* _OPENMP */
+
+#if defined(PYREX_WITHOUT_ASSERTIONS) && !defined(CYTHON_WITHOUT_ASSERTIONS)
+#define CYTHON_WITHOUT_ASSERTIONS
+#endif
+
+typedef struct {PyObject **p; const char *s; const Py_ssize_t n; const char* encoding;
+                const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry;
+
+#define __PYX_DEFAULT_STRING_ENCODING_IS_ASCII 0
+#define __PYX_DEFAULT_STRING_ENCODING_IS_UTF8 0
+#define __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT (PY_MAJOR_VERSION >= 3 && __PYX_DEFAULT_STRING_ENCODING_IS_UTF8)
+#define __PYX_DEFAULT_STRING_ENCODING ""
+#define __Pyx_PyObject_FromString __Pyx_PyBytes_FromString
+#define __Pyx_PyObject_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#define __Pyx_uchar_cast(c) ((unsigned char)c)
+#define __Pyx_long_cast(x) ((long)x)
+#define __Pyx_fits_Py_ssize_t(v, type, is_signed)  (\
+    (sizeof(type) < sizeof(Py_ssize_t))  ||\
+    (sizeof(type) > sizeof(Py_ssize_t) &&\
+          likely(v < (type)PY_SSIZE_T_MAX ||\
+                 v == (type)PY_SSIZE_T_MAX)  &&\
+          (!is_signed || likely(v > (type)PY_SSIZE_T_MIN ||\
+                                v == (type)PY_SSIZE_T_MIN)))  ||\
+    (sizeof(type) == sizeof(Py_ssize_t) &&\
+          (is_signed || likely(v < (type)PY_SSIZE_T_MAX ||\
+                               v == (type)PY_SSIZE_T_MAX)))  )
+static CYTHON_INLINE int __Pyx_is_valid_index(Py_ssize_t i, Py_ssize_t limit) {
+    return (size_t) i < (size_t) limit;
+}
+#if defined (__cplusplus) && __cplusplus >= 201103L
+    #include <cstdlib>
+    #define __Pyx_sst_abs(value) std::abs(value)
+#elif SIZEOF_INT >= SIZEOF_SIZE_T
+    #define __Pyx_sst_abs(value) abs(value)
+#elif SIZEOF_LONG >= SIZEOF_SIZE_T
+    #define __Pyx_sst_abs(value) labs(value)
+#elif defined (_MSC_VER)
+    #define __Pyx_sst_abs(value) ((Py_ssize_t)_abs64(value))
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define __Pyx_sst_abs(value) llabs(value)
+#elif defined (__GNUC__)
+    #define __Pyx_sst_abs(value) __builtin_llabs(value)
+#else
+    #define __Pyx_sst_abs(value) ((value<0) ? -value : value)
+#endif
+static CYTHON_INLINE Py_ssize_t __Pyx_ssize_strlen(const char *s);
+static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject*);
+static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject*, Py_ssize_t* length);
+static CYTHON_INLINE PyObject* __Pyx_PyByteArray_FromString(const char*);
+#define __Pyx_PyByteArray_FromStringAndSize(s, l) PyByteArray_FromStringAndSize((const char*)s, l)
+#define __Pyx_PyBytes_FromString        PyBytes_FromString
+#define __Pyx_PyBytes_FromStringAndSize PyBytes_FromStringAndSize
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char*);
+#if PY_MAJOR_VERSION < 3
+    #define __Pyx_PyStr_FromString        __Pyx_PyBytes_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#else
+    #define __Pyx_PyStr_FromString        __Pyx_PyUnicode_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyUnicode_FromStringAndSize
+#endif
+#define __Pyx_PyBytes_AsWritableString(s)     ((char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsWritableSString(s)    ((signed char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsWritableUString(s)    ((unsigned char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsString(s)     ((const char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsSString(s)    ((const signed char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsUString(s)    ((const unsigned char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyObject_AsWritableString(s)    ((char*)(__pyx_uintptr_t) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsWritableSString(s)    ((signed char*)(__pyx_uintptr_t) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsWritableUString(s)    ((unsigned char*)(__pyx_uintptr_t) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsSString(s)    ((const signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsUString(s)    ((const unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_FromCString(s)  __Pyx_PyObject_FromString((const char*)s)
+#define __Pyx_PyBytes_FromCString(s)   __Pyx_PyBytes_FromString((const char*)s)
+#define __Pyx_PyByteArray_FromCString(s)   __Pyx_PyByteArray_FromString((const char*)s)
+#define __Pyx_PyStr_FromCString(s)     __Pyx_PyStr_FromString((const char*)s)
+#define __Pyx_PyUnicode_FromCString(s) __Pyx_PyUnicode_FromString((const char*)s)
+#define __Pyx_PyUnicode_FromOrdinal(o)       PyUnicode_FromOrdinal((int)o)
+#define __Pyx_PyUnicode_AsUnicode            PyUnicode_AsUnicode
+#define __Pyx_NewRef(obj) (Py_INCREF(obj), obj)
+#define __Pyx_Owned_Py_None(b) __Pyx_NewRef(Py_None)
+static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b);
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*);
+static CYTHON_INLINE int __Pyx_PyObject_IsTrueAndDecref(PyObject*);
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x);
+#define __Pyx_PySequence_Tuple(obj)\
+    (likely(PyTuple_CheckExact(obj)) ? __Pyx_NewRef(obj) : PySequence_Tuple(obj))
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*);
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t);
+static CYTHON_INLINE Py_hash_t __Pyx_PyIndex_AsHash_t(PyObject*);
+#if CYTHON_ASSUME_SAFE_MACROS
+#define __pyx_PyFloat_AsDouble(x) (PyFloat_CheckExact(x) ? PyFloat_AS_DOUBLE(x) : PyFloat_AsDouble(x))
+#else
+#define __pyx_PyFloat_AsDouble(x) PyFloat_AsDouble(x)
+#endif
+#define __pyx_PyFloat_AsFloat(x) ((float) __pyx_PyFloat_AsDouble(x))
+#if PY_MAJOR_VERSION >= 3
+#define __Pyx_PyNumber_Int(x) (PyLong_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Long(x))
+#else
+#define __Pyx_PyNumber_Int(x) (PyInt_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Int(x))
+#endif
+#if CYTHON_USE_PYLONG_INTERNALS
+  #if PY_VERSION_HEX >= 0x030C00A7
+  #ifndef _PyLong_SIGN_MASK
+    #define _PyLong_SIGN_MASK 3
+  #endif
+  #ifndef _PyLong_NON_SIZE_BITS
+    #define _PyLong_NON_SIZE_BITS 3
+  #endif
+  #define __Pyx_PyLong_Sign(x)  (((PyLongObject*)x)->long_value.lv_tag & _PyLong_SIGN_MASK)
+  #define __Pyx_PyLong_IsNeg(x)  ((__Pyx_PyLong_Sign(x) & 2) != 0)
+  #define __Pyx_PyLong_IsNonNeg(x)  (!__Pyx_PyLong_IsNeg(x))
+  #define __Pyx_PyLong_IsZero(x)  (__Pyx_PyLong_Sign(x) & 1)
+  #define __Pyx_PyLong_IsPos(x)  (__Pyx_PyLong_Sign(x) == 0)
+  #define __Pyx_PyLong_CompactValueUnsigned(x)  (__Pyx_PyLong_Digits(x)[0])
+  #define __Pyx_PyLong_DigitCount(x)  ((Py_ssize_t) (((PyLongObject*)x)->long_value.lv_tag >> _PyLong_NON_SIZE_BITS))
+  #define __Pyx_PyLong_SignedDigitCount(x)\
+        ((1 - (Py_ssize_t) __Pyx_PyLong_Sign(x)) * __Pyx_PyLong_DigitCount(x))
+  #if defined(PyUnstable_Long_IsCompact) && defined(PyUnstable_Long_CompactValue)
+    #define __Pyx_PyLong_IsCompact(x)     PyUnstable_Long_IsCompact((PyLongObject*) x)
+    #define __Pyx_PyLong_CompactValue(x)  PyUnstable_Long_CompactValue((PyLongObject*) x)
+  #else
+    #define __Pyx_PyLong_IsCompact(x)     (((PyLongObject*)x)->long_value.lv_tag < (2 << _PyLong_NON_SIZE_BITS))
+    #define __Pyx_PyLong_CompactValue(x)  ((1 - (Py_ssize_t) __Pyx_PyLong_Sign(x)) * (Py_ssize_t) __Pyx_PyLong_Digits(x)[0])
+  #endif
+  typedef Py_ssize_t  __Pyx_compact_pylong;
+  typedef size_t  __Pyx_compact_upylong;
+  #else
+  #define __Pyx_PyLong_IsNeg(x)  (Py_SIZE(x) < 0)
+  #define __Pyx_PyLong_IsNonNeg(x)  (Py_SIZE(x) >= 0)
+  #define __Pyx_PyLong_IsZero(x)  (Py_SIZE(x) == 0)
+  #define __Pyx_PyLong_IsPos(x)  (Py_SIZE(x) > 0)
+  #define __Pyx_PyLong_CompactValueUnsigned(x)  ((Py_SIZE(x) == 0) ? 0 : __Pyx_PyLong_Digits(x)[0])
+  #define __Pyx_PyLong_DigitCount(x)  __Pyx_sst_abs(Py_SIZE(x))
+  #define __Pyx_PyLong_SignedDigitCount(x)  Py_SIZE(x)
+  #define __Pyx_PyLong_IsCompact(x)  (Py_SIZE(x) == 0 || Py_SIZE(x) == 1 || Py_SIZE(x) == -1)
+  #define __Pyx_PyLong_CompactValue(x)\
+        ((Py_SIZE(x) == 0) ? (sdigit) 0 : ((Py_SIZE(x) < 0) ? -(sdigit)__Pyx_PyLong_Digits(x)[0] : (sdigit)__Pyx_PyLong_Digits(x)[0]))
+  typedef sdigit  __Pyx_compact_pylong;
+  typedef digit  __Pyx_compact_upylong;
+  #endif
+  #if PY_VERSION_HEX >= 0x030C00A5
+  #define __Pyx_PyLong_Digits(x)  (((PyLongObject*)x)->long_value.ob_digit)
+  #else
+  #define __Pyx_PyLong_Digits(x)  (((PyLongObject*)x)->ob_digit)
+  #endif
+#endif
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+#include <string.h>
+static int __Pyx_sys_getdefaultencoding_not_ascii;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys;
+    PyObject* default_encoding = NULL;
+    PyObject* ascii_chars_u = NULL;
+    PyObject* ascii_chars_b = NULL;
+    const char* default_encoding_c;
+    sys = PyImport_ImportModule("sys");
+    if (!sys) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) "getdefaultencoding", NULL);
+    Py_DECREF(sys);
+    if (!default_encoding) goto bad;
+    default_encoding_c = PyBytes_AsString(default_encoding);
+    if (!default_encoding_c) goto bad;
+    if (strcmp(default_encoding_c, "ascii") == 0) {
+        __Pyx_sys_getdefaultencoding_not_ascii = 0;
+    } else {
+        char ascii_chars[128];
+        int c;
+        for (c = 0; c < 128; c++) {
+            ascii_chars[c] = (char) c;
+        }
+        __Pyx_sys_getdefaultencoding_not_ascii = 1;
+        ascii_chars_u = PyUnicode_DecodeASCII(ascii_chars, 128, NULL);
+        if (!ascii_chars_u) goto bad;
+        ascii_chars_b = PyUnicode_AsEncodedString(ascii_chars_u, default_encoding_c, NULL);
+        if (!ascii_chars_b || !PyBytes_Check(ascii_chars_b) || memcmp(ascii_chars, PyBytes_AS_STRING(ascii_chars_b), 128) != 0) {
+            PyErr_Format(
+                PyExc_ValueError,
+                "This module compiled with c_string_encoding=ascii, but default encoding '%.200s' is not a superset of ascii.",
+                default_encoding_c);
+            goto bad;
+        }
+        Py_DECREF(ascii_chars_u);
+        Py_DECREF(ascii_chars_b);
+    }
+    Py_DECREF(default_encoding);
+    return 0;
+bad:
+    Py_XDECREF(default_encoding);
+    Py_XDECREF(ascii_chars_u);
+    Py_XDECREF(ascii_chars_b);
+    return -1;
+}
+#endif
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT && PY_MAJOR_VERSION >= 3
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_DecodeUTF8(c_str, size, NULL)
+#else
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_Decode(c_str, size, __PYX_DEFAULT_STRING_ENCODING, NULL)
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+#include <string.h>
+static char* __PYX_DEFAULT_STRING_ENCODING;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys;
+    PyObject* default_encoding = NULL;
+    char* default_encoding_c;
+    sys = PyImport_ImportModule("sys");
+    if (!sys) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL);
+    Py_DECREF(sys);
+    if (!default_encoding) goto bad;
+    default_encoding_c = PyBytes_AsString(default_encoding);
+    if (!default_encoding_c) goto bad;
+    __PYX_DEFAULT_STRING_ENCODING = (char*) malloc(strlen(default_encoding_c) + 1);
+    if (!__PYX_DEFAULT_STRING_ENCODING) goto bad;
+    strcpy(__PYX_DEFAULT_STRING_ENCODING, default_encoding_c);
+    Py_DECREF(default_encoding);
+    return 0;
+bad:
+    Py_XDECREF(default_encoding);
+    return -1;
+}
+#endif
+#endif
+
+
+/* Test for GCC > 2.95 */
+#if defined(__GNUC__)     && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))
+  #define likely(x)   __builtin_expect(!!(x), 1)
+  #define unlikely(x) __builtin_expect(!!(x), 0)
+#else /* !__GNUC__ or GCC < 2.95 */
+  #define likely(x)   (x)
+  #define unlikely(x) (x)
+#endif /* __GNUC__ */
+static CYTHON_INLINE void __Pyx_pretend_to_initialize(void* ptr) { (void)ptr; }
+
+#if !CYTHON_USE_MODULE_STATE
+static PyObject *__pyx_m = NULL;
+#endif
+static int __pyx_lineno;
+static int __pyx_clineno = 0;
+static const char * __pyx_cfilenm = __FILE__;
+static const char *__pyx_filename;
+
+/* Header.proto */
+#if !defined(CYTHON_CCOMPLEX)
+  #if defined(__cplusplus)
+    #define CYTHON_CCOMPLEX 1
+  #elif (defined(_Complex_I) && !defined(_MSC_VER)) || ((defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_COMPLEX__) && !defined(_MSC_VER))
+    #define CYTHON_CCOMPLEX 1
+  #else
+    #define CYTHON_CCOMPLEX 0
+  #endif
+#endif
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    #include <complex>
+  #else
+    #include <complex.h>
+  #endif
+#endif
+#if CYTHON_CCOMPLEX && !defined(__cplusplus) && defined(__sun__) && defined(__GNUC__)
+  #undef _Complex_I
+  #define _Complex_I 1.0fj
+#endif
+
+/* #### Code section: filename_table ### */
+
+static const char *__pyx_f[] = {
+  "fairseq/data/data_utils_fast.pyx",
+  "<stringsource>",
+  "__init__.cython-30.pxd",
+  "type.pxd",
+};
+/* #### Code section: utility_code_proto_before_types ### */
+/* ForceInitThreads.proto */
+#ifndef __PYX_FORCE_INIT_THREADS
+  #define __PYX_FORCE_INIT_THREADS 0
+#endif
+
+/* NoFastGil.proto */
+#define __Pyx_PyGILState_Ensure PyGILState_Ensure
+#define __Pyx_PyGILState_Release PyGILState_Release
+#define __Pyx_FastGIL_Remember()
+#define __Pyx_FastGIL_Forget()
+#define __Pyx_FastGilFuncInit()
+
+/* BufferFormatStructs.proto */
+struct __Pyx_StructField_;
+#define __PYX_BUF_FLAGS_PACKED_STRUCT (1 << 0)
+typedef struct {
+  const char* name;
+  struct __Pyx_StructField_* fields;
+  size_t size;
+  size_t arraysize[8];
+  int ndim;
+  char typegroup;
+  char is_unsigned;
+  int flags;
+} __Pyx_TypeInfo;
+typedef struct __Pyx_StructField_ {
+  __Pyx_TypeInfo* type;
+  const char* name;
+  size_t offset;
+} __Pyx_StructField;
+typedef struct {
+  __Pyx_StructField* field;
+  size_t parent_offset;
+} __Pyx_BufFmt_StackElem;
+typedef struct {
+  __Pyx_StructField root;
+  __Pyx_BufFmt_StackElem* head;
+  size_t fmt_offset;
+  size_t new_count, enc_count;
+  size_t struct_alignment;
+  int is_complex;
+  char enc_type;
+  char new_packmode;
+  char enc_packmode;
+  char is_valid_array;
+} __Pyx_BufFmt_Context;
+
+/* Atomics.proto */
+#include <pythread.h>
+#ifndef CYTHON_ATOMICS
+    #define CYTHON_ATOMICS 1
+#endif
+#define __PYX_CYTHON_ATOMICS_ENABLED() CYTHON_ATOMICS
+#define __pyx_atomic_int_type int
+#define __pyx_nonatomic_int_type int
+#if CYTHON_ATOMICS && (defined(__STDC_VERSION__) &&\
+                        (__STDC_VERSION__ >= 201112L) &&\
+                        !defined(__STDC_NO_ATOMICS__))
+    #include <stdatomic.h>
+#elif CYTHON_ATOMICS && (defined(__cplusplus) && (\
+                    (__cplusplus >= 201103L) ||\
+                    (defined(_MSC_VER) && _MSC_VER >= 1700)))
+    #include <atomic>
+#endif
+#if CYTHON_ATOMICS && (defined(__STDC_VERSION__) &&\
+                        (__STDC_VERSION__ >= 201112L) &&\
+                        !defined(__STDC_NO_ATOMICS__) &&\
+                       ATOMIC_INT_LOCK_FREE == 2)
+    #undef __pyx_atomic_int_type
+    #define __pyx_atomic_int_type atomic_int
+    #define __pyx_atomic_incr_aligned(value) atomic_fetch_add_explicit(value, 1, memory_order_relaxed)
+    #define __pyx_atomic_decr_aligned(value) atomic_fetch_sub_explicit(value, 1, memory_order_acq_rel)
+    #if defined(__PYX_DEBUG_ATOMICS) && defined(_MSC_VER)
+        #pragma message ("Using standard C atomics")
+    #elif defined(__PYX_DEBUG_ATOMICS)
+        #warning "Using standard C atomics"
+    #endif
+#elif CYTHON_ATOMICS && (defined(__cplusplus) && (\
+                    (__cplusplus >= 201103L) ||\
+\
+                    (defined(_MSC_VER) && _MSC_VER >= 1700)) &&\
+                    ATOMIC_INT_LOCK_FREE == 2)
+    #undef __pyx_atomic_int_type
+    #define __pyx_atomic_int_type std::atomic_int
+    #define __pyx_atomic_incr_aligned(value) std::atomic_fetch_add_explicit(value, 1, std::memory_order_relaxed)
+    #define __pyx_atomic_decr_aligned(value) std::atomic_fetch_sub_explicit(value, 1, std::memory_order_acq_rel)
+    #if defined(__PYX_DEBUG_ATOMICS) && defined(_MSC_VER)
+        #pragma message ("Using standard C++ atomics")
+    #elif defined(__PYX_DEBUG_ATOMICS)
+        #warning "Using standard C++ atomics"
+    #endif
+#elif CYTHON_ATOMICS && (__GNUC__ >= 5 || (__GNUC__ == 4 &&\
+                    (__GNUC_MINOR__ > 1 ||\
+                    (__GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ >= 2))))
+    #define __pyx_atomic_incr_aligned(value) __sync_fetch_and_add(value, 1)
+    #define __pyx_atomic_decr_aligned(value) __sync_fetch_and_sub(value, 1)
+    #ifdef __PYX_DEBUG_ATOMICS
+        #warning "Using GNU atomics"
+    #endif
+#elif CYTHON_ATOMICS && defined(_MSC_VER)
+    #include <intrin.h>
+    #undef __pyx_atomic_int_type
+    #define __pyx_atomic_int_type long
+    #undef __pyx_nonatomic_int_type
+    #define __pyx_nonatomic_int_type long
+    #pragma intrinsic (_InterlockedExchangeAdd)
+    #define __pyx_atomic_incr_aligned(value) _InterlockedExchangeAdd(value, 1)
+    #define __pyx_atomic_decr_aligned(value) _InterlockedExchangeAdd(value, -1)
+    #ifdef __PYX_DEBUG_ATOMICS
+        #pragma message ("Using MSVC atomics")
+    #endif
+#else
+    #undef CYTHON_ATOMICS
+    #define CYTHON_ATOMICS 0
+    #ifdef __PYX_DEBUG_ATOMICS
+        #warning "Not using atomics"
+    #endif
+#endif
+#if CYTHON_ATOMICS
+    #define __pyx_add_acquisition_count(memview)\
+             __pyx_atomic_incr_aligned(__pyx_get_slice_count_pointer(memview))
+    #define __pyx_sub_acquisition_count(memview)\
+            __pyx_atomic_decr_aligned(__pyx_get_slice_count_pointer(memview))
+#else
+    #define __pyx_add_acquisition_count(memview)\
+            __pyx_add_acquisition_count_locked(__pyx_get_slice_count_pointer(memview), memview->lock)
+    #define __pyx_sub_acquisition_count(memview)\
+            __pyx_sub_acquisition_count_locked(__pyx_get_slice_count_pointer(memview), memview->lock)
+#endif
+
+/* MemviewSliceStruct.proto */
+struct __pyx_memoryview_obj;
+typedef struct {
+  struct __pyx_memoryview_obj *memview;
+  char *data;
+  Py_ssize_t shape[8];
+  Py_ssize_t strides[8];
+  Py_ssize_t suboffsets[8];
+} __Pyx_memviewslice;
+#define __Pyx_MemoryView_Len(m)  (m.shape[0])
+
+/* #### Code section: numeric_typedefs ### */
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":787
+ * # in Cython to enable them only on the right systems.
+ * 
+ * ctypedef npy_int8       int8_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t
+ */
+typedef npy_int8 __pyx_t_5numpy_int8_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":788
+ * 
+ * ctypedef npy_int8       int8_t
+ * ctypedef npy_int16      int16_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int32      int32_t
+ * ctypedef npy_int64      int64_t
+ */
+typedef npy_int16 __pyx_t_5numpy_int16_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":789
+ * ctypedef npy_int8       int8_t
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int64      int64_t
+ * #ctypedef npy_int96      int96_t
+ */
+typedef npy_int32 __pyx_t_5numpy_int32_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":790
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t
+ * ctypedef npy_int64      int64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_int96      int96_t
+ * #ctypedef npy_int128     int128_t
+ */
+typedef npy_int64 __pyx_t_5numpy_int64_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":794
+ * #ctypedef npy_int128     int128_t
+ * 
+ * ctypedef npy_uint8      uint8_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t
+ */
+typedef npy_uint8 __pyx_t_5numpy_uint8_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":795
+ * 
+ * ctypedef npy_uint8      uint8_t
+ * ctypedef npy_uint16     uint16_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint32     uint32_t
+ * ctypedef npy_uint64     uint64_t
+ */
+typedef npy_uint16 __pyx_t_5numpy_uint16_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":796
+ * ctypedef npy_uint8      uint8_t
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint64     uint64_t
+ * #ctypedef npy_uint96     uint96_t
+ */
+typedef npy_uint32 __pyx_t_5numpy_uint32_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":797
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t
+ * ctypedef npy_uint64     uint64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_uint96     uint96_t
+ * #ctypedef npy_uint128    uint128_t
+ */
+typedef npy_uint64 __pyx_t_5numpy_uint64_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":801
+ * #ctypedef npy_uint128    uint128_t
+ * 
+ * ctypedef npy_float32    float32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_float64    float64_t
+ * #ctypedef npy_float80    float80_t
+ */
+typedef npy_float32 __pyx_t_5numpy_float32_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":802
+ * 
+ * ctypedef npy_float32    float32_t
+ * ctypedef npy_float64    float64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_float80    float80_t
+ * #ctypedef npy_float128   float128_t
+ */
+typedef npy_float64 __pyx_t_5numpy_float64_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":809
+ * ctypedef double complex complex128_t
+ * 
+ * ctypedef npy_longlong   longlong_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_ulonglong  ulonglong_t
+ * 
+ */
+typedef npy_longlong __pyx_t_5numpy_longlong_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":810
+ * 
+ * ctypedef npy_longlong   longlong_t
+ * ctypedef npy_ulonglong  ulonglong_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_intp       intp_t
+ */
+typedef npy_ulonglong __pyx_t_5numpy_ulonglong_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":812
+ * ctypedef npy_ulonglong  ulonglong_t
+ * 
+ * ctypedef npy_intp       intp_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uintp      uintp_t
+ * 
+ */
+typedef npy_intp __pyx_t_5numpy_intp_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":813
+ * 
+ * ctypedef npy_intp       intp_t
+ * ctypedef npy_uintp      uintp_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_double     float_t
+ */
+typedef npy_uintp __pyx_t_5numpy_uintp_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":815
+ * ctypedef npy_uintp      uintp_t
+ * 
+ * ctypedef npy_double     float_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_double     double_t
+ * ctypedef npy_longdouble longdouble_t
+ */
+typedef npy_double __pyx_t_5numpy_float_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":816
+ * 
+ * ctypedef npy_double     float_t
+ * ctypedef npy_double     double_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longdouble longdouble_t
+ * 
+ */
+typedef npy_double __pyx_t_5numpy_double_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":817
+ * ctypedef npy_double     float_t
+ * ctypedef npy_double     double_t
+ * ctypedef npy_longdouble longdouble_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef float complex       cfloat_t
+ */
+typedef npy_longdouble __pyx_t_5numpy_longdouble_t;
+
+/* "fairseq/data/data_utils_fast.pyx":13
+ * 
+ * DTYPE = np.int64
+ * ctypedef np.int64_t DTYPE_t             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+typedef __pyx_t_5numpy_int64_t __pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t;
+/* #### Code section: complex_type_declarations ### */
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+  #ifdef __cplusplus
+    typedef ::std::complex< float > __pyx_t_float_complex;
+  #else
+    typedef float _Complex __pyx_t_float_complex;
+  #endif
+#else
+    typedef struct { float real, imag; } __pyx_t_float_complex;
+#endif
+static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float, float);
+
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+  #ifdef __cplusplus
+    typedef ::std::complex< double > __pyx_t_double_complex;
+  #else
+    typedef double _Complex __pyx_t_double_complex;
+  #endif
+#else
+    typedef struct { double real, imag; } __pyx_t_double_complex;
+#endif
+static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double, double);
+
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+  #ifdef __cplusplus
+    typedef ::std::complex< long double > __pyx_t_long_double_complex;
+  #else
+    typedef long double _Complex __pyx_t_long_double_complex;
+  #endif
+#else
+    typedef struct { long double real, imag; } __pyx_t_long_double_complex;
+#endif
+static CYTHON_INLINE __pyx_t_long_double_complex __pyx_t_long_double_complex_from_parts(long double, long double);
+
+/* #### Code section: type_declarations ### */
+
+/*--- Type declarations ---*/
+struct __pyx_array_obj;
+struct __pyx_MemviewEnum_obj;
+struct __pyx_memoryview_obj;
+struct __pyx_memoryviewslice_obj;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1113
+ * 
+ * # Iterator API added in v1.6
+ * ctypedef int (*NpyIter_IterNextFunc)(NpyIter* it) noexcept nogil             # <<<<<<<<<<<<<<
+ * ctypedef void (*NpyIter_GetMultiIndexFunc)(NpyIter* it, npy_intp* outcoords) noexcept nogil
+ * 
+ */
+typedef int (*__pyx_t_5numpy_NpyIter_IterNextFunc)(NpyIter *);
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1114
+ * # Iterator API added in v1.6
+ * ctypedef int (*NpyIter_IterNextFunc)(NpyIter* it) noexcept nogil
+ * ctypedef void (*NpyIter_GetMultiIndexFunc)(NpyIter* it, npy_intp* outcoords) noexcept nogil             # <<<<<<<<<<<<<<
+ * 
+ * cdef extern from "numpy/arrayobject.h":
+ */
+typedef void (*__pyx_t_5numpy_NpyIter_GetMultiIndexFunc)(NpyIter *, npy_intp *);
+
+/* "View.MemoryView":114
+ * @cython.collection_type("sequence")
+ * @cname("__pyx_array")
+ * cdef class array:             # <<<<<<<<<<<<<<
+ * 
+ *     cdef:
+ */
+struct __pyx_array_obj {
+  PyObject_HEAD
+  struct __pyx_vtabstruct_array *__pyx_vtab;
+  char *data;
+  Py_ssize_t len;
+  char *format;
+  int ndim;
+  Py_ssize_t *_shape;
+  Py_ssize_t *_strides;
+  Py_ssize_t itemsize;
+  PyObject *mode;
+  PyObject *_format;
+  void (*callback_free_data)(void *);
+  int free_data;
+  int dtype_is_object;
+};
+
+
+/* "View.MemoryView":302
+ * 
+ * @cname('__pyx_MemviewEnum')
+ * cdef class Enum(object):             # <<<<<<<<<<<<<<
+ *     cdef object name
+ *     def __init__(self, name):
+ */
+struct __pyx_MemviewEnum_obj {
+  PyObject_HEAD
+  PyObject *name;
+};
+
+
+/* "View.MemoryView":337
+ * 
+ * @cname('__pyx_memoryview')
+ * cdef class memoryview:             # <<<<<<<<<<<<<<
+ * 
+ *     cdef object obj
+ */
+struct __pyx_memoryview_obj {
+  PyObject_HEAD
+  struct __pyx_vtabstruct_memoryview *__pyx_vtab;
+  PyObject *obj;
+  PyObject *_size;
+  PyObject *_array_interface;
+  PyThread_type_lock lock;
+  __pyx_atomic_int_type acquisition_count;
+  Py_buffer view;
+  int flags;
+  int dtype_is_object;
+  __Pyx_TypeInfo *typeinfo;
+};
+
+
+/* "View.MemoryView":952
+ * @cython.collection_type("sequence")
+ * @cname('__pyx_memoryviewslice')
+ * cdef class _memoryviewslice(memoryview):             # <<<<<<<<<<<<<<
+ *     "Internal class for passing memoryview slices to Python"
+ * 
+ */
+struct __pyx_memoryviewslice_obj {
+  struct __pyx_memoryview_obj __pyx_base;
+  __Pyx_memviewslice from_slice;
+  PyObject *from_object;
+  PyObject *(*to_object_func)(char *);
+  int (*to_dtype_func)(char *, PyObject *);
+};
+
+
+
+/* "View.MemoryView":114
+ * @cython.collection_type("sequence")
+ * @cname("__pyx_array")
+ * cdef class array:             # <<<<<<<<<<<<<<
+ * 
+ *     cdef:
+ */
+
+struct __pyx_vtabstruct_array {
+  PyObject *(*get_memview)(struct __pyx_array_obj *);
+};
+static struct __pyx_vtabstruct_array *__pyx_vtabptr_array;
+
+
+/* "View.MemoryView":337
+ * 
+ * @cname('__pyx_memoryview')
+ * cdef class memoryview:             # <<<<<<<<<<<<<<
+ * 
+ *     cdef object obj
+ */
+
+struct __pyx_vtabstruct_memoryview {
+  char *(*get_item_pointer)(struct __pyx_memoryview_obj *, PyObject *);
+  PyObject *(*is_slice)(struct __pyx_memoryview_obj *, PyObject *);
+  PyObject *(*setitem_slice_assignment)(struct __pyx_memoryview_obj *, PyObject *, PyObject *);
+  PyObject *(*setitem_slice_assign_scalar)(struct __pyx_memoryview_obj *, struct __pyx_memoryview_obj *, PyObject *);
+  PyObject *(*setitem_indexed)(struct __pyx_memoryview_obj *, PyObject *, PyObject *);
+  PyObject *(*convert_item_to_object)(struct __pyx_memoryview_obj *, char *);
+  PyObject *(*assign_item_from_object)(struct __pyx_memoryview_obj *, char *, PyObject *);
+  PyObject *(*_get_base)(struct __pyx_memoryview_obj *);
+};
+static struct __pyx_vtabstruct_memoryview *__pyx_vtabptr_memoryview;
+
+
+/* "View.MemoryView":952
+ * @cython.collection_type("sequence")
+ * @cname('__pyx_memoryviewslice')
+ * cdef class _memoryviewslice(memoryview):             # <<<<<<<<<<<<<<
+ *     "Internal class for passing memoryview slices to Python"
+ * 
+ */
+
+struct __pyx_vtabstruct__memoryviewslice {
+  struct __pyx_vtabstruct_memoryview __pyx_base;
+};
+static struct __pyx_vtabstruct__memoryviewslice *__pyx_vtabptr__memoryviewslice;
+/* #### Code section: utility_code_proto ### */
+
+/* --- Runtime support code (head) --- */
+/* Refnanny.proto */
+#ifndef CYTHON_REFNANNY
+  #define CYTHON_REFNANNY 0
+#endif
+#if CYTHON_REFNANNY
+  typedef struct {
+    void (*INCREF)(void*, PyObject*, Py_ssize_t);
+    void (*DECREF)(void*, PyObject*, Py_ssize_t);
+    void (*GOTREF)(void*, PyObject*, Py_ssize_t);
+    void (*GIVEREF)(void*, PyObject*, Py_ssize_t);
+    void* (*SetupContext)(const char*, Py_ssize_t, const char*);
+    void (*FinishContext)(void**);
+  } __Pyx_RefNannyAPIStruct;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNanny = NULL;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname);
+  #define __Pyx_RefNannyDeclarations void *__pyx_refnanny = NULL;
+#ifdef WITH_THREAD
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+          if (acquire_gil) {\
+              PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();\
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), (__LINE__), (__FILE__));\
+              PyGILState_Release(__pyx_gilstate_save);\
+          } else {\
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), (__LINE__), (__FILE__));\
+          }
+  #define __Pyx_RefNannyFinishContextNogil() {\
+              PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();\
+              __Pyx_RefNannyFinishContext();\
+              PyGILState_Release(__pyx_gilstate_save);\
+          }
+#else
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+          __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), (__LINE__), (__FILE__))
+  #define __Pyx_RefNannyFinishContextNogil() __Pyx_RefNannyFinishContext()
+#endif
+  #define __Pyx_RefNannyFinishContextNogil() {\
+              PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();\
+              __Pyx_RefNannyFinishContext();\
+              PyGILState_Release(__pyx_gilstate_save);\
+          }
+  #define __Pyx_RefNannyFinishContext()\
+          __Pyx_RefNanny->FinishContext(&__pyx_refnanny)
+  #define __Pyx_INCREF(r)  __Pyx_RefNanny->INCREF(__pyx_refnanny, (PyObject *)(r), (__LINE__))
+  #define __Pyx_DECREF(r)  __Pyx_RefNanny->DECREF(__pyx_refnanny, (PyObject *)(r), (__LINE__))
+  #define __Pyx_GOTREF(r)  __Pyx_RefNanny->GOTREF(__pyx_refnanny, (PyObject *)(r), (__LINE__))
+  #define __Pyx_GIVEREF(r) __Pyx_RefNanny->GIVEREF(__pyx_refnanny, (PyObject *)(r), (__LINE__))
+  #define __Pyx_XINCREF(r)  do { if((r) == NULL); else {__Pyx_INCREF(r); }} while(0)
+  #define __Pyx_XDECREF(r)  do { if((r) == NULL); else {__Pyx_DECREF(r); }} while(0)
+  #define __Pyx_XGOTREF(r)  do { if((r) == NULL); else {__Pyx_GOTREF(r); }} while(0)
+  #define __Pyx_XGIVEREF(r) do { if((r) == NULL); else {__Pyx_GIVEREF(r);}} while(0)
+#else
+  #define __Pyx_RefNannyDeclarations
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)
+  #define __Pyx_RefNannyFinishContextNogil()
+  #define __Pyx_RefNannyFinishContext()
+  #define __Pyx_INCREF(r) Py_INCREF(r)
+  #define __Pyx_DECREF(r) Py_DECREF(r)
+  #define __Pyx_GOTREF(r)
+  #define __Pyx_GIVEREF(r)
+  #define __Pyx_XINCREF(r) Py_XINCREF(r)
+  #define __Pyx_XDECREF(r) Py_XDECREF(r)
+  #define __Pyx_XGOTREF(r)
+  #define __Pyx_XGIVEREF(r)
+#endif
+#define __Pyx_Py_XDECREF_SET(r, v) do {\
+        PyObject *tmp = (PyObject *) r;\
+        r = v; Py_XDECREF(tmp);\
+    } while (0)
+#define __Pyx_XDECREF_SET(r, v) do {\
+        PyObject *tmp = (PyObject *) r;\
+        r = v; __Pyx_XDECREF(tmp);\
+    } while (0)
+#define __Pyx_DECREF_SET(r, v) do {\
+        PyObject *tmp = (PyObject *) r;\
+        r = v; __Pyx_DECREF(tmp);\
+    } while (0)
+#define __Pyx_CLEAR(r)    do { PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);} while(0)
+#define __Pyx_XCLEAR(r)   do { if((r) != NULL) {PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);}} while(0)
+
+/* PyErrExceptionMatches.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyErr_ExceptionMatches(err) __Pyx_PyErr_ExceptionMatchesInState(__pyx_tstate, err)
+static CYTHON_INLINE int __Pyx_PyErr_ExceptionMatchesInState(PyThreadState* tstate, PyObject* err);
+#else
+#define __Pyx_PyErr_ExceptionMatches(err)  PyErr_ExceptionMatches(err)
+#endif
+
+/* PyThreadStateGet.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyThreadState_declare  PyThreadState *__pyx_tstate;
+#define __Pyx_PyThreadState_assign  __pyx_tstate = __Pyx_PyThreadState_Current;
+#if PY_VERSION_HEX >= 0x030C00A6
+#define __Pyx_PyErr_Occurred()  (__pyx_tstate->current_exception != NULL)
+#define __Pyx_PyErr_CurrentExceptionType()  (__pyx_tstate->current_exception ? (PyObject*) Py_TYPE(__pyx_tstate->current_exception) : (PyObject*) NULL)
+#else
+#define __Pyx_PyErr_Occurred()  (__pyx_tstate->curexc_type != NULL)
+#define __Pyx_PyErr_CurrentExceptionType()  (__pyx_tstate->curexc_type)
+#endif
+#else
+#define __Pyx_PyThreadState_declare
+#define __Pyx_PyThreadState_assign
+#define __Pyx_PyErr_Occurred()  (PyErr_Occurred() != NULL)
+#define __Pyx_PyErr_CurrentExceptionType()  PyErr_Occurred()
+#endif
+
+/* PyErrFetchRestore.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyErr_Clear() __Pyx_ErrRestore(NULL, NULL, NULL)
+#define __Pyx_ErrRestoreWithState(type, value, tb)  __Pyx_ErrRestoreInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb)    __Pyx_ErrFetchInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb)  __Pyx_ErrRestoreInState(__pyx_tstate, type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb)    __Pyx_ErrFetchInState(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
+static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX < 0x030C00A6
+#define __Pyx_PyErr_SetNone(exc) (Py_INCREF(exc), __Pyx_ErrRestore((exc), NULL, NULL))
+#else
+#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
+#endif
+#else
+#define __Pyx_PyErr_Clear() PyErr_Clear()
+#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
+#define __Pyx_ErrRestoreWithState(type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb)  PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestoreInState(tstate, type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchInState(tstate, type, value, tb)  PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb)  PyErr_Fetch(type, value, tb)
+#endif
+
+/* PyObjectGetAttrStr.proto */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name);
+#else
+#define __Pyx_PyObject_GetAttrStr(o,n) PyObject_GetAttr(o,n)
+#endif
+
+/* PyObjectGetAttrStrNoError.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStrNoError(PyObject* obj, PyObject* attr_name);
+
+/* GetBuiltinName.proto */
+static PyObject *__Pyx_GetBuiltinName(PyObject *name);
+
+/* TupleAndListFromArray.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyList_FromArray(PyObject *const *src, Py_ssize_t n);
+static CYTHON_INLINE PyObject* __Pyx_PyTuple_FromArray(PyObject *const *src, Py_ssize_t n);
+#endif
+
+/* IncludeStringH.proto */
+#include <string.h>
+
+/* BytesEquals.proto */
+static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int equals);
+
+/* UnicodeEquals.proto */
+static CYTHON_INLINE int __Pyx_PyUnicode_Equals(PyObject* s1, PyObject* s2, int equals);
+
+/* fastcall.proto */
+#if CYTHON_AVOID_BORROWED_REFS
+    #define __Pyx_Arg_VARARGS(args, i) PySequence_GetItem(args, i)
+#elif CYTHON_ASSUME_SAFE_MACROS
+    #define __Pyx_Arg_VARARGS(args, i) PyTuple_GET_ITEM(args, i)
+#else
+    #define __Pyx_Arg_VARARGS(args, i) PyTuple_GetItem(args, i)
+#endif
+#if CYTHON_AVOID_BORROWED_REFS
+    #define __Pyx_Arg_NewRef_VARARGS(arg) __Pyx_NewRef(arg)
+    #define __Pyx_Arg_XDECREF_VARARGS(arg) Py_XDECREF(arg)
+#else
+    #define __Pyx_Arg_NewRef_VARARGS(arg) arg
+    #define __Pyx_Arg_XDECREF_VARARGS(arg)
+#endif
+#define __Pyx_NumKwargs_VARARGS(kwds) PyDict_Size(kwds)
+#define __Pyx_KwValues_VARARGS(args, nargs) NULL
+#define __Pyx_GetKwValue_VARARGS(kw, kwvalues, s) __Pyx_PyDict_GetItemStrWithError(kw, s)
+#define __Pyx_KwargsAsDict_VARARGS(kw, kwvalues) PyDict_Copy(kw)
+#if CYTHON_METH_FASTCALL
+    #define __Pyx_Arg_FASTCALL(args, i) args[i]
+    #define __Pyx_NumKwargs_FASTCALL(kwds) PyTuple_GET_SIZE(kwds)
+    #define __Pyx_KwValues_FASTCALL(args, nargs) ((args) + (nargs))
+    static CYTHON_INLINE PyObject * __Pyx_GetKwValue_FASTCALL(PyObject *kwnames, PyObject *const *kwvalues, PyObject *s);
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030d0000
+    CYTHON_UNUSED static PyObject *__Pyx_KwargsAsDict_FASTCALL(PyObject *kwnames, PyObject *const *kwvalues);
+  #else
+    #define __Pyx_KwargsAsDict_FASTCALL(kw, kwvalues) _PyStack_AsDict(kwvalues, kw)
+  #endif
+    #define __Pyx_Arg_NewRef_FASTCALL(arg) arg  /* no-op, __Pyx_Arg_FASTCALL is direct and this needs
+                                                   to have the same reference counting */
+    #define __Pyx_Arg_XDECREF_FASTCALL(arg)
+#else
+    #define __Pyx_Arg_FASTCALL __Pyx_Arg_VARARGS
+    #define __Pyx_NumKwargs_FASTCALL __Pyx_NumKwargs_VARARGS
+    #define __Pyx_KwValues_FASTCALL __Pyx_KwValues_VARARGS
+    #define __Pyx_GetKwValue_FASTCALL __Pyx_GetKwValue_VARARGS
+    #define __Pyx_KwargsAsDict_FASTCALL __Pyx_KwargsAsDict_VARARGS
+    #define __Pyx_Arg_NewRef_FASTCALL(arg) __Pyx_Arg_NewRef_VARARGS(arg)
+    #define __Pyx_Arg_XDECREF_FASTCALL(arg) __Pyx_Arg_XDECREF_VARARGS(arg)
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+#define __Pyx_ArgsSlice_VARARGS(args, start, stop) __Pyx_PyTuple_FromArray(&__Pyx_Arg_VARARGS(args, start), stop - start)
+#define __Pyx_ArgsSlice_FASTCALL(args, start, stop) __Pyx_PyTuple_FromArray(&__Pyx_Arg_FASTCALL(args, start), stop - start)
+#else
+#define __Pyx_ArgsSlice_VARARGS(args, start, stop) PyTuple_GetSlice(args, start, stop)
+#define __Pyx_ArgsSlice_FASTCALL(args, start, stop) PyTuple_GetSlice(args, start, stop)
+#endif
+
+/* RaiseArgTupleInvalid.proto */
+static void __Pyx_RaiseArgtupleInvalid(const char* func_name, int exact,
+    Py_ssize_t num_min, Py_ssize_t num_max, Py_ssize_t num_found);
+
+/* RaiseDoubleKeywords.proto */
+static void __Pyx_RaiseDoubleKeywordsError(const char* func_name, PyObject* kw_name);
+
+/* ParseKeywords.proto */
+static int __Pyx_ParseOptionalKeywords(PyObject *kwds, PyObject *const *kwvalues,
+    PyObject **argnames[],
+    PyObject *kwds2, PyObject *values[], Py_ssize_t num_pos_args,
+    const char* function_name);
+
+/* ArgTypeTest.proto */
+#define __Pyx_ArgTypeTest(obj, type, none_allowed, name, exact)\
+    ((likely(__Pyx_IS_TYPE(obj, type) | (none_allowed && (obj == Py_None)))) ? 1 :\
+        __Pyx__ArgTypeTest(obj, type, name, exact))
+static int __Pyx__ArgTypeTest(PyObject *obj, PyTypeObject *type, const char *name, int exact);
+
+/* RaiseException.proto */
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause);
+
+/* PyFunctionFastCall.proto */
+#if CYTHON_FAST_PYCALL
+#if !CYTHON_VECTORCALL
+#define __Pyx_PyFunction_FastCall(func, args, nargs)\
+    __Pyx_PyFunction_FastCallDict((func), (args), (nargs), NULL)
+static PyObject *__Pyx_PyFunction_FastCallDict(PyObject *func, PyObject **args, Py_ssize_t nargs, PyObject *kwargs);
+#endif
+#define __Pyx_BUILD_ASSERT_EXPR(cond)\
+    (sizeof(char [1 - 2*!(cond)]) - 1)
+#ifndef Py_MEMBER_SIZE
+#define Py_MEMBER_SIZE(type, member) sizeof(((type *)0)->member)
+#endif
+#if !CYTHON_VECTORCALL
+#if PY_VERSION_HEX >= 0x03080000
+  #include "frameobject.h"
+#if PY_VERSION_HEX >= 0x030b00a6 && !CYTHON_COMPILING_IN_LIMITED_API && !defined(PYPY_VERSION)
+  #ifndef Py_BUILD_CORE
+    #define Py_BUILD_CORE 1
+  #endif
+  #include "internal/pycore_frame.h"
+#endif
+  #define __Pxy_PyFrame_Initialize_Offsets()
+  #define __Pyx_PyFrame_GetLocalsplus(frame)  ((frame)->f_localsplus)
+#else
+  static size_t __pyx_pyframe_localsplus_offset = 0;
+  #include "frameobject.h"
+  #define __Pxy_PyFrame_Initialize_Offsets()\
+    ((void)__Pyx_BUILD_ASSERT_EXPR(sizeof(PyFrameObject) == offsetof(PyFrameObject, f_localsplus) + Py_MEMBER_SIZE(PyFrameObject, f_localsplus)),\
+     (void)(__pyx_pyframe_localsplus_offset = ((size_t)PyFrame_Type.tp_basicsize) - Py_MEMBER_SIZE(PyFrameObject, f_localsplus)))
+  #define __Pyx_PyFrame_GetLocalsplus(frame)\
+    (assert(__pyx_pyframe_localsplus_offset), (PyObject **)(((char *)(frame)) + __pyx_pyframe_localsplus_offset))
+#endif
+#endif
+#endif
+
+/* PyObjectCall.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw);
+#else
+#define __Pyx_PyObject_Call(func, arg, kw) PyObject_Call(func, arg, kw)
+#endif
+
+/* PyObjectCallMethO.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg);
+#endif
+
+/* PyObjectFastCall.proto */
+#define __Pyx_PyObject_FastCall(func, args, nargs)  __Pyx_PyObject_FastCallDict(func, args, (size_t)(nargs), NULL)
+static CYTHON_INLINE PyObject* __Pyx_PyObject_FastCallDict(PyObject *func, PyObject **args, size_t nargs, PyObject *kwargs);
+
+/* RaiseUnexpectedTypeError.proto */
+static int __Pyx_RaiseUnexpectedTypeError(const char *expected, PyObject *obj);
+
+/* GCCDiagnostics.proto */
+#if !defined(__INTEL_COMPILER) && defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
+#define __Pyx_HAS_GCC_DIAGNOSTIC
+#endif
+
+/* BuildPyUnicode.proto */
+static PyObject* __Pyx_PyUnicode_BuildFromAscii(Py_ssize_t ulength, char* chars, int clength,
+                                                int prepend_sign, char padding_char);
+
+/* CIntToPyUnicode.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_From_int(int value, Py_ssize_t width, char padding_char, char format_char);
+
+/* CIntToPyUnicode.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_From_Py_ssize_t(Py_ssize_t value, Py_ssize_t width, char padding_char, char format_char);
+
+/* JoinPyUnicode.proto */
+static PyObject* __Pyx_PyUnicode_Join(PyObject* value_tuple, Py_ssize_t value_count, Py_ssize_t result_ulength,
+                                      Py_UCS4 max_char);
+
+/* StrEquals.proto */
+#if PY_MAJOR_VERSION >= 3
+#define __Pyx_PyString_Equals __Pyx_PyUnicode_Equals
+#else
+#define __Pyx_PyString_Equals __Pyx_PyBytes_Equals
+#endif
+
+/* PyObjectFormatSimple.proto */
+#if CYTHON_COMPILING_IN_PYPY
+    #define __Pyx_PyObject_FormatSimple(s, f) (\
+        likely(PyUnicode_CheckExact(s)) ? (Py_INCREF(s), s) :\
+        PyObject_Format(s, f))
+#elif PY_MAJOR_VERSION < 3
+    #define __Pyx_PyObject_FormatSimple(s, f) (\
+        likely(PyUnicode_CheckExact(s)) ? (Py_INCREF(s), s) :\
+        likely(PyString_CheckExact(s)) ? PyUnicode_FromEncodedObject(s, NULL, "strict") :\
+        PyObject_Format(s, f))
+#elif CYTHON_USE_TYPE_SLOTS
+    #define __Pyx_PyObject_FormatSimple(s, f) (\
+        likely(PyUnicode_CheckExact(s)) ? (Py_INCREF(s), s) :\
+        likely(PyLong_CheckExact(s)) ? PyLong_Type.tp_repr(s) :\
+        likely(PyFloat_CheckExact(s)) ? PyFloat_Type.tp_repr(s) :\
+        PyObject_Format(s, f))
+#else
+    #define __Pyx_PyObject_FormatSimple(s, f) (\
+        likely(PyUnicode_CheckExact(s)) ? (Py_INCREF(s), s) :\
+        PyObject_Format(s, f))
+#endif
+
+CYTHON_UNUSED static int __pyx_array_getbuffer(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /*proto*/
+static PyObject *__pyx_array_get_memview(struct __pyx_array_obj *); /*proto*/
+/* GetAttr.proto */
+static CYTHON_INLINE PyObject *__Pyx_GetAttr(PyObject *, PyObject *);
+
+/* GetItemInt.proto */
+#define __Pyx_GetItemInt(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
+    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
+    __Pyx_GetItemInt_Fast(o, (Py_ssize_t)i, is_list, wraparound, boundscheck) :\
+    (is_list ? (PyErr_SetString(PyExc_IndexError, "list index out of range"), (PyObject*)NULL) :\
+               __Pyx_GetItemInt_Generic(o, to_py_func(i))))
+#define __Pyx_GetItemInt_List(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
+    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
+    __Pyx_GetItemInt_List_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) :\
+    (PyErr_SetString(PyExc_IndexError, "list index out of range"), (PyObject*)NULL))
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, Py_ssize_t i,
+                                                              int wraparound, int boundscheck);
+#define __Pyx_GetItemInt_Tuple(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
+    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
+    __Pyx_GetItemInt_Tuple_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) :\
+    (PyErr_SetString(PyExc_IndexError, "tuple index out of range"), (PyObject*)NULL))
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, Py_ssize_t i,
+                                                              int wraparound, int boundscheck);
+static PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j);
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i,
+                                                     int is_list, int wraparound, int boundscheck);
+
+/* PyObjectCallOneArg.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg);
+
+/* ObjectGetItem.proto */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject *__Pyx_PyObject_GetItem(PyObject *obj, PyObject *key);
+#else
+#define __Pyx_PyObject_GetItem(obj, key)  PyObject_GetItem(obj, key)
+#endif
+
+/* KeywordStringCheck.proto */
+static int __Pyx_CheckKeywordStrings(PyObject *kw, const char* function_name, int kw_allowed);
+
+/* DivInt[Py_ssize_t].proto */
+static CYTHON_INLINE Py_ssize_t __Pyx_div_Py_ssize_t(Py_ssize_t, Py_ssize_t);
+
+/* UnaryNegOverflows.proto */
+#define __Pyx_UNARY_NEG_WOULD_OVERFLOW(x)\
+        (((x) < 0) & ((unsigned long)(x) == 0-(unsigned long)(x)))
+
+/* GetAttr3.proto */
+static CYTHON_INLINE PyObject *__Pyx_GetAttr3(PyObject *, PyObject *, PyObject *);
+
+/* PyDictVersioning.proto */
+#if CYTHON_USE_DICT_VERSIONS && CYTHON_USE_TYPE_SLOTS
+#define __PYX_DICT_VERSION_INIT  ((PY_UINT64_T) -1)
+#define __PYX_GET_DICT_VERSION(dict)  (((PyDictObject*)(dict))->ma_version_tag)
+#define __PYX_UPDATE_DICT_CACHE(dict, value, cache_var, version_var)\
+    (version_var) = __PYX_GET_DICT_VERSION(dict);\
+    (cache_var) = (value);
+#define __PYX_PY_DICT_LOOKUP_IF_MODIFIED(VAR, DICT, LOOKUP) {\
+    static PY_UINT64_T __pyx_dict_version = 0;\
+    static PyObject *__pyx_dict_cached_value = NULL;\
+    if (likely(__PYX_GET_DICT_VERSION(DICT) == __pyx_dict_version)) {\
+        (VAR) = __pyx_dict_cached_value;\
+    } else {\
+        (VAR) = __pyx_dict_cached_value = (LOOKUP);\
+        __pyx_dict_version = __PYX_GET_DICT_VERSION(DICT);\
+    }\
+}
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_tp_dict_version(PyObject *obj);
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_object_dict_version(PyObject *obj);
+static CYTHON_INLINE int __Pyx_object_dict_version_matches(PyObject* obj, PY_UINT64_T tp_dict_version, PY_UINT64_T obj_dict_version);
+#else
+#define __PYX_GET_DICT_VERSION(dict)  (0)
+#define __PYX_UPDATE_DICT_CACHE(dict, value, cache_var, version_var)
+#define __PYX_PY_DICT_LOOKUP_IF_MODIFIED(VAR, DICT, LOOKUP)  (VAR) = (LOOKUP);
+#endif
+
+/* GetModuleGlobalName.proto */
+#if CYTHON_USE_DICT_VERSIONS
+#define __Pyx_GetModuleGlobalName(var, name)  do {\
+    static PY_UINT64_T __pyx_dict_version = 0;\
+    static PyObject *__pyx_dict_cached_value = NULL;\
+    (var) = (likely(__pyx_dict_version == __PYX_GET_DICT_VERSION(__pyx_d))) ?\
+        (likely(__pyx_dict_cached_value) ? __Pyx_NewRef(__pyx_dict_cached_value) : __Pyx_GetBuiltinName(name)) :\
+        __Pyx__GetModuleGlobalName(name, &__pyx_dict_version, &__pyx_dict_cached_value);\
+} while(0)
+#define __Pyx_GetModuleGlobalNameUncached(var, name)  do {\
+    PY_UINT64_T __pyx_dict_version;\
+    PyObject *__pyx_dict_cached_value;\
+    (var) = __Pyx__GetModuleGlobalName(name, &__pyx_dict_version, &__pyx_dict_cached_value);\
+} while(0)
+static PyObject *__Pyx__GetModuleGlobalName(PyObject *name, PY_UINT64_T *dict_version, PyObject **dict_cached_value);
+#else
+#define __Pyx_GetModuleGlobalName(var, name)  (var) = __Pyx__GetModuleGlobalName(name)
+#define __Pyx_GetModuleGlobalNameUncached(var, name)  (var) = __Pyx__GetModuleGlobalName(name)
+static CYTHON_INLINE PyObject *__Pyx__GetModuleGlobalName(PyObject *name);
+#endif
+
+/* AssertionsEnabled.proto */
+#if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX < 0x02070600 && !defined(Py_OptimizeFlag)
+  #define __Pyx_init_assertions_enabled()  (0)
+  #define __pyx_assertions_enabled()  (1)
+#elif CYTHON_COMPILING_IN_LIMITED_API  ||  (CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030C0000)
+  static int __pyx_assertions_enabled_flag;
+  #define __pyx_assertions_enabled() (__pyx_assertions_enabled_flag)
+  static int __Pyx_init_assertions_enabled(void) {
+    PyObject *builtins, *debug, *debug_str;
+    int flag;
+    builtins = PyEval_GetBuiltins();
+    if (!builtins) goto bad;
+    debug_str = PyUnicode_FromStringAndSize("__debug__", 9);
+    if (!debug_str) goto bad;
+    debug = PyObject_GetItem(builtins, debug_str);
+    Py_DECREF(debug_str);
+    if (!debug) goto bad;
+    flag = PyObject_IsTrue(debug);
+    Py_DECREF(debug);
+    if (flag == -1) goto bad;
+    __pyx_assertions_enabled_flag = flag;
+    return 0;
+  bad:
+    __pyx_assertions_enabled_flag = 1;
+    return -1;
+  }
+#else
+  #define __Pyx_init_assertions_enabled()  (0)
+  #define __pyx_assertions_enabled()  (!Py_OptimizeFlag)
+#endif
+
+/* RaiseTooManyValuesToUnpack.proto */
+static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected);
+
+/* RaiseNeedMoreValuesToUnpack.proto */
+static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index);
+
+/* RaiseNoneIterError.proto */
+static CYTHON_INLINE void __Pyx_RaiseNoneNotIterableError(void);
+
+/* ExtTypeTest.proto */
+static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type);
+
+/* GetTopmostException.proto */
+#if CYTHON_USE_EXC_INFO_STACK && CYTHON_FAST_THREAD_STATE
+static _PyErr_StackItem * __Pyx_PyErr_GetTopmostException(PyThreadState *tstate);
+#endif
+
+/* SaveResetException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_ExceptionSave(type, value, tb)  __Pyx__ExceptionSave(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#define __Pyx_ExceptionReset(type, value, tb)  __Pyx__ExceptionReset(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
+#else
+#define __Pyx_ExceptionSave(type, value, tb)   PyErr_GetExcInfo(type, value, tb)
+#define __Pyx_ExceptionReset(type, value, tb)  PyErr_SetExcInfo(type, value, tb)
+#endif
+
+/* GetException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_GetException(type, value, tb)  __Pyx__GetException(__pyx_tstate, type, value, tb)
+static int __Pyx__GetException(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#else
+static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb);
+#endif
+
+/* SwapException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_ExceptionSwap(type, value, tb)  __Pyx__ExceptionSwap(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionSwap(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#else
+static CYTHON_INLINE void __Pyx_ExceptionSwap(PyObject **type, PyObject **value, PyObject **tb);
+#endif
+
+/* Import.proto */
+static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level);
+
+/* ImportDottedModule.proto */
+static PyObject *__Pyx_ImportDottedModule(PyObject *name, PyObject *parts_tuple);
+#if PY_MAJOR_VERSION >= 3
+static PyObject *__Pyx_ImportDottedModule_WalkParts(PyObject *module, PyObject *name, PyObject *parts_tuple);
+#endif
+
+/* FastTypeChecks.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_TypeCheck(obj, type) __Pyx_IsSubtype(Py_TYPE(obj), (PyTypeObject *)type)
+#define __Pyx_TypeCheck2(obj, type1, type2) __Pyx_IsAnySubtype2(Py_TYPE(obj), (PyTypeObject *)type1, (PyTypeObject *)type2)
+static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b);
+static CYTHON_INLINE int __Pyx_IsAnySubtype2(PyTypeObject *cls, PyTypeObject *a, PyTypeObject *b);
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject *type);
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *type1, PyObject *type2);
+#else
+#define __Pyx_TypeCheck(obj, type) PyObject_TypeCheck(obj, (PyTypeObject *)type)
+#define __Pyx_TypeCheck2(obj, type1, type2) (PyObject_TypeCheck(obj, (PyTypeObject *)type1) || PyObject_TypeCheck(obj, (PyTypeObject *)type2))
+#define __Pyx_PyErr_GivenExceptionMatches(err, type) PyErr_GivenExceptionMatches(err, type)
+#define __Pyx_PyErr_GivenExceptionMatches2(err, type1, type2) (PyErr_GivenExceptionMatches(err, type1) || PyErr_GivenExceptionMatches(err, type2))
+#endif
+#define __Pyx_PyErr_ExceptionMatches2(err1, err2)  __Pyx_PyErr_GivenExceptionMatches2(__Pyx_PyErr_CurrentExceptionType(), err1, err2)
+#define __Pyx_PyException_Check(obj) __Pyx_TypeCheck(obj, PyExc_Exception)
+
+CYTHON_UNUSED static int __pyx_memoryview_getbuffer(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /*proto*/
+/* ListCompAppend.proto */
+#if CYTHON_USE_PYLIST_INTERNALS && CYTHON_ASSUME_SAFE_MACROS
+static CYTHON_INLINE int __Pyx_ListComp_Append(PyObject* list, PyObject* x) {
+    PyListObject* L = (PyListObject*) list;
+    Py_ssize_t len = Py_SIZE(list);
+    if (likely(L->allocated > len)) {
+        Py_INCREF(x);
+        #if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030d0000
+        L->ob_item[len] = x;
+        #else
+        PyList_SET_ITEM(list, len, x);
+        #endif
+        __Pyx_SET_SIZE(list, len + 1);
+        return 0;
+    }
+    return PyList_Append(list, x);
+}
+#else
+#define __Pyx_ListComp_Append(L,x) PyList_Append(L,x)
+#endif
+
+/* PySequenceMultiply.proto */
+#define __Pyx_PySequence_Multiply_Left(mul, seq)  __Pyx_PySequence_Multiply(seq, mul)
+static CYTHON_INLINE PyObject* __Pyx_PySequence_Multiply(PyObject *seq, Py_ssize_t mul);
+
+/* SetItemInt.proto */
+#define __Pyx_SetItemInt(o, i, v, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
+    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
+    __Pyx_SetItemInt_Fast(o, (Py_ssize_t)i, v, is_list, wraparound, boundscheck) :\
+    (is_list ? (PyErr_SetString(PyExc_IndexError, "list assignment index out of range"), -1) :\
+               __Pyx_SetItemInt_Generic(o, to_py_func(i), v)))
+static int __Pyx_SetItemInt_Generic(PyObject *o, PyObject *j, PyObject *v);
+static CYTHON_INLINE int __Pyx_SetItemInt_Fast(PyObject *o, Py_ssize_t i, PyObject *v,
+                                               int is_list, int wraparound, int boundscheck);
+
+/* RaiseUnboundLocalError.proto */
+static CYTHON_INLINE void __Pyx_RaiseUnboundLocalError(const char *varname);
+
+/* DivInt[long].proto */
+static CYTHON_INLINE long __Pyx_div_long(long, long);
+
+/* PySequenceContains.proto */
+static CYTHON_INLINE int __Pyx_PySequence_ContainsTF(PyObject* item, PyObject* seq, int eq) {
+    int result = PySequence_Contains(seq, item);
+    return unlikely(result < 0) ? result : (result == (eq == Py_EQ));
+}
+
+/* ImportFrom.proto */
+static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name);
+
+/* HasAttr.proto */
+static CYTHON_INLINE int __Pyx_HasAttr(PyObject *, PyObject *);
+
+/* IsLittleEndian.proto */
+static CYTHON_INLINE int __Pyx_Is_Little_Endian(void);
+
+/* BufferFormatCheck.proto */
+static const char* __Pyx_BufFmt_CheckString(__Pyx_BufFmt_Context* ctx, const char* ts);
+static void __Pyx_BufFmt_Init(__Pyx_BufFmt_Context* ctx,
+                              __Pyx_BufFmt_StackElem* stack,
+                              __Pyx_TypeInfo* type);
+
+/* BufferGetAndValidate.proto */
+#define __Pyx_GetBufferAndValidate(buf, obj, dtype, flags, nd, cast, stack)\
+    ((obj == Py_None || obj == NULL) ?\
+    (__Pyx_ZeroBuffer(buf), 0) :\
+    __Pyx__GetBufferAndValidate(buf, obj, dtype, flags, nd, cast, stack))
+static int  __Pyx__GetBufferAndValidate(Py_buffer* buf, PyObject* obj,
+    __Pyx_TypeInfo* dtype, int flags, int nd, int cast, __Pyx_BufFmt_StackElem* stack);
+static void __Pyx_ZeroBuffer(Py_buffer* buf);
+static CYTHON_INLINE void __Pyx_SafeReleaseBuffer(Py_buffer* info);
+static Py_ssize_t __Pyx_minusones[] = { -1, -1, -1, -1, -1, -1, -1, -1 };
+static Py_ssize_t __Pyx_zeros[] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+/* BufferIndexError.proto */
+static void __Pyx_RaiseBufferIndexError(int axis);
+
+/* ListAppend.proto */
+#if CYTHON_USE_PYLIST_INTERNALS && CYTHON_ASSUME_SAFE_MACROS
+static CYTHON_INLINE int __Pyx_PyList_Append(PyObject* list, PyObject* x) {
+    PyListObject* L = (PyListObject*) list;
+    Py_ssize_t len = Py_SIZE(list);
+    if (likely(L->allocated > len) & likely(len > (L->allocated >> 1))) {
+        Py_INCREF(x);
+        #if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030d0000
+        L->ob_item[len] = x;
+        #else
+        PyList_SET_ITEM(list, len, x);
+        #endif
+        __Pyx_SET_SIZE(list, len + 1);
+        return 0;
+    }
+    return PyList_Append(list, x);
+}
+#else
+#define __Pyx_PyList_Append(L,x) PyList_Append(L,x)
+#endif
+
+/* SliceTupleAndList.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyList_GetSlice(PyObject* src, Py_ssize_t start, Py_ssize_t stop);
+static CYTHON_INLINE PyObject* __Pyx_PyTuple_GetSlice(PyObject* src, Py_ssize_t start, Py_ssize_t stop);
+#else
+#define __Pyx_PyList_GetSlice(seq, start, stop)   PySequence_GetSlice(seq, start, stop)
+#define __Pyx_PyTuple_GetSlice(seq, start, stop)  PySequence_GetSlice(seq, start, stop)
+#endif
+
+/* PyIntCompare.proto */
+static CYTHON_INLINE int __Pyx_PyInt_BoolEqObjC(PyObject *op1, PyObject *op2, long intval, long inplace);
+
+/* PyObject_GenericGetAttrNoDict.proto */
+#if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GenericGetAttrNoDict(PyObject* obj, PyObject* attr_name);
+#else
+#define __Pyx_PyObject_GenericGetAttrNoDict PyObject_GenericGetAttr
+#endif
+
+/* PyObject_GenericGetAttr.proto */
+#if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000
+static PyObject* __Pyx_PyObject_GenericGetAttr(PyObject* obj, PyObject* attr_name);
+#else
+#define __Pyx_PyObject_GenericGetAttr PyObject_GenericGetAttr
+#endif
+
+/* IncludeStructmemberH.proto */
+#include <structmember.h>
+
+/* FixUpExtensionType.proto */
+#if CYTHON_USE_TYPE_SPECS
+static int __Pyx_fix_up_extension_type_from_spec(PyType_Spec *spec, PyTypeObject *type);
+#endif
+
+/* PyObjectCallNoArg.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallNoArg(PyObject *func);
+
+/* PyObjectGetMethod.proto */
+static int __Pyx_PyObject_GetMethod(PyObject *obj, PyObject *name, PyObject **method);
+
+/* PyObjectCallMethod0.proto */
+static PyObject* __Pyx_PyObject_CallMethod0(PyObject* obj, PyObject* method_name);
+
+/* ValidateBasesTuple.proto */
+#if CYTHON_COMPILING_IN_CPYTHON || CYTHON_COMPILING_IN_LIMITED_API || CYTHON_USE_TYPE_SPECS
+static int __Pyx_validate_bases_tuple(const char *type_name, Py_ssize_t dictoffset, PyObject *bases);
+#endif
+
+/* PyType_Ready.proto */
+CYTHON_UNUSED static int __Pyx_PyType_Ready(PyTypeObject *t);
+
+/* SetVTable.proto */
+static int __Pyx_SetVtable(PyTypeObject* typeptr , void* vtable);
+
+/* GetVTable.proto */
+static void* __Pyx_GetVtable(PyTypeObject *type);
+
+/* MergeVTables.proto */
+#if !CYTHON_COMPILING_IN_LIMITED_API
+static int __Pyx_MergeVtables(PyTypeObject *type);
+#endif
+
+/* SetupReduce.proto */
+#if !CYTHON_COMPILING_IN_LIMITED_API
+static int __Pyx_setup_reduce(PyObject* type_obj);
+#endif
+
+/* TypeImport.proto */
+#ifndef __PYX_HAVE_RT_ImportType_proto_3_0_12
+#define __PYX_HAVE_RT_ImportType_proto_3_0_12
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#include <stdalign.h>
+#endif
+#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || __cplusplus >= 201103L
+#define __PYX_GET_STRUCT_ALIGNMENT_3_0_12(s) alignof(s)
+#else
+#define __PYX_GET_STRUCT_ALIGNMENT_3_0_12(s) sizeof(void*)
+#endif
+enum __Pyx_ImportType_CheckSize_3_0_12 {
+   __Pyx_ImportType_CheckSize_Error_3_0_12 = 0,
+   __Pyx_ImportType_CheckSize_Warn_3_0_12 = 1,
+   __Pyx_ImportType_CheckSize_Ignore_3_0_12 = 2
+};
+static PyTypeObject *__Pyx_ImportType_3_0_12(PyObject* module, const char *module_name, const char *class_name, size_t size, size_t alignment, enum __Pyx_ImportType_CheckSize_3_0_12 check_size);
+#endif
+
+/* FetchSharedCythonModule.proto */
+static PyObject *__Pyx_FetchSharedCythonABIModule(void);
+
+/* FetchCommonType.proto */
+#if !CYTHON_USE_TYPE_SPECS
+static PyTypeObject* __Pyx_FetchCommonType(PyTypeObject* type);
+#else
+static PyTypeObject* __Pyx_FetchCommonTypeFromSpec(PyObject *module, PyType_Spec *spec, PyObject *bases);
+#endif
+
+/* PyMethodNew.proto */
+#if CYTHON_COMPILING_IN_LIMITED_API
+static PyObject *__Pyx_PyMethod_New(PyObject *func, PyObject *self, PyObject *typ) {
+    PyObject *typesModule=NULL, *methodType=NULL, *result=NULL;
+    CYTHON_UNUSED_VAR(typ);
+    if (!self)
+        return __Pyx_NewRef(func);
+    typesModule = PyImport_ImportModule("types");
+    if (!typesModule) return NULL;
+    methodType = PyObject_GetAttrString(typesModule, "MethodType");
+    Py_DECREF(typesModule);
+    if (!methodType) return NULL;
+    result = PyObject_CallFunctionObjArgs(methodType, func, self, NULL);
+    Py_DECREF(methodType);
+    return result;
+}
+#elif PY_MAJOR_VERSION >= 3
+static PyObject *__Pyx_PyMethod_New(PyObject *func, PyObject *self, PyObject *typ) {
+    CYTHON_UNUSED_VAR(typ);
+    if (!self)
+        return __Pyx_NewRef(func);
+    return PyMethod_New(func, self);
+}
+#else
+    #define __Pyx_PyMethod_New PyMethod_New
+#endif
+
+/* PyVectorcallFastCallDict.proto */
+#if CYTHON_METH_FASTCALL
+static CYTHON_INLINE PyObject *__Pyx_PyVectorcall_FastCallDict(PyObject *func, __pyx_vectorcallfunc vc, PyObject *const *args, size_t nargs, PyObject *kw);
+#endif
+
+/* CythonFunctionShared.proto */
+#define __Pyx_CyFunction_USED
+#define __Pyx_CYFUNCTION_STATICMETHOD  0x01
+#define __Pyx_CYFUNCTION_CLASSMETHOD   0x02
+#define __Pyx_CYFUNCTION_CCLASS        0x04
+#define __Pyx_CYFUNCTION_COROUTINE     0x08
+#define __Pyx_CyFunction_GetClosure(f)\
+    (((__pyx_CyFunctionObject *) (f))->func_closure)
+#if PY_VERSION_HEX < 0x030900B1 || CYTHON_COMPILING_IN_LIMITED_API
+  #define __Pyx_CyFunction_GetClassObj(f)\
+      (((__pyx_CyFunctionObject *) (f))->func_classobj)
+#else
+  #define __Pyx_CyFunction_GetClassObj(f)\
+      ((PyObject*) ((PyCMethodObject *) (f))->mm_class)
+#endif
+#define __Pyx_CyFunction_SetClassObj(f, classobj)\
+    __Pyx__CyFunction_SetClassObj((__pyx_CyFunctionObject *) (f), (classobj))
+#define __Pyx_CyFunction_Defaults(type, f)\
+    ((type *)(((__pyx_CyFunctionObject *) (f))->defaults))
+#define __Pyx_CyFunction_SetDefaultsGetter(f, g)\
+    ((__pyx_CyFunctionObject *) (f))->defaults_getter = (g)
+typedef struct {
+#if CYTHON_COMPILING_IN_LIMITED_API
+    PyObject_HEAD
+    PyObject *func;
+#elif PY_VERSION_HEX < 0x030900B1
+    PyCFunctionObject func;
+#else
+    PyCMethodObject func;
+#endif
+#if CYTHON_BACKPORT_VECTORCALL
+    __pyx_vectorcallfunc func_vectorcall;
+#endif
+#if PY_VERSION_HEX < 0x030500A0 || CYTHON_COMPILING_IN_LIMITED_API
+    PyObject *func_weakreflist;
+#endif
+    PyObject *func_dict;
+    PyObject *func_name;
+    PyObject *func_qualname;
+    PyObject *func_doc;
+    PyObject *func_globals;
+    PyObject *func_code;
+    PyObject *func_closure;
+#if PY_VERSION_HEX < 0x030900B1 || CYTHON_COMPILING_IN_LIMITED_API
+    PyObject *func_classobj;
+#endif
+    void *defaults;
+    int defaults_pyobjects;
+    size_t defaults_size;
+    int flags;
+    PyObject *defaults_tuple;
+    PyObject *defaults_kwdict;
+    PyObject *(*defaults_getter)(PyObject *);
+    PyObject *func_annotations;
+    PyObject *func_is_coroutine;
+} __pyx_CyFunctionObject;
+#undef __Pyx_CyOrPyCFunction_Check
+#define __Pyx_CyFunction_Check(obj)  __Pyx_TypeCheck(obj, __pyx_CyFunctionType)
+#define __Pyx_CyOrPyCFunction_Check(obj)  __Pyx_TypeCheck2(obj, __pyx_CyFunctionType, &PyCFunction_Type)
+#define __Pyx_CyFunction_CheckExact(obj)  __Pyx_IS_TYPE(obj, __pyx_CyFunctionType)
+static CYTHON_INLINE int __Pyx__IsSameCyOrCFunction(PyObject *func, void *cfunc);
+#undef __Pyx_IsSameCFunction
+#define __Pyx_IsSameCFunction(func, cfunc)   __Pyx__IsSameCyOrCFunction(func, cfunc)
+static PyObject *__Pyx_CyFunction_Init(__pyx_CyFunctionObject* op, PyMethodDef *ml,
+                                      int flags, PyObject* qualname,
+                                      PyObject *closure,
+                                      PyObject *module, PyObject *globals,
+                                      PyObject* code);
+static CYTHON_INLINE void __Pyx__CyFunction_SetClassObj(__pyx_CyFunctionObject* f, PyObject* classobj);
+static CYTHON_INLINE void *__Pyx_CyFunction_InitDefaults(PyObject *m,
+                                                         size_t size,
+                                                         int pyobjects);
+static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsTuple(PyObject *m,
+                                                            PyObject *tuple);
+static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsKwDict(PyObject *m,
+                                                             PyObject *dict);
+static CYTHON_INLINE void __Pyx_CyFunction_SetAnnotationsDict(PyObject *m,
+                                                              PyObject *dict);
+static int __pyx_CyFunction_init(PyObject *module);
+#if CYTHON_METH_FASTCALL
+static PyObject * __Pyx_CyFunction_Vectorcall_NOARGS(PyObject *func, PyObject *const *args, size_t nargsf, PyObject *kwnames);
+static PyObject * __Pyx_CyFunction_Vectorcall_O(PyObject *func, PyObject *const *args, size_t nargsf, PyObject *kwnames);
+static PyObject * __Pyx_CyFunction_Vectorcall_FASTCALL_KEYWORDS(PyObject *func, PyObject *const *args, size_t nargsf, PyObject *kwnames);
+static PyObject * __Pyx_CyFunction_Vectorcall_FASTCALL_KEYWORDS_METHOD(PyObject *func, PyObject *const *args, size_t nargsf, PyObject *kwnames);
+#if CYTHON_BACKPORT_VECTORCALL
+#define __Pyx_CyFunction_func_vectorcall(f) (((__pyx_CyFunctionObject*)f)->func_vectorcall)
+#else
+#define __Pyx_CyFunction_func_vectorcall(f) (((PyCFunctionObject*)f)->vectorcall)
+#endif
+#endif
+
+/* CythonFunction.proto */
+static PyObject *__Pyx_CyFunction_New(PyMethodDef *ml,
+                                      int flags, PyObject* qualname,
+                                      PyObject *closure,
+                                      PyObject *module, PyObject *globals,
+                                      PyObject* code);
+
+/* CLineInTraceback.proto */
+#ifdef CYTHON_CLINE_IN_TRACEBACK
+#define __Pyx_CLineForTraceback(tstate, c_line)  (((CYTHON_CLINE_IN_TRACEBACK)) ? c_line : 0)
+#else
+static int __Pyx_CLineForTraceback(PyThreadState *tstate, int c_line);
+#endif
+
+/* CodeObjectCache.proto */
+#if !CYTHON_COMPILING_IN_LIMITED_API
+typedef struct {
+    PyCodeObject* code_object;
+    int code_line;
+} __Pyx_CodeObjectCacheEntry;
+struct __Pyx_CodeObjectCache {
+    int count;
+    int max_count;
+    __Pyx_CodeObjectCacheEntry* entries;
+};
+static struct __Pyx_CodeObjectCache __pyx_code_cache = {0,0,NULL};
+static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line);
+static PyCodeObject *__pyx_find_code_object(int code_line);
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object);
+#endif
+
+/* AddTraceback.proto */
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename);
+
+#if PY_MAJOR_VERSION < 3
+    static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags);
+    static void __Pyx_ReleaseBuffer(Py_buffer *view);
+#else
+    #define __Pyx_GetBuffer PyObject_GetBuffer
+    #define __Pyx_ReleaseBuffer PyBuffer_Release
+#endif
+
+
+/* BufferStructDeclare.proto */
+typedef struct {
+  Py_ssize_t shape, strides, suboffsets;
+} __Pyx_Buf_DimInfo;
+typedef struct {
+  size_t refcount;
+  Py_buffer pybuffer;
+} __Pyx_Buffer;
+typedef struct {
+  __Pyx_Buffer *rcbuffer;
+  char *data;
+  __Pyx_Buf_DimInfo diminfo[8];
+} __Pyx_LocalBuf_ND;
+
+/* MemviewSliceIsContig.proto */
+static int __pyx_memviewslice_is_contig(const __Pyx_memviewslice mvs, char order, int ndim);
+
+/* OverlappingSlices.proto */
+static int __pyx_slices_overlap(__Pyx_memviewslice *slice1,
+                                __Pyx_memviewslice *slice2,
+                                int ndim, size_t itemsize);
+
+/* TypeInfoCompare.proto */
+static int __pyx_typeinfo_cmp(__Pyx_TypeInfo *a, __Pyx_TypeInfo *b);
+
+/* MemviewSliceValidateAndInit.proto */
+static int __Pyx_ValidateAndInit_memviewslice(
+                int *axes_specs,
+                int c_or_f_flag,
+                int buf_flags,
+                int ndim,
+                __Pyx_TypeInfo *dtype,
+                __Pyx_BufFmt_StackElem stack[],
+                __Pyx_memviewslice *memviewslice,
+                PyObject *original_obj);
+
+/* ObjectToMemviewSlice.proto */
+static CYTHON_INLINE __Pyx_memviewslice __Pyx_PyObject_to_MemoryviewSlice_ds_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t(PyObject *, int writable_flag);
+
+/* MemviewDtypeToObject.proto */
+static CYTHON_INLINE PyObject *__pyx_memview_get_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t(const char *itemp);
+static CYTHON_INLINE int __pyx_memview_set_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t(const char *itemp, PyObject *obj);
+
+/* ObjectToMemviewSlice.proto */
+static CYTHON_INLINE __Pyx_memviewslice __Pyx_PyObject_to_MemoryviewSlice_dsds_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t(PyObject *, int writable_flag);
+
+/* RealImag.proto */
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    #define __Pyx_CREAL(z) ((z).real())
+    #define __Pyx_CIMAG(z) ((z).imag())
+  #else
+    #define __Pyx_CREAL(z) (__real__(z))
+    #define __Pyx_CIMAG(z) (__imag__(z))
+  #endif
+#else
+    #define __Pyx_CREAL(z) ((z).real)
+    #define __Pyx_CIMAG(z) ((z).imag)
+#endif
+#if defined(__cplusplus) && CYTHON_CCOMPLEX\
+        && (defined(_WIN32) || defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4 )) || __cplusplus >= 201103)
+    #define __Pyx_SET_CREAL(z,x) ((z).real(x))
+    #define __Pyx_SET_CIMAG(z,y) ((z).imag(y))
+#else
+    #define __Pyx_SET_CREAL(z,x) __Pyx_CREAL(z) = (x)
+    #define __Pyx_SET_CIMAG(z,y) __Pyx_CIMAG(z) = (y)
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+    #define __Pyx_c_eq_float(a, b)   ((a)==(b))
+    #define __Pyx_c_sum_float(a, b)  ((a)+(b))
+    #define __Pyx_c_diff_float(a, b) ((a)-(b))
+    #define __Pyx_c_prod_float(a, b) ((a)*(b))
+    #define __Pyx_c_quot_float(a, b) ((a)/(b))
+    #define __Pyx_c_neg_float(a)     (-(a))
+  #ifdef __cplusplus
+    #define __Pyx_c_is_zero_float(z) ((z)==(float)0)
+    #define __Pyx_c_conj_float(z)    (::std::conj(z))
+    #if 1
+        #define __Pyx_c_abs_float(z)     (::std::abs(z))
+        #define __Pyx_c_pow_float(a, b)  (::std::pow(a, b))
+    #endif
+  #else
+    #define __Pyx_c_is_zero_float(z) ((z)==0)
+    #define __Pyx_c_conj_float(z)    (conjf(z))
+    #if 1
+        #define __Pyx_c_abs_float(z)     (cabsf(z))
+        #define __Pyx_c_pow_float(a, b)  (cpowf(a, b))
+    #endif
+ #endif
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sum_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_diff_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prod_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_neg_float(__pyx_t_float_complex);
+    static CYTHON_INLINE int __Pyx_c_is_zero_float(__pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conj_float(__pyx_t_float_complex);
+    #if 1
+        static CYTHON_INLINE float __Pyx_c_abs_float(__pyx_t_float_complex);
+        static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_pow_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    #endif
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+    #define __Pyx_c_eq_double(a, b)   ((a)==(b))
+    #define __Pyx_c_sum_double(a, b)  ((a)+(b))
+    #define __Pyx_c_diff_double(a, b) ((a)-(b))
+    #define __Pyx_c_prod_double(a, b) ((a)*(b))
+    #define __Pyx_c_quot_double(a, b) ((a)/(b))
+    #define __Pyx_c_neg_double(a)     (-(a))
+  #ifdef __cplusplus
+    #define __Pyx_c_is_zero_double(z) ((z)==(double)0)
+    #define __Pyx_c_conj_double(z)    (::std::conj(z))
+    #if 1
+        #define __Pyx_c_abs_double(z)     (::std::abs(z))
+        #define __Pyx_c_pow_double(a, b)  (::std::pow(a, b))
+    #endif
+  #else
+    #define __Pyx_c_is_zero_double(z) ((z)==0)
+    #define __Pyx_c_conj_double(z)    (conj(z))
+    #if 1
+        #define __Pyx_c_abs_double(z)     (cabs(z))
+        #define __Pyx_c_pow_double(a, b)  (cpow(a, b))
+    #endif
+ #endif
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg_double(__pyx_t_double_complex);
+    static CYTHON_INLINE int __Pyx_c_is_zero_double(__pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj_double(__pyx_t_double_complex);
+    #if 1
+        static CYTHON_INLINE double __Pyx_c_abs_double(__pyx_t_double_complex);
+        static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    #endif
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+    #define __Pyx_c_eq_long__double(a, b)   ((a)==(b))
+    #define __Pyx_c_sum_long__double(a, b)  ((a)+(b))
+    #define __Pyx_c_diff_long__double(a, b) ((a)-(b))
+    #define __Pyx_c_prod_long__double(a, b) ((a)*(b))
+    #define __Pyx_c_quot_long__double(a, b) ((a)/(b))
+    #define __Pyx_c_neg_long__double(a)     (-(a))
+  #ifdef __cplusplus
+    #define __Pyx_c_is_zero_long__double(z) ((z)==(long double)0)
+    #define __Pyx_c_conj_long__double(z)    (::std::conj(z))
+    #if 1
+        #define __Pyx_c_abs_long__double(z)     (::std::abs(z))
+        #define __Pyx_c_pow_long__double(a, b)  (::std::pow(a, b))
+    #endif
+  #else
+    #define __Pyx_c_is_zero_long__double(z) ((z)==0)
+    #define __Pyx_c_conj_long__double(z)    (conjl(z))
+    #if 1
+        #define __Pyx_c_abs_long__double(z)     (cabsl(z))
+        #define __Pyx_c_pow_long__double(a, b)  (cpowl(a, b))
+    #endif
+ #endif
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_long__double(__pyx_t_long_double_complex, __pyx_t_long_double_complex);
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_sum_long__double(__pyx_t_long_double_complex, __pyx_t_long_double_complex);
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_diff_long__double(__pyx_t_long_double_complex, __pyx_t_long_double_complex);
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_prod_long__double(__pyx_t_long_double_complex, __pyx_t_long_double_complex);
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_quot_long__double(__pyx_t_long_double_complex, __pyx_t_long_double_complex);
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_neg_long__double(__pyx_t_long_double_complex);
+    static CYTHON_INLINE int __Pyx_c_is_zero_long__double(__pyx_t_long_double_complex);
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_conj_long__double(__pyx_t_long_double_complex);
+    #if 1
+        static CYTHON_INLINE long double __Pyx_c_abs_long__double(__pyx_t_long_double_complex);
+        static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_pow_long__double(__pyx_t_long_double_complex, __pyx_t_long_double_complex);
+    #endif
+#endif
+
+/* MemviewSliceCopyTemplate.proto */
+static __Pyx_memviewslice
+__pyx_memoryview_copy_new_contig(const __Pyx_memviewslice *from_mvs,
+                                 const char *mode, int ndim,
+                                 size_t sizeof_dtype, int contig_flag,
+                                 int dtype_is_object);
+
+/* MemviewSliceInit.proto */
+#define __Pyx_BUF_MAX_NDIMS %(BUF_MAX_NDIMS)d
+#define __Pyx_MEMVIEW_DIRECT   1
+#define __Pyx_MEMVIEW_PTR      2
+#define __Pyx_MEMVIEW_FULL     4
+#define __Pyx_MEMVIEW_CONTIG   8
+#define __Pyx_MEMVIEW_STRIDED  16
+#define __Pyx_MEMVIEW_FOLLOW   32
+#define __Pyx_IS_C_CONTIG 1
+#define __Pyx_IS_F_CONTIG 2
+static int __Pyx_init_memviewslice(
+                struct __pyx_memoryview_obj *memview,
+                int ndim,
+                __Pyx_memviewslice *memviewslice,
+                int memview_is_new_reference);
+static CYTHON_INLINE int __pyx_add_acquisition_count_locked(
+    __pyx_atomic_int_type *acquisition_count, PyThread_type_lock lock);
+static CYTHON_INLINE int __pyx_sub_acquisition_count_locked(
+    __pyx_atomic_int_type *acquisition_count, PyThread_type_lock lock);
+#define __pyx_get_slice_count_pointer(memview) (&memview->acquisition_count)
+#define __PYX_INC_MEMVIEW(slice, have_gil) __Pyx_INC_MEMVIEW(slice, have_gil, __LINE__)
+#define __PYX_XCLEAR_MEMVIEW(slice, have_gil) __Pyx_XCLEAR_MEMVIEW(slice, have_gil, __LINE__)
+static CYTHON_INLINE void __Pyx_INC_MEMVIEW(__Pyx_memviewslice *, int, int);
+static CYTHON_INLINE void __Pyx_XCLEAR_MEMVIEW(__Pyx_memviewslice *, int, int);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *);
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_npy_int64(npy_int64 value);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE npy_int64 __Pyx_PyInt_As_npy_int64(PyObject *);
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value);
+
+/* None.proto */
+#include <new>
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE char __Pyx_PyInt_As_char(PyObject *);
+
+/* FormatTypeName.proto */
+#if CYTHON_COMPILING_IN_LIMITED_API
+typedef PyObject *__Pyx_TypeName;
+#define __Pyx_FMT_TYPENAME "%U"
+static __Pyx_TypeName __Pyx_PyType_GetName(PyTypeObject* tp);
+#define __Pyx_DECREF_TypeName(obj) Py_XDECREF(obj)
+#else
+typedef const char *__Pyx_TypeName;
+#define __Pyx_FMT_TYPENAME "%.200s"
+#define __Pyx_PyType_GetName(tp) ((tp)->tp_name)
+#define __Pyx_DECREF_TypeName(obj)
+#endif
+
+/* CheckBinaryVersion.proto */
+static unsigned long __Pyx_get_runtime_version(void);
+static int __Pyx_check_binary_version(unsigned long ct_version, unsigned long rt_version, int allow_newer);
+
+/* InitStrings.proto */
+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t);
+
+/* #### Code section: module_declarations ### */
+static PyObject *__pyx_array_get_memview(struct __pyx_array_obj *__pyx_v_self); /* proto*/
+static char *__pyx_memoryview_get_item_pointer(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index); /* proto*/
+static PyObject *__pyx_memoryview_is_slice(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_obj); /* proto*/
+static PyObject *__pyx_memoryview_setitem_slice_assignment(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_dst, PyObject *__pyx_v_src); /* proto*/
+static PyObject *__pyx_memoryview_setitem_slice_assign_scalar(struct __pyx_memoryview_obj *__pyx_v_self, struct __pyx_memoryview_obj *__pyx_v_dst, PyObject *__pyx_v_value); /* proto*/
+static PyObject *__pyx_memoryview_setitem_indexed(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index, PyObject *__pyx_v_value); /* proto*/
+static PyObject *__pyx_memoryview_convert_item_to_object(struct __pyx_memoryview_obj *__pyx_v_self, char *__pyx_v_itemp); /* proto*/
+static PyObject *__pyx_memoryview_assign_item_from_object(struct __pyx_memoryview_obj *__pyx_v_self, char *__pyx_v_itemp, PyObject *__pyx_v_value); /* proto*/
+static PyObject *__pyx_memoryview__get_base(struct __pyx_memoryview_obj *__pyx_v_self); /* proto*/
+static PyObject *__pyx_memoryviewslice_convert_item_to_object(struct __pyx_memoryviewslice_obj *__pyx_v_self, char *__pyx_v_itemp); /* proto*/
+static PyObject *__pyx_memoryviewslice_assign_item_from_object(struct __pyx_memoryviewslice_obj *__pyx_v_self, char *__pyx_v_itemp, PyObject *__pyx_v_value); /* proto*/
+static PyObject *__pyx_memoryviewslice__get_base(struct __pyx_memoryviewslice_obj *__pyx_v_self); /* proto*/
+static CYTHON_INLINE npy_intp __pyx_f_5numpy_5dtype_8itemsize_itemsize(PyArray_Descr *__pyx_v_self); /* proto*/
+static CYTHON_INLINE npy_intp __pyx_f_5numpy_5dtype_9alignment_alignment(PyArray_Descr *__pyx_v_self); /* proto*/
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_5dtype_6fields_fields(PyArray_Descr *__pyx_v_self); /* proto*/
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_5dtype_5names_names(PyArray_Descr *__pyx_v_self); /* proto*/
+static CYTHON_INLINE PyArray_ArrayDescr *__pyx_f_5numpy_5dtype_8subarray_subarray(PyArray_Descr *__pyx_v_self); /* proto*/
+static CYTHON_INLINE npy_uint64 __pyx_f_5numpy_5dtype_5flags_flags(PyArray_Descr *__pyx_v_self); /* proto*/
+static CYTHON_INLINE int __pyx_f_5numpy_9broadcast_7numiter_numiter(PyArrayMultiIterObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE npy_intp __pyx_f_5numpy_9broadcast_4size_size(PyArrayMultiIterObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE npy_intp __pyx_f_5numpy_9broadcast_5index_index(PyArrayMultiIterObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE int __pyx_f_5numpy_9broadcast_2nd_nd(PyArrayMultiIterObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE npy_intp *__pyx_f_5numpy_9broadcast_10dimensions_dimensions(PyArrayMultiIterObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE void **__pyx_f_5numpy_9broadcast_5iters_iters(PyArrayMultiIterObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_7ndarray_4base_base(PyArrayObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE PyArray_Descr *__pyx_f_5numpy_7ndarray_5descr_descr(PyArrayObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE int __pyx_f_5numpy_7ndarray_4ndim_ndim(PyArrayObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE npy_intp *__pyx_f_5numpy_7ndarray_5shape_shape(PyArrayObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE npy_intp *__pyx_f_5numpy_7ndarray_7strides_strides(PyArrayObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE npy_intp __pyx_f_5numpy_7ndarray_4size_size(PyArrayObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE char *__pyx_f_5numpy_7ndarray_4data_data(PyArrayObject *__pyx_v_self); /* proto*/
+
+/* Module declarations from "cython.view" */
+
+/* Module declarations from "cython.dataclasses" */
+
+/* Module declarations from "cython" */
+
+/* Module declarations from "libc.string" */
+
+/* Module declarations from "libc.stdio" */
+
+/* Module declarations from "__builtin__" */
+
+/* Module declarations from "cpython.type" */
+
+/* Module declarations from "cpython" */
+
+/* Module declarations from "cpython.object" */
+
+/* Module declarations from "cpython.ref" */
+
+/* Module declarations from "numpy" */
+
+/* Module declarations from "numpy" */
+
+/* Module declarations from "fairseq.data.data_utils_fast" */
+static PyObject *__pyx_collections_abc_Sequence = 0;
+static PyObject *generic = 0;
+static PyObject *strided = 0;
+static PyObject *indirect = 0;
+static PyObject *contiguous = 0;
+static PyObject *indirect_contiguous = 0;
+static int __pyx_memoryview_thread_locks_used;
+static PyThread_type_lock __pyx_memoryview_thread_locks[8];
+static PyObject *__pyx_f_7fairseq_4data_15data_utils_fast__is_batch_full(long, long, long, long); /*proto*/
+static PyObject *__pyx_f_7fairseq_4data_15data_utils_fast_batch_by_size_fast(PyArrayObject *, PyObject *, long, long, int, int __pyx_skip_dispatch); /*proto*/
+static PyObject *__pyx_f_7fairseq_4data_15data_utils_fast__find_valid_shape(__Pyx_memviewslice, long, long); /*proto*/
+static PyObject *__pyx_f_7fairseq_4data_15data_utils_fast_batch_fixed_shapes_fast(PyArrayObject *, PyObject *, PyArrayObject *, int __pyx_skip_dispatch); /*proto*/
+static int __pyx_array_allocate_buffer(struct __pyx_array_obj *); /*proto*/
+static struct __pyx_array_obj *__pyx_array_new(PyObject *, Py_ssize_t, char *, char *, char *); /*proto*/
+static PyObject *__pyx_memoryview_new(PyObject *, int, int, __Pyx_TypeInfo *); /*proto*/
+static CYTHON_INLINE int __pyx_memoryview_check(PyObject *); /*proto*/
+static PyObject *_unellipsify(PyObject *, int); /*proto*/
+static int assert_direct_dimensions(Py_ssize_t *, int); /*proto*/
+static struct __pyx_memoryview_obj *__pyx_memview_slice(struct __pyx_memoryview_obj *, PyObject *); /*proto*/
+static int __pyx_memoryview_slice_memviewslice(__Pyx_memviewslice *, Py_ssize_t, Py_ssize_t, Py_ssize_t, int, int, int *, Py_ssize_t, Py_ssize_t, Py_ssize_t, int, int, int, int); /*proto*/
+static char *__pyx_pybuffer_index(Py_buffer *, char *, Py_ssize_t, Py_ssize_t); /*proto*/
+static int __pyx_memslice_transpose(__Pyx_memviewslice *); /*proto*/
+static PyObject *__pyx_memoryview_fromslice(__Pyx_memviewslice, int, PyObject *(*)(char *), int (*)(char *, PyObject *), int); /*proto*/
+static __Pyx_memviewslice *__pyx_memoryview_get_slice_from_memoryview(struct __pyx_memoryview_obj *, __Pyx_memviewslice *); /*proto*/
+static void __pyx_memoryview_slice_copy(struct __pyx_memoryview_obj *, __Pyx_memviewslice *); /*proto*/
+static PyObject *__pyx_memoryview_copy_object(struct __pyx_memoryview_obj *); /*proto*/
+static PyObject *__pyx_memoryview_copy_object_from_slice(struct __pyx_memoryview_obj *, __Pyx_memviewslice *); /*proto*/
+static Py_ssize_t abs_py_ssize_t(Py_ssize_t); /*proto*/
+static char __pyx_get_best_slice_order(__Pyx_memviewslice *, int); /*proto*/
+static void _copy_strided_to_strided(char *, Py_ssize_t *, char *, Py_ssize_t *, Py_ssize_t *, Py_ssize_t *, int, size_t); /*proto*/
+static void copy_strided_to_strided(__Pyx_memviewslice *, __Pyx_memviewslice *, int, size_t); /*proto*/
+static Py_ssize_t __pyx_memoryview_slice_get_size(__Pyx_memviewslice *, int); /*proto*/
+static Py_ssize_t __pyx_fill_contig_strides_array(Py_ssize_t *, Py_ssize_t *, Py_ssize_t, int, char); /*proto*/
+static void *__pyx_memoryview_copy_data_to_temp(__Pyx_memviewslice *, __Pyx_memviewslice *, char, int); /*proto*/
+static int __pyx_memoryview_err_extents(int, Py_ssize_t, Py_ssize_t); /*proto*/
+static int __pyx_memoryview_err_dim(PyObject *, PyObject *, int); /*proto*/
+static int __pyx_memoryview_err(PyObject *, PyObject *); /*proto*/
+static int __pyx_memoryview_err_no_memory(void); /*proto*/
+static int __pyx_memoryview_copy_contents(__Pyx_memviewslice, __Pyx_memviewslice, int, int, int); /*proto*/
+static void __pyx_memoryview_broadcast_leading(__Pyx_memviewslice *, int, int); /*proto*/
+static void __pyx_memoryview_refcount_copying(__Pyx_memviewslice *, int, int, int); /*proto*/
+static void __pyx_memoryview_refcount_objects_in_slice_with_gil(char *, Py_ssize_t *, Py_ssize_t *, int, int); /*proto*/
+static void __pyx_memoryview_refcount_objects_in_slice(char *, Py_ssize_t *, Py_ssize_t *, int, int); /*proto*/
+static void __pyx_memoryview_slice_assign_scalar(__Pyx_memviewslice *, int, size_t, void *, int); /*proto*/
+static void __pyx_memoryview__slice_assign_scalar(char *, Py_ssize_t *, Py_ssize_t *, int, size_t, void *); /*proto*/
+static PyObject *__pyx_unpickle_Enum__set_state(struct __pyx_MemviewEnum_obj *, PyObject *); /*proto*/
+/* #### Code section: typeinfo ### */
+static __Pyx_TypeInfo __Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t = { "DTYPE_t", NULL, sizeof(__pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t), { 0 }, 0, __PYX_IS_UNSIGNED(__pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t) ? 'U' : 'I', __PYX_IS_UNSIGNED(__pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t), 0 };
+/* #### Code section: before_global_var ### */
+#define __Pyx_MODULE_NAME "fairseq.data.data_utils_fast"
+extern int __pyx_module_is_main_fairseq__data__data_utils_fast;
+int __pyx_module_is_main_fairseq__data__data_utils_fast = 0;
+
+/* Implementation of "fairseq.data.data_utils_fast" */
+/* #### Code section: global_var ### */
+static PyObject *__pyx_builtin_range;
+static PyObject *__pyx_builtin_AssertionError;
+static PyObject *__pyx_builtin_max;
+static PyObject *__pyx_builtin___import__;
+static PyObject *__pyx_builtin_ValueError;
+static PyObject *__pyx_builtin_MemoryError;
+static PyObject *__pyx_builtin_enumerate;
+static PyObject *__pyx_builtin_TypeError;
+static PyObject *__pyx_builtin_Ellipsis;
+static PyObject *__pyx_builtin_id;
+static PyObject *__pyx_builtin_IndexError;
+static PyObject *__pyx_builtin_ImportError;
+/* #### Code section: string_decls ### */
+static const char __pyx_k_[] = ": ";
+static const char __pyx_k_O[] = "O";
+static const char __pyx_k_c[] = "c";
+static const char __pyx_k__2[] = ".";
+static const char __pyx_k__3[] = "*";
+static const char __pyx_k__6[] = "'";
+static const char __pyx_k__7[] = ")";
+static const char __pyx_k_gc[] = "gc";
+static const char __pyx_k_id[] = "id";
+static const char __pyx_k_np[] = "np";
+static const char __pyx_k__26[] = "?";
+static const char __pyx_k_abc[] = "abc";
+static const char __pyx_k_and[] = " and ";
+static const char __pyx_k_got[] = " (got ";
+static const char __pyx_k_max[] = "max";
+static const char __pyx_k_new[] = "__new__";
+static const char __pyx_k_obj[] = "obj";
+static const char __pyx_k_sys[] = "sys";
+static const char __pyx_k_base[] = "base";
+static const char __pyx_k_dict[] = "__dict__";
+static const char __pyx_k_main[] = "__main__";
+static const char __pyx_k_mode[] = "mode";
+static const char __pyx_k_name[] = "name";
+static const char __pyx_k_ndim[] = "ndim";
+static const char __pyx_k_pack[] = "pack";
+static const char __pyx_k_size[] = "size";
+static const char __pyx_k_spec[] = "__spec__";
+static const char __pyx_k_step[] = "step";
+static const char __pyx_k_stop[] = "stop";
+static const char __pyx_k_test[] = "__test__";
+static const char __pyx_k_ASCII[] = "ASCII";
+static const char __pyx_k_DTYPE[] = "DTYPE";
+static const char __pyx_k_class[] = "__class__";
+static const char __pyx_k_count[] = "count";
+static const char __pyx_k_error[] = "error";
+static const char __pyx_k_flags[] = "flags";
+static const char __pyx_k_index[] = "index";
+static const char __pyx_k_int64[] = "int64";
+static const char __pyx_k_numpy[] = "numpy";
+static const char __pyx_k_range[] = "range";
+static const char __pyx_k_shape[] = "shape";
+static const char __pyx_k_start[] = "start";
+static const char __pyx_k_enable[] = "enable";
+static const char __pyx_k_encode[] = "encode";
+static const char __pyx_k_format[] = "format";
+static const char __pyx_k_import[] = "__import__";
+static const char __pyx_k_name_2[] = "__name__";
+static const char __pyx_k_pickle[] = "pickle";
+static const char __pyx_k_reduce[] = "__reduce__";
+static const char __pyx_k_struct[] = "struct";
+static const char __pyx_k_unpack[] = "unpack";
+static const char __pyx_k_update[] = "update";
+static const char __pyx_k_disable[] = "disable";
+static const char __pyx_k_fortran[] = "fortran";
+static const char __pyx_k_indices[] = "indices";
+static const char __pyx_k_memview[] = "memview";
+static const char __pyx_k_Ellipsis[] = "Ellipsis";
+static const char __pyx_k_Sequence[] = "Sequence";
+static const char __pyx_k_bsz_mult[] = "bsz_mult";
+static const char __pyx_k_getstate[] = "__getstate__";
+static const char __pyx_k_itemsize[] = "itemsize";
+static const char __pyx_k_pyx_type[] = "__pyx_type";
+static const char __pyx_k_register[] = "register";
+static const char __pyx_k_setstate[] = "__setstate__";
+static const char __pyx_k_TypeError[] = "TypeError";
+static const char __pyx_k_enumerate[] = "enumerate";
+static const char __pyx_k_isenabled[] = "isenabled";
+static const char __pyx_k_pyx_state[] = "__pyx_state";
+static const char __pyx_k_reduce_ex[] = "__reduce_ex__";
+static const char __pyx_k_IndexError[] = "IndexError";
+static const char __pyx_k_ValueError[] = "ValueError";
+static const char __pyx_k_max_tokens[] = "max_tokens";
+static const char __pyx_k_pyx_result[] = "__pyx_result";
+static const char __pyx_k_pyx_vtable[] = "__pyx_vtable__";
+static const char __pyx_k_ImportError[] = "ImportError";
+static const char __pyx_k_MemoryError[] = "MemoryError";
+static const char __pyx_k_PickleError[] = "PickleError";
+static const char __pyx_k_collections[] = "collections";
+static const char __pyx_k_initializing[] = "_initializing";
+static const char __pyx_k_is_coroutine[] = "_is_coroutine";
+static const char __pyx_k_pyx_checksum[] = "__pyx_checksum";
+static const char __pyx_k_stringsource[] = "<stringsource>";
+static const char __pyx_k_version_info[] = "version_info";
+static const char __pyx_k_class_getitem[] = "__class_getitem__";
+static const char __pyx_k_max_sentences[] = "max_sentences";
+static const char __pyx_k_num_tokens_fn[] = "num_tokens_fn";
+static const char __pyx_k_reduce_cython[] = "__reduce_cython__";
+static const char __pyx_k_AssertionError[] = "AssertionError";
+static const char __pyx_k_View_MemoryView[] = "View.MemoryView";
+static const char __pyx_k_allocate_buffer[] = "allocate_buffer";
+static const char __pyx_k_collections_abc[] = "collections.abc";
+static const char __pyx_k_dtype_is_object[] = "dtype_is_object";
+static const char __pyx_k_pyx_PickleError[] = "__pyx_PickleError";
+static const char __pyx_k_setstate_cython[] = "__setstate_cython__";
+static const char __pyx_k_pyx_unpickle_Enum[] = "__pyx_unpickle_Enum";
+static const char __pyx_k_asyncio_coroutines[] = "asyncio.coroutines";
+static const char __pyx_k_batch_by_size_fast[] = "batch_by_size_fast";
+static const char __pyx_k_cline_in_traceback[] = "cline_in_traceback";
+static const char __pyx_k_strided_and_direct[] = "<strided and direct>";
+static const char __pyx_k_fixed_shapes_sorted[] = "fixed_shapes_sorted";
+static const char __pyx_k_strided_and_indirect[] = "<strided and indirect>";
+static const char __pyx_k_Invalid_shape_in_axis[] = "Invalid shape in axis ";
+static const char __pyx_k_contiguous_and_direct[] = "<contiguous and direct>";
+static const char __pyx_k_Cannot_index_with_type[] = "Cannot index with type '";
+static const char __pyx_k_MemoryView_of_r_object[] = "<MemoryView of %r object>";
+static const char __pyx_k_MemoryView_of_r_at_0x_x[] = "<MemoryView of %r at 0x%x>";
+static const char __pyx_k_batch_fixed_shapes_fast[] = "batch_fixed_shapes_fast";
+static const char __pyx_k_contiguous_and_indirect[] = "<contiguous and indirect>";
+static const char __pyx_k_Dimension_d_is_not_direct[] = "Dimension %d is not direct";
+static const char __pyx_k_Index_out_of_bounds_axis_d[] = "Index out of bounds (axis %d)";
+static const char __pyx_k_Step_may_not_be_zero_axis_d[] = "Step may not be zero (axis %d)";
+static const char __pyx_k_itemsize_0_for_cython_array[] = "itemsize <= 0 for cython.array";
+static const char __pyx_k_fairseq_data_data_utils_fast[] = "fairseq.data.data_utils_fast";
+static const char __pyx_k_unable_to_allocate_array_data[] = "unable to allocate array data.";
+static const char __pyx_k_strided_and_direct_or_indirect[] = "<strided and direct or indirect>";
+static const char __pyx_k_All_dimensions_preceding_dimensi[] = "All dimensions preceding dimension %d must be indexed and not sliced";
+static const char __pyx_k_Buffer_view_does_not_expose_stri[] = "Buffer view does not expose strides";
+static const char __pyx_k_Can_only_create_a_buffer_that_is[] = "Can only create a buffer that is contiguous in memory.";
+static const char __pyx_k_Cannot_assign_to_read_only_memor[] = "Cannot assign to read-only memoryview";
+static const char __pyx_k_Cannot_create_writable_memory_vi[] = "Cannot create writable memory view from read-only memoryview";
+static const char __pyx_k_Cannot_transpose_memoryview_with[] = "Cannot transpose memoryview with indirect dimensions";
+static const char __pyx_k_Empty_shape_tuple_for_cython_arr[] = "Empty shape tuple for cython.array";
+static const char __pyx_k_Incompatible_checksums_0x_x_vs_0[] = "Incompatible checksums (0x%x vs (0x82a3537, 0x6ae9995, 0xb068931) = (name))";
+static const char __pyx_k_Indirect_dimensions_not_supporte[] = "Indirect dimensions not supported";
+static const char __pyx_k_Invalid_mode_expected_c_or_fortr[] = "Invalid mode, expected 'c' or 'fortran', got ";
+static const char __pyx_k_Out_of_bounds_on_buffer_access_a[] = "Out of bounds on buffer access (axis ";
+static const char __pyx_k_Unable_to_convert_item_to_object[] = "Unable to convert item to object";
+static const char __pyx_k_fairseq_data_data_utils_fast_pyx[] = "fairseq/data/data_utils_fast.pyx";
+static const char __pyx_k_got_differing_extents_in_dimensi[] = "got differing extents in dimension ";
+static const char __pyx_k_no_default___reduce___due_to_non[] = "no default __reduce__ due to non-trivial __cinit__";
+static const char __pyx_k_numpy__core_multiarray_failed_to[] = "numpy._core.multiarray failed to import";
+static const char __pyx_k_numpy__core_umath_failed_to_impo[] = "numpy._core.umath failed to import";
+static const char __pyx_k_sentence_at_index_of_size_exceed[] = "sentence at index {} of size {} exceeds max_tokens limit of {}!";
+static const char __pyx_k_unable_to_allocate_shape_and_str[] = "unable to allocate shape and strides.";
+/* #### Code section: decls ### */
+static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array___cinit__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_shape, Py_ssize_t __pyx_v_itemsize, PyObject *__pyx_v_format, PyObject *__pyx_v_mode, int __pyx_v_allocate_buffer); /* proto */
+static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array_2__getbuffer__(struct __pyx_array_obj *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /* proto */
+static void __pyx_array___pyx_pf_15View_dot_MemoryView_5array_4__dealloc__(struct __pyx_array_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_5array_7memview___get__(struct __pyx_array_obj *__pyx_v_self); /* proto */
+static Py_ssize_t __pyx_array___pyx_pf_15View_dot_MemoryView_5array_6__len__(struct __pyx_array_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_array___pyx_pf_15View_dot_MemoryView_5array_8__getattr__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_attr); /* proto */
+static PyObject *__pyx_array___pyx_pf_15View_dot_MemoryView_5array_10__getitem__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_item); /* proto */
+static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array_12__setitem__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_item, PyObject *__pyx_v_value); /* proto */
+static PyObject *__pyx_pf___pyx_array___reduce_cython__(CYTHON_UNUSED struct __pyx_array_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_array_2__setstate_cython__(CYTHON_UNUSED struct __pyx_array_obj *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state); /* proto */
+static int __pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum___init__(struct __pyx_MemviewEnum_obj *__pyx_v_self, PyObject *__pyx_v_name); /* proto */
+static PyObject *__pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum_2__repr__(struct __pyx_MemviewEnum_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_MemviewEnum___reduce_cython__(struct __pyx_MemviewEnum_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_MemviewEnum_2__setstate_cython__(struct __pyx_MemviewEnum_obj *__pyx_v_self, PyObject *__pyx_v___pyx_state); /* proto */
+static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview___cinit__(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_obj, int __pyx_v_flags, int __pyx_v_dtype_is_object); /* proto */
+static void __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_2__dealloc__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_4__getitem__(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index); /* proto */
+static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_6__setitem__(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index, PyObject *__pyx_v_value); /* proto */
+static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_8__getbuffer__(struct __pyx_memoryview_obj *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_1T___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4base___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_5shape___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_7strides___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_10suboffsets___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4ndim___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_8itemsize___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_6nbytes___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4size___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static Py_ssize_t __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_10__len__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_12__repr__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_14__str__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_16is_c_contig(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_18is_f_contig(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_20copy(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_22copy_fortran(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_memoryview___reduce_cython__(CYTHON_UNUSED struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_memoryview_2__setstate_cython__(CYTHON_UNUSED struct __pyx_memoryview_obj *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state); /* proto */
+static void __pyx_memoryviewslice___pyx_pf_15View_dot_MemoryView_16_memoryviewslice___dealloc__(struct __pyx_memoryviewslice_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_memoryviewslice___reduce_cython__(CYTHON_UNUSED struct __pyx_memoryviewslice_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_memoryviewslice_2__setstate_cython__(CYTHON_UNUSED struct __pyx_memoryviewslice_obj *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView___pyx_unpickle_Enum(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v___pyx_type, long __pyx_v___pyx_checksum, PyObject *__pyx_v___pyx_state); /* proto */
+static PyObject *__pyx_pf_7fairseq_4data_15data_utils_fast_batch_by_size_fast(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_indices, PyObject *__pyx_v_num_tokens_fn, long __pyx_v_max_tokens, long __pyx_v_max_sentences, int __pyx_v_bsz_mult); /* proto */
+static PyObject *__pyx_pf_7fairseq_4data_15data_utils_fast_2batch_fixed_shapes_fast(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_indices, PyObject *__pyx_v_num_tokens_fn, PyArrayObject *__pyx_v_fixed_shapes_sorted); /* proto */
+static PyObject *__pyx_tp_new_array(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
+static PyObject *__pyx_tp_new_Enum(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
+static PyObject *__pyx_tp_new_memoryview(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
+static PyObject *__pyx_tp_new__memoryviewslice(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
+/* #### Code section: late_includes ### */
+/* #### Code section: module_state ### */
+typedef struct {
+  PyObject *__pyx_d;
+  PyObject *__pyx_b;
+  PyObject *__pyx_cython_runtime;
+  PyObject *__pyx_empty_tuple;
+  PyObject *__pyx_empty_bytes;
+  PyObject *__pyx_empty_unicode;
+  #ifdef __Pyx_CyFunction_USED
+  PyTypeObject *__pyx_CyFunctionType;
+  #endif
+  #ifdef __Pyx_FusedFunction_USED
+  PyTypeObject *__pyx_FusedFunctionType;
+  #endif
+  #ifdef __Pyx_Generator_USED
+  PyTypeObject *__pyx_GeneratorType;
+  #endif
+  #ifdef __Pyx_IterableCoroutine_USED
+  PyTypeObject *__pyx_IterableCoroutineType;
+  #endif
+  #ifdef __Pyx_Coroutine_USED
+  PyTypeObject *__pyx_CoroutineAwaitType;
+  #endif
+  #ifdef __Pyx_Coroutine_USED
+  PyTypeObject *__pyx_CoroutineType;
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  PyTypeObject *__pyx_ptype_7cpython_4type_type;
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  PyTypeObject *__pyx_ptype_5numpy_dtype;
+  PyTypeObject *__pyx_ptype_5numpy_flatiter;
+  PyTypeObject *__pyx_ptype_5numpy_broadcast;
+  PyTypeObject *__pyx_ptype_5numpy_ndarray;
+  PyTypeObject *__pyx_ptype_5numpy_generic;
+  PyTypeObject *__pyx_ptype_5numpy_number;
+  PyTypeObject *__pyx_ptype_5numpy_integer;
+  PyTypeObject *__pyx_ptype_5numpy_signedinteger;
+  PyTypeObject *__pyx_ptype_5numpy_unsignedinteger;
+  PyTypeObject *__pyx_ptype_5numpy_inexact;
+  PyTypeObject *__pyx_ptype_5numpy_floating;
+  PyTypeObject *__pyx_ptype_5numpy_complexfloating;
+  PyTypeObject *__pyx_ptype_5numpy_flexible;
+  PyTypeObject *__pyx_ptype_5numpy_character;
+  PyTypeObject *__pyx_ptype_5numpy_ufunc;
+  #if CYTHON_USE_MODULE_STATE
+  PyObject *__pyx_type___pyx_array;
+  PyObject *__pyx_type___pyx_MemviewEnum;
+  PyObject *__pyx_type___pyx_memoryview;
+  PyObject *__pyx_type___pyx_memoryviewslice;
+  #endif
+  PyTypeObject *__pyx_array_type;
+  PyTypeObject *__pyx_MemviewEnum_type;
+  PyTypeObject *__pyx_memoryview_type;
+  PyTypeObject *__pyx_memoryviewslice_type;
+  PyObject *__pyx_kp_u_;
+  PyObject *__pyx_n_s_ASCII;
+  PyObject *__pyx_kp_s_All_dimensions_preceding_dimensi;
+  PyObject *__pyx_n_s_AssertionError;
+  PyObject *__pyx_kp_s_Buffer_view_does_not_expose_stri;
+  PyObject *__pyx_kp_s_Can_only_create_a_buffer_that_is;
+  PyObject *__pyx_kp_s_Cannot_assign_to_read_only_memor;
+  PyObject *__pyx_kp_s_Cannot_create_writable_memory_vi;
+  PyObject *__pyx_kp_u_Cannot_index_with_type;
+  PyObject *__pyx_kp_s_Cannot_transpose_memoryview_with;
+  PyObject *__pyx_n_s_DTYPE;
+  PyObject *__pyx_kp_s_Dimension_d_is_not_direct;
+  PyObject *__pyx_n_s_Ellipsis;
+  PyObject *__pyx_kp_s_Empty_shape_tuple_for_cython_arr;
+  PyObject *__pyx_n_s_ImportError;
+  PyObject *__pyx_kp_s_Incompatible_checksums_0x_x_vs_0;
+  PyObject *__pyx_n_s_IndexError;
+  PyObject *__pyx_kp_s_Index_out_of_bounds_axis_d;
+  PyObject *__pyx_kp_s_Indirect_dimensions_not_supporte;
+  PyObject *__pyx_kp_u_Invalid_mode_expected_c_or_fortr;
+  PyObject *__pyx_kp_u_Invalid_shape_in_axis;
+  PyObject *__pyx_n_s_MemoryError;
+  PyObject *__pyx_kp_s_MemoryView_of_r_at_0x_x;
+  PyObject *__pyx_kp_s_MemoryView_of_r_object;
+  PyObject *__pyx_n_b_O;
+  PyObject *__pyx_kp_u_Out_of_bounds_on_buffer_access_a;
+  PyObject *__pyx_n_s_PickleError;
+  PyObject *__pyx_n_s_Sequence;
+  PyObject *__pyx_kp_s_Step_may_not_be_zero_axis_d;
+  PyObject *__pyx_n_s_TypeError;
+  PyObject *__pyx_kp_s_Unable_to_convert_item_to_object;
+  PyObject *__pyx_n_s_ValueError;
+  PyObject *__pyx_n_s_View_MemoryView;
+  PyObject *__pyx_kp_u__2;
+  PyObject *__pyx_n_s__26;
+  PyObject *__pyx_n_s__3;
+  PyObject *__pyx_kp_u__6;
+  PyObject *__pyx_kp_u__7;
+  PyObject *__pyx_n_s_abc;
+  PyObject *__pyx_n_s_allocate_buffer;
+  PyObject *__pyx_kp_u_and;
+  PyObject *__pyx_n_s_asyncio_coroutines;
+  PyObject *__pyx_n_s_base;
+  PyObject *__pyx_n_s_batch_by_size_fast;
+  PyObject *__pyx_n_s_batch_fixed_shapes_fast;
+  PyObject *__pyx_n_s_bsz_mult;
+  PyObject *__pyx_n_s_c;
+  PyObject *__pyx_n_u_c;
+  PyObject *__pyx_n_s_class;
+  PyObject *__pyx_n_s_class_getitem;
+  PyObject *__pyx_n_s_cline_in_traceback;
+  PyObject *__pyx_n_s_collections;
+  PyObject *__pyx_kp_s_collections_abc;
+  PyObject *__pyx_kp_s_contiguous_and_direct;
+  PyObject *__pyx_kp_s_contiguous_and_indirect;
+  PyObject *__pyx_n_s_count;
+  PyObject *__pyx_n_s_dict;
+  PyObject *__pyx_kp_u_disable;
+  PyObject *__pyx_n_s_dtype_is_object;
+  PyObject *__pyx_kp_u_enable;
+  PyObject *__pyx_n_s_encode;
+  PyObject *__pyx_n_s_enumerate;
+  PyObject *__pyx_n_s_error;
+  PyObject *__pyx_n_s_fairseq_data_data_utils_fast;
+  PyObject *__pyx_kp_s_fairseq_data_data_utils_fast_pyx;
+  PyObject *__pyx_n_s_fixed_shapes_sorted;
+  PyObject *__pyx_n_s_flags;
+  PyObject *__pyx_n_s_format;
+  PyObject *__pyx_n_s_fortran;
+  PyObject *__pyx_n_u_fortran;
+  PyObject *__pyx_kp_u_gc;
+  PyObject *__pyx_n_s_getstate;
+  PyObject *__pyx_kp_u_got;
+  PyObject *__pyx_kp_u_got_differing_extents_in_dimensi;
+  PyObject *__pyx_n_s_id;
+  PyObject *__pyx_n_s_import;
+  PyObject *__pyx_n_s_index;
+  PyObject *__pyx_n_s_indices;
+  PyObject *__pyx_n_s_initializing;
+  PyObject *__pyx_n_s_int64;
+  PyObject *__pyx_n_s_is_coroutine;
+  PyObject *__pyx_kp_u_isenabled;
+  PyObject *__pyx_n_s_itemsize;
+  PyObject *__pyx_kp_s_itemsize_0_for_cython_array;
+  PyObject *__pyx_n_s_main;
+  PyObject *__pyx_n_s_max;
+  PyObject *__pyx_n_s_max_sentences;
+  PyObject *__pyx_n_s_max_tokens;
+  PyObject *__pyx_n_s_memview;
+  PyObject *__pyx_n_s_mode;
+  PyObject *__pyx_n_s_name;
+  PyObject *__pyx_n_s_name_2;
+  PyObject *__pyx_n_s_ndim;
+  PyObject *__pyx_n_s_new;
+  PyObject *__pyx_kp_s_no_default___reduce___due_to_non;
+  PyObject *__pyx_n_s_np;
+  PyObject *__pyx_n_s_num_tokens_fn;
+  PyObject *__pyx_n_s_numpy;
+  PyObject *__pyx_kp_u_numpy__core_multiarray_failed_to;
+  PyObject *__pyx_kp_u_numpy__core_umath_failed_to_impo;
+  PyObject *__pyx_n_s_obj;
+  PyObject *__pyx_n_s_pack;
+  PyObject *__pyx_n_s_pickle;
+  PyObject *__pyx_n_s_pyx_PickleError;
+  PyObject *__pyx_n_s_pyx_checksum;
+  PyObject *__pyx_n_s_pyx_result;
+  PyObject *__pyx_n_s_pyx_state;
+  PyObject *__pyx_n_s_pyx_type;
+  PyObject *__pyx_n_s_pyx_unpickle_Enum;
+  PyObject *__pyx_n_s_pyx_vtable;
+  PyObject *__pyx_n_s_range;
+  PyObject *__pyx_n_s_reduce;
+  PyObject *__pyx_n_s_reduce_cython;
+  PyObject *__pyx_n_s_reduce_ex;
+  PyObject *__pyx_n_s_register;
+  PyObject *__pyx_kp_u_sentence_at_index_of_size_exceed;
+  PyObject *__pyx_n_s_setstate;
+  PyObject *__pyx_n_s_setstate_cython;
+  PyObject *__pyx_n_s_shape;
+  PyObject *__pyx_n_s_size;
+  PyObject *__pyx_n_s_spec;
+  PyObject *__pyx_n_s_start;
+  PyObject *__pyx_n_s_step;
+  PyObject *__pyx_n_s_stop;
+  PyObject *__pyx_kp_s_strided_and_direct;
+  PyObject *__pyx_kp_s_strided_and_direct_or_indirect;
+  PyObject *__pyx_kp_s_strided_and_indirect;
+  PyObject *__pyx_kp_s_stringsource;
+  PyObject *__pyx_n_s_struct;
+  PyObject *__pyx_n_s_sys;
+  PyObject *__pyx_n_s_test;
+  PyObject *__pyx_kp_s_unable_to_allocate_array_data;
+  PyObject *__pyx_kp_s_unable_to_allocate_shape_and_str;
+  PyObject *__pyx_n_s_unpack;
+  PyObject *__pyx_n_s_update;
+  PyObject *__pyx_n_s_version_info;
+  PyObject *__pyx_int_0;
+  PyObject *__pyx_int_1;
+  PyObject *__pyx_int_3;
+  PyObject *__pyx_int_112105877;
+  PyObject *__pyx_int_136983863;
+  PyObject *__pyx_int_184977713;
+  PyObject *__pyx_int_neg_1;
+  PyObject *__pyx_slice__5;
+  PyObject *__pyx_tuple__4;
+  PyObject *__pyx_tuple__8;
+  PyObject *__pyx_tuple__9;
+  PyObject *__pyx_tuple__10;
+  PyObject *__pyx_tuple__11;
+  PyObject *__pyx_tuple__12;
+  PyObject *__pyx_tuple__13;
+  PyObject *__pyx_tuple__14;
+  PyObject *__pyx_tuple__15;
+  PyObject *__pyx_tuple__16;
+  PyObject *__pyx_tuple__17;
+  PyObject *__pyx_tuple__18;
+  PyObject *__pyx_tuple__19;
+  PyObject *__pyx_tuple__20;
+  PyObject *__pyx_tuple__22;
+  PyObject *__pyx_tuple__24;
+  PyObject *__pyx_codeobj__21;
+  PyObject *__pyx_codeobj__23;
+  PyObject *__pyx_codeobj__25;
+} __pyx_mstate;
+
+#if CYTHON_USE_MODULE_STATE
+#ifdef __cplusplus
+namespace {
+  extern struct PyModuleDef __pyx_moduledef;
+} /* anonymous namespace */
+#else
+static struct PyModuleDef __pyx_moduledef;
+#endif
+
+#define __pyx_mstate(o) ((__pyx_mstate *)__Pyx_PyModule_GetState(o))
+
+#define __pyx_mstate_global (__pyx_mstate(PyState_FindModule(&__pyx_moduledef)))
+
+#define __pyx_m (PyState_FindModule(&__pyx_moduledef))
+#else
+static __pyx_mstate __pyx_mstate_global_static =
+#ifdef __cplusplus
+    {};
+#else
+    {0};
+#endif
+static __pyx_mstate *__pyx_mstate_global = &__pyx_mstate_global_static;
+#endif
+/* #### Code section: module_state_clear ### */
+#if CYTHON_USE_MODULE_STATE
+static int __pyx_m_clear(PyObject *m) {
+  __pyx_mstate *clear_module_state = __pyx_mstate(m);
+  if (!clear_module_state) return 0;
+  Py_CLEAR(clear_module_state->__pyx_d);
+  Py_CLEAR(clear_module_state->__pyx_b);
+  Py_CLEAR(clear_module_state->__pyx_cython_runtime);
+  Py_CLEAR(clear_module_state->__pyx_empty_tuple);
+  Py_CLEAR(clear_module_state->__pyx_empty_bytes);
+  Py_CLEAR(clear_module_state->__pyx_empty_unicode);
+  #ifdef __Pyx_CyFunction_USED
+  Py_CLEAR(clear_module_state->__pyx_CyFunctionType);
+  #endif
+  #ifdef __Pyx_FusedFunction_USED
+  Py_CLEAR(clear_module_state->__pyx_FusedFunctionType);
+  #endif
+  Py_CLEAR(clear_module_state->__pyx_ptype_7cpython_4type_type);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_dtype);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_flatiter);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_broadcast);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_ndarray);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_generic);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_number);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_integer);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_signedinteger);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_unsignedinteger);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_inexact);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_floating);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_complexfloating);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_flexible);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_character);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_ufunc);
+  Py_CLEAR(clear_module_state->__pyx_array_type);
+  Py_CLEAR(clear_module_state->__pyx_type___pyx_array);
+  Py_CLEAR(clear_module_state->__pyx_MemviewEnum_type);
+  Py_CLEAR(clear_module_state->__pyx_type___pyx_MemviewEnum);
+  Py_CLEAR(clear_module_state->__pyx_memoryview_type);
+  Py_CLEAR(clear_module_state->__pyx_type___pyx_memoryview);
+  Py_CLEAR(clear_module_state->__pyx_memoryviewslice_type);
+  Py_CLEAR(clear_module_state->__pyx_type___pyx_memoryviewslice);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_);
+  Py_CLEAR(clear_module_state->__pyx_n_s_ASCII);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_All_dimensions_preceding_dimensi);
+  Py_CLEAR(clear_module_state->__pyx_n_s_AssertionError);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Buffer_view_does_not_expose_stri);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Can_only_create_a_buffer_that_is);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Cannot_assign_to_read_only_memor);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Cannot_create_writable_memory_vi);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_Cannot_index_with_type);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Cannot_transpose_memoryview_with);
+  Py_CLEAR(clear_module_state->__pyx_n_s_DTYPE);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Dimension_d_is_not_direct);
+  Py_CLEAR(clear_module_state->__pyx_n_s_Ellipsis);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Empty_shape_tuple_for_cython_arr);
+  Py_CLEAR(clear_module_state->__pyx_n_s_ImportError);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Incompatible_checksums_0x_x_vs_0);
+  Py_CLEAR(clear_module_state->__pyx_n_s_IndexError);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Index_out_of_bounds_axis_d);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Indirect_dimensions_not_supporte);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_Invalid_mode_expected_c_or_fortr);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_Invalid_shape_in_axis);
+  Py_CLEAR(clear_module_state->__pyx_n_s_MemoryError);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_MemoryView_of_r_at_0x_x);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_MemoryView_of_r_object);
+  Py_CLEAR(clear_module_state->__pyx_n_b_O);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_Out_of_bounds_on_buffer_access_a);
+  Py_CLEAR(clear_module_state->__pyx_n_s_PickleError);
+  Py_CLEAR(clear_module_state->__pyx_n_s_Sequence);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Step_may_not_be_zero_axis_d);
+  Py_CLEAR(clear_module_state->__pyx_n_s_TypeError);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Unable_to_convert_item_to_object);
+  Py_CLEAR(clear_module_state->__pyx_n_s_ValueError);
+  Py_CLEAR(clear_module_state->__pyx_n_s_View_MemoryView);
+  Py_CLEAR(clear_module_state->__pyx_kp_u__2);
+  Py_CLEAR(clear_module_state->__pyx_n_s__26);
+  Py_CLEAR(clear_module_state->__pyx_n_s__3);
+  Py_CLEAR(clear_module_state->__pyx_kp_u__6);
+  Py_CLEAR(clear_module_state->__pyx_kp_u__7);
+  Py_CLEAR(clear_module_state->__pyx_n_s_abc);
+  Py_CLEAR(clear_module_state->__pyx_n_s_allocate_buffer);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_and);
+  Py_CLEAR(clear_module_state->__pyx_n_s_asyncio_coroutines);
+  Py_CLEAR(clear_module_state->__pyx_n_s_base);
+  Py_CLEAR(clear_module_state->__pyx_n_s_batch_by_size_fast);
+  Py_CLEAR(clear_module_state->__pyx_n_s_batch_fixed_shapes_fast);
+  Py_CLEAR(clear_module_state->__pyx_n_s_bsz_mult);
+  Py_CLEAR(clear_module_state->__pyx_n_s_c);
+  Py_CLEAR(clear_module_state->__pyx_n_u_c);
+  Py_CLEAR(clear_module_state->__pyx_n_s_class);
+  Py_CLEAR(clear_module_state->__pyx_n_s_class_getitem);
+  Py_CLEAR(clear_module_state->__pyx_n_s_cline_in_traceback);
+  Py_CLEAR(clear_module_state->__pyx_n_s_collections);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_collections_abc);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_contiguous_and_direct);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_contiguous_and_indirect);
+  Py_CLEAR(clear_module_state->__pyx_n_s_count);
+  Py_CLEAR(clear_module_state->__pyx_n_s_dict);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_disable);
+  Py_CLEAR(clear_module_state->__pyx_n_s_dtype_is_object);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_enable);
+  Py_CLEAR(clear_module_state->__pyx_n_s_encode);
+  Py_CLEAR(clear_module_state->__pyx_n_s_enumerate);
+  Py_CLEAR(clear_module_state->__pyx_n_s_error);
+  Py_CLEAR(clear_module_state->__pyx_n_s_fairseq_data_data_utils_fast);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_fairseq_data_data_utils_fast_pyx);
+  Py_CLEAR(clear_module_state->__pyx_n_s_fixed_shapes_sorted);
+  Py_CLEAR(clear_module_state->__pyx_n_s_flags);
+  Py_CLEAR(clear_module_state->__pyx_n_s_format);
+  Py_CLEAR(clear_module_state->__pyx_n_s_fortran);
+  Py_CLEAR(clear_module_state->__pyx_n_u_fortran);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_gc);
+  Py_CLEAR(clear_module_state->__pyx_n_s_getstate);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_got);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_got_differing_extents_in_dimensi);
+  Py_CLEAR(clear_module_state->__pyx_n_s_id);
+  Py_CLEAR(clear_module_state->__pyx_n_s_import);
+  Py_CLEAR(clear_module_state->__pyx_n_s_index);
+  Py_CLEAR(clear_module_state->__pyx_n_s_indices);
+  Py_CLEAR(clear_module_state->__pyx_n_s_initializing);
+  Py_CLEAR(clear_module_state->__pyx_n_s_int64);
+  Py_CLEAR(clear_module_state->__pyx_n_s_is_coroutine);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_isenabled);
+  Py_CLEAR(clear_module_state->__pyx_n_s_itemsize);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_itemsize_0_for_cython_array);
+  Py_CLEAR(clear_module_state->__pyx_n_s_main);
+  Py_CLEAR(clear_module_state->__pyx_n_s_max);
+  Py_CLEAR(clear_module_state->__pyx_n_s_max_sentences);
+  Py_CLEAR(clear_module_state->__pyx_n_s_max_tokens);
+  Py_CLEAR(clear_module_state->__pyx_n_s_memview);
+  Py_CLEAR(clear_module_state->__pyx_n_s_mode);
+  Py_CLEAR(clear_module_state->__pyx_n_s_name);
+  Py_CLEAR(clear_module_state->__pyx_n_s_name_2);
+  Py_CLEAR(clear_module_state->__pyx_n_s_ndim);
+  Py_CLEAR(clear_module_state->__pyx_n_s_new);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_no_default___reduce___due_to_non);
+  Py_CLEAR(clear_module_state->__pyx_n_s_np);
+  Py_CLEAR(clear_module_state->__pyx_n_s_num_tokens_fn);
+  Py_CLEAR(clear_module_state->__pyx_n_s_numpy);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_numpy__core_multiarray_failed_to);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_numpy__core_umath_failed_to_impo);
+  Py_CLEAR(clear_module_state->__pyx_n_s_obj);
+  Py_CLEAR(clear_module_state->__pyx_n_s_pack);
+  Py_CLEAR(clear_module_state->__pyx_n_s_pickle);
+  Py_CLEAR(clear_module_state->__pyx_n_s_pyx_PickleError);
+  Py_CLEAR(clear_module_state->__pyx_n_s_pyx_checksum);
+  Py_CLEAR(clear_module_state->__pyx_n_s_pyx_result);
+  Py_CLEAR(clear_module_state->__pyx_n_s_pyx_state);
+  Py_CLEAR(clear_module_state->__pyx_n_s_pyx_type);
+  Py_CLEAR(clear_module_state->__pyx_n_s_pyx_unpickle_Enum);
+  Py_CLEAR(clear_module_state->__pyx_n_s_pyx_vtable);
+  Py_CLEAR(clear_module_state->__pyx_n_s_range);
+  Py_CLEAR(clear_module_state->__pyx_n_s_reduce);
+  Py_CLEAR(clear_module_state->__pyx_n_s_reduce_cython);
+  Py_CLEAR(clear_module_state->__pyx_n_s_reduce_ex);
+  Py_CLEAR(clear_module_state->__pyx_n_s_register);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_sentence_at_index_of_size_exceed);
+  Py_CLEAR(clear_module_state->__pyx_n_s_setstate);
+  Py_CLEAR(clear_module_state->__pyx_n_s_setstate_cython);
+  Py_CLEAR(clear_module_state->__pyx_n_s_shape);
+  Py_CLEAR(clear_module_state->__pyx_n_s_size);
+  Py_CLEAR(clear_module_state->__pyx_n_s_spec);
+  Py_CLEAR(clear_module_state->__pyx_n_s_start);
+  Py_CLEAR(clear_module_state->__pyx_n_s_step);
+  Py_CLEAR(clear_module_state->__pyx_n_s_stop);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_strided_and_direct);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_strided_and_direct_or_indirect);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_strided_and_indirect);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_stringsource);
+  Py_CLEAR(clear_module_state->__pyx_n_s_struct);
+  Py_CLEAR(clear_module_state->__pyx_n_s_sys);
+  Py_CLEAR(clear_module_state->__pyx_n_s_test);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_unable_to_allocate_array_data);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_unable_to_allocate_shape_and_str);
+  Py_CLEAR(clear_module_state->__pyx_n_s_unpack);
+  Py_CLEAR(clear_module_state->__pyx_n_s_update);
+  Py_CLEAR(clear_module_state->__pyx_n_s_version_info);
+  Py_CLEAR(clear_module_state->__pyx_int_0);
+  Py_CLEAR(clear_module_state->__pyx_int_1);
+  Py_CLEAR(clear_module_state->__pyx_int_3);
+  Py_CLEAR(clear_module_state->__pyx_int_112105877);
+  Py_CLEAR(clear_module_state->__pyx_int_136983863);
+  Py_CLEAR(clear_module_state->__pyx_int_184977713);
+  Py_CLEAR(clear_module_state->__pyx_int_neg_1);
+  Py_CLEAR(clear_module_state->__pyx_slice__5);
+  Py_CLEAR(clear_module_state->__pyx_tuple__4);
+  Py_CLEAR(clear_module_state->__pyx_tuple__8);
+  Py_CLEAR(clear_module_state->__pyx_tuple__9);
+  Py_CLEAR(clear_module_state->__pyx_tuple__10);
+  Py_CLEAR(clear_module_state->__pyx_tuple__11);
+  Py_CLEAR(clear_module_state->__pyx_tuple__12);
+  Py_CLEAR(clear_module_state->__pyx_tuple__13);
+  Py_CLEAR(clear_module_state->__pyx_tuple__14);
+  Py_CLEAR(clear_module_state->__pyx_tuple__15);
+  Py_CLEAR(clear_module_state->__pyx_tuple__16);
+  Py_CLEAR(clear_module_state->__pyx_tuple__17);
+  Py_CLEAR(clear_module_state->__pyx_tuple__18);
+  Py_CLEAR(clear_module_state->__pyx_tuple__19);
+  Py_CLEAR(clear_module_state->__pyx_tuple__20);
+  Py_CLEAR(clear_module_state->__pyx_tuple__22);
+  Py_CLEAR(clear_module_state->__pyx_tuple__24);
+  Py_CLEAR(clear_module_state->__pyx_codeobj__21);
+  Py_CLEAR(clear_module_state->__pyx_codeobj__23);
+  Py_CLEAR(clear_module_state->__pyx_codeobj__25);
+  return 0;
+}
+#endif
+/* #### Code section: module_state_traverse ### */
+#if CYTHON_USE_MODULE_STATE
+static int __pyx_m_traverse(PyObject *m, visitproc visit, void *arg) {
+  __pyx_mstate *traverse_module_state = __pyx_mstate(m);
+  if (!traverse_module_state) return 0;
+  Py_VISIT(traverse_module_state->__pyx_d);
+  Py_VISIT(traverse_module_state->__pyx_b);
+  Py_VISIT(traverse_module_state->__pyx_cython_runtime);
+  Py_VISIT(traverse_module_state->__pyx_empty_tuple);
+  Py_VISIT(traverse_module_state->__pyx_empty_bytes);
+  Py_VISIT(traverse_module_state->__pyx_empty_unicode);
+  #ifdef __Pyx_CyFunction_USED
+  Py_VISIT(traverse_module_state->__pyx_CyFunctionType);
+  #endif
+  #ifdef __Pyx_FusedFunction_USED
+  Py_VISIT(traverse_module_state->__pyx_FusedFunctionType);
+  #endif
+  Py_VISIT(traverse_module_state->__pyx_ptype_7cpython_4type_type);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_dtype);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_flatiter);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_broadcast);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_ndarray);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_generic);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_number);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_integer);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_signedinteger);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_unsignedinteger);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_inexact);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_floating);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_complexfloating);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_flexible);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_character);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_ufunc);
+  Py_VISIT(traverse_module_state->__pyx_array_type);
+  Py_VISIT(traverse_module_state->__pyx_type___pyx_array);
+  Py_VISIT(traverse_module_state->__pyx_MemviewEnum_type);
+  Py_VISIT(traverse_module_state->__pyx_type___pyx_MemviewEnum);
+  Py_VISIT(traverse_module_state->__pyx_memoryview_type);
+  Py_VISIT(traverse_module_state->__pyx_type___pyx_memoryview);
+  Py_VISIT(traverse_module_state->__pyx_memoryviewslice_type);
+  Py_VISIT(traverse_module_state->__pyx_type___pyx_memoryviewslice);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_);
+  Py_VISIT(traverse_module_state->__pyx_n_s_ASCII);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_All_dimensions_preceding_dimensi);
+  Py_VISIT(traverse_module_state->__pyx_n_s_AssertionError);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Buffer_view_does_not_expose_stri);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Can_only_create_a_buffer_that_is);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Cannot_assign_to_read_only_memor);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Cannot_create_writable_memory_vi);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_Cannot_index_with_type);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Cannot_transpose_memoryview_with);
+  Py_VISIT(traverse_module_state->__pyx_n_s_DTYPE);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Dimension_d_is_not_direct);
+  Py_VISIT(traverse_module_state->__pyx_n_s_Ellipsis);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Empty_shape_tuple_for_cython_arr);
+  Py_VISIT(traverse_module_state->__pyx_n_s_ImportError);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Incompatible_checksums_0x_x_vs_0);
+  Py_VISIT(traverse_module_state->__pyx_n_s_IndexError);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Index_out_of_bounds_axis_d);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Indirect_dimensions_not_supporte);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_Invalid_mode_expected_c_or_fortr);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_Invalid_shape_in_axis);
+  Py_VISIT(traverse_module_state->__pyx_n_s_MemoryError);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_MemoryView_of_r_at_0x_x);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_MemoryView_of_r_object);
+  Py_VISIT(traverse_module_state->__pyx_n_b_O);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_Out_of_bounds_on_buffer_access_a);
+  Py_VISIT(traverse_module_state->__pyx_n_s_PickleError);
+  Py_VISIT(traverse_module_state->__pyx_n_s_Sequence);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Step_may_not_be_zero_axis_d);
+  Py_VISIT(traverse_module_state->__pyx_n_s_TypeError);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Unable_to_convert_item_to_object);
+  Py_VISIT(traverse_module_state->__pyx_n_s_ValueError);
+  Py_VISIT(traverse_module_state->__pyx_n_s_View_MemoryView);
+  Py_VISIT(traverse_module_state->__pyx_kp_u__2);
+  Py_VISIT(traverse_module_state->__pyx_n_s__26);
+  Py_VISIT(traverse_module_state->__pyx_n_s__3);
+  Py_VISIT(traverse_module_state->__pyx_kp_u__6);
+  Py_VISIT(traverse_module_state->__pyx_kp_u__7);
+  Py_VISIT(traverse_module_state->__pyx_n_s_abc);
+  Py_VISIT(traverse_module_state->__pyx_n_s_allocate_buffer);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_and);
+  Py_VISIT(traverse_module_state->__pyx_n_s_asyncio_coroutines);
+  Py_VISIT(traverse_module_state->__pyx_n_s_base);
+  Py_VISIT(traverse_module_state->__pyx_n_s_batch_by_size_fast);
+  Py_VISIT(traverse_module_state->__pyx_n_s_batch_fixed_shapes_fast);
+  Py_VISIT(traverse_module_state->__pyx_n_s_bsz_mult);
+  Py_VISIT(traverse_module_state->__pyx_n_s_c);
+  Py_VISIT(traverse_module_state->__pyx_n_u_c);
+  Py_VISIT(traverse_module_state->__pyx_n_s_class);
+  Py_VISIT(traverse_module_state->__pyx_n_s_class_getitem);
+  Py_VISIT(traverse_module_state->__pyx_n_s_cline_in_traceback);
+  Py_VISIT(traverse_module_state->__pyx_n_s_collections);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_collections_abc);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_contiguous_and_direct);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_contiguous_and_indirect);
+  Py_VISIT(traverse_module_state->__pyx_n_s_count);
+  Py_VISIT(traverse_module_state->__pyx_n_s_dict);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_disable);
+  Py_VISIT(traverse_module_state->__pyx_n_s_dtype_is_object);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_enable);
+  Py_VISIT(traverse_module_state->__pyx_n_s_encode);
+  Py_VISIT(traverse_module_state->__pyx_n_s_enumerate);
+  Py_VISIT(traverse_module_state->__pyx_n_s_error);
+  Py_VISIT(traverse_module_state->__pyx_n_s_fairseq_data_data_utils_fast);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_fairseq_data_data_utils_fast_pyx);
+  Py_VISIT(traverse_module_state->__pyx_n_s_fixed_shapes_sorted);
+  Py_VISIT(traverse_module_state->__pyx_n_s_flags);
+  Py_VISIT(traverse_module_state->__pyx_n_s_format);
+  Py_VISIT(traverse_module_state->__pyx_n_s_fortran);
+  Py_VISIT(traverse_module_state->__pyx_n_u_fortran);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_gc);
+  Py_VISIT(traverse_module_state->__pyx_n_s_getstate);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_got);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_got_differing_extents_in_dimensi);
+  Py_VISIT(traverse_module_state->__pyx_n_s_id);
+  Py_VISIT(traverse_module_state->__pyx_n_s_import);
+  Py_VISIT(traverse_module_state->__pyx_n_s_index);
+  Py_VISIT(traverse_module_state->__pyx_n_s_indices);
+  Py_VISIT(traverse_module_state->__pyx_n_s_initializing);
+  Py_VISIT(traverse_module_state->__pyx_n_s_int64);
+  Py_VISIT(traverse_module_state->__pyx_n_s_is_coroutine);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_isenabled);
+  Py_VISIT(traverse_module_state->__pyx_n_s_itemsize);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_itemsize_0_for_cython_array);
+  Py_VISIT(traverse_module_state->__pyx_n_s_main);
+  Py_VISIT(traverse_module_state->__pyx_n_s_max);
+  Py_VISIT(traverse_module_state->__pyx_n_s_max_sentences);
+  Py_VISIT(traverse_module_state->__pyx_n_s_max_tokens);
+  Py_VISIT(traverse_module_state->__pyx_n_s_memview);
+  Py_VISIT(traverse_module_state->__pyx_n_s_mode);
+  Py_VISIT(traverse_module_state->__pyx_n_s_name);
+  Py_VISIT(traverse_module_state->__pyx_n_s_name_2);
+  Py_VISIT(traverse_module_state->__pyx_n_s_ndim);
+  Py_VISIT(traverse_module_state->__pyx_n_s_new);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_no_default___reduce___due_to_non);
+  Py_VISIT(traverse_module_state->__pyx_n_s_np);
+  Py_VISIT(traverse_module_state->__pyx_n_s_num_tokens_fn);
+  Py_VISIT(traverse_module_state->__pyx_n_s_numpy);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_numpy__core_multiarray_failed_to);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_numpy__core_umath_failed_to_impo);
+  Py_VISIT(traverse_module_state->__pyx_n_s_obj);
+  Py_VISIT(traverse_module_state->__pyx_n_s_pack);
+  Py_VISIT(traverse_module_state->__pyx_n_s_pickle);
+  Py_VISIT(traverse_module_state->__pyx_n_s_pyx_PickleError);
+  Py_VISIT(traverse_module_state->__pyx_n_s_pyx_checksum);
+  Py_VISIT(traverse_module_state->__pyx_n_s_pyx_result);
+  Py_VISIT(traverse_module_state->__pyx_n_s_pyx_state);
+  Py_VISIT(traverse_module_state->__pyx_n_s_pyx_type);
+  Py_VISIT(traverse_module_state->__pyx_n_s_pyx_unpickle_Enum);
+  Py_VISIT(traverse_module_state->__pyx_n_s_pyx_vtable);
+  Py_VISIT(traverse_module_state->__pyx_n_s_range);
+  Py_VISIT(traverse_module_state->__pyx_n_s_reduce);
+  Py_VISIT(traverse_module_state->__pyx_n_s_reduce_cython);
+  Py_VISIT(traverse_module_state->__pyx_n_s_reduce_ex);
+  Py_VISIT(traverse_module_state->__pyx_n_s_register);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_sentence_at_index_of_size_exceed);
+  Py_VISIT(traverse_module_state->__pyx_n_s_setstate);
+  Py_VISIT(traverse_module_state->__pyx_n_s_setstate_cython);
+  Py_VISIT(traverse_module_state->__pyx_n_s_shape);
+  Py_VISIT(traverse_module_state->__pyx_n_s_size);
+  Py_VISIT(traverse_module_state->__pyx_n_s_spec);
+  Py_VISIT(traverse_module_state->__pyx_n_s_start);
+  Py_VISIT(traverse_module_state->__pyx_n_s_step);
+  Py_VISIT(traverse_module_state->__pyx_n_s_stop);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_strided_and_direct);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_strided_and_direct_or_indirect);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_strided_and_indirect);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_stringsource);
+  Py_VISIT(traverse_module_state->__pyx_n_s_struct);
+  Py_VISIT(traverse_module_state->__pyx_n_s_sys);
+  Py_VISIT(traverse_module_state->__pyx_n_s_test);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_unable_to_allocate_array_data);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_unable_to_allocate_shape_and_str);
+  Py_VISIT(traverse_module_state->__pyx_n_s_unpack);
+  Py_VISIT(traverse_module_state->__pyx_n_s_update);
+  Py_VISIT(traverse_module_state->__pyx_n_s_version_info);
+  Py_VISIT(traverse_module_state->__pyx_int_0);
+  Py_VISIT(traverse_module_state->__pyx_int_1);
+  Py_VISIT(traverse_module_state->__pyx_int_3);
+  Py_VISIT(traverse_module_state->__pyx_int_112105877);
+  Py_VISIT(traverse_module_state->__pyx_int_136983863);
+  Py_VISIT(traverse_module_state->__pyx_int_184977713);
+  Py_VISIT(traverse_module_state->__pyx_int_neg_1);
+  Py_VISIT(traverse_module_state->__pyx_slice__5);
+  Py_VISIT(traverse_module_state->__pyx_tuple__4);
+  Py_VISIT(traverse_module_state->__pyx_tuple__8);
+  Py_VISIT(traverse_module_state->__pyx_tuple__9);
+  Py_VISIT(traverse_module_state->__pyx_tuple__10);
+  Py_VISIT(traverse_module_state->__pyx_tuple__11);
+  Py_VISIT(traverse_module_state->__pyx_tuple__12);
+  Py_VISIT(traverse_module_state->__pyx_tuple__13);
+  Py_VISIT(traverse_module_state->__pyx_tuple__14);
+  Py_VISIT(traverse_module_state->__pyx_tuple__15);
+  Py_VISIT(traverse_module_state->__pyx_tuple__16);
+  Py_VISIT(traverse_module_state->__pyx_tuple__17);
+  Py_VISIT(traverse_module_state->__pyx_tuple__18);
+  Py_VISIT(traverse_module_state->__pyx_tuple__19);
+  Py_VISIT(traverse_module_state->__pyx_tuple__20);
+  Py_VISIT(traverse_module_state->__pyx_tuple__22);
+  Py_VISIT(traverse_module_state->__pyx_tuple__24);
+  Py_VISIT(traverse_module_state->__pyx_codeobj__21);
+  Py_VISIT(traverse_module_state->__pyx_codeobj__23);
+  Py_VISIT(traverse_module_state->__pyx_codeobj__25);
+  return 0;
+}
+#endif
+/* #### Code section: module_state_defines ### */
+#define __pyx_d __pyx_mstate_global->__pyx_d
+#define __pyx_b __pyx_mstate_global->__pyx_b
+#define __pyx_cython_runtime __pyx_mstate_global->__pyx_cython_runtime
+#define __pyx_empty_tuple __pyx_mstate_global->__pyx_empty_tuple
+#define __pyx_empty_bytes __pyx_mstate_global->__pyx_empty_bytes
+#define __pyx_empty_unicode __pyx_mstate_global->__pyx_empty_unicode
+#ifdef __Pyx_CyFunction_USED
+#define __pyx_CyFunctionType __pyx_mstate_global->__pyx_CyFunctionType
+#endif
+#ifdef __Pyx_FusedFunction_USED
+#define __pyx_FusedFunctionType __pyx_mstate_global->__pyx_FusedFunctionType
+#endif
+#ifdef __Pyx_Generator_USED
+#define __pyx_GeneratorType __pyx_mstate_global->__pyx_GeneratorType
+#endif
+#ifdef __Pyx_IterableCoroutine_USED
+#define __pyx_IterableCoroutineType __pyx_mstate_global->__pyx_IterableCoroutineType
+#endif
+#ifdef __Pyx_Coroutine_USED
+#define __pyx_CoroutineAwaitType __pyx_mstate_global->__pyx_CoroutineAwaitType
+#endif
+#ifdef __Pyx_Coroutine_USED
+#define __pyx_CoroutineType __pyx_mstate_global->__pyx_CoroutineType
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#define __pyx_ptype_7cpython_4type_type __pyx_mstate_global->__pyx_ptype_7cpython_4type_type
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#define __pyx_ptype_5numpy_dtype __pyx_mstate_global->__pyx_ptype_5numpy_dtype
+#define __pyx_ptype_5numpy_flatiter __pyx_mstate_global->__pyx_ptype_5numpy_flatiter
+#define __pyx_ptype_5numpy_broadcast __pyx_mstate_global->__pyx_ptype_5numpy_broadcast
+#define __pyx_ptype_5numpy_ndarray __pyx_mstate_global->__pyx_ptype_5numpy_ndarray
+#define __pyx_ptype_5numpy_generic __pyx_mstate_global->__pyx_ptype_5numpy_generic
+#define __pyx_ptype_5numpy_number __pyx_mstate_global->__pyx_ptype_5numpy_number
+#define __pyx_ptype_5numpy_integer __pyx_mstate_global->__pyx_ptype_5numpy_integer
+#define __pyx_ptype_5numpy_signedinteger __pyx_mstate_global->__pyx_ptype_5numpy_signedinteger
+#define __pyx_ptype_5numpy_unsignedinteger __pyx_mstate_global->__pyx_ptype_5numpy_unsignedinteger
+#define __pyx_ptype_5numpy_inexact __pyx_mstate_global->__pyx_ptype_5numpy_inexact
+#define __pyx_ptype_5numpy_floating __pyx_mstate_global->__pyx_ptype_5numpy_floating
+#define __pyx_ptype_5numpy_complexfloating __pyx_mstate_global->__pyx_ptype_5numpy_complexfloating
+#define __pyx_ptype_5numpy_flexible __pyx_mstate_global->__pyx_ptype_5numpy_flexible
+#define __pyx_ptype_5numpy_character __pyx_mstate_global->__pyx_ptype_5numpy_character
+#define __pyx_ptype_5numpy_ufunc __pyx_mstate_global->__pyx_ptype_5numpy_ufunc
+#if CYTHON_USE_MODULE_STATE
+#define __pyx_type___pyx_array __pyx_mstate_global->__pyx_type___pyx_array
+#define __pyx_type___pyx_MemviewEnum __pyx_mstate_global->__pyx_type___pyx_MemviewEnum
+#define __pyx_type___pyx_memoryview __pyx_mstate_global->__pyx_type___pyx_memoryview
+#define __pyx_type___pyx_memoryviewslice __pyx_mstate_global->__pyx_type___pyx_memoryviewslice
+#endif
+#define __pyx_array_type __pyx_mstate_global->__pyx_array_type
+#define __pyx_MemviewEnum_type __pyx_mstate_global->__pyx_MemviewEnum_type
+#define __pyx_memoryview_type __pyx_mstate_global->__pyx_memoryview_type
+#define __pyx_memoryviewslice_type __pyx_mstate_global->__pyx_memoryviewslice_type
+#define __pyx_kp_u_ __pyx_mstate_global->__pyx_kp_u_
+#define __pyx_n_s_ASCII __pyx_mstate_global->__pyx_n_s_ASCII
+#define __pyx_kp_s_All_dimensions_preceding_dimensi __pyx_mstate_global->__pyx_kp_s_All_dimensions_preceding_dimensi
+#define __pyx_n_s_AssertionError __pyx_mstate_global->__pyx_n_s_AssertionError
+#define __pyx_kp_s_Buffer_view_does_not_expose_stri __pyx_mstate_global->__pyx_kp_s_Buffer_view_does_not_expose_stri
+#define __pyx_kp_s_Can_only_create_a_buffer_that_is __pyx_mstate_global->__pyx_kp_s_Can_only_create_a_buffer_that_is
+#define __pyx_kp_s_Cannot_assign_to_read_only_memor __pyx_mstate_global->__pyx_kp_s_Cannot_assign_to_read_only_memor
+#define __pyx_kp_s_Cannot_create_writable_memory_vi __pyx_mstate_global->__pyx_kp_s_Cannot_create_writable_memory_vi
+#define __pyx_kp_u_Cannot_index_with_type __pyx_mstate_global->__pyx_kp_u_Cannot_index_with_type
+#define __pyx_kp_s_Cannot_transpose_memoryview_with __pyx_mstate_global->__pyx_kp_s_Cannot_transpose_memoryview_with
+#define __pyx_n_s_DTYPE __pyx_mstate_global->__pyx_n_s_DTYPE
+#define __pyx_kp_s_Dimension_d_is_not_direct __pyx_mstate_global->__pyx_kp_s_Dimension_d_is_not_direct
+#define __pyx_n_s_Ellipsis __pyx_mstate_global->__pyx_n_s_Ellipsis
+#define __pyx_kp_s_Empty_shape_tuple_for_cython_arr __pyx_mstate_global->__pyx_kp_s_Empty_shape_tuple_for_cython_arr
+#define __pyx_n_s_ImportError __pyx_mstate_global->__pyx_n_s_ImportError
+#define __pyx_kp_s_Incompatible_checksums_0x_x_vs_0 __pyx_mstate_global->__pyx_kp_s_Incompatible_checksums_0x_x_vs_0
+#define __pyx_n_s_IndexError __pyx_mstate_global->__pyx_n_s_IndexError
+#define __pyx_kp_s_Index_out_of_bounds_axis_d __pyx_mstate_global->__pyx_kp_s_Index_out_of_bounds_axis_d
+#define __pyx_kp_s_Indirect_dimensions_not_supporte __pyx_mstate_global->__pyx_kp_s_Indirect_dimensions_not_supporte
+#define __pyx_kp_u_Invalid_mode_expected_c_or_fortr __pyx_mstate_global->__pyx_kp_u_Invalid_mode_expected_c_or_fortr
+#define __pyx_kp_u_Invalid_shape_in_axis __pyx_mstate_global->__pyx_kp_u_Invalid_shape_in_axis
+#define __pyx_n_s_MemoryError __pyx_mstate_global->__pyx_n_s_MemoryError
+#define __pyx_kp_s_MemoryView_of_r_at_0x_x __pyx_mstate_global->__pyx_kp_s_MemoryView_of_r_at_0x_x
+#define __pyx_kp_s_MemoryView_of_r_object __pyx_mstate_global->__pyx_kp_s_MemoryView_of_r_object
+#define __pyx_n_b_O __pyx_mstate_global->__pyx_n_b_O
+#define __pyx_kp_u_Out_of_bounds_on_buffer_access_a __pyx_mstate_global->__pyx_kp_u_Out_of_bounds_on_buffer_access_a
+#define __pyx_n_s_PickleError __pyx_mstate_global->__pyx_n_s_PickleError
+#define __pyx_n_s_Sequence __pyx_mstate_global->__pyx_n_s_Sequence
+#define __pyx_kp_s_Step_may_not_be_zero_axis_d __pyx_mstate_global->__pyx_kp_s_Step_may_not_be_zero_axis_d
+#define __pyx_n_s_TypeError __pyx_mstate_global->__pyx_n_s_TypeError
+#define __pyx_kp_s_Unable_to_convert_item_to_object __pyx_mstate_global->__pyx_kp_s_Unable_to_convert_item_to_object
+#define __pyx_n_s_ValueError __pyx_mstate_global->__pyx_n_s_ValueError
+#define __pyx_n_s_View_MemoryView __pyx_mstate_global->__pyx_n_s_View_MemoryView
+#define __pyx_kp_u__2 __pyx_mstate_global->__pyx_kp_u__2
+#define __pyx_n_s__26 __pyx_mstate_global->__pyx_n_s__26
+#define __pyx_n_s__3 __pyx_mstate_global->__pyx_n_s__3
+#define __pyx_kp_u__6 __pyx_mstate_global->__pyx_kp_u__6
+#define __pyx_kp_u__7 __pyx_mstate_global->__pyx_kp_u__7
+#define __pyx_n_s_abc __pyx_mstate_global->__pyx_n_s_abc
+#define __pyx_n_s_allocate_buffer __pyx_mstate_global->__pyx_n_s_allocate_buffer
+#define __pyx_kp_u_and __pyx_mstate_global->__pyx_kp_u_and
+#define __pyx_n_s_asyncio_coroutines __pyx_mstate_global->__pyx_n_s_asyncio_coroutines
+#define __pyx_n_s_base __pyx_mstate_global->__pyx_n_s_base
+#define __pyx_n_s_batch_by_size_fast __pyx_mstate_global->__pyx_n_s_batch_by_size_fast
+#define __pyx_n_s_batch_fixed_shapes_fast __pyx_mstate_global->__pyx_n_s_batch_fixed_shapes_fast
+#define __pyx_n_s_bsz_mult __pyx_mstate_global->__pyx_n_s_bsz_mult
+#define __pyx_n_s_c __pyx_mstate_global->__pyx_n_s_c
+#define __pyx_n_u_c __pyx_mstate_global->__pyx_n_u_c
+#define __pyx_n_s_class __pyx_mstate_global->__pyx_n_s_class
+#define __pyx_n_s_class_getitem __pyx_mstate_global->__pyx_n_s_class_getitem
+#define __pyx_n_s_cline_in_traceback __pyx_mstate_global->__pyx_n_s_cline_in_traceback
+#define __pyx_n_s_collections __pyx_mstate_global->__pyx_n_s_collections
+#define __pyx_kp_s_collections_abc __pyx_mstate_global->__pyx_kp_s_collections_abc
+#define __pyx_kp_s_contiguous_and_direct __pyx_mstate_global->__pyx_kp_s_contiguous_and_direct
+#define __pyx_kp_s_contiguous_and_indirect __pyx_mstate_global->__pyx_kp_s_contiguous_and_indirect
+#define __pyx_n_s_count __pyx_mstate_global->__pyx_n_s_count
+#define __pyx_n_s_dict __pyx_mstate_global->__pyx_n_s_dict
+#define __pyx_kp_u_disable __pyx_mstate_global->__pyx_kp_u_disable
+#define __pyx_n_s_dtype_is_object __pyx_mstate_global->__pyx_n_s_dtype_is_object
+#define __pyx_kp_u_enable __pyx_mstate_global->__pyx_kp_u_enable
+#define __pyx_n_s_encode __pyx_mstate_global->__pyx_n_s_encode
+#define __pyx_n_s_enumerate __pyx_mstate_global->__pyx_n_s_enumerate
+#define __pyx_n_s_error __pyx_mstate_global->__pyx_n_s_error
+#define __pyx_n_s_fairseq_data_data_utils_fast __pyx_mstate_global->__pyx_n_s_fairseq_data_data_utils_fast
+#define __pyx_kp_s_fairseq_data_data_utils_fast_pyx __pyx_mstate_global->__pyx_kp_s_fairseq_data_data_utils_fast_pyx
+#define __pyx_n_s_fixed_shapes_sorted __pyx_mstate_global->__pyx_n_s_fixed_shapes_sorted
+#define __pyx_n_s_flags __pyx_mstate_global->__pyx_n_s_flags
+#define __pyx_n_s_format __pyx_mstate_global->__pyx_n_s_format
+#define __pyx_n_s_fortran __pyx_mstate_global->__pyx_n_s_fortran
+#define __pyx_n_u_fortran __pyx_mstate_global->__pyx_n_u_fortran
+#define __pyx_kp_u_gc __pyx_mstate_global->__pyx_kp_u_gc
+#define __pyx_n_s_getstate __pyx_mstate_global->__pyx_n_s_getstate
+#define __pyx_kp_u_got __pyx_mstate_global->__pyx_kp_u_got
+#define __pyx_kp_u_got_differing_extents_in_dimensi __pyx_mstate_global->__pyx_kp_u_got_differing_extents_in_dimensi
+#define __pyx_n_s_id __pyx_mstate_global->__pyx_n_s_id
+#define __pyx_n_s_import __pyx_mstate_global->__pyx_n_s_import
+#define __pyx_n_s_index __pyx_mstate_global->__pyx_n_s_index
+#define __pyx_n_s_indices __pyx_mstate_global->__pyx_n_s_indices
+#define __pyx_n_s_initializing __pyx_mstate_global->__pyx_n_s_initializing
+#define __pyx_n_s_int64 __pyx_mstate_global->__pyx_n_s_int64
+#define __pyx_n_s_is_coroutine __pyx_mstate_global->__pyx_n_s_is_coroutine
+#define __pyx_kp_u_isenabled __pyx_mstate_global->__pyx_kp_u_isenabled
+#define __pyx_n_s_itemsize __pyx_mstate_global->__pyx_n_s_itemsize
+#define __pyx_kp_s_itemsize_0_for_cython_array __pyx_mstate_global->__pyx_kp_s_itemsize_0_for_cython_array
+#define __pyx_n_s_main __pyx_mstate_global->__pyx_n_s_main
+#define __pyx_n_s_max __pyx_mstate_global->__pyx_n_s_max
+#define __pyx_n_s_max_sentences __pyx_mstate_global->__pyx_n_s_max_sentences
+#define __pyx_n_s_max_tokens __pyx_mstate_global->__pyx_n_s_max_tokens
+#define __pyx_n_s_memview __pyx_mstate_global->__pyx_n_s_memview
+#define __pyx_n_s_mode __pyx_mstate_global->__pyx_n_s_mode
+#define __pyx_n_s_name __pyx_mstate_global->__pyx_n_s_name
+#define __pyx_n_s_name_2 __pyx_mstate_global->__pyx_n_s_name_2
+#define __pyx_n_s_ndim __pyx_mstate_global->__pyx_n_s_ndim
+#define __pyx_n_s_new __pyx_mstate_global->__pyx_n_s_new
+#define __pyx_kp_s_no_default___reduce___due_to_non __pyx_mstate_global->__pyx_kp_s_no_default___reduce___due_to_non
+#define __pyx_n_s_np __pyx_mstate_global->__pyx_n_s_np
+#define __pyx_n_s_num_tokens_fn __pyx_mstate_global->__pyx_n_s_num_tokens_fn
+#define __pyx_n_s_numpy __pyx_mstate_global->__pyx_n_s_numpy
+#define __pyx_kp_u_numpy__core_multiarray_failed_to __pyx_mstate_global->__pyx_kp_u_numpy__core_multiarray_failed_to
+#define __pyx_kp_u_numpy__core_umath_failed_to_impo __pyx_mstate_global->__pyx_kp_u_numpy__core_umath_failed_to_impo
+#define __pyx_n_s_obj __pyx_mstate_global->__pyx_n_s_obj
+#define __pyx_n_s_pack __pyx_mstate_global->__pyx_n_s_pack
+#define __pyx_n_s_pickle __pyx_mstate_global->__pyx_n_s_pickle
+#define __pyx_n_s_pyx_PickleError __pyx_mstate_global->__pyx_n_s_pyx_PickleError
+#define __pyx_n_s_pyx_checksum __pyx_mstate_global->__pyx_n_s_pyx_checksum
+#define __pyx_n_s_pyx_result __pyx_mstate_global->__pyx_n_s_pyx_result
+#define __pyx_n_s_pyx_state __pyx_mstate_global->__pyx_n_s_pyx_state
+#define __pyx_n_s_pyx_type __pyx_mstate_global->__pyx_n_s_pyx_type
+#define __pyx_n_s_pyx_unpickle_Enum __pyx_mstate_global->__pyx_n_s_pyx_unpickle_Enum
+#define __pyx_n_s_pyx_vtable __pyx_mstate_global->__pyx_n_s_pyx_vtable
+#define __pyx_n_s_range __pyx_mstate_global->__pyx_n_s_range
+#define __pyx_n_s_reduce __pyx_mstate_global->__pyx_n_s_reduce
+#define __pyx_n_s_reduce_cython __pyx_mstate_global->__pyx_n_s_reduce_cython
+#define __pyx_n_s_reduce_ex __pyx_mstate_global->__pyx_n_s_reduce_ex
+#define __pyx_n_s_register __pyx_mstate_global->__pyx_n_s_register
+#define __pyx_kp_u_sentence_at_index_of_size_exceed __pyx_mstate_global->__pyx_kp_u_sentence_at_index_of_size_exceed
+#define __pyx_n_s_setstate __pyx_mstate_global->__pyx_n_s_setstate
+#define __pyx_n_s_setstate_cython __pyx_mstate_global->__pyx_n_s_setstate_cython
+#define __pyx_n_s_shape __pyx_mstate_global->__pyx_n_s_shape
+#define __pyx_n_s_size __pyx_mstate_global->__pyx_n_s_size
+#define __pyx_n_s_spec __pyx_mstate_global->__pyx_n_s_spec
+#define __pyx_n_s_start __pyx_mstate_global->__pyx_n_s_start
+#define __pyx_n_s_step __pyx_mstate_global->__pyx_n_s_step
+#define __pyx_n_s_stop __pyx_mstate_global->__pyx_n_s_stop
+#define __pyx_kp_s_strided_and_direct __pyx_mstate_global->__pyx_kp_s_strided_and_direct
+#define __pyx_kp_s_strided_and_direct_or_indirect __pyx_mstate_global->__pyx_kp_s_strided_and_direct_or_indirect
+#define __pyx_kp_s_strided_and_indirect __pyx_mstate_global->__pyx_kp_s_strided_and_indirect
+#define __pyx_kp_s_stringsource __pyx_mstate_global->__pyx_kp_s_stringsource
+#define __pyx_n_s_struct __pyx_mstate_global->__pyx_n_s_struct
+#define __pyx_n_s_sys __pyx_mstate_global->__pyx_n_s_sys
+#define __pyx_n_s_test __pyx_mstate_global->__pyx_n_s_test
+#define __pyx_kp_s_unable_to_allocate_array_data __pyx_mstate_global->__pyx_kp_s_unable_to_allocate_array_data
+#define __pyx_kp_s_unable_to_allocate_shape_and_str __pyx_mstate_global->__pyx_kp_s_unable_to_allocate_shape_and_str
+#define __pyx_n_s_unpack __pyx_mstate_global->__pyx_n_s_unpack
+#define __pyx_n_s_update __pyx_mstate_global->__pyx_n_s_update
+#define __pyx_n_s_version_info __pyx_mstate_global->__pyx_n_s_version_info
+#define __pyx_int_0 __pyx_mstate_global->__pyx_int_0
+#define __pyx_int_1 __pyx_mstate_global->__pyx_int_1
+#define __pyx_int_3 __pyx_mstate_global->__pyx_int_3
+#define __pyx_int_112105877 __pyx_mstate_global->__pyx_int_112105877
+#define __pyx_int_136983863 __pyx_mstate_global->__pyx_int_136983863
+#define __pyx_int_184977713 __pyx_mstate_global->__pyx_int_184977713
+#define __pyx_int_neg_1 __pyx_mstate_global->__pyx_int_neg_1
+#define __pyx_slice__5 __pyx_mstate_global->__pyx_slice__5
+#define __pyx_tuple__4 __pyx_mstate_global->__pyx_tuple__4
+#define __pyx_tuple__8 __pyx_mstate_global->__pyx_tuple__8
+#define __pyx_tuple__9 __pyx_mstate_global->__pyx_tuple__9
+#define __pyx_tuple__10 __pyx_mstate_global->__pyx_tuple__10
+#define __pyx_tuple__11 __pyx_mstate_global->__pyx_tuple__11
+#define __pyx_tuple__12 __pyx_mstate_global->__pyx_tuple__12
+#define __pyx_tuple__13 __pyx_mstate_global->__pyx_tuple__13
+#define __pyx_tuple__14 __pyx_mstate_global->__pyx_tuple__14
+#define __pyx_tuple__15 __pyx_mstate_global->__pyx_tuple__15
+#define __pyx_tuple__16 __pyx_mstate_global->__pyx_tuple__16
+#define __pyx_tuple__17 __pyx_mstate_global->__pyx_tuple__17
+#define __pyx_tuple__18 __pyx_mstate_global->__pyx_tuple__18
+#define __pyx_tuple__19 __pyx_mstate_global->__pyx_tuple__19
+#define __pyx_tuple__20 __pyx_mstate_global->__pyx_tuple__20
+#define __pyx_tuple__22 __pyx_mstate_global->__pyx_tuple__22
+#define __pyx_tuple__24 __pyx_mstate_global->__pyx_tuple__24
+#define __pyx_codeobj__21 __pyx_mstate_global->__pyx_codeobj__21
+#define __pyx_codeobj__23 __pyx_mstate_global->__pyx_codeobj__23
+#define __pyx_codeobj__25 __pyx_mstate_global->__pyx_codeobj__25
+/* #### Code section: module_code ### */
+
+/* "View.MemoryView":131
+ *         cdef bint dtype_is_object
+ * 
+ *     def __cinit__(array self, tuple shape, Py_ssize_t itemsize, format not None,             # <<<<<<<<<<<<<<
+ *                   mode="c", bint allocate_buffer=True):
+ * 
+ */
+
+/* Python wrapper */
+static int __pyx_array___cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static int __pyx_array___cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_shape = 0;
+  Py_ssize_t __pyx_v_itemsize;
+  PyObject *__pyx_v_format = 0;
+  PyObject *__pyx_v_mode = 0;
+  int __pyx_v_allocate_buffer;
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[5] = {0,0,0,0,0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__cinit__ (wrapper)", 0);
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return -1;
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_shape,&__pyx_n_s_itemsize,&__pyx_n_s_format,&__pyx_n_s_mode,&__pyx_n_s_allocate_buffer,0};
+    values[3] = __Pyx_Arg_NewRef_VARARGS(((PyObject *)__pyx_n_s_c));
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  5: values[4] = __Pyx_Arg_VARARGS(__pyx_args, 4);
+        CYTHON_FALLTHROUGH;
+        case  4: values[3] = __Pyx_Arg_VARARGS(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = __Pyx_Arg_VARARGS(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = __Pyx_Arg_VARARGS(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = __Pyx_Arg_VARARGS(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_VARARGS(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_VARARGS(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_shape)) != 0)) {
+          (void)__Pyx_Arg_NewRef_VARARGS(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 131, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_GetKwValue_VARARGS(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_itemsize)) != 0)) {
+          (void)__Pyx_Arg_NewRef_VARARGS(values[1]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 131, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 3, 5, 1); __PYX_ERR(1, 131, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_GetKwValue_VARARGS(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_format)) != 0)) {
+          (void)__Pyx_Arg_NewRef_VARARGS(values[2]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 131, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 3, 5, 2); __PYX_ERR(1, 131, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  3:
+        if (kw_args > 0) {
+          PyObject* value = __Pyx_GetKwValue_VARARGS(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_mode);
+          if (value) { values[3] = __Pyx_Arg_NewRef_VARARGS(value); kw_args--; }
+          else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 131, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  4:
+        if (kw_args > 0) {
+          PyObject* value = __Pyx_GetKwValue_VARARGS(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_allocate_buffer);
+          if (value) { values[4] = __Pyx_Arg_NewRef_VARARGS(value); kw_args--; }
+          else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 131, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "__cinit__") < 0)) __PYX_ERR(1, 131, __pyx_L3_error)
+      }
+    } else {
+      switch (__pyx_nargs) {
+        case  5: values[4] = __Pyx_Arg_VARARGS(__pyx_args, 4);
+        CYTHON_FALLTHROUGH;
+        case  4: values[3] = __Pyx_Arg_VARARGS(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = __Pyx_Arg_VARARGS(__pyx_args, 2);
+        values[1] = __Pyx_Arg_VARARGS(__pyx_args, 1);
+        values[0] = __Pyx_Arg_VARARGS(__pyx_args, 0);
+        break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+    }
+    __pyx_v_shape = ((PyObject*)values[0]);
+    __pyx_v_itemsize = __Pyx_PyIndex_AsSsize_t(values[1]); if (unlikely((__pyx_v_itemsize == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(1, 131, __pyx_L3_error)
+    __pyx_v_format = values[2];
+    __pyx_v_mode = values[3];
+    if (values[4]) {
+      __pyx_v_allocate_buffer = __Pyx_PyObject_IsTrue(values[4]); if (unlikely((__pyx_v_allocate_buffer == (int)-1) && PyErr_Occurred())) __PYX_ERR(1, 132, __pyx_L3_error)
+    } else {
+
+      /* "View.MemoryView":132
+ * 
+ *     def __cinit__(array self, tuple shape, Py_ssize_t itemsize, format not None,
+ *                   mode="c", bint allocate_buffer=True):             # <<<<<<<<<<<<<<
+ * 
+ *         cdef int idx
+ */
+      __pyx_v_allocate_buffer = ((int)1);
+    }
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 3, 5, __pyx_nargs); __PYX_ERR(1, 131, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_VARARGS(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("View.MemoryView.array.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return -1;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_shape), (&PyTuple_Type), 1, "shape", 1))) __PYX_ERR(1, 131, __pyx_L1_error)
+  if (unlikely(((PyObject *)__pyx_v_format) == Py_None)) {
+    PyErr_Format(PyExc_TypeError, "Argument '%.200s' must not be None", "format"); __PYX_ERR(1, 131, __pyx_L1_error)
+  }
+  __pyx_r = __pyx_array___pyx_pf_15View_dot_MemoryView_5array___cinit__(((struct __pyx_array_obj *)__pyx_v_self), __pyx_v_shape, __pyx_v_itemsize, __pyx_v_format, __pyx_v_mode, __pyx_v_allocate_buffer);
+
+  /* "View.MemoryView":131
+ *         cdef bint dtype_is_object
+ * 
+ *     def __cinit__(array self, tuple shape, Py_ssize_t itemsize, format not None,             # <<<<<<<<<<<<<<
+ *                   mode="c", bint allocate_buffer=True):
+ * 
+ */
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = -1;
+  __pyx_L0:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_VARARGS(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array___cinit__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_shape, Py_ssize_t __pyx_v_itemsize, PyObject *__pyx_v_format, PyObject *__pyx_v_mode, int __pyx_v_allocate_buffer) {
+  int __pyx_v_idx;
+  Py_ssize_t __pyx_v_dim;
+  char __pyx_v_order;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  Py_ssize_t __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  unsigned int __pyx_t_7;
+  char *__pyx_t_8;
+  int __pyx_t_9;
+  Py_ssize_t __pyx_t_10;
+  Py_UCS4 __pyx_t_11;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__cinit__", 0);
+  __Pyx_INCREF(__pyx_v_format);
+
+  /* "View.MemoryView":137
+ *         cdef Py_ssize_t dim
+ * 
+ *         self.ndim = <int> len(shape)             # <<<<<<<<<<<<<<
+ *         self.itemsize = itemsize
+ * 
+ */
+  if (unlikely(__pyx_v_shape == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()");
+    __PYX_ERR(1, 137, __pyx_L1_error)
+  }
+  __pyx_t_1 = __Pyx_PyTuple_GET_SIZE(__pyx_v_shape); if (unlikely(__pyx_t_1 == ((Py_ssize_t)-1))) __PYX_ERR(1, 137, __pyx_L1_error)
+  __pyx_v_self->ndim = ((int)__pyx_t_1);
+
+  /* "View.MemoryView":138
+ * 
+ *         self.ndim = <int> len(shape)
+ *         self.itemsize = itemsize             # <<<<<<<<<<<<<<
+ * 
+ *         if not self.ndim:
+ */
+  __pyx_v_self->itemsize = __pyx_v_itemsize;
+
+  /* "View.MemoryView":140
+ *         self.itemsize = itemsize
+ * 
+ *         if not self.ndim:             # <<<<<<<<<<<<<<
+ *             raise ValueError, "Empty shape tuple for cython.array"
+ * 
+ */
+  __pyx_t_2 = (!(__pyx_v_self->ndim != 0));
+  if (unlikely(__pyx_t_2)) {
+
+    /* "View.MemoryView":141
+ * 
+ *         if not self.ndim:
+ *             raise ValueError, "Empty shape tuple for cython.array"             # <<<<<<<<<<<<<<
+ * 
+ *         if itemsize <= 0:
+ */
+    __Pyx_Raise(__pyx_builtin_ValueError, __pyx_kp_s_Empty_shape_tuple_for_cython_arr, 0, 0);
+    __PYX_ERR(1, 141, __pyx_L1_error)
+
+    /* "View.MemoryView":140
+ *         self.itemsize = itemsize
+ * 
+ *         if not self.ndim:             # <<<<<<<<<<<<<<
+ *             raise ValueError, "Empty shape tuple for cython.array"
+ * 
+ */
+  }
+
+  /* "View.MemoryView":143
+ *             raise ValueError, "Empty shape tuple for cython.array"
+ * 
+ *         if itemsize <= 0:             # <<<<<<<<<<<<<<
+ *             raise ValueError, "itemsize <= 0 for cython.array"
+ * 
+ */
+  __pyx_t_2 = (__pyx_v_itemsize <= 0);
+  if (unlikely(__pyx_t_2)) {
+
+    /* "View.MemoryView":144
+ * 
+ *         if itemsize <= 0:
+ *             raise ValueError, "itemsize <= 0 for cython.array"             # <<<<<<<<<<<<<<
+ * 
+ *         if not isinstance(format, bytes):
+ */
+    __Pyx_Raise(__pyx_builtin_ValueError, __pyx_kp_s_itemsize_0_for_cython_array, 0, 0);
+    __PYX_ERR(1, 144, __pyx_L1_error)
+
+    /* "View.MemoryView":143
+ *             raise ValueError, "Empty shape tuple for cython.array"
+ * 
+ *         if itemsize <= 0:             # <<<<<<<<<<<<<<
+ *             raise ValueError, "itemsize <= 0 for cython.array"
+ * 
+ */
+  }
+
+  /* "View.MemoryView":146
+ *             raise ValueError, "itemsize <= 0 for cython.array"
+ * 
+ *         if not isinstance(format, bytes):             # <<<<<<<<<<<<<<
+ *             format = format.encode('ASCII')
+ *         self._format = format  # keep a reference to the byte string
+ */
+  __pyx_t_2 = PyBytes_Check(__pyx_v_format); 
+  __pyx_t_3 = (!__pyx_t_2);
+  if (__pyx_t_3) {
+
+    /* "View.MemoryView":147
+ * 
+ *         if not isinstance(format, bytes):
+ *             format = format.encode('ASCII')             # <<<<<<<<<<<<<<
+ *         self._format = format  # keep a reference to the byte string
+ *         self.format = self._format
+ */
+    __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_v_format, __pyx_n_s_encode); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 147, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_6 = NULL;
+    __pyx_t_7 = 0;
+    #if CYTHON_UNPACK_METHODS
+    if (likely(PyMethod_Check(__pyx_t_5))) {
+      __pyx_t_6 = PyMethod_GET_SELF(__pyx_t_5);
+      if (likely(__pyx_t_6)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_5);
+        __Pyx_INCREF(__pyx_t_6);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_5, function);
+        __pyx_t_7 = 1;
+      }
+    }
+    #endif
+    {
+      PyObject *__pyx_callargs[2] = {__pyx_t_6, __pyx_n_s_ASCII};
+      __pyx_t_4 = __Pyx_PyObject_FastCall(__pyx_t_5, __pyx_callargs+1-__pyx_t_7, 1+__pyx_t_7);
+      __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+      if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 147, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    }
+    __Pyx_DECREF_SET(__pyx_v_format, __pyx_t_4);
+    __pyx_t_4 = 0;
+
+    /* "View.MemoryView":146
+ *             raise ValueError, "itemsize <= 0 for cython.array"
+ * 
+ *         if not isinstance(format, bytes):             # <<<<<<<<<<<<<<
+ *             format = format.encode('ASCII')
+ *         self._format = format  # keep a reference to the byte string
+ */
+  }
+
+  /* "View.MemoryView":148
+ *         if not isinstance(format, bytes):
+ *             format = format.encode('ASCII')
+ *         self._format = format  # keep a reference to the byte string             # <<<<<<<<<<<<<<
+ *         self.format = self._format
+ * 
+ */
+  if (!(likely(PyBytes_CheckExact(__pyx_v_format))||((__pyx_v_format) == Py_None) || __Pyx_RaiseUnexpectedTypeError("bytes", __pyx_v_format))) __PYX_ERR(1, 148, __pyx_L1_error)
+  __pyx_t_4 = __pyx_v_format;
+  __Pyx_INCREF(__pyx_t_4);
+  __Pyx_GIVEREF(__pyx_t_4);
+  __Pyx_GOTREF(__pyx_v_self->_format);
+  __Pyx_DECREF(__pyx_v_self->_format);
+  __pyx_v_self->_format = ((PyObject*)__pyx_t_4);
+  __pyx_t_4 = 0;
+
+  /* "View.MemoryView":149
+ *             format = format.encode('ASCII')
+ *         self._format = format  # keep a reference to the byte string
+ *         self.format = self._format             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  if (unlikely(__pyx_v_self->_format == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "expected bytes, NoneType found");
+    __PYX_ERR(1, 149, __pyx_L1_error)
+  }
+  __pyx_t_8 = __Pyx_PyBytes_AsWritableString(__pyx_v_self->_format); if (unlikely((!__pyx_t_8) && PyErr_Occurred())) __PYX_ERR(1, 149, __pyx_L1_error)
+  __pyx_v_self->format = __pyx_t_8;
+
+  /* "View.MemoryView":152
+ * 
+ * 
+ *         self._shape = <Py_ssize_t *> PyObject_Malloc(sizeof(Py_ssize_t)*self.ndim*2)             # <<<<<<<<<<<<<<
+ *         self._strides = self._shape + self.ndim
+ * 
+ */
+  __pyx_v_self->_shape = ((Py_ssize_t *)PyObject_Malloc((((sizeof(Py_ssize_t)) * __pyx_v_self->ndim) * 2)));
+
+  /* "View.MemoryView":153
+ * 
+ *         self._shape = <Py_ssize_t *> PyObject_Malloc(sizeof(Py_ssize_t)*self.ndim*2)
+ *         self._strides = self._shape + self.ndim             # <<<<<<<<<<<<<<
+ * 
+ *         if not self._shape:
+ */
+  __pyx_v_self->_strides = (__pyx_v_self->_shape + __pyx_v_self->ndim);
+
+  /* "View.MemoryView":155
+ *         self._strides = self._shape + self.ndim
+ * 
+ *         if not self._shape:             # <<<<<<<<<<<<<<
+ *             raise MemoryError, "unable to allocate shape and strides."
+ * 
+ */
+  __pyx_t_3 = (!(__pyx_v_self->_shape != 0));
+  if (unlikely(__pyx_t_3)) {
+
+    /* "View.MemoryView":156
+ * 
+ *         if not self._shape:
+ *             raise MemoryError, "unable to allocate shape and strides."             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+    __Pyx_Raise(__pyx_builtin_MemoryError, __pyx_kp_s_unable_to_allocate_shape_and_str, 0, 0);
+    __PYX_ERR(1, 156, __pyx_L1_error)
+
+    /* "View.MemoryView":155
+ *         self._strides = self._shape + self.ndim
+ * 
+ *         if not self._shape:             # <<<<<<<<<<<<<<
+ *             raise MemoryError, "unable to allocate shape and strides."
+ * 
+ */
+  }
+
+  /* "View.MemoryView":159
+ * 
+ * 
+ *         for idx, dim in enumerate(shape):             # <<<<<<<<<<<<<<
+ *             if dim <= 0:
+ *                 raise ValueError, f"Invalid shape in axis {idx}: {dim}."
+ */
+  __pyx_t_9 = 0;
+  __pyx_t_4 = __pyx_v_shape; __Pyx_INCREF(__pyx_t_4);
+  __pyx_t_1 = 0;
+  for (;;) {
+    {
+      Py_ssize_t __pyx_temp = __Pyx_PyTuple_GET_SIZE(__pyx_t_4);
+      #if !CYTHON_ASSUME_SAFE_MACROS
+      if (unlikely((__pyx_temp < 0))) __PYX_ERR(1, 159, __pyx_L1_error)
+      #endif
+      if (__pyx_t_1 >= __pyx_temp) break;
+    }
+    #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    __pyx_t_5 = PyTuple_GET_ITEM(__pyx_t_4, __pyx_t_1); __Pyx_INCREF(__pyx_t_5); __pyx_t_1++; if (unlikely((0 < 0))) __PYX_ERR(1, 159, __pyx_L1_error)
+    #else
+    __pyx_t_5 = __Pyx_PySequence_ITEM(__pyx_t_4, __pyx_t_1); __pyx_t_1++; if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 159, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    #endif
+    __pyx_t_10 = __Pyx_PyIndex_AsSsize_t(__pyx_t_5); if (unlikely((__pyx_t_10 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(1, 159, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __pyx_v_dim = __pyx_t_10;
+    __pyx_v_idx = __pyx_t_9;
+    __pyx_t_9 = (__pyx_t_9 + 1);
+
+    /* "View.MemoryView":160
+ * 
+ *         for idx, dim in enumerate(shape):
+ *             if dim <= 0:             # <<<<<<<<<<<<<<
+ *                 raise ValueError, f"Invalid shape in axis {idx}: {dim}."
+ *             self._shape[idx] = dim
+ */
+    __pyx_t_3 = (__pyx_v_dim <= 0);
+    if (unlikely(__pyx_t_3)) {
+
+      /* "View.MemoryView":161
+ *         for idx, dim in enumerate(shape):
+ *             if dim <= 0:
+ *                 raise ValueError, f"Invalid shape in axis {idx}: {dim}."             # <<<<<<<<<<<<<<
+ *             self._shape[idx] = dim
+ * 
+ */
+      __pyx_t_5 = PyTuple_New(5); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 161, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __pyx_t_10 = 0;
+      __pyx_t_11 = 127;
+      __Pyx_INCREF(__pyx_kp_u_Invalid_shape_in_axis);
+      __pyx_t_10 += 22;
+      __Pyx_GIVEREF(__pyx_kp_u_Invalid_shape_in_axis);
+      PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_kp_u_Invalid_shape_in_axis);
+      __pyx_t_6 = __Pyx_PyUnicode_From_int(__pyx_v_idx, 0, ' ', 'd'); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 161, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __pyx_t_10 += __Pyx_PyUnicode_GET_LENGTH(__pyx_t_6);
+      __Pyx_GIVEREF(__pyx_t_6);
+      PyTuple_SET_ITEM(__pyx_t_5, 1, __pyx_t_6);
+      __pyx_t_6 = 0;
+      __Pyx_INCREF(__pyx_kp_u_);
+      __pyx_t_10 += 2;
+      __Pyx_GIVEREF(__pyx_kp_u_);
+      PyTuple_SET_ITEM(__pyx_t_5, 2, __pyx_kp_u_);
+      __pyx_t_6 = __Pyx_PyUnicode_From_Py_ssize_t(__pyx_v_dim, 0, ' ', 'd'); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 161, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __pyx_t_10 += __Pyx_PyUnicode_GET_LENGTH(__pyx_t_6);
+      __Pyx_GIVEREF(__pyx_t_6);
+      PyTuple_SET_ITEM(__pyx_t_5, 3, __pyx_t_6);
+      __pyx_t_6 = 0;
+      __Pyx_INCREF(__pyx_kp_u__2);
+      __pyx_t_10 += 1;
+      __Pyx_GIVEREF(__pyx_kp_u__2);
+      PyTuple_SET_ITEM(__pyx_t_5, 4, __pyx_kp_u__2);
+      __pyx_t_6 = __Pyx_PyUnicode_Join(__pyx_t_5, 5, __pyx_t_10, __pyx_t_11); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 161, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __Pyx_Raise(__pyx_builtin_ValueError, __pyx_t_6, 0, 0);
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __PYX_ERR(1, 161, __pyx_L1_error)
+
+      /* "View.MemoryView":160
+ * 
+ *         for idx, dim in enumerate(shape):
+ *             if dim <= 0:             # <<<<<<<<<<<<<<
+ *                 raise ValueError, f"Invalid shape in axis {idx}: {dim}."
+ *             self._shape[idx] = dim
+ */
+    }
+
+    /* "View.MemoryView":162
+ *             if dim <= 0:
+ *                 raise ValueError, f"Invalid shape in axis {idx}: {dim}."
+ *             self._shape[idx] = dim             # <<<<<<<<<<<<<<
+ * 
+ *         cdef char order
+ */
+    (__pyx_v_self->_shape[__pyx_v_idx]) = __pyx_v_dim;
+
+    /* "View.MemoryView":159
+ * 
+ * 
+ *         for idx, dim in enumerate(shape):             # <<<<<<<<<<<<<<
+ *             if dim <= 0:
+ *                 raise ValueError, f"Invalid shape in axis {idx}: {dim}."
+ */
+  }
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+
+  /* "View.MemoryView":165
+ * 
+ *         cdef char order
+ *         if mode == 'c':             # <<<<<<<<<<<<<<
+ *             order = b'C'
+ *             self.mode = u'c'
+ */
+  __pyx_t_3 = (__Pyx_PyString_Equals(__pyx_v_mode, __pyx_n_s_c, Py_EQ)); if (unlikely((__pyx_t_3 < 0))) __PYX_ERR(1, 165, __pyx_L1_error)
+  if (__pyx_t_3) {
+
+    /* "View.MemoryView":166
+ *         cdef char order
+ *         if mode == 'c':
+ *             order = b'C'             # <<<<<<<<<<<<<<
+ *             self.mode = u'c'
+ *         elif mode == 'fortran':
+ */
+    __pyx_v_order = 'C';
+
+    /* "View.MemoryView":167
+ *         if mode == 'c':
+ *             order = b'C'
+ *             self.mode = u'c'             # <<<<<<<<<<<<<<
+ *         elif mode == 'fortran':
+ *             order = b'F'
+ */
+    __Pyx_INCREF(__pyx_n_u_c);
+    __Pyx_GIVEREF(__pyx_n_u_c);
+    __Pyx_GOTREF(__pyx_v_self->mode);
+    __Pyx_DECREF(__pyx_v_self->mode);
+    __pyx_v_self->mode = __pyx_n_u_c;
+
+    /* "View.MemoryView":165
+ * 
+ *         cdef char order
+ *         if mode == 'c':             # <<<<<<<<<<<<<<
+ *             order = b'C'
+ *             self.mode = u'c'
+ */
+    goto __pyx_L11;
+  }
+
+  /* "View.MemoryView":168
+ *             order = b'C'
+ *             self.mode = u'c'
+ *         elif mode == 'fortran':             # <<<<<<<<<<<<<<
+ *             order = b'F'
+ *             self.mode = u'fortran'
+ */
+  __pyx_t_3 = (__Pyx_PyString_Equals(__pyx_v_mode, __pyx_n_s_fortran, Py_EQ)); if (unlikely((__pyx_t_3 < 0))) __PYX_ERR(1, 168, __pyx_L1_error)
+  if (likely(__pyx_t_3)) {
+
+    /* "View.MemoryView":169
+ *             self.mode = u'c'
+ *         elif mode == 'fortran':
+ *             order = b'F'             # <<<<<<<<<<<<<<
+ *             self.mode = u'fortran'
+ *         else:
+ */
+    __pyx_v_order = 'F';
+
+    /* "View.MemoryView":170
+ *         elif mode == 'fortran':
+ *             order = b'F'
+ *             self.mode = u'fortran'             # <<<<<<<<<<<<<<
+ *         else:
+ *             raise ValueError, f"Invalid mode, expected 'c' or 'fortran', got {mode}"
+ */
+    __Pyx_INCREF(__pyx_n_u_fortran);
+    __Pyx_GIVEREF(__pyx_n_u_fortran);
+    __Pyx_GOTREF(__pyx_v_self->mode);
+    __Pyx_DECREF(__pyx_v_self->mode);
+    __pyx_v_self->mode = __pyx_n_u_fortran;
+
+    /* "View.MemoryView":168
+ *             order = b'C'
+ *             self.mode = u'c'
+ *         elif mode == 'fortran':             # <<<<<<<<<<<<<<
+ *             order = b'F'
+ *             self.mode = u'fortran'
+ */
+    goto __pyx_L11;
+  }
+
+  /* "View.MemoryView":172
+ *             self.mode = u'fortran'
+ *         else:
+ *             raise ValueError, f"Invalid mode, expected 'c' or 'fortran', got {mode}"             # <<<<<<<<<<<<<<
+ * 
+ *         self.len = fill_contig_strides_array(self._shape, self._strides, itemsize, self.ndim, order)
+ */
+  /*else*/ {
+    __pyx_t_4 = __Pyx_PyObject_FormatSimple(__pyx_v_mode, __pyx_empty_unicode); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 172, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_6 = __Pyx_PyUnicode_Concat(__pyx_kp_u_Invalid_mode_expected_c_or_fortr, __pyx_t_4); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 172, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __Pyx_Raise(__pyx_builtin_ValueError, __pyx_t_6, 0, 0);
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    __PYX_ERR(1, 172, __pyx_L1_error)
+  }
+  __pyx_L11:;
+
+  /* "View.MemoryView":174
+ *             raise ValueError, f"Invalid mode, expected 'c' or 'fortran', got {mode}"
+ * 
+ *         self.len = fill_contig_strides_array(self._shape, self._strides, itemsize, self.ndim, order)             # <<<<<<<<<<<<<<
+ * 
+ *         self.free_data = allocate_buffer
+ */
+  __pyx_v_self->len = __pyx_fill_contig_strides_array(__pyx_v_self->_shape, __pyx_v_self->_strides, __pyx_v_itemsize, __pyx_v_self->ndim, __pyx_v_order);
+
+  /* "View.MemoryView":176
+ *         self.len = fill_contig_strides_array(self._shape, self._strides, itemsize, self.ndim, order)
+ * 
+ *         self.free_data = allocate_buffer             # <<<<<<<<<<<<<<
+ *         self.dtype_is_object = format == b'O'
+ * 
+ */
+  __pyx_v_self->free_data = __pyx_v_allocate_buffer;
+
+  /* "View.MemoryView":177
+ * 
+ *         self.free_data = allocate_buffer
+ *         self.dtype_is_object = format == b'O'             # <<<<<<<<<<<<<<
+ * 
+ *         if allocate_buffer:
+ */
+  __pyx_t_6 = PyObject_RichCompare(__pyx_v_format, __pyx_n_b_O, Py_EQ); __Pyx_XGOTREF(__pyx_t_6); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 177, __pyx_L1_error)
+  __pyx_t_3 = __Pyx_PyObject_IsTrue(__pyx_t_6); if (unlikely((__pyx_t_3 == (int)-1) && PyErr_Occurred())) __PYX_ERR(1, 177, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+  __pyx_v_self->dtype_is_object = __pyx_t_3;
+
+  /* "View.MemoryView":179
+ *         self.dtype_is_object = format == b'O'
+ * 
+ *         if allocate_buffer:             # <<<<<<<<<<<<<<
+ *             _allocate_buffer(self)
+ * 
+ */
+  if (__pyx_v_allocate_buffer) {
+
+    /* "View.MemoryView":180
+ * 
+ *         if allocate_buffer:
+ *             _allocate_buffer(self)             # <<<<<<<<<<<<<<
+ * 
+ *     @cname('getbuffer')
+ */
+    __pyx_t_9 = __pyx_array_allocate_buffer(__pyx_v_self); if (unlikely(__pyx_t_9 == ((int)-1))) __PYX_ERR(1, 180, __pyx_L1_error)
+
+    /* "View.MemoryView":179
+ *         self.dtype_is_object = format == b'O'
+ * 
+ *         if allocate_buffer:             # <<<<<<<<<<<<<<
+ *             _allocate_buffer(self)
+ * 
+ */
+  }
+
+  /* "View.MemoryView":131
+ *         cdef bint dtype_is_object
+ * 
+ *     def __cinit__(array self, tuple shape, Py_ssize_t itemsize, format not None,             # <<<<<<<<<<<<<<
+ *                   mode="c", bint allocate_buffer=True):
+ * 
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_AddTraceback("View.MemoryView.array.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_format);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":182
+ *             _allocate_buffer(self)
+ * 
+ *     @cname('getbuffer')             # <<<<<<<<<<<<<<
+ *     def __getbuffer__(self, Py_buffer *info, int flags):
+ *         cdef int bufmode = -1
+ */
+
+/* Python wrapper */
+CYTHON_UNUSED static int __pyx_array_getbuffer(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /*proto*/
+CYTHON_UNUSED static int __pyx_array_getbuffer(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__getbuffer__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_array___pyx_pf_15View_dot_MemoryView_5array_2__getbuffer__(((struct __pyx_array_obj *)__pyx_v_self), ((Py_buffer *)__pyx_v_info), ((int)__pyx_v_flags));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array_2__getbuffer__(struct __pyx_array_obj *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+  int __pyx_v_bufmode;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  char *__pyx_t_2;
+  Py_ssize_t __pyx_t_3;
+  int __pyx_t_4;
+  Py_ssize_t *__pyx_t_5;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  if (unlikely(__pyx_v_info == NULL)) {
+    PyErr_SetString(PyExc_BufferError, "PyObject_GetBuffer: view==NULL argument is obsolete");
+    return -1;
+  }
+  __Pyx_RefNannySetupContext("__getbuffer__", 0);
+  __pyx_v_info->obj = Py_None; __Pyx_INCREF(Py_None);
+  __Pyx_GIVEREF(__pyx_v_info->obj);
+
+  /* "View.MemoryView":184
+ *     @cname('getbuffer')
+ *     def __getbuffer__(self, Py_buffer *info, int flags):
+ *         cdef int bufmode = -1             # <<<<<<<<<<<<<<
+ *         if flags & (PyBUF_C_CONTIGUOUS | PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS):
+ *             if self.mode == u"c":
+ */
+  __pyx_v_bufmode = -1;
+
+  /* "View.MemoryView":185
+ *     def __getbuffer__(self, Py_buffer *info, int flags):
+ *         cdef int bufmode = -1
+ *         if flags & (PyBUF_C_CONTIGUOUS | PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS):             # <<<<<<<<<<<<<<
+ *             if self.mode == u"c":
+ *                 bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ */
+  __pyx_t_1 = ((__pyx_v_flags & ((PyBUF_C_CONTIGUOUS | PyBUF_F_CONTIGUOUS) | PyBUF_ANY_CONTIGUOUS)) != 0);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":186
+ *         cdef int bufmode = -1
+ *         if flags & (PyBUF_C_CONTIGUOUS | PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS):
+ *             if self.mode == u"c":             # <<<<<<<<<<<<<<
+ *                 bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ *             elif self.mode == u"fortran":
+ */
+    __pyx_t_1 = (__Pyx_PyUnicode_Equals(__pyx_v_self->mode, __pyx_n_u_c, Py_EQ)); if (unlikely((__pyx_t_1 < 0))) __PYX_ERR(1, 186, __pyx_L1_error)
+    if (__pyx_t_1) {
+
+      /* "View.MemoryView":187
+ *         if flags & (PyBUF_C_CONTIGUOUS | PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS):
+ *             if self.mode == u"c":
+ *                 bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS             # <<<<<<<<<<<<<<
+ *             elif self.mode == u"fortran":
+ *                 bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ */
+      __pyx_v_bufmode = (PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS);
+
+      /* "View.MemoryView":186
+ *         cdef int bufmode = -1
+ *         if flags & (PyBUF_C_CONTIGUOUS | PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS):
+ *             if self.mode == u"c":             # <<<<<<<<<<<<<<
+ *                 bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ *             elif self.mode == u"fortran":
+ */
+      goto __pyx_L4;
+    }
+
+    /* "View.MemoryView":188
+ *             if self.mode == u"c":
+ *                 bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ *             elif self.mode == u"fortran":             # <<<<<<<<<<<<<<
+ *                 bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ *             if not (flags & bufmode):
+ */
+    __pyx_t_1 = (__Pyx_PyUnicode_Equals(__pyx_v_self->mode, __pyx_n_u_fortran, Py_EQ)); if (unlikely((__pyx_t_1 < 0))) __PYX_ERR(1, 188, __pyx_L1_error)
+    if (__pyx_t_1) {
+
+      /* "View.MemoryView":189
+ *                 bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ *             elif self.mode == u"fortran":
+ *                 bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS             # <<<<<<<<<<<<<<
+ *             if not (flags & bufmode):
+ *                 raise ValueError, "Can only create a buffer that is contiguous in memory."
+ */
+      __pyx_v_bufmode = (PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS);
+
+      /* "View.MemoryView":188
+ *             if self.mode == u"c":
+ *                 bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ *             elif self.mode == u"fortran":             # <<<<<<<<<<<<<<
+ *                 bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ *             if not (flags & bufmode):
+ */
+    }
+    __pyx_L4:;
+
+    /* "View.MemoryView":190
+ *             elif self.mode == u"fortran":
+ *                 bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ *             if not (flags & bufmode):             # <<<<<<<<<<<<<<
+ *                 raise ValueError, "Can only create a buffer that is contiguous in memory."
+ *         info.buf = self.data
+ */
+    __pyx_t_1 = (!((__pyx_v_flags & __pyx_v_bufmode) != 0));
+    if (unlikely(__pyx_t_1)) {
+
+      /* "View.MemoryView":191
+ *                 bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ *             if not (flags & bufmode):
+ *                 raise ValueError, "Can only create a buffer that is contiguous in memory."             # <<<<<<<<<<<<<<
+ *         info.buf = self.data
+ *         info.len = self.len
+ */
+      __Pyx_Raise(__pyx_builtin_ValueError, __pyx_kp_s_Can_only_create_a_buffer_that_is, 0, 0);
+      __PYX_ERR(1, 191, __pyx_L1_error)
+
+      /* "View.MemoryView":190
+ *             elif self.mode == u"fortran":
+ *                 bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ *             if not (flags & bufmode):             # <<<<<<<<<<<<<<
+ *                 raise ValueError, "Can only create a buffer that is contiguous in memory."
+ *         info.buf = self.data
+ */
+    }
+
+    /* "View.MemoryView":185
+ *     def __getbuffer__(self, Py_buffer *info, int flags):
+ *         cdef int bufmode = -1
+ *         if flags & (PyBUF_C_CONTIGUOUS | PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS):             # <<<<<<<<<<<<<<
+ *             if self.mode == u"c":
+ *                 bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ */
+  }
+
+  /* "View.MemoryView":192
+ *             if not (flags & bufmode):
+ *                 raise ValueError, "Can only create a buffer that is contiguous in memory."
+ *         info.buf = self.data             # <<<<<<<<<<<<<<
+ *         info.len = self.len
+ * 
+ */
+  __pyx_t_2 = __pyx_v_self->data;
+  __pyx_v_info->buf = __pyx_t_2;
+
+  /* "View.MemoryView":193
+ *                 raise ValueError, "Can only create a buffer that is contiguous in memory."
+ *         info.buf = self.data
+ *         info.len = self.len             # <<<<<<<<<<<<<<
+ * 
+ *         if flags & PyBUF_STRIDES:
+ */
+  __pyx_t_3 = __pyx_v_self->len;
+  __pyx_v_info->len = __pyx_t_3;
+
+  /* "View.MemoryView":195
+ *         info.len = self.len
+ * 
+ *         if flags & PyBUF_STRIDES:             # <<<<<<<<<<<<<<
+ *             info.ndim = self.ndim
+ *             info.shape = self._shape
+ */
+  __pyx_t_1 = ((__pyx_v_flags & PyBUF_STRIDES) != 0);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":196
+ * 
+ *         if flags & PyBUF_STRIDES:
+ *             info.ndim = self.ndim             # <<<<<<<<<<<<<<
+ *             info.shape = self._shape
+ *             info.strides = self._strides
+ */
+    __pyx_t_4 = __pyx_v_self->ndim;
+    __pyx_v_info->ndim = __pyx_t_4;
+
+    /* "View.MemoryView":197
+ *         if flags & PyBUF_STRIDES:
+ *             info.ndim = self.ndim
+ *             info.shape = self._shape             # <<<<<<<<<<<<<<
+ *             info.strides = self._strides
+ *         else:
+ */
+    __pyx_t_5 = __pyx_v_self->_shape;
+    __pyx_v_info->shape = __pyx_t_5;
+
+    /* "View.MemoryView":198
+ *             info.ndim = self.ndim
+ *             info.shape = self._shape
+ *             info.strides = self._strides             # <<<<<<<<<<<<<<
+ *         else:
+ *             info.ndim = 1
+ */
+    __pyx_t_5 = __pyx_v_self->_strides;
+    __pyx_v_info->strides = __pyx_t_5;
+
+    /* "View.MemoryView":195
+ *         info.len = self.len
+ * 
+ *         if flags & PyBUF_STRIDES:             # <<<<<<<<<<<<<<
+ *             info.ndim = self.ndim
+ *             info.shape = self._shape
+ */
+    goto __pyx_L6;
+  }
+
+  /* "View.MemoryView":200
+ *             info.strides = self._strides
+ *         else:
+ *             info.ndim = 1             # <<<<<<<<<<<<<<
+ *             info.shape = &self.len if flags & PyBUF_ND else NULL
+ *             info.strides = NULL
+ */
+  /*else*/ {
+    __pyx_v_info->ndim = 1;
+
+    /* "View.MemoryView":201
+ *         else:
+ *             info.ndim = 1
+ *             info.shape = &self.len if flags & PyBUF_ND else NULL             # <<<<<<<<<<<<<<
+ *             info.strides = NULL
+ * 
+ */
+    __pyx_t_1 = ((__pyx_v_flags & PyBUF_ND) != 0);
+    if (__pyx_t_1) {
+      __pyx_t_5 = (&__pyx_v_self->len);
+    } else {
+      __pyx_t_5 = NULL;
+    }
+    __pyx_v_info->shape = __pyx_t_5;
+
+    /* "View.MemoryView":202
+ *             info.ndim = 1
+ *             info.shape = &self.len if flags & PyBUF_ND else NULL
+ *             info.strides = NULL             # <<<<<<<<<<<<<<
+ * 
+ *         info.suboffsets = NULL
+ */
+    __pyx_v_info->strides = NULL;
+  }
+  __pyx_L6:;
+
+  /* "View.MemoryView":204
+ *             info.strides = NULL
+ * 
+ *         info.suboffsets = NULL             # <<<<<<<<<<<<<<
+ *         info.itemsize = self.itemsize
+ *         info.readonly = 0
+ */
+  __pyx_v_info->suboffsets = NULL;
+
+  /* "View.MemoryView":205
+ * 
+ *         info.suboffsets = NULL
+ *         info.itemsize = self.itemsize             # <<<<<<<<<<<<<<
+ *         info.readonly = 0
+ *         info.format = self.format if flags & PyBUF_FORMAT else NULL
+ */
+  __pyx_t_3 = __pyx_v_self->itemsize;
+  __pyx_v_info->itemsize = __pyx_t_3;
+
+  /* "View.MemoryView":206
+ *         info.suboffsets = NULL
+ *         info.itemsize = self.itemsize
+ *         info.readonly = 0             # <<<<<<<<<<<<<<
+ *         info.format = self.format if flags & PyBUF_FORMAT else NULL
+ *         info.obj = self
+ */
+  __pyx_v_info->readonly = 0;
+
+  /* "View.MemoryView":207
+ *         info.itemsize = self.itemsize
+ *         info.readonly = 0
+ *         info.format = self.format if flags & PyBUF_FORMAT else NULL             # <<<<<<<<<<<<<<
+ *         info.obj = self
+ * 
+ */
+  __pyx_t_1 = ((__pyx_v_flags & PyBUF_FORMAT) != 0);
+  if (__pyx_t_1) {
+    __pyx_t_2 = __pyx_v_self->format;
+  } else {
+    __pyx_t_2 = NULL;
+  }
+  __pyx_v_info->format = __pyx_t_2;
+
+  /* "View.MemoryView":208
+ *         info.readonly = 0
+ *         info.format = self.format if flags & PyBUF_FORMAT else NULL
+ *         info.obj = self             # <<<<<<<<<<<<<<
+ * 
+ *     def __dealloc__(array self):
+ */
+  __Pyx_INCREF((PyObject *)__pyx_v_self);
+  __Pyx_GIVEREF((PyObject *)__pyx_v_self);
+  __Pyx_GOTREF(__pyx_v_info->obj);
+  __Pyx_DECREF(__pyx_v_info->obj);
+  __pyx_v_info->obj = ((PyObject *)__pyx_v_self);
+
+  /* "View.MemoryView":182
+ *             _allocate_buffer(self)
+ * 
+ *     @cname('getbuffer')             # <<<<<<<<<<<<<<
+ *     def __getbuffer__(self, Py_buffer *info, int flags):
+ *         cdef int bufmode = -1
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView.array.__getbuffer__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  if (__pyx_v_info->obj != NULL) {
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj); __pyx_v_info->obj = 0;
+  }
+  goto __pyx_L2;
+  __pyx_L0:;
+  if (__pyx_v_info->obj == Py_None) {
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj); __pyx_v_info->obj = 0;
+  }
+  __pyx_L2:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":210
+ *         info.obj = self
+ * 
+ *     def __dealloc__(array self):             # <<<<<<<<<<<<<<
+ *         if self.callback_free_data != NULL:
+ *             self.callback_free_data(self.data)
+ */
+
+/* Python wrapper */
+static void __pyx_array___dealloc__(PyObject *__pyx_v_self); /*proto*/
+static void __pyx_array___dealloc__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_array___pyx_pf_15View_dot_MemoryView_5array_4__dealloc__(((struct __pyx_array_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+static void __pyx_array___pyx_pf_15View_dot_MemoryView_5array_4__dealloc__(struct __pyx_array_obj *__pyx_v_self) {
+  int __pyx_t_1;
+  int __pyx_t_2;
+
+  /* "View.MemoryView":211
+ * 
+ *     def __dealloc__(array self):
+ *         if self.callback_free_data != NULL:             # <<<<<<<<<<<<<<
+ *             self.callback_free_data(self.data)
+ *         elif self.free_data and self.data is not NULL:
+ */
+  __pyx_t_1 = (__pyx_v_self->callback_free_data != NULL);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":212
+ *     def __dealloc__(array self):
+ *         if self.callback_free_data != NULL:
+ *             self.callback_free_data(self.data)             # <<<<<<<<<<<<<<
+ *         elif self.free_data and self.data is not NULL:
+ *             if self.dtype_is_object:
+ */
+    __pyx_v_self->callback_free_data(__pyx_v_self->data);
+
+    /* "View.MemoryView":211
+ * 
+ *     def __dealloc__(array self):
+ *         if self.callback_free_data != NULL:             # <<<<<<<<<<<<<<
+ *             self.callback_free_data(self.data)
+ *         elif self.free_data and self.data is not NULL:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":213
+ *         if self.callback_free_data != NULL:
+ *             self.callback_free_data(self.data)
+ *         elif self.free_data and self.data is not NULL:             # <<<<<<<<<<<<<<
+ *             if self.dtype_is_object:
+ *                 refcount_objects_in_slice(self.data, self._shape, self._strides, self.ndim, inc=False)
+ */
+  if (__pyx_v_self->free_data) {
+  } else {
+    __pyx_t_1 = __pyx_v_self->free_data;
+    goto __pyx_L4_bool_binop_done;
+  }
+  __pyx_t_2 = (__pyx_v_self->data != NULL);
+  __pyx_t_1 = __pyx_t_2;
+  __pyx_L4_bool_binop_done:;
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":214
+ *             self.callback_free_data(self.data)
+ *         elif self.free_data and self.data is not NULL:
+ *             if self.dtype_is_object:             # <<<<<<<<<<<<<<
+ *                 refcount_objects_in_slice(self.data, self._shape, self._strides, self.ndim, inc=False)
+ *             free(self.data)
+ */
+    if (__pyx_v_self->dtype_is_object) {
+
+      /* "View.MemoryView":215
+ *         elif self.free_data and self.data is not NULL:
+ *             if self.dtype_is_object:
+ *                 refcount_objects_in_slice(self.data, self._shape, self._strides, self.ndim, inc=False)             # <<<<<<<<<<<<<<
+ *             free(self.data)
+ *         PyObject_Free(self._shape)
+ */
+      __pyx_memoryview_refcount_objects_in_slice(__pyx_v_self->data, __pyx_v_self->_shape, __pyx_v_self->_strides, __pyx_v_self->ndim, 0);
+
+      /* "View.MemoryView":214
+ *             self.callback_free_data(self.data)
+ *         elif self.free_data and self.data is not NULL:
+ *             if self.dtype_is_object:             # <<<<<<<<<<<<<<
+ *                 refcount_objects_in_slice(self.data, self._shape, self._strides, self.ndim, inc=False)
+ *             free(self.data)
+ */
+    }
+
+    /* "View.MemoryView":216
+ *             if self.dtype_is_object:
+ *                 refcount_objects_in_slice(self.data, self._shape, self._strides, self.ndim, inc=False)
+ *             free(self.data)             # <<<<<<<<<<<<<<
+ *         PyObject_Free(self._shape)
+ * 
+ */
+    free(__pyx_v_self->data);
+
+    /* "View.MemoryView":213
+ *         if self.callback_free_data != NULL:
+ *             self.callback_free_data(self.data)
+ *         elif self.free_data and self.data is not NULL:             # <<<<<<<<<<<<<<
+ *             if self.dtype_is_object:
+ *                 refcount_objects_in_slice(self.data, self._shape, self._strides, self.ndim, inc=False)
+ */
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":217
+ *                 refcount_objects_in_slice(self.data, self._shape, self._strides, self.ndim, inc=False)
+ *             free(self.data)
+ *         PyObject_Free(self._shape)             # <<<<<<<<<<<<<<
+ * 
+ *     @property
+ */
+  PyObject_Free(__pyx_v_self->_shape);
+
+  /* "View.MemoryView":210
+ *         info.obj = self
+ * 
+ *     def __dealloc__(array self):             # <<<<<<<<<<<<<<
+ *         if self.callback_free_data != NULL:
+ *             self.callback_free_data(self.data)
+ */
+
+  /* function exit code */
+}
+
+/* "View.MemoryView":219
+ *         PyObject_Free(self._shape)
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def memview(self):
+ *         return self.get_memview()
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_5array_7memview_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_5array_7memview_1__get__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_pf_15View_dot_MemoryView_5array_7memview___get__(((struct __pyx_array_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_5array_7memview___get__(struct __pyx_array_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__get__", 1);
+
+  /* "View.MemoryView":221
+ *     @property
+ *     def memview(self):
+ *         return self.get_memview()             # <<<<<<<<<<<<<<
+ * 
+ *     @cname('get_memview')
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = ((struct __pyx_vtabstruct_array *)__pyx_v_self->__pyx_vtab)->get_memview(__pyx_v_self); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 221, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":219
+ *         PyObject_Free(self._shape)
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def memview(self):
+ *         return self.get_memview()
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("View.MemoryView.array.memview.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":224
+ * 
+ *     @cname('get_memview')
+ *     cdef get_memview(self):             # <<<<<<<<<<<<<<
+ *         flags =  PyBUF_ANY_CONTIGUOUS|PyBUF_FORMAT|PyBUF_WRITABLE
+ *         return  memoryview(self, flags, self.dtype_is_object)
+ */
+
+static PyObject *__pyx_array_get_memview(struct __pyx_array_obj *__pyx_v_self) {
+  int __pyx_v_flags;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("get_memview", 1);
+
+  /* "View.MemoryView":225
+ *     @cname('get_memview')
+ *     cdef get_memview(self):
+ *         flags =  PyBUF_ANY_CONTIGUOUS|PyBUF_FORMAT|PyBUF_WRITABLE             # <<<<<<<<<<<<<<
+ *         return  memoryview(self, flags, self.dtype_is_object)
+ * 
+ */
+  __pyx_v_flags = ((PyBUF_ANY_CONTIGUOUS | PyBUF_FORMAT) | PyBUF_WRITABLE);
+
+  /* "View.MemoryView":226
+ *     cdef get_memview(self):
+ *         flags =  PyBUF_ANY_CONTIGUOUS|PyBUF_FORMAT|PyBUF_WRITABLE
+ *         return  memoryview(self, flags, self.dtype_is_object)             # <<<<<<<<<<<<<<
+ * 
+ *     def __len__(self):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_flags); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 226, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyBool_FromLong(__pyx_v_self->dtype_is_object); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 226, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = PyTuple_New(3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 226, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_INCREF((PyObject *)__pyx_v_self);
+  __Pyx_GIVEREF((PyObject *)__pyx_v_self);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 0, ((PyObject *)__pyx_v_self))) __PYX_ERR(1, 226, __pyx_L1_error);
+  __Pyx_GIVEREF(__pyx_t_1);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_1)) __PYX_ERR(1, 226, __pyx_L1_error);
+  __Pyx_GIVEREF(__pyx_t_2);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 2, __pyx_t_2)) __PYX_ERR(1, 226, __pyx_L1_error);
+  __pyx_t_1 = 0;
+  __pyx_t_2 = 0;
+  __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)__pyx_memoryview_type), __pyx_t_3, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 226, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_r = __pyx_t_2;
+  __pyx_t_2 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":224
+ * 
+ *     @cname('get_memview')
+ *     cdef get_memview(self):             # <<<<<<<<<<<<<<
+ *         flags =  PyBUF_ANY_CONTIGUOUS|PyBUF_FORMAT|PyBUF_WRITABLE
+ *         return  memoryview(self, flags, self.dtype_is_object)
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_AddTraceback("View.MemoryView.array.get_memview", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":228
+ *         return  memoryview(self, flags, self.dtype_is_object)
+ * 
+ *     def __len__(self):             # <<<<<<<<<<<<<<
+ *         return self._shape[0]
+ * 
+ */
+
+/* Python wrapper */
+static Py_ssize_t __pyx_array___len__(PyObject *__pyx_v_self); /*proto*/
+static Py_ssize_t __pyx_array___len__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  Py_ssize_t __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__len__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_array___pyx_pf_15View_dot_MemoryView_5array_6__len__(((struct __pyx_array_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static Py_ssize_t __pyx_array___pyx_pf_15View_dot_MemoryView_5array_6__len__(struct __pyx_array_obj *__pyx_v_self) {
+  Py_ssize_t __pyx_r;
+
+  /* "View.MemoryView":229
+ * 
+ *     def __len__(self):
+ *         return self._shape[0]             # <<<<<<<<<<<<<<
+ * 
+ *     def __getattr__(self, attr):
+ */
+  __pyx_r = (__pyx_v_self->_shape[0]);
+  goto __pyx_L0;
+
+  /* "View.MemoryView":228
+ *         return  memoryview(self, flags, self.dtype_is_object)
+ * 
+ *     def __len__(self):             # <<<<<<<<<<<<<<
+ *         return self._shape[0]
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":231
+ *         return self._shape[0]
+ * 
+ *     def __getattr__(self, attr):             # <<<<<<<<<<<<<<
+ *         return getattr(self.memview, attr)
+ * 
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_array___getattr__(PyObject *__pyx_v_self, PyObject *__pyx_v_attr); /*proto*/
+static PyObject *__pyx_array___getattr__(PyObject *__pyx_v_self, PyObject *__pyx_v_attr) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__getattr__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_array___pyx_pf_15View_dot_MemoryView_5array_8__getattr__(((struct __pyx_array_obj *)__pyx_v_self), ((PyObject *)__pyx_v_attr));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_array___pyx_pf_15View_dot_MemoryView_5array_8__getattr__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_attr) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__getattr__", 1);
+
+  /* "View.MemoryView":232
+ * 
+ *     def __getattr__(self, attr):
+ *         return getattr(self.memview, attr)             # <<<<<<<<<<<<<<
+ * 
+ *     def __getitem__(self, item):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_memview); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 232, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_GetAttr(__pyx_t_1, __pyx_v_attr); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 232, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_r = __pyx_t_2;
+  __pyx_t_2 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":231
+ *         return self._shape[0]
+ * 
+ *     def __getattr__(self, attr):             # <<<<<<<<<<<<<<
+ *         return getattr(self.memview, attr)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView.array.__getattr__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":234
+ *         return getattr(self.memview, attr)
+ * 
+ *     def __getitem__(self, item):             # <<<<<<<<<<<<<<
+ *         return self.memview[item]
+ * 
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_array___getitem__(PyObject *__pyx_v_self, PyObject *__pyx_v_item); /*proto*/
+static PyObject *__pyx_array___getitem__(PyObject *__pyx_v_self, PyObject *__pyx_v_item) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__getitem__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_array___pyx_pf_15View_dot_MemoryView_5array_10__getitem__(((struct __pyx_array_obj *)__pyx_v_self), ((PyObject *)__pyx_v_item));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_array___pyx_pf_15View_dot_MemoryView_5array_10__getitem__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_item) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__getitem__", 1);
+
+  /* "View.MemoryView":235
+ * 
+ *     def __getitem__(self, item):
+ *         return self.memview[item]             # <<<<<<<<<<<<<<
+ * 
+ *     def __setitem__(self, item, value):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_memview); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 235, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetItem(__pyx_t_1, __pyx_v_item); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 235, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_r = __pyx_t_2;
+  __pyx_t_2 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":234
+ *         return getattr(self.memview, attr)
+ * 
+ *     def __getitem__(self, item):             # <<<<<<<<<<<<<<
+ *         return self.memview[item]
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView.array.__getitem__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":237
+ *         return self.memview[item]
+ * 
+ *     def __setitem__(self, item, value):             # <<<<<<<<<<<<<<
+ *         self.memview[item] = value
+ * 
+ */
+
+/* Python wrapper */
+static int __pyx_array___setitem__(PyObject *__pyx_v_self, PyObject *__pyx_v_item, PyObject *__pyx_v_value); /*proto*/
+static int __pyx_array___setitem__(PyObject *__pyx_v_self, PyObject *__pyx_v_item, PyObject *__pyx_v_value) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__setitem__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_array___pyx_pf_15View_dot_MemoryView_5array_12__setitem__(((struct __pyx_array_obj *)__pyx_v_self), ((PyObject *)__pyx_v_item), ((PyObject *)__pyx_v_value));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array_12__setitem__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_item, PyObject *__pyx_v_value) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__setitem__", 1);
+
+  /* "View.MemoryView":238
+ * 
+ *     def __setitem__(self, item, value):
+ *         self.memview[item] = value             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_memview); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 238, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (unlikely((PyObject_SetItem(__pyx_t_1, __pyx_v_item, __pyx_v_value) < 0))) __PYX_ERR(1, 238, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "View.MemoryView":237
+ *         return self.memview[item]
+ * 
+ *     def __setitem__(self, item, value):             # <<<<<<<<<<<<<<
+ *         self.memview[item] = value
+ * 
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("View.MemoryView.array.__setitem__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw___pyx_array_1__reduce_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_pw___pyx_array_1__reduce_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__reduce_cython__ (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  if (unlikely(__pyx_nargs > 0)) {
+    __Pyx_RaiseArgtupleInvalid("__reduce_cython__", 1, 0, 0, __pyx_nargs); return NULL;}
+  if (unlikely(__pyx_kwds) && __Pyx_NumKwargs_FASTCALL(__pyx_kwds) && unlikely(!__Pyx_CheckKeywordStrings(__pyx_kwds, "__reduce_cython__", 0))) return NULL;
+  __pyx_r = __pyx_pf___pyx_array___reduce_cython__(((struct __pyx_array_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf___pyx_array___reduce_cython__(CYTHON_UNUSED struct __pyx_array_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__reduce_cython__", 1);
+
+  /* "(tree fragment)":2
+ * def __reduce_cython__(self):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"             # <<<<<<<<<<<<<<
+ * def __setstate_cython__(self, __pyx_state):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ */
+  __Pyx_Raise(__pyx_builtin_TypeError, __pyx_kp_s_no_default___reduce___due_to_non, 0, 0);
+  __PYX_ERR(1, 2, __pyx_L1_error)
+
+  /* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView.array.__reduce_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":3
+ * def __reduce_cython__(self):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw___pyx_array_3__setstate_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_pw___pyx_array_3__setstate_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  CYTHON_UNUSED PyObject *__pyx_v___pyx_state = 0;
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[1] = {0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__setstate_cython__ (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_pyx_state,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  1: values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_FASTCALL(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_pyx_state)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 3, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "__setstate_cython__") < 0)) __PYX_ERR(1, 3, __pyx_L3_error)
+      }
+    } else if (unlikely(__pyx_nargs != 1)) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+    }
+    __pyx_v___pyx_state = values[0];
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__setstate_cython__", 1, 1, 1, __pyx_nargs); __PYX_ERR(1, 3, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("View.MemoryView.array.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf___pyx_array_2__setstate_cython__(((struct __pyx_array_obj *)__pyx_v_self), __pyx_v___pyx_state);
+
+  /* function exit code */
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf___pyx_array_2__setstate_cython__(CYTHON_UNUSED struct __pyx_array_obj *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__setstate_cython__", 1);
+
+  /* "(tree fragment)":4
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"             # <<<<<<<<<<<<<<
+ */
+  __Pyx_Raise(__pyx_builtin_TypeError, __pyx_kp_s_no_default___reduce___due_to_non, 0, 0);
+  __PYX_ERR(1, 4, __pyx_L1_error)
+
+  /* "(tree fragment)":3
+ * def __reduce_cython__(self):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView.array.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":248
+ * 
+ * @cname("__pyx_array_allocate_buffer")
+ * cdef int _allocate_buffer(array self) except -1:             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+
+static int __pyx_array_allocate_buffer(struct __pyx_array_obj *__pyx_v_self) {
+  Py_ssize_t __pyx_v_i;
+  PyObject **__pyx_v_p;
+  int __pyx_r;
+  int __pyx_t_1;
+  Py_ssize_t __pyx_t_2;
+  Py_ssize_t __pyx_t_3;
+  Py_ssize_t __pyx_t_4;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+
+  /* "View.MemoryView":254
+ *     cdef PyObject **p
+ * 
+ *     self.free_data = True             # <<<<<<<<<<<<<<
+ *     self.data = <char *>malloc(self.len)
+ *     if not self.data:
+ */
+  __pyx_v_self->free_data = 1;
+
+  /* "View.MemoryView":255
+ * 
+ *     self.free_data = True
+ *     self.data = <char *>malloc(self.len)             # <<<<<<<<<<<<<<
+ *     if not self.data:
+ *         raise MemoryError, "unable to allocate array data."
+ */
+  __pyx_v_self->data = ((char *)malloc(__pyx_v_self->len));
+
+  /* "View.MemoryView":256
+ *     self.free_data = True
+ *     self.data = <char *>malloc(self.len)
+ *     if not self.data:             # <<<<<<<<<<<<<<
+ *         raise MemoryError, "unable to allocate array data."
+ * 
+ */
+  __pyx_t_1 = (!(__pyx_v_self->data != 0));
+  if (unlikely(__pyx_t_1)) {
+
+    /* "View.MemoryView":257
+ *     self.data = <char *>malloc(self.len)
+ *     if not self.data:
+ *         raise MemoryError, "unable to allocate array data."             # <<<<<<<<<<<<<<
+ * 
+ *     if self.dtype_is_object:
+ */
+    __Pyx_Raise(__pyx_builtin_MemoryError, __pyx_kp_s_unable_to_allocate_array_data, 0, 0);
+    __PYX_ERR(1, 257, __pyx_L1_error)
+
+    /* "View.MemoryView":256
+ *     self.free_data = True
+ *     self.data = <char *>malloc(self.len)
+ *     if not self.data:             # <<<<<<<<<<<<<<
+ *         raise MemoryError, "unable to allocate array data."
+ * 
+ */
+  }
+
+  /* "View.MemoryView":259
+ *         raise MemoryError, "unable to allocate array data."
+ * 
+ *     if self.dtype_is_object:             # <<<<<<<<<<<<<<
+ *         p = <PyObject **> self.data
+ *         for i in range(self.len // self.itemsize):
+ */
+  if (__pyx_v_self->dtype_is_object) {
+
+    /* "View.MemoryView":260
+ * 
+ *     if self.dtype_is_object:
+ *         p = <PyObject **> self.data             # <<<<<<<<<<<<<<
+ *         for i in range(self.len // self.itemsize):
+ *             p[i] = Py_None
+ */
+    __pyx_v_p = ((PyObject **)__pyx_v_self->data);
+
+    /* "View.MemoryView":261
+ *     if self.dtype_is_object:
+ *         p = <PyObject **> self.data
+ *         for i in range(self.len // self.itemsize):             # <<<<<<<<<<<<<<
+ *             p[i] = Py_None
+ *             Py_INCREF(Py_None)
+ */
+    if (unlikely(__pyx_v_self->itemsize == 0)) {
+      PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
+      __PYX_ERR(1, 261, __pyx_L1_error)
+    }
+    else if (sizeof(Py_ssize_t) == sizeof(long) && (!(((Py_ssize_t)-1) > 0)) && unlikely(__pyx_v_self->itemsize == (Py_ssize_t)-1)  && unlikely(__Pyx_UNARY_NEG_WOULD_OVERFLOW(__pyx_v_self->len))) {
+      PyErr_SetString(PyExc_OverflowError, "value too large to perform division");
+      __PYX_ERR(1, 261, __pyx_L1_error)
+    }
+    __pyx_t_2 = __Pyx_div_Py_ssize_t(__pyx_v_self->len, __pyx_v_self->itemsize);
+    __pyx_t_3 = __pyx_t_2;
+    for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
+      __pyx_v_i = __pyx_t_4;
+
+      /* "View.MemoryView":262
+ *         p = <PyObject **> self.data
+ *         for i in range(self.len // self.itemsize):
+ *             p[i] = Py_None             # <<<<<<<<<<<<<<
+ *             Py_INCREF(Py_None)
+ *     return 0
+ */
+      (__pyx_v_p[__pyx_v_i]) = Py_None;
+
+      /* "View.MemoryView":263
+ *         for i in range(self.len // self.itemsize):
+ *             p[i] = Py_None
+ *             Py_INCREF(Py_None)             # <<<<<<<<<<<<<<
+ *     return 0
+ * 
+ */
+      Py_INCREF(Py_None);
+    }
+
+    /* "View.MemoryView":259
+ *         raise MemoryError, "unable to allocate array data."
+ * 
+ *     if self.dtype_is_object:             # <<<<<<<<<<<<<<
+ *         p = <PyObject **> self.data
+ *         for i in range(self.len // self.itemsize):
+ */
+  }
+
+  /* "View.MemoryView":264
+ *             p[i] = Py_None
+ *             Py_INCREF(Py_None)
+ *     return 0             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":248
+ * 
+ * @cname("__pyx_array_allocate_buffer")
+ * cdef int _allocate_buffer(array self) except -1:             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView._allocate_buffer", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":268
+ * 
+ * @cname("__pyx_array_new")
+ * cdef array array_cwrapper(tuple shape, Py_ssize_t itemsize, char *format, char *c_mode, char *buf):             # <<<<<<<<<<<<<<
+ *     cdef array result
+ *     cdef str mode = "fortran" if c_mode[0] == b'f' else "c"  # this often comes from a constant C string.
+ */
+
+static struct __pyx_array_obj *__pyx_array_new(PyObject *__pyx_v_shape, Py_ssize_t __pyx_v_itemsize, char *__pyx_v_format, char *__pyx_v_c_mode, char *__pyx_v_buf) {
+  struct __pyx_array_obj *__pyx_v_result = 0;
+  PyObject *__pyx_v_mode = 0;
+  struct __pyx_array_obj *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("array_cwrapper", 1);
+
+  /* "View.MemoryView":270
+ * cdef array array_cwrapper(tuple shape, Py_ssize_t itemsize, char *format, char *c_mode, char *buf):
+ *     cdef array result
+ *     cdef str mode = "fortran" if c_mode[0] == b'f' else "c"  # this often comes from a constant C string.             # <<<<<<<<<<<<<<
+ * 
+ *     if buf is NULL:
+ */
+  __pyx_t_2 = ((__pyx_v_c_mode[0]) == 'f');
+  if (__pyx_t_2) {
+    __Pyx_INCREF(__pyx_n_s_fortran);
+    __pyx_t_1 = __pyx_n_s_fortran;
+  } else {
+    __Pyx_INCREF(__pyx_n_s_c);
+    __pyx_t_1 = __pyx_n_s_c;
+  }
+  __pyx_v_mode = ((PyObject*)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "View.MemoryView":272
+ *     cdef str mode = "fortran" if c_mode[0] == b'f' else "c"  # this often comes from a constant C string.
+ * 
+ *     if buf is NULL:             # <<<<<<<<<<<<<<
+ *         result = array.__new__(array, shape, itemsize, format, mode)
+ *     else:
+ */
+  __pyx_t_2 = (__pyx_v_buf == NULL);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":273
+ * 
+ *     if buf is NULL:
+ *         result = array.__new__(array, shape, itemsize, format, mode)             # <<<<<<<<<<<<<<
+ *     else:
+ *         result = array.__new__(array, shape, itemsize, format, mode, allocate_buffer=False)
+ */
+    __pyx_t_1 = PyInt_FromSsize_t(__pyx_v_itemsize); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 273, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_3 = __Pyx_PyBytes_FromString(__pyx_v_format); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 273, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_4 = PyTuple_New(4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 273, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_INCREF(__pyx_v_shape);
+    __Pyx_GIVEREF(__pyx_v_shape);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_v_shape)) __PYX_ERR(1, 273, __pyx_L1_error);
+    __Pyx_GIVEREF(__pyx_t_1);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_4, 1, __pyx_t_1)) __PYX_ERR(1, 273, __pyx_L1_error);
+    __Pyx_GIVEREF(__pyx_t_3);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_4, 2, __pyx_t_3)) __PYX_ERR(1, 273, __pyx_L1_error);
+    __Pyx_INCREF(__pyx_v_mode);
+    __Pyx_GIVEREF(__pyx_v_mode);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_4, 3, __pyx_v_mode)) __PYX_ERR(1, 273, __pyx_L1_error);
+    __pyx_t_1 = 0;
+    __pyx_t_3 = 0;
+    __pyx_t_3 = ((PyObject *)__pyx_tp_new_array(((PyTypeObject *)__pyx_array_type), __pyx_t_4, NULL)); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 273, __pyx_L1_error)
+    __Pyx_GOTREF((PyObject *)__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __pyx_v_result = ((struct __pyx_array_obj *)__pyx_t_3);
+    __pyx_t_3 = 0;
+
+    /* "View.MemoryView":272
+ *     cdef str mode = "fortran" if c_mode[0] == b'f' else "c"  # this often comes from a constant C string.
+ * 
+ *     if buf is NULL:             # <<<<<<<<<<<<<<
+ *         result = array.__new__(array, shape, itemsize, format, mode)
+ *     else:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":275
+ *         result = array.__new__(array, shape, itemsize, format, mode)
+ *     else:
+ *         result = array.__new__(array, shape, itemsize, format, mode, allocate_buffer=False)             # <<<<<<<<<<<<<<
+ *         result.data = buf
+ * 
+ */
+  /*else*/ {
+    __pyx_t_3 = PyInt_FromSsize_t(__pyx_v_itemsize); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 275, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_4 = __Pyx_PyBytes_FromString(__pyx_v_format); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 275, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_1 = PyTuple_New(4); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 275, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_INCREF(__pyx_v_shape);
+    __Pyx_GIVEREF(__pyx_v_shape);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v_shape)) __PYX_ERR(1, 275, __pyx_L1_error);
+    __Pyx_GIVEREF(__pyx_t_3);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_t_3)) __PYX_ERR(1, 275, __pyx_L1_error);
+    __Pyx_GIVEREF(__pyx_t_4);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 2, __pyx_t_4)) __PYX_ERR(1, 275, __pyx_L1_error);
+    __Pyx_INCREF(__pyx_v_mode);
+    __Pyx_GIVEREF(__pyx_v_mode);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 3, __pyx_v_mode)) __PYX_ERR(1, 275, __pyx_L1_error);
+    __pyx_t_3 = 0;
+    __pyx_t_4 = 0;
+    __pyx_t_4 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 275, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    if (PyDict_SetItem(__pyx_t_4, __pyx_n_s_allocate_buffer, Py_False) < 0) __PYX_ERR(1, 275, __pyx_L1_error)
+    __pyx_t_3 = ((PyObject *)__pyx_tp_new_array(((PyTypeObject *)__pyx_array_type), __pyx_t_1, __pyx_t_4)); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 275, __pyx_L1_error)
+    __Pyx_GOTREF((PyObject *)__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __pyx_v_result = ((struct __pyx_array_obj *)__pyx_t_3);
+    __pyx_t_3 = 0;
+
+    /* "View.MemoryView":276
+ *     else:
+ *         result = array.__new__(array, shape, itemsize, format, mode, allocate_buffer=False)
+ *         result.data = buf             # <<<<<<<<<<<<<<
+ * 
+ *     return result
+ */
+    __pyx_v_result->data = __pyx_v_buf;
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":278
+ *         result.data = buf
+ * 
+ *     return result             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __Pyx_XDECREF((PyObject *)__pyx_r);
+  __Pyx_INCREF((PyObject *)__pyx_v_result);
+  __pyx_r = __pyx_v_result;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":268
+ * 
+ * @cname("__pyx_array_new")
+ * cdef array array_cwrapper(tuple shape, Py_ssize_t itemsize, char *format, char *c_mode, char *buf):             # <<<<<<<<<<<<<<
+ *     cdef array result
+ *     cdef str mode = "fortran" if c_mode[0] == b'f' else "c"  # this often comes from a constant C string.
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("View.MemoryView.array_cwrapper", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_result);
+  __Pyx_XDECREF(__pyx_v_mode);
+  __Pyx_XGIVEREF((PyObject *)__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":304
+ * cdef class Enum(object):
+ *     cdef object name
+ *     def __init__(self, name):             # <<<<<<<<<<<<<<
+ *         self.name = name
+ *     def __repr__(self):
+ */
+
+/* Python wrapper */
+static int __pyx_MemviewEnum___init__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static int __pyx_MemviewEnum___init__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_name = 0;
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[1] = {0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__init__ (wrapper)", 0);
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return -1;
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_name,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  1: values[0] = __Pyx_Arg_VARARGS(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_VARARGS(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_VARARGS(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_name)) != 0)) {
+          (void)__Pyx_Arg_NewRef_VARARGS(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 304, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "__init__") < 0)) __PYX_ERR(1, 304, __pyx_L3_error)
+      }
+    } else if (unlikely(__pyx_nargs != 1)) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = __Pyx_Arg_VARARGS(__pyx_args, 0);
+    }
+    __pyx_v_name = values[0];
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__init__", 1, 1, 1, __pyx_nargs); __PYX_ERR(1, 304, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_VARARGS(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("View.MemoryView.Enum.__init__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return -1;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum___init__(((struct __pyx_MemviewEnum_obj *)__pyx_v_self), __pyx_v_name);
+
+  /* function exit code */
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_VARARGS(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum___init__(struct __pyx_MemviewEnum_obj *__pyx_v_self, PyObject *__pyx_v_name) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__init__", 1);
+
+  /* "View.MemoryView":305
+ *     cdef object name
+ *     def __init__(self, name):
+ *         self.name = name             # <<<<<<<<<<<<<<
+ *     def __repr__(self):
+ *         return self.name
+ */
+  __Pyx_INCREF(__pyx_v_name);
+  __Pyx_GIVEREF(__pyx_v_name);
+  __Pyx_GOTREF(__pyx_v_self->name);
+  __Pyx_DECREF(__pyx_v_self->name);
+  __pyx_v_self->name = __pyx_v_name;
+
+  /* "View.MemoryView":304
+ * cdef class Enum(object):
+ *     cdef object name
+ *     def __init__(self, name):             # <<<<<<<<<<<<<<
+ *         self.name = name
+ *     def __repr__(self):
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":306
+ *     def __init__(self, name):
+ *         self.name = name
+ *     def __repr__(self):             # <<<<<<<<<<<<<<
+ *         return self.name
+ * 
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_MemviewEnum___repr__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_MemviewEnum___repr__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__repr__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum_2__repr__(((struct __pyx_MemviewEnum_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum_2__repr__(struct __pyx_MemviewEnum_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__repr__", 1);
+
+  /* "View.MemoryView":307
+ *         self.name = name
+ *     def __repr__(self):
+ *         return self.name             # <<<<<<<<<<<<<<
+ * 
+ * cdef generic = Enum("<strided and direct or indirect>")
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_self->name);
+  __pyx_r = __pyx_v_self->name;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":306
+ *     def __init__(self, name):
+ *         self.name = name
+ *     def __repr__(self):             # <<<<<<<<<<<<<<
+ *         return self.name
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     cdef tuple state
+ *     cdef object _dict
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw___pyx_MemviewEnum_1__reduce_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_pw___pyx_MemviewEnum_1__reduce_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__reduce_cython__ (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  if (unlikely(__pyx_nargs > 0)) {
+    __Pyx_RaiseArgtupleInvalid("__reduce_cython__", 1, 0, 0, __pyx_nargs); return NULL;}
+  if (unlikely(__pyx_kwds) && __Pyx_NumKwargs_FASTCALL(__pyx_kwds) && unlikely(!__Pyx_CheckKeywordStrings(__pyx_kwds, "__reduce_cython__", 0))) return NULL;
+  __pyx_r = __pyx_pf___pyx_MemviewEnum___reduce_cython__(((struct __pyx_MemviewEnum_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf___pyx_MemviewEnum___reduce_cython__(struct __pyx_MemviewEnum_obj *__pyx_v_self) {
+  PyObject *__pyx_v_state = 0;
+  PyObject *__pyx_v__dict = 0;
+  int __pyx_v_use_setstate;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__reduce_cython__", 1);
+
+  /* "(tree fragment)":5
+ *     cdef object _dict
+ *     cdef bint use_setstate
+ *     state = (self.name,)             # <<<<<<<<<<<<<<
+ *     _dict = getattr(self, '__dict__', None)
+ *     if _dict is not None:
+ */
+  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 5, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_INCREF(__pyx_v_self->name);
+  __Pyx_GIVEREF(__pyx_v_self->name);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v_self->name)) __PYX_ERR(1, 5, __pyx_L1_error);
+  __pyx_v_state = ((PyObject*)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "(tree fragment)":6
+ *     cdef bint use_setstate
+ *     state = (self.name,)
+ *     _dict = getattr(self, '__dict__', None)             # <<<<<<<<<<<<<<
+ *     if _dict is not None:
+ *         state += (_dict,)
+ */
+  __pyx_t_1 = __Pyx_GetAttr3(((PyObject *)__pyx_v_self), __pyx_n_s_dict, Py_None); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 6, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v__dict = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "(tree fragment)":7
+ *     state = (self.name,)
+ *     _dict = getattr(self, '__dict__', None)
+ *     if _dict is not None:             # <<<<<<<<<<<<<<
+ *         state += (_dict,)
+ *         use_setstate = True
+ */
+  __pyx_t_2 = (__pyx_v__dict != Py_None);
+  if (__pyx_t_2) {
+
+    /* "(tree fragment)":8
+ *     _dict = getattr(self, '__dict__', None)
+ *     if _dict is not None:
+ *         state += (_dict,)             # <<<<<<<<<<<<<<
+ *         use_setstate = True
+ *     else:
+ */
+    __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 8, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_INCREF(__pyx_v__dict);
+    __Pyx_GIVEREF(__pyx_v__dict);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v__dict)) __PYX_ERR(1, 8, __pyx_L1_error);
+    __pyx_t_3 = PyNumber_InPlaceAdd(__pyx_v_state, __pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 8, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_DECREF_SET(__pyx_v_state, ((PyObject*)__pyx_t_3));
+    __pyx_t_3 = 0;
+
+    /* "(tree fragment)":9
+ *     if _dict is not None:
+ *         state += (_dict,)
+ *         use_setstate = True             # <<<<<<<<<<<<<<
+ *     else:
+ *         use_setstate = self.name is not None
+ */
+    __pyx_v_use_setstate = 1;
+
+    /* "(tree fragment)":7
+ *     state = (self.name,)
+ *     _dict = getattr(self, '__dict__', None)
+ *     if _dict is not None:             # <<<<<<<<<<<<<<
+ *         state += (_dict,)
+ *         use_setstate = True
+ */
+    goto __pyx_L3;
+  }
+
+  /* "(tree fragment)":11
+ *         use_setstate = True
+ *     else:
+ *         use_setstate = self.name is not None             # <<<<<<<<<<<<<<
+ *     if use_setstate:
+ *         return __pyx_unpickle_Enum, (type(self), 0x82a3537, None), state
+ */
+  /*else*/ {
+    __pyx_t_2 = (__pyx_v_self->name != Py_None);
+    __pyx_v_use_setstate = __pyx_t_2;
+  }
+  __pyx_L3:;
+
+  /* "(tree fragment)":12
+ *     else:
+ *         use_setstate = self.name is not None
+ *     if use_setstate:             # <<<<<<<<<<<<<<
+ *         return __pyx_unpickle_Enum, (type(self), 0x82a3537, None), state
+ *     else:
+ */
+  if (__pyx_v_use_setstate) {
+
+    /* "(tree fragment)":13
+ *         use_setstate = self.name is not None
+ *     if use_setstate:
+ *         return __pyx_unpickle_Enum, (type(self), 0x82a3537, None), state             # <<<<<<<<<<<<<<
+ *     else:
+ *         return __pyx_unpickle_Enum, (type(self), 0x82a3537, state)
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_GetModuleGlobalName(__pyx_t_3, __pyx_n_s_pyx_unpickle_Enum); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 13, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_1 = PyTuple_New(3); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 13, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_INCREF(((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))));
+    __Pyx_GIVEREF(((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))));
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 0, ((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))))) __PYX_ERR(1, 13, __pyx_L1_error);
+    __Pyx_INCREF(__pyx_int_136983863);
+    __Pyx_GIVEREF(__pyx_int_136983863);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_int_136983863)) __PYX_ERR(1, 13, __pyx_L1_error);
+    __Pyx_INCREF(Py_None);
+    __Pyx_GIVEREF(Py_None);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 2, Py_None)) __PYX_ERR(1, 13, __pyx_L1_error);
+    __pyx_t_4 = PyTuple_New(3); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 13, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_GIVEREF(__pyx_t_3);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3)) __PYX_ERR(1, 13, __pyx_L1_error);
+    __Pyx_GIVEREF(__pyx_t_1);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_4, 1, __pyx_t_1)) __PYX_ERR(1, 13, __pyx_L1_error);
+    __Pyx_INCREF(__pyx_v_state);
+    __Pyx_GIVEREF(__pyx_v_state);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_4, 2, __pyx_v_state)) __PYX_ERR(1, 13, __pyx_L1_error);
+    __pyx_t_3 = 0;
+    __pyx_t_1 = 0;
+    __pyx_r = __pyx_t_4;
+    __pyx_t_4 = 0;
+    goto __pyx_L0;
+
+    /* "(tree fragment)":12
+ *     else:
+ *         use_setstate = self.name is not None
+ *     if use_setstate:             # <<<<<<<<<<<<<<
+ *         return __pyx_unpickle_Enum, (type(self), 0x82a3537, None), state
+ *     else:
+ */
+  }
+
+  /* "(tree fragment)":15
+ *         return __pyx_unpickle_Enum, (type(self), 0x82a3537, None), state
+ *     else:
+ *         return __pyx_unpickle_Enum, (type(self), 0x82a3537, state)             # <<<<<<<<<<<<<<
+ * def __setstate_cython__(self, __pyx_state):
+ *     __pyx_unpickle_Enum__set_state(self, __pyx_state)
+ */
+  /*else*/ {
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_GetModuleGlobalName(__pyx_t_4, __pyx_n_s_pyx_unpickle_Enum); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 15, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_1 = PyTuple_New(3); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 15, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_INCREF(((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))));
+    __Pyx_GIVEREF(((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))));
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 0, ((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))))) __PYX_ERR(1, 15, __pyx_L1_error);
+    __Pyx_INCREF(__pyx_int_136983863);
+    __Pyx_GIVEREF(__pyx_int_136983863);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_int_136983863)) __PYX_ERR(1, 15, __pyx_L1_error);
+    __Pyx_INCREF(__pyx_v_state);
+    __Pyx_GIVEREF(__pyx_v_state);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 2, __pyx_v_state)) __PYX_ERR(1, 15, __pyx_L1_error);
+    __pyx_t_3 = PyTuple_New(2); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 15, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_GIVEREF(__pyx_t_4);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_4)) __PYX_ERR(1, 15, __pyx_L1_error);
+    __Pyx_GIVEREF(__pyx_t_1);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_1)) __PYX_ERR(1, 15, __pyx_L1_error);
+    __pyx_t_4 = 0;
+    __pyx_t_1 = 0;
+    __pyx_r = __pyx_t_3;
+    __pyx_t_3 = 0;
+    goto __pyx_L0;
+  }
+
+  /* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     cdef tuple state
+ *     cdef object _dict
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("View.MemoryView.Enum.__reduce_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_state);
+  __Pyx_XDECREF(__pyx_v__dict);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":16
+ *     else:
+ *         return __pyx_unpickle_Enum, (type(self), 0x82a3537, state)
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     __pyx_unpickle_Enum__set_state(self, __pyx_state)
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw___pyx_MemviewEnum_3__setstate_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_pw___pyx_MemviewEnum_3__setstate_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  PyObject *__pyx_v___pyx_state = 0;
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[1] = {0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__setstate_cython__ (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_pyx_state,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  1: values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_FASTCALL(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_pyx_state)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 16, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "__setstate_cython__") < 0)) __PYX_ERR(1, 16, __pyx_L3_error)
+      }
+    } else if (unlikely(__pyx_nargs != 1)) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+    }
+    __pyx_v___pyx_state = values[0];
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__setstate_cython__", 1, 1, 1, __pyx_nargs); __PYX_ERR(1, 16, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("View.MemoryView.Enum.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf___pyx_MemviewEnum_2__setstate_cython__(((struct __pyx_MemviewEnum_obj *)__pyx_v_self), __pyx_v___pyx_state);
+
+  /* function exit code */
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf___pyx_MemviewEnum_2__setstate_cython__(struct __pyx_MemviewEnum_obj *__pyx_v_self, PyObject *__pyx_v___pyx_state) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__setstate_cython__", 1);
+
+  /* "(tree fragment)":17
+ *         return __pyx_unpickle_Enum, (type(self), 0x82a3537, state)
+ * def __setstate_cython__(self, __pyx_state):
+ *     __pyx_unpickle_Enum__set_state(self, __pyx_state)             # <<<<<<<<<<<<<<
+ */
+  if (!(likely(PyTuple_CheckExact(__pyx_v___pyx_state))||((__pyx_v___pyx_state) == Py_None) || __Pyx_RaiseUnexpectedTypeError("tuple", __pyx_v___pyx_state))) __PYX_ERR(1, 17, __pyx_L1_error)
+  __pyx_t_1 = __pyx_unpickle_Enum__set_state(__pyx_v_self, ((PyObject*)__pyx_v___pyx_state)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 17, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "(tree fragment)":16
+ *     else:
+ *         return __pyx_unpickle_Enum, (type(self), 0x82a3537, state)
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     __pyx_unpickle_Enum__set_state(self, __pyx_state)
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("View.MemoryView.Enum.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":349
+ *     cdef __Pyx_TypeInfo *typeinfo
+ * 
+ *     def __cinit__(memoryview self, object obj, int flags, bint dtype_is_object=False):             # <<<<<<<<<<<<<<
+ *         self.obj = obj
+ *         self.flags = flags
+ */
+
+/* Python wrapper */
+static int __pyx_memoryview___cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static int __pyx_memoryview___cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_obj = 0;
+  int __pyx_v_flags;
+  int __pyx_v_dtype_is_object;
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[3] = {0,0,0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__cinit__ (wrapper)", 0);
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return -1;
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_obj,&__pyx_n_s_flags,&__pyx_n_s_dtype_is_object,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  3: values[2] = __Pyx_Arg_VARARGS(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = __Pyx_Arg_VARARGS(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = __Pyx_Arg_VARARGS(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_VARARGS(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_VARARGS(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_obj)) != 0)) {
+          (void)__Pyx_Arg_NewRef_VARARGS(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 349, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_GetKwValue_VARARGS(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_flags)) != 0)) {
+          (void)__Pyx_Arg_NewRef_VARARGS(values[1]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 349, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, 1); __PYX_ERR(1, 349, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (kw_args > 0) {
+          PyObject* value = __Pyx_GetKwValue_VARARGS(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_dtype_is_object);
+          if (value) { values[2] = __Pyx_Arg_NewRef_VARARGS(value); kw_args--; }
+          else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 349, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "__cinit__") < 0)) __PYX_ERR(1, 349, __pyx_L3_error)
+      }
+    } else {
+      switch (__pyx_nargs) {
+        case  3: values[2] = __Pyx_Arg_VARARGS(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = __Pyx_Arg_VARARGS(__pyx_args, 1);
+        values[0] = __Pyx_Arg_VARARGS(__pyx_args, 0);
+        break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+    }
+    __pyx_v_obj = values[0];
+    __pyx_v_flags = __Pyx_PyInt_As_int(values[1]); if (unlikely((__pyx_v_flags == (int)-1) && PyErr_Occurred())) __PYX_ERR(1, 349, __pyx_L3_error)
+    if (values[2]) {
+      __pyx_v_dtype_is_object = __Pyx_PyObject_IsTrue(values[2]); if (unlikely((__pyx_v_dtype_is_object == (int)-1) && PyErr_Occurred())) __PYX_ERR(1, 349, __pyx_L3_error)
+    } else {
+      __pyx_v_dtype_is_object = ((int)0);
+    }
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, __pyx_nargs); __PYX_ERR(1, 349, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_VARARGS(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("View.MemoryView.memoryview.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return -1;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview___cinit__(((struct __pyx_memoryview_obj *)__pyx_v_self), __pyx_v_obj, __pyx_v_flags, __pyx_v_dtype_is_object);
+
+  /* function exit code */
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_VARARGS(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview___cinit__(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_obj, int __pyx_v_flags, int __pyx_v_dtype_is_object) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+  Py_intptr_t __pyx_t_4;
+  size_t __pyx_t_5;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__cinit__", 1);
+
+  /* "View.MemoryView":350
+ * 
+ *     def __cinit__(memoryview self, object obj, int flags, bint dtype_is_object=False):
+ *         self.obj = obj             # <<<<<<<<<<<<<<
+ *         self.flags = flags
+ *         if type(self) is memoryview or obj is not None:
+ */
+  __Pyx_INCREF(__pyx_v_obj);
+  __Pyx_GIVEREF(__pyx_v_obj);
+  __Pyx_GOTREF(__pyx_v_self->obj);
+  __Pyx_DECREF(__pyx_v_self->obj);
+  __pyx_v_self->obj = __pyx_v_obj;
+
+  /* "View.MemoryView":351
+ *     def __cinit__(memoryview self, object obj, int flags, bint dtype_is_object=False):
+ *         self.obj = obj
+ *         self.flags = flags             # <<<<<<<<<<<<<<
+ *         if type(self) is memoryview or obj is not None:
+ *             __Pyx_GetBuffer(obj, &self.view, flags)
+ */
+  __pyx_v_self->flags = __pyx_v_flags;
+
+  /* "View.MemoryView":352
+ *         self.obj = obj
+ *         self.flags = flags
+ *         if type(self) is memoryview or obj is not None:             # <<<<<<<<<<<<<<
+ *             __Pyx_GetBuffer(obj, &self.view, flags)
+ *             if <PyObject *> self.view.obj == NULL:
+ */
+  __pyx_t_2 = (((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))) == ((PyObject *)__pyx_memoryview_type));
+  if (!__pyx_t_2) {
+  } else {
+    __pyx_t_1 = __pyx_t_2;
+    goto __pyx_L4_bool_binop_done;
+  }
+  __pyx_t_2 = (__pyx_v_obj != Py_None);
+  __pyx_t_1 = __pyx_t_2;
+  __pyx_L4_bool_binop_done:;
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":353
+ *         self.flags = flags
+ *         if type(self) is memoryview or obj is not None:
+ *             __Pyx_GetBuffer(obj, &self.view, flags)             # <<<<<<<<<<<<<<
+ *             if <PyObject *> self.view.obj == NULL:
+ *                 (<__pyx_buffer *> &self.view).obj = Py_None
+ */
+    __pyx_t_3 = __Pyx_GetBuffer(__pyx_v_obj, (&__pyx_v_self->view), __pyx_v_flags); if (unlikely(__pyx_t_3 == ((int)-1))) __PYX_ERR(1, 353, __pyx_L1_error)
+
+    /* "View.MemoryView":354
+ *         if type(self) is memoryview or obj is not None:
+ *             __Pyx_GetBuffer(obj, &self.view, flags)
+ *             if <PyObject *> self.view.obj == NULL:             # <<<<<<<<<<<<<<
+ *                 (<__pyx_buffer *> &self.view).obj = Py_None
+ *                 Py_INCREF(Py_None)
+ */
+    __pyx_t_1 = (((PyObject *)__pyx_v_self->view.obj) == NULL);
+    if (__pyx_t_1) {
+
+      /* "View.MemoryView":355
+ *             __Pyx_GetBuffer(obj, &self.view, flags)
+ *             if <PyObject *> self.view.obj == NULL:
+ *                 (<__pyx_buffer *> &self.view).obj = Py_None             # <<<<<<<<<<<<<<
+ *                 Py_INCREF(Py_None)
+ * 
+ */
+      ((Py_buffer *)(&__pyx_v_self->view))->obj = Py_None;
+
+      /* "View.MemoryView":356
+ *             if <PyObject *> self.view.obj == NULL:
+ *                 (<__pyx_buffer *> &self.view).obj = Py_None
+ *                 Py_INCREF(Py_None)             # <<<<<<<<<<<<<<
+ * 
+ *         if not __PYX_CYTHON_ATOMICS_ENABLED():
+ */
+      Py_INCREF(Py_None);
+
+      /* "View.MemoryView":354
+ *         if type(self) is memoryview or obj is not None:
+ *             __Pyx_GetBuffer(obj, &self.view, flags)
+ *             if <PyObject *> self.view.obj == NULL:             # <<<<<<<<<<<<<<
+ *                 (<__pyx_buffer *> &self.view).obj = Py_None
+ *                 Py_INCREF(Py_None)
+ */
+    }
+
+    /* "View.MemoryView":352
+ *         self.obj = obj
+ *         self.flags = flags
+ *         if type(self) is memoryview or obj is not None:             # <<<<<<<<<<<<<<
+ *             __Pyx_GetBuffer(obj, &self.view, flags)
+ *             if <PyObject *> self.view.obj == NULL:
+ */
+  }
+
+  /* "View.MemoryView":358
+ *                 Py_INCREF(Py_None)
+ * 
+ *         if not __PYX_CYTHON_ATOMICS_ENABLED():             # <<<<<<<<<<<<<<
+ *             global __pyx_memoryview_thread_locks_used
+ *             if __pyx_memoryview_thread_locks_used < 8:
+ */
+  __pyx_t_1 = (!__PYX_CYTHON_ATOMICS_ENABLED());
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":360
+ *         if not __PYX_CYTHON_ATOMICS_ENABLED():
+ *             global __pyx_memoryview_thread_locks_used
+ *             if __pyx_memoryview_thread_locks_used < 8:             # <<<<<<<<<<<<<<
+ *                 self.lock = __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]
+ *                 __pyx_memoryview_thread_locks_used += 1
+ */
+    __pyx_t_1 = (__pyx_memoryview_thread_locks_used < 8);
+    if (__pyx_t_1) {
+
+      /* "View.MemoryView":361
+ *             global __pyx_memoryview_thread_locks_used
+ *             if __pyx_memoryview_thread_locks_used < 8:
+ *                 self.lock = __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]             # <<<<<<<<<<<<<<
+ *                 __pyx_memoryview_thread_locks_used += 1
+ *             if self.lock is NULL:
+ */
+      __pyx_v_self->lock = (__pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]);
+
+      /* "View.MemoryView":362
+ *             if __pyx_memoryview_thread_locks_used < 8:
+ *                 self.lock = __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]
+ *                 __pyx_memoryview_thread_locks_used += 1             # <<<<<<<<<<<<<<
+ *             if self.lock is NULL:
+ *                 self.lock = PyThread_allocate_lock()
+ */
+      __pyx_memoryview_thread_locks_used = (__pyx_memoryview_thread_locks_used + 1);
+
+      /* "View.MemoryView":360
+ *         if not __PYX_CYTHON_ATOMICS_ENABLED():
+ *             global __pyx_memoryview_thread_locks_used
+ *             if __pyx_memoryview_thread_locks_used < 8:             # <<<<<<<<<<<<<<
+ *                 self.lock = __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]
+ *                 __pyx_memoryview_thread_locks_used += 1
+ */
+    }
+
+    /* "View.MemoryView":363
+ *                 self.lock = __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]
+ *                 __pyx_memoryview_thread_locks_used += 1
+ *             if self.lock is NULL:             # <<<<<<<<<<<<<<
+ *                 self.lock = PyThread_allocate_lock()
+ *                 if self.lock is NULL:
+ */
+    __pyx_t_1 = (__pyx_v_self->lock == NULL);
+    if (__pyx_t_1) {
+
+      /* "View.MemoryView":364
+ *                 __pyx_memoryview_thread_locks_used += 1
+ *             if self.lock is NULL:
+ *                 self.lock = PyThread_allocate_lock()             # <<<<<<<<<<<<<<
+ *                 if self.lock is NULL:
+ *                     raise MemoryError
+ */
+      __pyx_v_self->lock = PyThread_allocate_lock();
+
+      /* "View.MemoryView":365
+ *             if self.lock is NULL:
+ *                 self.lock = PyThread_allocate_lock()
+ *                 if self.lock is NULL:             # <<<<<<<<<<<<<<
+ *                     raise MemoryError
+ * 
+ */
+      __pyx_t_1 = (__pyx_v_self->lock == NULL);
+      if (unlikely(__pyx_t_1)) {
+
+        /* "View.MemoryView":366
+ *                 self.lock = PyThread_allocate_lock()
+ *                 if self.lock is NULL:
+ *                     raise MemoryError             # <<<<<<<<<<<<<<
+ * 
+ *         if flags & PyBUF_FORMAT:
+ */
+        PyErr_NoMemory(); __PYX_ERR(1, 366, __pyx_L1_error)
+
+        /* "View.MemoryView":365
+ *             if self.lock is NULL:
+ *                 self.lock = PyThread_allocate_lock()
+ *                 if self.lock is NULL:             # <<<<<<<<<<<<<<
+ *                     raise MemoryError
+ * 
+ */
+      }
+
+      /* "View.MemoryView":363
+ *                 self.lock = __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]
+ *                 __pyx_memoryview_thread_locks_used += 1
+ *             if self.lock is NULL:             # <<<<<<<<<<<<<<
+ *                 self.lock = PyThread_allocate_lock()
+ *                 if self.lock is NULL:
+ */
+    }
+
+    /* "View.MemoryView":358
+ *                 Py_INCREF(Py_None)
+ * 
+ *         if not __PYX_CYTHON_ATOMICS_ENABLED():             # <<<<<<<<<<<<<<
+ *             global __pyx_memoryview_thread_locks_used
+ *             if __pyx_memoryview_thread_locks_used < 8:
+ */
+  }
+
+  /* "View.MemoryView":368
+ *                     raise MemoryError
+ * 
+ *         if flags & PyBUF_FORMAT:             # <<<<<<<<<<<<<<
+ *             self.dtype_is_object = (self.view.format[0] == b'O' and self.view.format[1] == b'\0')
+ *         else:
+ */
+  __pyx_t_1 = ((__pyx_v_flags & PyBUF_FORMAT) != 0);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":369
+ * 
+ *         if flags & PyBUF_FORMAT:
+ *             self.dtype_is_object = (self.view.format[0] == b'O' and self.view.format[1] == b'\0')             # <<<<<<<<<<<<<<
+ *         else:
+ *             self.dtype_is_object = dtype_is_object
+ */
+    __pyx_t_2 = ((__pyx_v_self->view.format[0]) == 'O');
+    if (__pyx_t_2) {
+    } else {
+      __pyx_t_1 = __pyx_t_2;
+      goto __pyx_L12_bool_binop_done;
+    }
+    __pyx_t_2 = ((__pyx_v_self->view.format[1]) == '\x00');
+    __pyx_t_1 = __pyx_t_2;
+    __pyx_L12_bool_binop_done:;
+    __pyx_v_self->dtype_is_object = __pyx_t_1;
+
+    /* "View.MemoryView":368
+ *                     raise MemoryError
+ * 
+ *         if flags & PyBUF_FORMAT:             # <<<<<<<<<<<<<<
+ *             self.dtype_is_object = (self.view.format[0] == b'O' and self.view.format[1] == b'\0')
+ *         else:
+ */
+    goto __pyx_L11;
+  }
+
+  /* "View.MemoryView":371
+ *             self.dtype_is_object = (self.view.format[0] == b'O' and self.view.format[1] == b'\0')
+ *         else:
+ *             self.dtype_is_object = dtype_is_object             # <<<<<<<<<<<<<<
+ * 
+ *         assert <Py_intptr_t><void*>(&self.acquisition_count) % sizeof(__pyx_atomic_int_type) == 0
+ */
+  /*else*/ {
+    __pyx_v_self->dtype_is_object = __pyx_v_dtype_is_object;
+  }
+  __pyx_L11:;
+
+  /* "View.MemoryView":373
+ *             self.dtype_is_object = dtype_is_object
+ * 
+ *         assert <Py_intptr_t><void*>(&self.acquisition_count) % sizeof(__pyx_atomic_int_type) == 0             # <<<<<<<<<<<<<<
+ *         self.typeinfo = NULL
+ * 
+ */
+  #ifndef CYTHON_WITHOUT_ASSERTIONS
+  if (unlikely(__pyx_assertions_enabled())) {
+    __pyx_t_4 = ((Py_intptr_t)((void *)(&__pyx_v_self->acquisition_count)));
+    __pyx_t_5 = (sizeof(__pyx_atomic_int_type));
+    if (unlikely(__pyx_t_5 == 0)) {
+      PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
+      __PYX_ERR(1, 373, __pyx_L1_error)
+    }
+    __pyx_t_1 = ((__pyx_t_4 % __pyx_t_5) == 0);
+    if (unlikely(!__pyx_t_1)) {
+      __Pyx_Raise(__pyx_builtin_AssertionError, 0, 0, 0);
+      __PYX_ERR(1, 373, __pyx_L1_error)
+    }
+  }
+  #else
+  if ((1)); else __PYX_ERR(1, 373, __pyx_L1_error)
+  #endif
+
+  /* "View.MemoryView":374
+ * 
+ *         assert <Py_intptr_t><void*>(&self.acquisition_count) % sizeof(__pyx_atomic_int_type) == 0
+ *         self.typeinfo = NULL             # <<<<<<<<<<<<<<
+ * 
+ *     def __dealloc__(memoryview self):
+ */
+  __pyx_v_self->typeinfo = NULL;
+
+  /* "View.MemoryView":349
+ *     cdef __Pyx_TypeInfo *typeinfo
+ * 
+ *     def __cinit__(memoryview self, object obj, int flags, bint dtype_is_object=False):             # <<<<<<<<<<<<<<
+ *         self.obj = obj
+ *         self.flags = flags
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView.memoryview.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":376
+ *         self.typeinfo = NULL
+ * 
+ *     def __dealloc__(memoryview self):             # <<<<<<<<<<<<<<
+ *         if self.obj is not None:
+ *             __Pyx_ReleaseBuffer(&self.view)
+ */
+
+/* Python wrapper */
+static void __pyx_memoryview___dealloc__(PyObject *__pyx_v_self); /*proto*/
+static void __pyx_memoryview___dealloc__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_2__dealloc__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+static void __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_2__dealloc__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  int __pyx_v_i;
+  int __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+  int __pyx_t_4;
+  PyThread_type_lock __pyx_t_5;
+  PyThread_type_lock __pyx_t_6;
+
+  /* "View.MemoryView":377
+ * 
+ *     def __dealloc__(memoryview self):
+ *         if self.obj is not None:             # <<<<<<<<<<<<<<
+ *             __Pyx_ReleaseBuffer(&self.view)
+ *         elif (<__pyx_buffer *> &self.view).obj == Py_None:
+ */
+  __pyx_t_1 = (__pyx_v_self->obj != Py_None);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":378
+ *     def __dealloc__(memoryview self):
+ *         if self.obj is not None:
+ *             __Pyx_ReleaseBuffer(&self.view)             # <<<<<<<<<<<<<<
+ *         elif (<__pyx_buffer *> &self.view).obj == Py_None:
+ * 
+ */
+    __Pyx_ReleaseBuffer((&__pyx_v_self->view));
+
+    /* "View.MemoryView":377
+ * 
+ *     def __dealloc__(memoryview self):
+ *         if self.obj is not None:             # <<<<<<<<<<<<<<
+ *             __Pyx_ReleaseBuffer(&self.view)
+ *         elif (<__pyx_buffer *> &self.view).obj == Py_None:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":379
+ *         if self.obj is not None:
+ *             __Pyx_ReleaseBuffer(&self.view)
+ *         elif (<__pyx_buffer *> &self.view).obj == Py_None:             # <<<<<<<<<<<<<<
+ * 
+ *             (<__pyx_buffer *> &self.view).obj = NULL
+ */
+  __pyx_t_1 = (((Py_buffer *)(&__pyx_v_self->view))->obj == Py_None);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":381
+ *         elif (<__pyx_buffer *> &self.view).obj == Py_None:
+ * 
+ *             (<__pyx_buffer *> &self.view).obj = NULL             # <<<<<<<<<<<<<<
+ *             Py_DECREF(Py_None)
+ * 
+ */
+    ((Py_buffer *)(&__pyx_v_self->view))->obj = NULL;
+
+    /* "View.MemoryView":382
+ * 
+ *             (<__pyx_buffer *> &self.view).obj = NULL
+ *             Py_DECREF(Py_None)             # <<<<<<<<<<<<<<
+ * 
+ *         cdef int i
+ */
+    Py_DECREF(Py_None);
+
+    /* "View.MemoryView":379
+ *         if self.obj is not None:
+ *             __Pyx_ReleaseBuffer(&self.view)
+ *         elif (<__pyx_buffer *> &self.view).obj == Py_None:             # <<<<<<<<<<<<<<
+ * 
+ *             (<__pyx_buffer *> &self.view).obj = NULL
+ */
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":386
+ *         cdef int i
+ *         global __pyx_memoryview_thread_locks_used
+ *         if self.lock != NULL:             # <<<<<<<<<<<<<<
+ *             for i in range(__pyx_memoryview_thread_locks_used):
+ *                 if __pyx_memoryview_thread_locks[i] is self.lock:
+ */
+  __pyx_t_1 = (__pyx_v_self->lock != NULL);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":387
+ *         global __pyx_memoryview_thread_locks_used
+ *         if self.lock != NULL:
+ *             for i in range(__pyx_memoryview_thread_locks_used):             # <<<<<<<<<<<<<<
+ *                 if __pyx_memoryview_thread_locks[i] is self.lock:
+ *                     __pyx_memoryview_thread_locks_used -= 1
+ */
+    __pyx_t_2 = __pyx_memoryview_thread_locks_used;
+    __pyx_t_3 = __pyx_t_2;
+    for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
+      __pyx_v_i = __pyx_t_4;
+
+      /* "View.MemoryView":388
+ *         if self.lock != NULL:
+ *             for i in range(__pyx_memoryview_thread_locks_used):
+ *                 if __pyx_memoryview_thread_locks[i] is self.lock:             # <<<<<<<<<<<<<<
+ *                     __pyx_memoryview_thread_locks_used -= 1
+ *                     if i != __pyx_memoryview_thread_locks_used:
+ */
+      __pyx_t_1 = ((__pyx_memoryview_thread_locks[__pyx_v_i]) == __pyx_v_self->lock);
+      if (__pyx_t_1) {
+
+        /* "View.MemoryView":389
+ *             for i in range(__pyx_memoryview_thread_locks_used):
+ *                 if __pyx_memoryview_thread_locks[i] is self.lock:
+ *                     __pyx_memoryview_thread_locks_used -= 1             # <<<<<<<<<<<<<<
+ *                     if i != __pyx_memoryview_thread_locks_used:
+ *                         __pyx_memoryview_thread_locks[i], __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used] = (
+ */
+        __pyx_memoryview_thread_locks_used = (__pyx_memoryview_thread_locks_used - 1);
+
+        /* "View.MemoryView":390
+ *                 if __pyx_memoryview_thread_locks[i] is self.lock:
+ *                     __pyx_memoryview_thread_locks_used -= 1
+ *                     if i != __pyx_memoryview_thread_locks_used:             # <<<<<<<<<<<<<<
+ *                         __pyx_memoryview_thread_locks[i], __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used] = (
+ *                             __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used], __pyx_memoryview_thread_locks[i])
+ */
+        __pyx_t_1 = (__pyx_v_i != __pyx_memoryview_thread_locks_used);
+        if (__pyx_t_1) {
+
+          /* "View.MemoryView":392
+ *                     if i != __pyx_memoryview_thread_locks_used:
+ *                         __pyx_memoryview_thread_locks[i], __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used] = (
+ *                             __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used], __pyx_memoryview_thread_locks[i])             # <<<<<<<<<<<<<<
+ *                     break
+ *             else:
+ */
+          __pyx_t_5 = (__pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]);
+          __pyx_t_6 = (__pyx_memoryview_thread_locks[__pyx_v_i]);
+
+          /* "View.MemoryView":391
+ *                     __pyx_memoryview_thread_locks_used -= 1
+ *                     if i != __pyx_memoryview_thread_locks_used:
+ *                         __pyx_memoryview_thread_locks[i], __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used] = (             # <<<<<<<<<<<<<<
+ *                             __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used], __pyx_memoryview_thread_locks[i])
+ *                     break
+ */
+          (__pyx_memoryview_thread_locks[__pyx_v_i]) = __pyx_t_5;
+          (__pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]) = __pyx_t_6;
+
+          /* "View.MemoryView":390
+ *                 if __pyx_memoryview_thread_locks[i] is self.lock:
+ *                     __pyx_memoryview_thread_locks_used -= 1
+ *                     if i != __pyx_memoryview_thread_locks_used:             # <<<<<<<<<<<<<<
+ *                         __pyx_memoryview_thread_locks[i], __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used] = (
+ *                             __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used], __pyx_memoryview_thread_locks[i])
+ */
+        }
+
+        /* "View.MemoryView":393
+ *                         __pyx_memoryview_thread_locks[i], __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used] = (
+ *                             __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used], __pyx_memoryview_thread_locks[i])
+ *                     break             # <<<<<<<<<<<<<<
+ *             else:
+ *                 PyThread_free_lock(self.lock)
+ */
+        goto __pyx_L6_break;
+
+        /* "View.MemoryView":388
+ *         if self.lock != NULL:
+ *             for i in range(__pyx_memoryview_thread_locks_used):
+ *                 if __pyx_memoryview_thread_locks[i] is self.lock:             # <<<<<<<<<<<<<<
+ *                     __pyx_memoryview_thread_locks_used -= 1
+ *                     if i != __pyx_memoryview_thread_locks_used:
+ */
+      }
+    }
+    /*else*/ {
+
+      /* "View.MemoryView":395
+ *                     break
+ *             else:
+ *                 PyThread_free_lock(self.lock)             # <<<<<<<<<<<<<<
+ * 
+ *     cdef char *get_item_pointer(memoryview self, object index) except NULL:
+ */
+      PyThread_free_lock(__pyx_v_self->lock);
+    }
+    __pyx_L6_break:;
+
+    /* "View.MemoryView":386
+ *         cdef int i
+ *         global __pyx_memoryview_thread_locks_used
+ *         if self.lock != NULL:             # <<<<<<<<<<<<<<
+ *             for i in range(__pyx_memoryview_thread_locks_used):
+ *                 if __pyx_memoryview_thread_locks[i] is self.lock:
+ */
+  }
+
+  /* "View.MemoryView":376
+ *         self.typeinfo = NULL
+ * 
+ *     def __dealloc__(memoryview self):             # <<<<<<<<<<<<<<
+ *         if self.obj is not None:
+ *             __Pyx_ReleaseBuffer(&self.view)
+ */
+
+  /* function exit code */
+}
+
+/* "View.MemoryView":397
+ *                 PyThread_free_lock(self.lock)
+ * 
+ *     cdef char *get_item_pointer(memoryview self, object index) except NULL:             # <<<<<<<<<<<<<<
+ *         cdef Py_ssize_t dim
+ *         cdef char *itemp = <char *> self.view.buf
+ */
+
+static char *__pyx_memoryview_get_item_pointer(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index) {
+  Py_ssize_t __pyx_v_dim;
+  char *__pyx_v_itemp;
+  PyObject *__pyx_v_idx = NULL;
+  char *__pyx_r;
+  __Pyx_RefNannyDeclarations
+  Py_ssize_t __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  Py_ssize_t __pyx_t_3;
+  PyObject *(*__pyx_t_4)(PyObject *);
+  PyObject *__pyx_t_5 = NULL;
+  Py_ssize_t __pyx_t_6;
+  char *__pyx_t_7;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("get_item_pointer", 1);
+
+  /* "View.MemoryView":399
+ *     cdef char *get_item_pointer(memoryview self, object index) except NULL:
+ *         cdef Py_ssize_t dim
+ *         cdef char *itemp = <char *> self.view.buf             # <<<<<<<<<<<<<<
+ * 
+ *         for dim, idx in enumerate(index):
+ */
+  __pyx_v_itemp = ((char *)__pyx_v_self->view.buf);
+
+  /* "View.MemoryView":401
+ *         cdef char *itemp = <char *> self.view.buf
+ * 
+ *         for dim, idx in enumerate(index):             # <<<<<<<<<<<<<<
+ *             itemp = pybuffer_index(&self.view, itemp, idx, dim)
+ * 
+ */
+  __pyx_t_1 = 0;
+  if (likely(PyList_CheckExact(__pyx_v_index)) || PyTuple_CheckExact(__pyx_v_index)) {
+    __pyx_t_2 = __pyx_v_index; __Pyx_INCREF(__pyx_t_2);
+    __pyx_t_3 = 0;
+    __pyx_t_4 = NULL;
+  } else {
+    __pyx_t_3 = -1; __pyx_t_2 = PyObject_GetIter(__pyx_v_index); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 401, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_4 = __Pyx_PyObject_GetIterNextFunc(__pyx_t_2); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 401, __pyx_L1_error)
+  }
+  for (;;) {
+    if (likely(!__pyx_t_4)) {
+      if (likely(PyList_CheckExact(__pyx_t_2))) {
+        {
+          Py_ssize_t __pyx_temp = __Pyx_PyList_GET_SIZE(__pyx_t_2);
+          #if !CYTHON_ASSUME_SAFE_MACROS
+          if (unlikely((__pyx_temp < 0))) __PYX_ERR(1, 401, __pyx_L1_error)
+          #endif
+          if (__pyx_t_3 >= __pyx_temp) break;
+        }
+        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+        __pyx_t_5 = PyList_GET_ITEM(__pyx_t_2, __pyx_t_3); __Pyx_INCREF(__pyx_t_5); __pyx_t_3++; if (unlikely((0 < 0))) __PYX_ERR(1, 401, __pyx_L1_error)
+        #else
+        __pyx_t_5 = __Pyx_PySequence_ITEM(__pyx_t_2, __pyx_t_3); __pyx_t_3++; if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 401, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        #endif
+      } else {
+        {
+          Py_ssize_t __pyx_temp = __Pyx_PyTuple_GET_SIZE(__pyx_t_2);
+          #if !CYTHON_ASSUME_SAFE_MACROS
+          if (unlikely((__pyx_temp < 0))) __PYX_ERR(1, 401, __pyx_L1_error)
+          #endif
+          if (__pyx_t_3 >= __pyx_temp) break;
+        }
+        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+        __pyx_t_5 = PyTuple_GET_ITEM(__pyx_t_2, __pyx_t_3); __Pyx_INCREF(__pyx_t_5); __pyx_t_3++; if (unlikely((0 < 0))) __PYX_ERR(1, 401, __pyx_L1_error)
+        #else
+        __pyx_t_5 = __Pyx_PySequence_ITEM(__pyx_t_2, __pyx_t_3); __pyx_t_3++; if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 401, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        #endif
+      }
+    } else {
+      __pyx_t_5 = __pyx_t_4(__pyx_t_2);
+      if (unlikely(!__pyx_t_5)) {
+        PyObject* exc_type = PyErr_Occurred();
+        if (exc_type) {
+          if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
+          else __PYX_ERR(1, 401, __pyx_L1_error)
+        }
+        break;
+      }
+      __Pyx_GOTREF(__pyx_t_5);
+    }
+    __Pyx_XDECREF_SET(__pyx_v_idx, __pyx_t_5);
+    __pyx_t_5 = 0;
+    __pyx_v_dim = __pyx_t_1;
+    __pyx_t_1 = (__pyx_t_1 + 1);
+
+    /* "View.MemoryView":402
+ * 
+ *         for dim, idx in enumerate(index):
+ *             itemp = pybuffer_index(&self.view, itemp, idx, dim)             # <<<<<<<<<<<<<<
+ * 
+ *         return itemp
+ */
+    __pyx_t_6 = __Pyx_PyIndex_AsSsize_t(__pyx_v_idx); if (unlikely((__pyx_t_6 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(1, 402, __pyx_L1_error)
+    __pyx_t_7 = __pyx_pybuffer_index((&__pyx_v_self->view), __pyx_v_itemp, __pyx_t_6, __pyx_v_dim); if (unlikely(__pyx_t_7 == ((char *)NULL))) __PYX_ERR(1, 402, __pyx_L1_error)
+    __pyx_v_itemp = __pyx_t_7;
+
+    /* "View.MemoryView":401
+ *         cdef char *itemp = <char *> self.view.buf
+ * 
+ *         for dim, idx in enumerate(index):             # <<<<<<<<<<<<<<
+ *             itemp = pybuffer_index(&self.view, itemp, idx, dim)
+ * 
+ */
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "View.MemoryView":404
+ *             itemp = pybuffer_index(&self.view, itemp, idx, dim)
+ * 
+ *         return itemp             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = __pyx_v_itemp;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":397
+ *                 PyThread_free_lock(self.lock)
+ * 
+ *     cdef char *get_item_pointer(memoryview self, object index) except NULL:             # <<<<<<<<<<<<<<
+ *         cdef Py_ssize_t dim
+ *         cdef char *itemp = <char *> self.view.buf
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.get_item_pointer", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_idx);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":407
+ * 
+ * 
+ *     def __getitem__(memoryview self, object index):             # <<<<<<<<<<<<<<
+ *         if index is Ellipsis:
+ *             return self
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_memoryview___getitem__(PyObject *__pyx_v_self, PyObject *__pyx_v_index); /*proto*/
+static PyObject *__pyx_memoryview___getitem__(PyObject *__pyx_v_self, PyObject *__pyx_v_index) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__getitem__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_4__getitem__(((struct __pyx_memoryview_obj *)__pyx_v_self), ((PyObject *)__pyx_v_index));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_4__getitem__(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index) {
+  PyObject *__pyx_v_have_slices = NULL;
+  PyObject *__pyx_v_indices = NULL;
+  char *__pyx_v_itemp;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  char *__pyx_t_5;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__getitem__", 1);
+
+  /* "View.MemoryView":408
+ * 
+ *     def __getitem__(memoryview self, object index):
+ *         if index is Ellipsis:             # <<<<<<<<<<<<<<
+ *             return self
+ * 
+ */
+  __pyx_t_1 = (__pyx_v_index == __pyx_builtin_Ellipsis);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":409
+ *     def __getitem__(memoryview self, object index):
+ *         if index is Ellipsis:
+ *             return self             # <<<<<<<<<<<<<<
+ * 
+ *         have_slices, indices = _unellipsify(index, self.view.ndim)
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF((PyObject *)__pyx_v_self);
+    __pyx_r = ((PyObject *)__pyx_v_self);
+    goto __pyx_L0;
+
+    /* "View.MemoryView":408
+ * 
+ *     def __getitem__(memoryview self, object index):
+ *         if index is Ellipsis:             # <<<<<<<<<<<<<<
+ *             return self
+ * 
+ */
+  }
+
+  /* "View.MemoryView":411
+ *             return self
+ * 
+ *         have_slices, indices = _unellipsify(index, self.view.ndim)             # <<<<<<<<<<<<<<
+ * 
+ *         cdef char *itemp
+ */
+  __pyx_t_2 = _unellipsify(__pyx_v_index, __pyx_v_self->view.ndim); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 411, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  if (likely(__pyx_t_2 != Py_None)) {
+    PyObject* sequence = __pyx_t_2;
+    Py_ssize_t size = __Pyx_PySequence_SIZE(sequence);
+    if (unlikely(size != 2)) {
+      if (size > 2) __Pyx_RaiseTooManyValuesError(2);
+      else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size);
+      __PYX_ERR(1, 411, __pyx_L1_error)
+    }
+    #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    __pyx_t_3 = PyTuple_GET_ITEM(sequence, 0); 
+    __pyx_t_4 = PyTuple_GET_ITEM(sequence, 1); 
+    __Pyx_INCREF(__pyx_t_3);
+    __Pyx_INCREF(__pyx_t_4);
+    #else
+    __pyx_t_3 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 411, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_4 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 411, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    #endif
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  } else {
+    __Pyx_RaiseNoneNotIterableError(); __PYX_ERR(1, 411, __pyx_L1_error)
+  }
+  __pyx_v_have_slices = __pyx_t_3;
+  __pyx_t_3 = 0;
+  __pyx_v_indices = __pyx_t_4;
+  __pyx_t_4 = 0;
+
+  /* "View.MemoryView":414
+ * 
+ *         cdef char *itemp
+ *         if have_slices:             # <<<<<<<<<<<<<<
+ *             return memview_slice(self, indices)
+ *         else:
+ */
+  __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_v_have_slices); if (unlikely((__pyx_t_1 < 0))) __PYX_ERR(1, 414, __pyx_L1_error)
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":415
+ *         cdef char *itemp
+ *         if have_slices:
+ *             return memview_slice(self, indices)             # <<<<<<<<<<<<<<
+ *         else:
+ *             itemp = self.get_item_pointer(indices)
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_t_2 = ((PyObject *)__pyx_memview_slice(__pyx_v_self, __pyx_v_indices)); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 415, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_r = __pyx_t_2;
+    __pyx_t_2 = 0;
+    goto __pyx_L0;
+
+    /* "View.MemoryView":414
+ * 
+ *         cdef char *itemp
+ *         if have_slices:             # <<<<<<<<<<<<<<
+ *             return memview_slice(self, indices)
+ *         else:
+ */
+  }
+
+  /* "View.MemoryView":417
+ *             return memview_slice(self, indices)
+ *         else:
+ *             itemp = self.get_item_pointer(indices)             # <<<<<<<<<<<<<<
+ *             return self.convert_item_to_object(itemp)
+ * 
+ */
+  /*else*/ {
+    __pyx_t_5 = ((struct __pyx_vtabstruct_memoryview *)__pyx_v_self->__pyx_vtab)->get_item_pointer(__pyx_v_self, __pyx_v_indices); if (unlikely(__pyx_t_5 == ((char *)NULL))) __PYX_ERR(1, 417, __pyx_L1_error)
+    __pyx_v_itemp = __pyx_t_5;
+
+    /* "View.MemoryView":418
+ *         else:
+ *             itemp = self.get_item_pointer(indices)
+ *             return self.convert_item_to_object(itemp)             # <<<<<<<<<<<<<<
+ * 
+ *     def __setitem__(memoryview self, object index, object value):
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_t_2 = ((struct __pyx_vtabstruct_memoryview *)__pyx_v_self->__pyx_vtab)->convert_item_to_object(__pyx_v_self, __pyx_v_itemp); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 418, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_r = __pyx_t_2;
+    __pyx_t_2 = 0;
+    goto __pyx_L0;
+  }
+
+  /* "View.MemoryView":407
+ * 
+ * 
+ *     def __getitem__(memoryview self, object index):             # <<<<<<<<<<<<<<
+ *         if index is Ellipsis:
+ *             return self
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.__getitem__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_have_slices);
+  __Pyx_XDECREF(__pyx_v_indices);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":420
+ *             return self.convert_item_to_object(itemp)
+ * 
+ *     def __setitem__(memoryview self, object index, object value):             # <<<<<<<<<<<<<<
+ *         if self.view.readonly:
+ *             raise TypeError, "Cannot assign to read-only memoryview"
+ */
+
+/* Python wrapper */
+static int __pyx_memoryview___setitem__(PyObject *__pyx_v_self, PyObject *__pyx_v_index, PyObject *__pyx_v_value); /*proto*/
+static int __pyx_memoryview___setitem__(PyObject *__pyx_v_self, PyObject *__pyx_v_index, PyObject *__pyx_v_value) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__setitem__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_6__setitem__(((struct __pyx_memoryview_obj *)__pyx_v_self), ((PyObject *)__pyx_v_index), ((PyObject *)__pyx_v_value));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_6__setitem__(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index, PyObject *__pyx_v_value) {
+  PyObject *__pyx_v_have_slices = NULL;
+  PyObject *__pyx_v_obj = NULL;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__setitem__", 0);
+  __Pyx_INCREF(__pyx_v_index);
+
+  /* "View.MemoryView":421
+ * 
+ *     def __setitem__(memoryview self, object index, object value):
+ *         if self.view.readonly:             # <<<<<<<<<<<<<<
+ *             raise TypeError, "Cannot assign to read-only memoryview"
+ * 
+ */
+  if (unlikely(__pyx_v_self->view.readonly)) {
+
+    /* "View.MemoryView":422
+ *     def __setitem__(memoryview self, object index, object value):
+ *         if self.view.readonly:
+ *             raise TypeError, "Cannot assign to read-only memoryview"             # <<<<<<<<<<<<<<
+ * 
+ *         have_slices, index = _unellipsify(index, self.view.ndim)
+ */
+    __Pyx_Raise(__pyx_builtin_TypeError, __pyx_kp_s_Cannot_assign_to_read_only_memor, 0, 0);
+    __PYX_ERR(1, 422, __pyx_L1_error)
+
+    /* "View.MemoryView":421
+ * 
+ *     def __setitem__(memoryview self, object index, object value):
+ *         if self.view.readonly:             # <<<<<<<<<<<<<<
+ *             raise TypeError, "Cannot assign to read-only memoryview"
+ * 
+ */
+  }
+
+  /* "View.MemoryView":424
+ *             raise TypeError, "Cannot assign to read-only memoryview"
+ * 
+ *         have_slices, index = _unellipsify(index, self.view.ndim)             # <<<<<<<<<<<<<<
+ * 
+ *         if have_slices:
+ */
+  __pyx_t_1 = _unellipsify(__pyx_v_index, __pyx_v_self->view.ndim); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 424, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (likely(__pyx_t_1 != Py_None)) {
+    PyObject* sequence = __pyx_t_1;
+    Py_ssize_t size = __Pyx_PySequence_SIZE(sequence);
+    if (unlikely(size != 2)) {
+      if (size > 2) __Pyx_RaiseTooManyValuesError(2);
+      else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size);
+      __PYX_ERR(1, 424, __pyx_L1_error)
+    }
+    #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    __pyx_t_2 = PyTuple_GET_ITEM(sequence, 0); 
+    __pyx_t_3 = PyTuple_GET_ITEM(sequence, 1); 
+    __Pyx_INCREF(__pyx_t_2);
+    __Pyx_INCREF(__pyx_t_3);
+    #else
+    __pyx_t_2 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 424, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_3 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 424, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    #endif
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  } else {
+    __Pyx_RaiseNoneNotIterableError(); __PYX_ERR(1, 424, __pyx_L1_error)
+  }
+  __pyx_v_have_slices = __pyx_t_2;
+  __pyx_t_2 = 0;
+  __Pyx_DECREF_SET(__pyx_v_index, __pyx_t_3);
+  __pyx_t_3 = 0;
+
+  /* "View.MemoryView":426
+ *         have_slices, index = _unellipsify(index, self.view.ndim)
+ * 
+ *         if have_slices:             # <<<<<<<<<<<<<<
+ *             obj = self.is_slice(value)
+ *             if obj is not None:
+ */
+  __pyx_t_4 = __Pyx_PyObject_IsTrue(__pyx_v_have_slices); if (unlikely((__pyx_t_4 < 0))) __PYX_ERR(1, 426, __pyx_L1_error)
+  if (__pyx_t_4) {
+
+    /* "View.MemoryView":427
+ * 
+ *         if have_slices:
+ *             obj = self.is_slice(value)             # <<<<<<<<<<<<<<
+ *             if obj is not None:
+ *                 self.setitem_slice_assignment(self[index], obj)
+ */
+    __pyx_t_1 = ((struct __pyx_vtabstruct_memoryview *)__pyx_v_self->__pyx_vtab)->is_slice(__pyx_v_self, __pyx_v_value); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 427, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_v_obj = __pyx_t_1;
+    __pyx_t_1 = 0;
+
+    /* "View.MemoryView":428
+ *         if have_slices:
+ *             obj = self.is_slice(value)
+ *             if obj is not None:             # <<<<<<<<<<<<<<
+ *                 self.setitem_slice_assignment(self[index], obj)
+ *             else:
+ */
+    __pyx_t_4 = (__pyx_v_obj != Py_None);
+    if (__pyx_t_4) {
+
+      /* "View.MemoryView":429
+ *             obj = self.is_slice(value)
+ *             if obj is not None:
+ *                 self.setitem_slice_assignment(self[index], obj)             # <<<<<<<<<<<<<<
+ *             else:
+ *                 self.setitem_slice_assign_scalar(self[index], value)
+ */
+      __pyx_t_1 = __Pyx_PyObject_GetItem(((PyObject *)__pyx_v_self), __pyx_v_index); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 429, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __pyx_t_3 = ((struct __pyx_vtabstruct_memoryview *)__pyx_v_self->__pyx_vtab)->setitem_slice_assignment(__pyx_v_self, __pyx_t_1, __pyx_v_obj); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 429, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+
+      /* "View.MemoryView":428
+ *         if have_slices:
+ *             obj = self.is_slice(value)
+ *             if obj is not None:             # <<<<<<<<<<<<<<
+ *                 self.setitem_slice_assignment(self[index], obj)
+ *             else:
+ */
+      goto __pyx_L5;
+    }
+
+    /* "View.MemoryView":431
+ *                 self.setitem_slice_assignment(self[index], obj)
+ *             else:
+ *                 self.setitem_slice_assign_scalar(self[index], value)             # <<<<<<<<<<<<<<
+ *         else:
+ *             self.setitem_indexed(index, value)
+ */
+    /*else*/ {
+      __pyx_t_3 = __Pyx_PyObject_GetItem(((PyObject *)__pyx_v_self), __pyx_v_index); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 431, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      if (!(likely(((__pyx_t_3) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_3, __pyx_memoryview_type))))) __PYX_ERR(1, 431, __pyx_L1_error)
+      __pyx_t_1 = ((struct __pyx_vtabstruct_memoryview *)__pyx_v_self->__pyx_vtab)->setitem_slice_assign_scalar(__pyx_v_self, ((struct __pyx_memoryview_obj *)__pyx_t_3), __pyx_v_value); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 431, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    }
+    __pyx_L5:;
+
+    /* "View.MemoryView":426
+ *         have_slices, index = _unellipsify(index, self.view.ndim)
+ * 
+ *         if have_slices:             # <<<<<<<<<<<<<<
+ *             obj = self.is_slice(value)
+ *             if obj is not None:
+ */
+    goto __pyx_L4;
+  }
+
+  /* "View.MemoryView":433
+ *                 self.setitem_slice_assign_scalar(self[index], value)
+ *         else:
+ *             self.setitem_indexed(index, value)             # <<<<<<<<<<<<<<
+ * 
+ *     cdef is_slice(self, obj):
+ */
+  /*else*/ {
+    __pyx_t_1 = ((struct __pyx_vtabstruct_memoryview *)__pyx_v_self->__pyx_vtab)->setitem_indexed(__pyx_v_self, __pyx_v_index, __pyx_v_value); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 433, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  }
+  __pyx_L4:;
+
+  /* "View.MemoryView":420
+ *             return self.convert_item_to_object(itemp)
+ * 
+ *     def __setitem__(memoryview self, object index, object value):             # <<<<<<<<<<<<<<
+ *         if self.view.readonly:
+ *             raise TypeError, "Cannot assign to read-only memoryview"
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.__setitem__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_have_slices);
+  __Pyx_XDECREF(__pyx_v_obj);
+  __Pyx_XDECREF(__pyx_v_index);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":435
+ *             self.setitem_indexed(index, value)
+ * 
+ *     cdef is_slice(self, obj):             # <<<<<<<<<<<<<<
+ *         if not isinstance(obj, memoryview):
+ *             try:
+ */
+
+static PyObject *__pyx_memoryview_is_slice(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_obj) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  int __pyx_t_9;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("is_slice", 0);
+  __Pyx_INCREF(__pyx_v_obj);
+
+  /* "View.MemoryView":436
+ * 
+ *     cdef is_slice(self, obj):
+ *         if not isinstance(obj, memoryview):             # <<<<<<<<<<<<<<
+ *             try:
+ *                 obj = memoryview(obj, self.flags & ~PyBUF_WRITABLE | PyBUF_ANY_CONTIGUOUS,
+ */
+  __pyx_t_1 = __Pyx_TypeCheck(__pyx_v_obj, __pyx_memoryview_type); 
+  __pyx_t_2 = (!__pyx_t_1);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":437
+ *     cdef is_slice(self, obj):
+ *         if not isinstance(obj, memoryview):
+ *             try:             # <<<<<<<<<<<<<<
+ *                 obj = memoryview(obj, self.flags & ~PyBUF_WRITABLE | PyBUF_ANY_CONTIGUOUS,
+ *                                  self.dtype_is_object)
+ */
+    {
+      __Pyx_PyThreadState_declare
+      __Pyx_PyThreadState_assign
+      __Pyx_ExceptionSave(&__pyx_t_3, &__pyx_t_4, &__pyx_t_5);
+      __Pyx_XGOTREF(__pyx_t_3);
+      __Pyx_XGOTREF(__pyx_t_4);
+      __Pyx_XGOTREF(__pyx_t_5);
+      /*try:*/ {
+
+        /* "View.MemoryView":438
+ *         if not isinstance(obj, memoryview):
+ *             try:
+ *                 obj = memoryview(obj, self.flags & ~PyBUF_WRITABLE | PyBUF_ANY_CONTIGUOUS,             # <<<<<<<<<<<<<<
+ *                                  self.dtype_is_object)
+ *             except TypeError:
+ */
+        __pyx_t_6 = __Pyx_PyInt_From_int(((__pyx_v_self->flags & (~PyBUF_WRITABLE)) | PyBUF_ANY_CONTIGUOUS)); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 438, __pyx_L4_error)
+        __Pyx_GOTREF(__pyx_t_6);
+
+        /* "View.MemoryView":439
+ *             try:
+ *                 obj = memoryview(obj, self.flags & ~PyBUF_WRITABLE | PyBUF_ANY_CONTIGUOUS,
+ *                                  self.dtype_is_object)             # <<<<<<<<<<<<<<
+ *             except TypeError:
+ *                 return None
+ */
+        __pyx_t_7 = __Pyx_PyBool_FromLong(__pyx_v_self->dtype_is_object); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 439, __pyx_L4_error)
+        __Pyx_GOTREF(__pyx_t_7);
+
+        /* "View.MemoryView":438
+ *         if not isinstance(obj, memoryview):
+ *             try:
+ *                 obj = memoryview(obj, self.flags & ~PyBUF_WRITABLE | PyBUF_ANY_CONTIGUOUS,             # <<<<<<<<<<<<<<
+ *                                  self.dtype_is_object)
+ *             except TypeError:
+ */
+        __pyx_t_8 = PyTuple_New(3); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 438, __pyx_L4_error)
+        __Pyx_GOTREF(__pyx_t_8);
+        __Pyx_INCREF(__pyx_v_obj);
+        __Pyx_GIVEREF(__pyx_v_obj);
+        if (__Pyx_PyTuple_SET_ITEM(__pyx_t_8, 0, __pyx_v_obj)) __PYX_ERR(1, 438, __pyx_L4_error);
+        __Pyx_GIVEREF(__pyx_t_6);
+        if (__Pyx_PyTuple_SET_ITEM(__pyx_t_8, 1, __pyx_t_6)) __PYX_ERR(1, 438, __pyx_L4_error);
+        __Pyx_GIVEREF(__pyx_t_7);
+        if (__Pyx_PyTuple_SET_ITEM(__pyx_t_8, 2, __pyx_t_7)) __PYX_ERR(1, 438, __pyx_L4_error);
+        __pyx_t_6 = 0;
+        __pyx_t_7 = 0;
+        __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)__pyx_memoryview_type), __pyx_t_8, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 438, __pyx_L4_error)
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+        __Pyx_DECREF_SET(__pyx_v_obj, __pyx_t_7);
+        __pyx_t_7 = 0;
+
+        /* "View.MemoryView":437
+ *     cdef is_slice(self, obj):
+ *         if not isinstance(obj, memoryview):
+ *             try:             # <<<<<<<<<<<<<<
+ *                 obj = memoryview(obj, self.flags & ~PyBUF_WRITABLE | PyBUF_ANY_CONTIGUOUS,
+ *                                  self.dtype_is_object)
+ */
+      }
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+      goto __pyx_L9_try_end;
+      __pyx_L4_error:;
+      __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+      __Pyx_XDECREF(__pyx_t_8); __pyx_t_8 = 0;
+
+      /* "View.MemoryView":440
+ *                 obj = memoryview(obj, self.flags & ~PyBUF_WRITABLE | PyBUF_ANY_CONTIGUOUS,
+ *                                  self.dtype_is_object)
+ *             except TypeError:             # <<<<<<<<<<<<<<
+ *                 return None
+ * 
+ */
+      __pyx_t_9 = __Pyx_PyErr_ExceptionMatches(__pyx_builtin_TypeError);
+      if (__pyx_t_9) {
+        __Pyx_AddTraceback("View.MemoryView.memoryview.is_slice", __pyx_clineno, __pyx_lineno, __pyx_filename);
+        if (__Pyx_GetException(&__pyx_t_7, &__pyx_t_8, &__pyx_t_6) < 0) __PYX_ERR(1, 440, __pyx_L6_except_error)
+        __Pyx_XGOTREF(__pyx_t_7);
+        __Pyx_XGOTREF(__pyx_t_8);
+        __Pyx_XGOTREF(__pyx_t_6);
+
+        /* "View.MemoryView":441
+ *                                  self.dtype_is_object)
+ *             except TypeError:
+ *                 return None             # <<<<<<<<<<<<<<
+ * 
+ *         return obj
+ */
+        __Pyx_XDECREF(__pyx_r);
+        __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+        __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+        goto __pyx_L7_except_return;
+      }
+      goto __pyx_L6_except_error;
+
+      /* "View.MemoryView":437
+ *     cdef is_slice(self, obj):
+ *         if not isinstance(obj, memoryview):
+ *             try:             # <<<<<<<<<<<<<<
+ *                 obj = memoryview(obj, self.flags & ~PyBUF_WRITABLE | PyBUF_ANY_CONTIGUOUS,
+ *                                  self.dtype_is_object)
+ */
+      __pyx_L6_except_error:;
+      __Pyx_XGIVEREF(__pyx_t_3);
+      __Pyx_XGIVEREF(__pyx_t_4);
+      __Pyx_XGIVEREF(__pyx_t_5);
+      __Pyx_ExceptionReset(__pyx_t_3, __pyx_t_4, __pyx_t_5);
+      goto __pyx_L1_error;
+      __pyx_L7_except_return:;
+      __Pyx_XGIVEREF(__pyx_t_3);
+      __Pyx_XGIVEREF(__pyx_t_4);
+      __Pyx_XGIVEREF(__pyx_t_5);
+      __Pyx_ExceptionReset(__pyx_t_3, __pyx_t_4, __pyx_t_5);
+      goto __pyx_L0;
+      __pyx_L9_try_end:;
+    }
+
+    /* "View.MemoryView":436
+ * 
+ *     cdef is_slice(self, obj):
+ *         if not isinstance(obj, memoryview):             # <<<<<<<<<<<<<<
+ *             try:
+ *                 obj = memoryview(obj, self.flags & ~PyBUF_WRITABLE | PyBUF_ANY_CONTIGUOUS,
+ */
+  }
+
+  /* "View.MemoryView":443
+ *                 return None
+ * 
+ *         return obj             # <<<<<<<<<<<<<<
+ * 
+ *     cdef setitem_slice_assignment(self, dst, src):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_obj);
+  __pyx_r = __pyx_v_obj;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":435
+ *             self.setitem_indexed(index, value)
+ * 
+ *     cdef is_slice(self, obj):             # <<<<<<<<<<<<<<
+ *         if not isinstance(obj, memoryview):
+ *             try:
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.is_slice", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_obj);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":445
+ *         return obj
+ * 
+ *     cdef setitem_slice_assignment(self, dst, src):             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice dst_slice
+ *         cdef __Pyx_memviewslice src_slice
+ */
+
+static PyObject *__pyx_memoryview_setitem_slice_assignment(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_dst, PyObject *__pyx_v_src) {
+  __Pyx_memviewslice __pyx_v_dst_slice;
+  __Pyx_memviewslice __pyx_v_src_slice;
+  __Pyx_memviewslice __pyx_v_msrc;
+  __Pyx_memviewslice __pyx_v_mdst;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_memviewslice *__pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_t_3;
+  int __pyx_t_4;
+  int __pyx_t_5;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("setitem_slice_assignment", 1);
+
+  /* "View.MemoryView":448
+ *         cdef __Pyx_memviewslice dst_slice
+ *         cdef __Pyx_memviewslice src_slice
+ *         cdef __Pyx_memviewslice msrc = get_slice_from_memview(src, &src_slice)[0]             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice mdst = get_slice_from_memview(dst, &dst_slice)[0]
+ * 
+ */
+  if (!(likely(((__pyx_v_src) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_src, __pyx_memoryview_type))))) __PYX_ERR(1, 448, __pyx_L1_error)
+  __pyx_t_1 = __pyx_memoryview_get_slice_from_memoryview(((struct __pyx_memoryview_obj *)__pyx_v_src), (&__pyx_v_src_slice)); if (unlikely(__pyx_t_1 == ((__Pyx_memviewslice *)NULL))) __PYX_ERR(1, 448, __pyx_L1_error)
+  __pyx_v_msrc = (__pyx_t_1[0]);
+
+  /* "View.MemoryView":449
+ *         cdef __Pyx_memviewslice src_slice
+ *         cdef __Pyx_memviewslice msrc = get_slice_from_memview(src, &src_slice)[0]
+ *         cdef __Pyx_memviewslice mdst = get_slice_from_memview(dst, &dst_slice)[0]             # <<<<<<<<<<<<<<
+ * 
+ *         memoryview_copy_contents(msrc, mdst, src.ndim, dst.ndim, self.dtype_is_object)
+ */
+  if (!(likely(((__pyx_v_dst) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_dst, __pyx_memoryview_type))))) __PYX_ERR(1, 449, __pyx_L1_error)
+  __pyx_t_1 = __pyx_memoryview_get_slice_from_memoryview(((struct __pyx_memoryview_obj *)__pyx_v_dst), (&__pyx_v_dst_slice)); if (unlikely(__pyx_t_1 == ((__Pyx_memviewslice *)NULL))) __PYX_ERR(1, 449, __pyx_L1_error)
+  __pyx_v_mdst = (__pyx_t_1[0]);
+
+  /* "View.MemoryView":451
+ *         cdef __Pyx_memviewslice mdst = get_slice_from_memview(dst, &dst_slice)[0]
+ * 
+ *         memoryview_copy_contents(msrc, mdst, src.ndim, dst.ndim, self.dtype_is_object)             # <<<<<<<<<<<<<<
+ * 
+ *     cdef setitem_slice_assign_scalar(self, memoryview dst, value):
+ */
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_src, __pyx_n_s_ndim); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 451, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = __Pyx_PyInt_As_int(__pyx_t_2); if (unlikely((__pyx_t_3 == (int)-1) && PyErr_Occurred())) __PYX_ERR(1, 451, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_dst, __pyx_n_s_ndim); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 451, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_4 = __Pyx_PyInt_As_int(__pyx_t_2); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) __PYX_ERR(1, 451, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_t_5 = __pyx_memoryview_copy_contents(__pyx_v_msrc, __pyx_v_mdst, __pyx_t_3, __pyx_t_4, __pyx_v_self->dtype_is_object); if (unlikely(__pyx_t_5 == ((int)-1))) __PYX_ERR(1, 451, __pyx_L1_error)
+
+  /* "View.MemoryView":445
+ *         return obj
+ * 
+ *     cdef setitem_slice_assignment(self, dst, src):             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice dst_slice
+ *         cdef __Pyx_memviewslice src_slice
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.setitem_slice_assignment", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":453
+ *         memoryview_copy_contents(msrc, mdst, src.ndim, dst.ndim, self.dtype_is_object)
+ * 
+ *     cdef setitem_slice_assign_scalar(self, memoryview dst, value):             # <<<<<<<<<<<<<<
+ *         cdef int array[128]
+ *         cdef void *tmp = NULL
+ */
+
+static PyObject *__pyx_memoryview_setitem_slice_assign_scalar(struct __pyx_memoryview_obj *__pyx_v_self, struct __pyx_memoryview_obj *__pyx_v_dst, PyObject *__pyx_v_value) {
+  int __pyx_v_array[0x80];
+  void *__pyx_v_tmp;
+  void *__pyx_v_item;
+  __Pyx_memviewslice *__pyx_v_dst_slice;
+  __Pyx_memviewslice __pyx_v_tmp_slice;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_memviewslice *__pyx_t_1;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  int __pyx_t_5;
+  char const *__pyx_t_6;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  PyObject *__pyx_t_9 = NULL;
+  PyObject *__pyx_t_10 = NULL;
+  PyObject *__pyx_t_11 = NULL;
+  PyObject *__pyx_t_12 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("setitem_slice_assign_scalar", 1);
+
+  /* "View.MemoryView":455
+ *     cdef setitem_slice_assign_scalar(self, memoryview dst, value):
+ *         cdef int array[128]
+ *         cdef void *tmp = NULL             # <<<<<<<<<<<<<<
+ *         cdef void *item
+ * 
+ */
+  __pyx_v_tmp = NULL;
+
+  /* "View.MemoryView":460
+ *         cdef __Pyx_memviewslice *dst_slice
+ *         cdef __Pyx_memviewslice tmp_slice
+ *         dst_slice = get_slice_from_memview(dst, &tmp_slice)             # <<<<<<<<<<<<<<
+ * 
+ *         if <size_t>self.view.itemsize > sizeof(array):
+ */
+  __pyx_t_1 = __pyx_memoryview_get_slice_from_memoryview(__pyx_v_dst, (&__pyx_v_tmp_slice)); if (unlikely(__pyx_t_1 == ((__Pyx_memviewslice *)NULL))) __PYX_ERR(1, 460, __pyx_L1_error)
+  __pyx_v_dst_slice = __pyx_t_1;
+
+  /* "View.MemoryView":462
+ *         dst_slice = get_slice_from_memview(dst, &tmp_slice)
+ * 
+ *         if <size_t>self.view.itemsize > sizeof(array):             # <<<<<<<<<<<<<<
+ *             tmp = PyMem_Malloc(self.view.itemsize)
+ *             if tmp == NULL:
+ */
+  __pyx_t_2 = (((size_t)__pyx_v_self->view.itemsize) > (sizeof(__pyx_v_array)));
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":463
+ * 
+ *         if <size_t>self.view.itemsize > sizeof(array):
+ *             tmp = PyMem_Malloc(self.view.itemsize)             # <<<<<<<<<<<<<<
+ *             if tmp == NULL:
+ *                 raise MemoryError
+ */
+    __pyx_v_tmp = PyMem_Malloc(__pyx_v_self->view.itemsize);
+
+    /* "View.MemoryView":464
+ *         if <size_t>self.view.itemsize > sizeof(array):
+ *             tmp = PyMem_Malloc(self.view.itemsize)
+ *             if tmp == NULL:             # <<<<<<<<<<<<<<
+ *                 raise MemoryError
+ *             item = tmp
+ */
+    __pyx_t_2 = (__pyx_v_tmp == NULL);
+    if (unlikely(__pyx_t_2)) {
+
+      /* "View.MemoryView":465
+ *             tmp = PyMem_Malloc(self.view.itemsize)
+ *             if tmp == NULL:
+ *                 raise MemoryError             # <<<<<<<<<<<<<<
+ *             item = tmp
+ *         else:
+ */
+      PyErr_NoMemory(); __PYX_ERR(1, 465, __pyx_L1_error)
+
+      /* "View.MemoryView":464
+ *         if <size_t>self.view.itemsize > sizeof(array):
+ *             tmp = PyMem_Malloc(self.view.itemsize)
+ *             if tmp == NULL:             # <<<<<<<<<<<<<<
+ *                 raise MemoryError
+ *             item = tmp
+ */
+    }
+
+    /* "View.MemoryView":466
+ *             if tmp == NULL:
+ *                 raise MemoryError
+ *             item = tmp             # <<<<<<<<<<<<<<
+ *         else:
+ *             item = <void *> array
+ */
+    __pyx_v_item = __pyx_v_tmp;
+
+    /* "View.MemoryView":462
+ *         dst_slice = get_slice_from_memview(dst, &tmp_slice)
+ * 
+ *         if <size_t>self.view.itemsize > sizeof(array):             # <<<<<<<<<<<<<<
+ *             tmp = PyMem_Malloc(self.view.itemsize)
+ *             if tmp == NULL:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":468
+ *             item = tmp
+ *         else:
+ *             item = <void *> array             # <<<<<<<<<<<<<<
+ * 
+ *         try:
+ */
+  /*else*/ {
+    __pyx_v_item = ((void *)__pyx_v_array);
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":470
+ *             item = <void *> array
+ * 
+ *         try:             # <<<<<<<<<<<<<<
+ *             if self.dtype_is_object:
+ *                 (<PyObject **> item)[0] = <PyObject *> value
+ */
+  /*try:*/ {
+
+    /* "View.MemoryView":471
+ * 
+ *         try:
+ *             if self.dtype_is_object:             # <<<<<<<<<<<<<<
+ *                 (<PyObject **> item)[0] = <PyObject *> value
+ *             else:
+ */
+    if (__pyx_v_self->dtype_is_object) {
+
+      /* "View.MemoryView":472
+ *         try:
+ *             if self.dtype_is_object:
+ *                 (<PyObject **> item)[0] = <PyObject *> value             # <<<<<<<<<<<<<<
+ *             else:
+ *                 self.assign_item_from_object(<char *> item, value)
+ */
+      (((PyObject **)__pyx_v_item)[0]) = ((PyObject *)__pyx_v_value);
+
+      /* "View.MemoryView":471
+ * 
+ *         try:
+ *             if self.dtype_is_object:             # <<<<<<<<<<<<<<
+ *                 (<PyObject **> item)[0] = <PyObject *> value
+ *             else:
+ */
+      goto __pyx_L8;
+    }
+
+    /* "View.MemoryView":474
+ *                 (<PyObject **> item)[0] = <PyObject *> value
+ *             else:
+ *                 self.assign_item_from_object(<char *> item, value)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+    /*else*/ {
+      __pyx_t_3 = ((struct __pyx_vtabstruct_memoryview *)__pyx_v_self->__pyx_vtab)->assign_item_from_object(__pyx_v_self, ((char *)__pyx_v_item), __pyx_v_value); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 474, __pyx_L6_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    }
+    __pyx_L8:;
+
+    /* "View.MemoryView":478
+ * 
+ * 
+ *             if self.view.suboffsets != NULL:             # <<<<<<<<<<<<<<
+ *                 assert_direct_dimensions(self.view.suboffsets, self.view.ndim)
+ *             slice_assign_scalar(dst_slice, dst.view.ndim, self.view.itemsize,
+ */
+    __pyx_t_2 = (__pyx_v_self->view.suboffsets != NULL);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":479
+ * 
+ *             if self.view.suboffsets != NULL:
+ *                 assert_direct_dimensions(self.view.suboffsets, self.view.ndim)             # <<<<<<<<<<<<<<
+ *             slice_assign_scalar(dst_slice, dst.view.ndim, self.view.itemsize,
+ *                                 item, self.dtype_is_object)
+ */
+      __pyx_t_4 = assert_direct_dimensions(__pyx_v_self->view.suboffsets, __pyx_v_self->view.ndim); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(1, 479, __pyx_L6_error)
+
+      /* "View.MemoryView":478
+ * 
+ * 
+ *             if self.view.suboffsets != NULL:             # <<<<<<<<<<<<<<
+ *                 assert_direct_dimensions(self.view.suboffsets, self.view.ndim)
+ *             slice_assign_scalar(dst_slice, dst.view.ndim, self.view.itemsize,
+ */
+    }
+
+    /* "View.MemoryView":480
+ *             if self.view.suboffsets != NULL:
+ *                 assert_direct_dimensions(self.view.suboffsets, self.view.ndim)
+ *             slice_assign_scalar(dst_slice, dst.view.ndim, self.view.itemsize,             # <<<<<<<<<<<<<<
+ *                                 item, self.dtype_is_object)
+ *         finally:
+ */
+    __pyx_memoryview_slice_assign_scalar(__pyx_v_dst_slice, __pyx_v_dst->view.ndim, __pyx_v_self->view.itemsize, __pyx_v_item, __pyx_v_self->dtype_is_object);
+  }
+
+  /* "View.MemoryView":483
+ *                                 item, self.dtype_is_object)
+ *         finally:
+ *             PyMem_Free(tmp)             # <<<<<<<<<<<<<<
+ * 
+ *     cdef setitem_indexed(self, index, value):
+ */
+  /*finally:*/ {
+    /*normal exit:*/{
+      PyMem_Free(__pyx_v_tmp);
+      goto __pyx_L7;
+    }
+    __pyx_L6_error:;
+    /*exception exit:*/{
+      __Pyx_PyThreadState_declare
+      __Pyx_PyThreadState_assign
+      __pyx_t_7 = 0; __pyx_t_8 = 0; __pyx_t_9 = 0; __pyx_t_10 = 0; __pyx_t_11 = 0; __pyx_t_12 = 0;
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (PY_MAJOR_VERSION >= 3) __Pyx_ExceptionSwap(&__pyx_t_10, &__pyx_t_11, &__pyx_t_12);
+      if ((PY_MAJOR_VERSION < 3) || unlikely(__Pyx_GetException(&__pyx_t_7, &__pyx_t_8, &__pyx_t_9) < 0)) __Pyx_ErrFetch(&__pyx_t_7, &__pyx_t_8, &__pyx_t_9);
+      __Pyx_XGOTREF(__pyx_t_7);
+      __Pyx_XGOTREF(__pyx_t_8);
+      __Pyx_XGOTREF(__pyx_t_9);
+      __Pyx_XGOTREF(__pyx_t_10);
+      __Pyx_XGOTREF(__pyx_t_11);
+      __Pyx_XGOTREF(__pyx_t_12);
+      __pyx_t_4 = __pyx_lineno; __pyx_t_5 = __pyx_clineno; __pyx_t_6 = __pyx_filename;
+      {
+        PyMem_Free(__pyx_v_tmp);
+      }
+      if (PY_MAJOR_VERSION >= 3) {
+        __Pyx_XGIVEREF(__pyx_t_10);
+        __Pyx_XGIVEREF(__pyx_t_11);
+        __Pyx_XGIVEREF(__pyx_t_12);
+        __Pyx_ExceptionReset(__pyx_t_10, __pyx_t_11, __pyx_t_12);
+      }
+      __Pyx_XGIVEREF(__pyx_t_7);
+      __Pyx_XGIVEREF(__pyx_t_8);
+      __Pyx_XGIVEREF(__pyx_t_9);
+      __Pyx_ErrRestore(__pyx_t_7, __pyx_t_8, __pyx_t_9);
+      __pyx_t_7 = 0; __pyx_t_8 = 0; __pyx_t_9 = 0; __pyx_t_10 = 0; __pyx_t_11 = 0; __pyx_t_12 = 0;
+      __pyx_lineno = __pyx_t_4; __pyx_clineno = __pyx_t_5; __pyx_filename = __pyx_t_6;
+      goto __pyx_L1_error;
+    }
+    __pyx_L7:;
+  }
+
+  /* "View.MemoryView":453
+ *         memoryview_copy_contents(msrc, mdst, src.ndim, dst.ndim, self.dtype_is_object)
+ * 
+ *     cdef setitem_slice_assign_scalar(self, memoryview dst, value):             # <<<<<<<<<<<<<<
+ *         cdef int array[128]
+ *         cdef void *tmp = NULL
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.setitem_slice_assign_scalar", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":485
+ *             PyMem_Free(tmp)
+ * 
+ *     cdef setitem_indexed(self, index, value):             # <<<<<<<<<<<<<<
+ *         cdef char *itemp = self.get_item_pointer(index)
+ *         self.assign_item_from_object(itemp, value)
+ */
+
+static PyObject *__pyx_memoryview_setitem_indexed(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index, PyObject *__pyx_v_value) {
+  char *__pyx_v_itemp;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  char *__pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("setitem_indexed", 1);
+
+  /* "View.MemoryView":486
+ * 
+ *     cdef setitem_indexed(self, index, value):
+ *         cdef char *itemp = self.get_item_pointer(index)             # <<<<<<<<<<<<<<
+ *         self.assign_item_from_object(itemp, value)
+ * 
+ */
+  __pyx_t_1 = ((struct __pyx_vtabstruct_memoryview *)__pyx_v_self->__pyx_vtab)->get_item_pointer(__pyx_v_self, __pyx_v_index); if (unlikely(__pyx_t_1 == ((char *)NULL))) __PYX_ERR(1, 486, __pyx_L1_error)
+  __pyx_v_itemp = __pyx_t_1;
+
+  /* "View.MemoryView":487
+ *     cdef setitem_indexed(self, index, value):
+ *         cdef char *itemp = self.get_item_pointer(index)
+ *         self.assign_item_from_object(itemp, value)             # <<<<<<<<<<<<<<
+ * 
+ *     cdef convert_item_to_object(self, char *itemp):
+ */
+  __pyx_t_2 = ((struct __pyx_vtabstruct_memoryview *)__pyx_v_self->__pyx_vtab)->assign_item_from_object(__pyx_v_self, __pyx_v_itemp, __pyx_v_value); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 487, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "View.MemoryView":485
+ *             PyMem_Free(tmp)
+ * 
+ *     cdef setitem_indexed(self, index, value):             # <<<<<<<<<<<<<<
+ *         cdef char *itemp = self.get_item_pointer(index)
+ *         self.assign_item_from_object(itemp, value)
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.setitem_indexed", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":489
+ *         self.assign_item_from_object(itemp, value)
+ * 
+ *     cdef convert_item_to_object(self, char *itemp):             # <<<<<<<<<<<<<<
+ *         """Only used if instantiated manually by the user, or if Cython doesn't
+ *         know how to convert the type"""
+ */
+
+static PyObject *__pyx_memoryview_convert_item_to_object(struct __pyx_memoryview_obj *__pyx_v_self, char *__pyx_v_itemp) {
+  PyObject *__pyx_v_struct = NULL;
+  PyObject *__pyx_v_bytesitem = 0;
+  PyObject *__pyx_v_result = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  unsigned int __pyx_t_8;
+  Py_ssize_t __pyx_t_9;
+  int __pyx_t_10;
+  int __pyx_t_11;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("convert_item_to_object", 1);
+
+  /* "View.MemoryView":492
+ *         """Only used if instantiated manually by the user, or if Cython doesn't
+ *         know how to convert the type"""
+ *         import struct             # <<<<<<<<<<<<<<
+ *         cdef bytes bytesitem
+ * 
+ */
+  __pyx_t_1 = __Pyx_ImportDottedModule(__pyx_n_s_struct, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 492, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v_struct = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "View.MemoryView":495
+ *         cdef bytes bytesitem
+ * 
+ *         bytesitem = itemp[:self.view.itemsize]             # <<<<<<<<<<<<<<
+ *         try:
+ *             result = struct.unpack(self.view.format, bytesitem)
+ */
+  __pyx_t_1 = __Pyx_PyBytes_FromStringAndSize(__pyx_v_itemp + 0, __pyx_v_self->view.itemsize - 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 495, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v_bytesitem = ((PyObject*)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "View.MemoryView":496
+ * 
+ *         bytesitem = itemp[:self.view.itemsize]
+ *         try:             # <<<<<<<<<<<<<<
+ *             result = struct.unpack(self.view.format, bytesitem)
+ *         except struct.error:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_2, &__pyx_t_3, &__pyx_t_4);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_4);
+    /*try:*/ {
+
+      /* "View.MemoryView":497
+ *         bytesitem = itemp[:self.view.itemsize]
+ *         try:
+ *             result = struct.unpack(self.view.format, bytesitem)             # <<<<<<<<<<<<<<
+ *         except struct.error:
+ *             raise ValueError, "Unable to convert item to object"
+ */
+      __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_v_struct, __pyx_n_s_unpack); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 497, __pyx_L3_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __pyx_t_6 = __Pyx_PyBytes_FromString(__pyx_v_self->view.format); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 497, __pyx_L3_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __pyx_t_7 = NULL;
+      __pyx_t_8 = 0;
+      #if CYTHON_UNPACK_METHODS
+      if (likely(PyMethod_Check(__pyx_t_5))) {
+        __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_5);
+        if (likely(__pyx_t_7)) {
+          PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_5);
+          __Pyx_INCREF(__pyx_t_7);
+          __Pyx_INCREF(function);
+          __Pyx_DECREF_SET(__pyx_t_5, function);
+          __pyx_t_8 = 1;
+        }
+      }
+      #endif
+      {
+        PyObject *__pyx_callargs[3] = {__pyx_t_7, __pyx_t_6, __pyx_v_bytesitem};
+        __pyx_t_1 = __Pyx_PyObject_FastCall(__pyx_t_5, __pyx_callargs+1-__pyx_t_8, 2+__pyx_t_8);
+        __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+        if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 497, __pyx_L3_error)
+        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      }
+      __pyx_v_result = __pyx_t_1;
+      __pyx_t_1 = 0;
+
+      /* "View.MemoryView":496
+ * 
+ *         bytesitem = itemp[:self.view.itemsize]
+ *         try:             # <<<<<<<<<<<<<<
+ *             result = struct.unpack(self.view.format, bytesitem)
+ *         except struct.error:
+ */
+    }
+
+    /* "View.MemoryView":501
+ *             raise ValueError, "Unable to convert item to object"
+ *         else:
+ *             if len(self.view.format) == 1:             # <<<<<<<<<<<<<<
+ *                 return result[0]
+ *             return result
+ */
+    /*else:*/ {
+      __pyx_t_9 = __Pyx_ssize_strlen(__pyx_v_self->view.format); if (unlikely(__pyx_t_9 == ((Py_ssize_t)-1))) __PYX_ERR(1, 501, __pyx_L5_except_error)
+      __pyx_t_10 = (__pyx_t_9 == 1);
+      if (__pyx_t_10) {
+
+        /* "View.MemoryView":502
+ *         else:
+ *             if len(self.view.format) == 1:
+ *                 return result[0]             # <<<<<<<<<<<<<<
+ *             return result
+ * 
+ */
+        __Pyx_XDECREF(__pyx_r);
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_result, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 502, __pyx_L5_except_error)
+        __Pyx_GOTREF(__pyx_t_1);
+        __pyx_r = __pyx_t_1;
+        __pyx_t_1 = 0;
+        goto __pyx_L6_except_return;
+
+        /* "View.MemoryView":501
+ *             raise ValueError, "Unable to convert item to object"
+ *         else:
+ *             if len(self.view.format) == 1:             # <<<<<<<<<<<<<<
+ *                 return result[0]
+ *             return result
+ */
+      }
+
+      /* "View.MemoryView":503
+ *             if len(self.view.format) == 1:
+ *                 return result[0]
+ *             return result             # <<<<<<<<<<<<<<
+ * 
+ *     cdef assign_item_from_object(self, char *itemp, object value):
+ */
+      __Pyx_XDECREF(__pyx_r);
+      __Pyx_INCREF(__pyx_v_result);
+      __pyx_r = __pyx_v_result;
+      goto __pyx_L6_except_return;
+    }
+    __pyx_L3_error:;
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+    __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+    /* "View.MemoryView":498
+ *         try:
+ *             result = struct.unpack(self.view.format, bytesitem)
+ *         except struct.error:             # <<<<<<<<<<<<<<
+ *             raise ValueError, "Unable to convert item to object"
+ *         else:
+ */
+    __Pyx_ErrFetch(&__pyx_t_1, &__pyx_t_5, &__pyx_t_6);
+    __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_v_struct, __pyx_n_s_error); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 498, __pyx_L5_except_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __pyx_t_11 = __Pyx_PyErr_GivenExceptionMatches(__pyx_t_1, __pyx_t_7);
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __Pyx_ErrRestore(__pyx_t_1, __pyx_t_5, __pyx_t_6);
+    __pyx_t_1 = 0; __pyx_t_5 = 0; __pyx_t_6 = 0;
+    if (__pyx_t_11) {
+      __Pyx_AddTraceback("View.MemoryView.memoryview.convert_item_to_object", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_6, &__pyx_t_5, &__pyx_t_1) < 0) __PYX_ERR(1, 498, __pyx_L5_except_error)
+      __Pyx_XGOTREF(__pyx_t_6);
+      __Pyx_XGOTREF(__pyx_t_5);
+      __Pyx_XGOTREF(__pyx_t_1);
+
+      /* "View.MemoryView":499
+ *             result = struct.unpack(self.view.format, bytesitem)
+ *         except struct.error:
+ *             raise ValueError, "Unable to convert item to object"             # <<<<<<<<<<<<<<
+ *         else:
+ *             if len(self.view.format) == 1:
+ */
+      __Pyx_Raise(__pyx_builtin_ValueError, __pyx_kp_s_Unable_to_convert_item_to_object, 0, 0);
+      __PYX_ERR(1, 499, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+
+    /* "View.MemoryView":496
+ * 
+ *         bytesitem = itemp[:self.view.itemsize]
+ *         try:             # <<<<<<<<<<<<<<
+ *             result = struct.unpack(self.view.format, bytesitem)
+ *         except struct.error:
+ */
+    __pyx_L5_except_error:;
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_XGIVEREF(__pyx_t_4);
+    __Pyx_ExceptionReset(__pyx_t_2, __pyx_t_3, __pyx_t_4);
+    goto __pyx_L1_error;
+    __pyx_L6_except_return:;
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_XGIVEREF(__pyx_t_4);
+    __Pyx_ExceptionReset(__pyx_t_2, __pyx_t_3, __pyx_t_4);
+    goto __pyx_L0;
+  }
+
+  /* "View.MemoryView":489
+ *         self.assign_item_from_object(itemp, value)
+ * 
+ *     cdef convert_item_to_object(self, char *itemp):             # <<<<<<<<<<<<<<
+ *         """Only used if instantiated manually by the user, or if Cython doesn't
+ *         know how to convert the type"""
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.convert_item_to_object", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_struct);
+  __Pyx_XDECREF(__pyx_v_bytesitem);
+  __Pyx_XDECREF(__pyx_v_result);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":505
+ *             return result
+ * 
+ *     cdef assign_item_from_object(self, char *itemp, object value):             # <<<<<<<<<<<<<<
+ *         """Only used if instantiated manually by the user, or if Cython doesn't
+ *         know how to convert the type"""
+ */
+
+static PyObject *__pyx_memoryview_assign_item_from_object(struct __pyx_memoryview_obj *__pyx_v_self, char *__pyx_v_itemp, PyObject *__pyx_v_value) {
+  PyObject *__pyx_v_struct = NULL;
+  char __pyx_v_c;
+  PyObject *__pyx_v_bytesvalue = 0;
+  Py_ssize_t __pyx_v_i;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  unsigned int __pyx_t_6;
+  Py_ssize_t __pyx_t_7;
+  PyObject *__pyx_t_8 = NULL;
+  char *__pyx_t_9;
+  char *__pyx_t_10;
+  char *__pyx_t_11;
+  char *__pyx_t_12;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("assign_item_from_object", 1);
+
+  /* "View.MemoryView":508
+ *         """Only used if instantiated manually by the user, or if Cython doesn't
+ *         know how to convert the type"""
+ *         import struct             # <<<<<<<<<<<<<<
+ *         cdef char c
+ *         cdef bytes bytesvalue
+ */
+  __pyx_t_1 = __Pyx_ImportDottedModule(__pyx_n_s_struct, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 508, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v_struct = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "View.MemoryView":513
+ *         cdef Py_ssize_t i
+ * 
+ *         if isinstance(value, tuple):             # <<<<<<<<<<<<<<
+ *             bytesvalue = struct.pack(self.view.format, *value)
+ *         else:
+ */
+  __pyx_t_2 = PyTuple_Check(__pyx_v_value); 
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":514
+ * 
+ *         if isinstance(value, tuple):
+ *             bytesvalue = struct.pack(self.view.format, *value)             # <<<<<<<<<<<<<<
+ *         else:
+ *             bytesvalue = struct.pack(self.view.format, value)
+ */
+    __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_struct, __pyx_n_s_pack); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 514, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_3 = __Pyx_PyBytes_FromString(__pyx_v_self->view.format); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 514, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_4 = PyTuple_New(1); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 514, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_GIVEREF(__pyx_t_3);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3)) __PYX_ERR(1, 514, __pyx_L1_error);
+    __pyx_t_3 = 0;
+    __pyx_t_3 = __Pyx_PySequence_Tuple(__pyx_v_value); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 514, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_5 = PyNumber_Add(__pyx_t_4, __pyx_t_3); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 514, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_5, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 514, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    if (!(likely(PyBytes_CheckExact(__pyx_t_3))||((__pyx_t_3) == Py_None) || __Pyx_RaiseUnexpectedTypeError("bytes", __pyx_t_3))) __PYX_ERR(1, 514, __pyx_L1_error)
+    __pyx_v_bytesvalue = ((PyObject*)__pyx_t_3);
+    __pyx_t_3 = 0;
+
+    /* "View.MemoryView":513
+ *         cdef Py_ssize_t i
+ * 
+ *         if isinstance(value, tuple):             # <<<<<<<<<<<<<<
+ *             bytesvalue = struct.pack(self.view.format, *value)
+ *         else:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":516
+ *             bytesvalue = struct.pack(self.view.format, *value)
+ *         else:
+ *             bytesvalue = struct.pack(self.view.format, value)             # <<<<<<<<<<<<<<
+ * 
+ *         for i, c in enumerate(bytesvalue):
+ */
+  /*else*/ {
+    __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_v_struct, __pyx_n_s_pack); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 516, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_1 = __Pyx_PyBytes_FromString(__pyx_v_self->view.format); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 516, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_4 = NULL;
+    __pyx_t_6 = 0;
+    #if CYTHON_UNPACK_METHODS
+    if (likely(PyMethod_Check(__pyx_t_5))) {
+      __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_5);
+      if (likely(__pyx_t_4)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_5);
+        __Pyx_INCREF(__pyx_t_4);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_5, function);
+        __pyx_t_6 = 1;
+      }
+    }
+    #endif
+    {
+      PyObject *__pyx_callargs[3] = {__pyx_t_4, __pyx_t_1, __pyx_v_value};
+      __pyx_t_3 = __Pyx_PyObject_FastCall(__pyx_t_5, __pyx_callargs+1-__pyx_t_6, 2+__pyx_t_6);
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+      if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 516, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    }
+    if (!(likely(PyBytes_CheckExact(__pyx_t_3))||((__pyx_t_3) == Py_None) || __Pyx_RaiseUnexpectedTypeError("bytes", __pyx_t_3))) __PYX_ERR(1, 516, __pyx_L1_error)
+    __pyx_v_bytesvalue = ((PyObject*)__pyx_t_3);
+    __pyx_t_3 = 0;
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":518
+ *             bytesvalue = struct.pack(self.view.format, value)
+ * 
+ *         for i, c in enumerate(bytesvalue):             # <<<<<<<<<<<<<<
+ *             itemp[i] = c
+ * 
+ */
+  __pyx_t_7 = 0;
+  if (unlikely(__pyx_v_bytesvalue == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' is not iterable");
+    __PYX_ERR(1, 518, __pyx_L1_error)
+  }
+  __Pyx_INCREF(__pyx_v_bytesvalue);
+  __pyx_t_8 = __pyx_v_bytesvalue;
+  __pyx_t_10 = PyBytes_AS_STRING(__pyx_t_8);
+  __pyx_t_11 = (__pyx_t_10 + PyBytes_GET_SIZE(__pyx_t_8));
+  for (__pyx_t_12 = __pyx_t_10; __pyx_t_12 < __pyx_t_11; __pyx_t_12++) {
+    __pyx_t_9 = __pyx_t_12;
+    __pyx_v_c = (__pyx_t_9[0]);
+
+    /* "View.MemoryView":519
+ * 
+ *         for i, c in enumerate(bytesvalue):
+ *             itemp[i] = c             # <<<<<<<<<<<<<<
+ * 
+ *     @cname('getbuffer')
+ */
+    __pyx_v_i = __pyx_t_7;
+
+    /* "View.MemoryView":518
+ *             bytesvalue = struct.pack(self.view.format, value)
+ * 
+ *         for i, c in enumerate(bytesvalue):             # <<<<<<<<<<<<<<
+ *             itemp[i] = c
+ * 
+ */
+    __pyx_t_7 = (__pyx_t_7 + 1);
+
+    /* "View.MemoryView":519
+ * 
+ *         for i, c in enumerate(bytesvalue):
+ *             itemp[i] = c             # <<<<<<<<<<<<<<
+ * 
+ *     @cname('getbuffer')
+ */
+    (__pyx_v_itemp[__pyx_v_i]) = __pyx_v_c;
+  }
+  __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+
+  /* "View.MemoryView":505
+ *             return result
+ * 
+ *     cdef assign_item_from_object(self, char *itemp, object value):             # <<<<<<<<<<<<<<
+ *         """Only used if instantiated manually by the user, or if Cython doesn't
+ *         know how to convert the type"""
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.assign_item_from_object", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_struct);
+  __Pyx_XDECREF(__pyx_v_bytesvalue);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":521
+ *             itemp[i] = c
+ * 
+ *     @cname('getbuffer')             # <<<<<<<<<<<<<<
+ *     def __getbuffer__(self, Py_buffer *info, int flags):
+ *         if flags & PyBUF_WRITABLE and self.view.readonly:
+ */
+
+/* Python wrapper */
+CYTHON_UNUSED static int __pyx_memoryview_getbuffer(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /*proto*/
+CYTHON_UNUSED static int __pyx_memoryview_getbuffer(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__getbuffer__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_8__getbuffer__(((struct __pyx_memoryview_obj *)__pyx_v_self), ((Py_buffer *)__pyx_v_info), ((int)__pyx_v_flags));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_8__getbuffer__(struct __pyx_memoryview_obj *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  int __pyx_t_2;
+  Py_ssize_t *__pyx_t_3;
+  char *__pyx_t_4;
+  void *__pyx_t_5;
+  int __pyx_t_6;
+  Py_ssize_t __pyx_t_7;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  if (unlikely(__pyx_v_info == NULL)) {
+    PyErr_SetString(PyExc_BufferError, "PyObject_GetBuffer: view==NULL argument is obsolete");
+    return -1;
+  }
+  __Pyx_RefNannySetupContext("__getbuffer__", 0);
+  __pyx_v_info->obj = Py_None; __Pyx_INCREF(Py_None);
+  __Pyx_GIVEREF(__pyx_v_info->obj);
+
+  /* "View.MemoryView":523
+ *     @cname('getbuffer')
+ *     def __getbuffer__(self, Py_buffer *info, int flags):
+ *         if flags & PyBUF_WRITABLE and self.view.readonly:             # <<<<<<<<<<<<<<
+ *             raise ValueError, "Cannot create writable memory view from read-only memoryview"
+ * 
+ */
+  __pyx_t_2 = ((__pyx_v_flags & PyBUF_WRITABLE) != 0);
+  if (__pyx_t_2) {
+  } else {
+    __pyx_t_1 = __pyx_t_2;
+    goto __pyx_L4_bool_binop_done;
+  }
+  __pyx_t_1 = __pyx_v_self->view.readonly;
+  __pyx_L4_bool_binop_done:;
+  if (unlikely(__pyx_t_1)) {
+
+    /* "View.MemoryView":524
+ *     def __getbuffer__(self, Py_buffer *info, int flags):
+ *         if flags & PyBUF_WRITABLE and self.view.readonly:
+ *             raise ValueError, "Cannot create writable memory view from read-only memoryview"             # <<<<<<<<<<<<<<
+ * 
+ *         if flags & PyBUF_ND:
+ */
+    __Pyx_Raise(__pyx_builtin_ValueError, __pyx_kp_s_Cannot_create_writable_memory_vi, 0, 0);
+    __PYX_ERR(1, 524, __pyx_L1_error)
+
+    /* "View.MemoryView":523
+ *     @cname('getbuffer')
+ *     def __getbuffer__(self, Py_buffer *info, int flags):
+ *         if flags & PyBUF_WRITABLE and self.view.readonly:             # <<<<<<<<<<<<<<
+ *             raise ValueError, "Cannot create writable memory view from read-only memoryview"
+ * 
+ */
+  }
+
+  /* "View.MemoryView":526
+ *             raise ValueError, "Cannot create writable memory view from read-only memoryview"
+ * 
+ *         if flags & PyBUF_ND:             # <<<<<<<<<<<<<<
+ *             info.shape = self.view.shape
+ *         else:
+ */
+  __pyx_t_1 = ((__pyx_v_flags & PyBUF_ND) != 0);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":527
+ * 
+ *         if flags & PyBUF_ND:
+ *             info.shape = self.view.shape             # <<<<<<<<<<<<<<
+ *         else:
+ *             info.shape = NULL
+ */
+    __pyx_t_3 = __pyx_v_self->view.shape;
+    __pyx_v_info->shape = __pyx_t_3;
+
+    /* "View.MemoryView":526
+ *             raise ValueError, "Cannot create writable memory view from read-only memoryview"
+ * 
+ *         if flags & PyBUF_ND:             # <<<<<<<<<<<<<<
+ *             info.shape = self.view.shape
+ *         else:
+ */
+    goto __pyx_L6;
+  }
+
+  /* "View.MemoryView":529
+ *             info.shape = self.view.shape
+ *         else:
+ *             info.shape = NULL             # <<<<<<<<<<<<<<
+ * 
+ *         if flags & PyBUF_STRIDES:
+ */
+  /*else*/ {
+    __pyx_v_info->shape = NULL;
+  }
+  __pyx_L6:;
+
+  /* "View.MemoryView":531
+ *             info.shape = NULL
+ * 
+ *         if flags & PyBUF_STRIDES:             # <<<<<<<<<<<<<<
+ *             info.strides = self.view.strides
+ *         else:
+ */
+  __pyx_t_1 = ((__pyx_v_flags & PyBUF_STRIDES) != 0);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":532
+ * 
+ *         if flags & PyBUF_STRIDES:
+ *             info.strides = self.view.strides             # <<<<<<<<<<<<<<
+ *         else:
+ *             info.strides = NULL
+ */
+    __pyx_t_3 = __pyx_v_self->view.strides;
+    __pyx_v_info->strides = __pyx_t_3;
+
+    /* "View.MemoryView":531
+ *             info.shape = NULL
+ * 
+ *         if flags & PyBUF_STRIDES:             # <<<<<<<<<<<<<<
+ *             info.strides = self.view.strides
+ *         else:
+ */
+    goto __pyx_L7;
+  }
+
+  /* "View.MemoryView":534
+ *             info.strides = self.view.strides
+ *         else:
+ *             info.strides = NULL             # <<<<<<<<<<<<<<
+ * 
+ *         if flags & PyBUF_INDIRECT:
+ */
+  /*else*/ {
+    __pyx_v_info->strides = NULL;
+  }
+  __pyx_L7:;
+
+  /* "View.MemoryView":536
+ *             info.strides = NULL
+ * 
+ *         if flags & PyBUF_INDIRECT:             # <<<<<<<<<<<<<<
+ *             info.suboffsets = self.view.suboffsets
+ *         else:
+ */
+  __pyx_t_1 = ((__pyx_v_flags & PyBUF_INDIRECT) != 0);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":537
+ * 
+ *         if flags & PyBUF_INDIRECT:
+ *             info.suboffsets = self.view.suboffsets             # <<<<<<<<<<<<<<
+ *         else:
+ *             info.suboffsets = NULL
+ */
+    __pyx_t_3 = __pyx_v_self->view.suboffsets;
+    __pyx_v_info->suboffsets = __pyx_t_3;
+
+    /* "View.MemoryView":536
+ *             info.strides = NULL
+ * 
+ *         if flags & PyBUF_INDIRECT:             # <<<<<<<<<<<<<<
+ *             info.suboffsets = self.view.suboffsets
+ *         else:
+ */
+    goto __pyx_L8;
+  }
+
+  /* "View.MemoryView":539
+ *             info.suboffsets = self.view.suboffsets
+ *         else:
+ *             info.suboffsets = NULL             # <<<<<<<<<<<<<<
+ * 
+ *         if flags & PyBUF_FORMAT:
+ */
+  /*else*/ {
+    __pyx_v_info->suboffsets = NULL;
+  }
+  __pyx_L8:;
+
+  /* "View.MemoryView":541
+ *             info.suboffsets = NULL
+ * 
+ *         if flags & PyBUF_FORMAT:             # <<<<<<<<<<<<<<
+ *             info.format = self.view.format
+ *         else:
+ */
+  __pyx_t_1 = ((__pyx_v_flags & PyBUF_FORMAT) != 0);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":542
+ * 
+ *         if flags & PyBUF_FORMAT:
+ *             info.format = self.view.format             # <<<<<<<<<<<<<<
+ *         else:
+ *             info.format = NULL
+ */
+    __pyx_t_4 = __pyx_v_self->view.format;
+    __pyx_v_info->format = __pyx_t_4;
+
+    /* "View.MemoryView":541
+ *             info.suboffsets = NULL
+ * 
+ *         if flags & PyBUF_FORMAT:             # <<<<<<<<<<<<<<
+ *             info.format = self.view.format
+ *         else:
+ */
+    goto __pyx_L9;
+  }
+
+  /* "View.MemoryView":544
+ *             info.format = self.view.format
+ *         else:
+ *             info.format = NULL             # <<<<<<<<<<<<<<
+ * 
+ *         info.buf = self.view.buf
+ */
+  /*else*/ {
+    __pyx_v_info->format = NULL;
+  }
+  __pyx_L9:;
+
+  /* "View.MemoryView":546
+ *             info.format = NULL
+ * 
+ *         info.buf = self.view.buf             # <<<<<<<<<<<<<<
+ *         info.ndim = self.view.ndim
+ *         info.itemsize = self.view.itemsize
+ */
+  __pyx_t_5 = __pyx_v_self->view.buf;
+  __pyx_v_info->buf = __pyx_t_5;
+
+  /* "View.MemoryView":547
+ * 
+ *         info.buf = self.view.buf
+ *         info.ndim = self.view.ndim             # <<<<<<<<<<<<<<
+ *         info.itemsize = self.view.itemsize
+ *         info.len = self.view.len
+ */
+  __pyx_t_6 = __pyx_v_self->view.ndim;
+  __pyx_v_info->ndim = __pyx_t_6;
+
+  /* "View.MemoryView":548
+ *         info.buf = self.view.buf
+ *         info.ndim = self.view.ndim
+ *         info.itemsize = self.view.itemsize             # <<<<<<<<<<<<<<
+ *         info.len = self.view.len
+ *         info.readonly = self.view.readonly
+ */
+  __pyx_t_7 = __pyx_v_self->view.itemsize;
+  __pyx_v_info->itemsize = __pyx_t_7;
+
+  /* "View.MemoryView":549
+ *         info.ndim = self.view.ndim
+ *         info.itemsize = self.view.itemsize
+ *         info.len = self.view.len             # <<<<<<<<<<<<<<
+ *         info.readonly = self.view.readonly
+ *         info.obj = self
+ */
+  __pyx_t_7 = __pyx_v_self->view.len;
+  __pyx_v_info->len = __pyx_t_7;
+
+  /* "View.MemoryView":550
+ *         info.itemsize = self.view.itemsize
+ *         info.len = self.view.len
+ *         info.readonly = self.view.readonly             # <<<<<<<<<<<<<<
+ *         info.obj = self
+ * 
+ */
+  __pyx_t_1 = __pyx_v_self->view.readonly;
+  __pyx_v_info->readonly = __pyx_t_1;
+
+  /* "View.MemoryView":551
+ *         info.len = self.view.len
+ *         info.readonly = self.view.readonly
+ *         info.obj = self             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __Pyx_INCREF((PyObject *)__pyx_v_self);
+  __Pyx_GIVEREF((PyObject *)__pyx_v_self);
+  __Pyx_GOTREF(__pyx_v_info->obj);
+  __Pyx_DECREF(__pyx_v_info->obj);
+  __pyx_v_info->obj = ((PyObject *)__pyx_v_self);
+
+  /* "View.MemoryView":521
+ *             itemp[i] = c
+ * 
+ *     @cname('getbuffer')             # <<<<<<<<<<<<<<
+ *     def __getbuffer__(self, Py_buffer *info, int flags):
+ *         if flags & PyBUF_WRITABLE and self.view.readonly:
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView.memoryview.__getbuffer__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  if (__pyx_v_info->obj != NULL) {
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj); __pyx_v_info->obj = 0;
+  }
+  goto __pyx_L2;
+  __pyx_L0:;
+  if (__pyx_v_info->obj == Py_None) {
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj); __pyx_v_info->obj = 0;
+  }
+  __pyx_L2:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":554
+ * 
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def T(self):
+ *         cdef _memoryviewslice result = memoryview_copy(self)
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_1T_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_1T_1__get__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_pf_15View_dot_MemoryView_10memoryview_1T___get__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_1T___get__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  struct __pyx_memoryviewslice_obj *__pyx_v_result = 0;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__get__", 1);
+
+  /* "View.MemoryView":556
+ *     @property
+ *     def T(self):
+ *         cdef _memoryviewslice result = memoryview_copy(self)             # <<<<<<<<<<<<<<
+ *         transpose_memslice(&result.from_slice)
+ *         return result
+ */
+  __pyx_t_1 = __pyx_memoryview_copy_object(__pyx_v_self); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 556, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_memoryviewslice_type))))) __PYX_ERR(1, 556, __pyx_L1_error)
+  __pyx_v_result = ((struct __pyx_memoryviewslice_obj *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "View.MemoryView":557
+ *     def T(self):
+ *         cdef _memoryviewslice result = memoryview_copy(self)
+ *         transpose_memslice(&result.from_slice)             # <<<<<<<<<<<<<<
+ *         return result
+ * 
+ */
+  __pyx_t_2 = __pyx_memslice_transpose((&__pyx_v_result->from_slice)); if (unlikely(__pyx_t_2 == ((int)-1))) __PYX_ERR(1, 557, __pyx_L1_error)
+
+  /* "View.MemoryView":558
+ *         cdef _memoryviewslice result = memoryview_copy(self)
+ *         transpose_memslice(&result.from_slice)
+ *         return result             # <<<<<<<<<<<<<<
+ * 
+ *     @property
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF((PyObject *)__pyx_v_result);
+  __pyx_r = ((PyObject *)__pyx_v_result);
+  goto __pyx_L0;
+
+  /* "View.MemoryView":554
+ * 
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def T(self):
+ *         cdef _memoryviewslice result = memoryview_copy(self)
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.T.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_result);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":560
+ *         return result
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def base(self):
+ *         return self._get_base()
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_4base_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_4base_1__get__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_pf_15View_dot_MemoryView_10memoryview_4base___get__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4base___get__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__get__", 1);
+
+  /* "View.MemoryView":562
+ *     @property
+ *     def base(self):
+ *         return self._get_base()             # <<<<<<<<<<<<<<
+ * 
+ *     cdef _get_base(self):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = ((struct __pyx_vtabstruct_memoryview *)__pyx_v_self->__pyx_vtab)->_get_base(__pyx_v_self); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 562, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":560
+ *         return result
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def base(self):
+ *         return self._get_base()
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.base.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":564
+ *         return self._get_base()
+ * 
+ *     cdef _get_base(self):             # <<<<<<<<<<<<<<
+ *         return self.obj
+ * 
+ */
+
+static PyObject *__pyx_memoryview__get_base(struct __pyx_memoryview_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_get_base", 1);
+
+  /* "View.MemoryView":565
+ * 
+ *     cdef _get_base(self):
+ *         return self.obj             # <<<<<<<<<<<<<<
+ * 
+ *     @property
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_self->obj);
+  __pyx_r = __pyx_v_self->obj;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":564
+ *         return self._get_base()
+ * 
+ *     cdef _get_base(self):             # <<<<<<<<<<<<<<
+ *         return self.obj
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":567
+ *         return self.obj
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def shape(self):
+ *         return tuple([length for length in self.view.shape[:self.view.ndim]])
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_5shape_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_5shape_1__get__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_pf_15View_dot_MemoryView_10memoryview_5shape___get__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_5shape___get__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  Py_ssize_t __pyx_7genexpr__pyx_v_length;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  Py_ssize_t *__pyx_t_2;
+  Py_ssize_t *__pyx_t_3;
+  Py_ssize_t *__pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__get__", 1);
+
+  /* "View.MemoryView":569
+ *     @property
+ *     def shape(self):
+ *         return tuple([length for length in self.view.shape[:self.view.ndim]])             # <<<<<<<<<<<<<<
+ * 
+ *     @property
+ */
+  __Pyx_XDECREF(__pyx_r);
+  { /* enter inner scope */
+    __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 569, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_3 = (__pyx_v_self->view.shape + __pyx_v_self->view.ndim);
+    for (__pyx_t_4 = __pyx_v_self->view.shape; __pyx_t_4 < __pyx_t_3; __pyx_t_4++) {
+      __pyx_t_2 = __pyx_t_4;
+      __pyx_7genexpr__pyx_v_length = (__pyx_t_2[0]);
+      __pyx_t_5 = PyInt_FromSsize_t(__pyx_7genexpr__pyx_v_length); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 569, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      if (unlikely(__Pyx_ListComp_Append(__pyx_t_1, (PyObject*)__pyx_t_5))) __PYX_ERR(1, 569, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    }
+  } /* exit inner scope */
+  __pyx_t_5 = PyList_AsTuple(((PyObject*)__pyx_t_1)); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 569, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_r = __pyx_t_5;
+  __pyx_t_5 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":567
+ *         return self.obj
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def shape(self):
+ *         return tuple([length for length in self.view.shape[:self.view.ndim]])
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.shape.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":571
+ *         return tuple([length for length in self.view.shape[:self.view.ndim]])
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def strides(self):
+ *         if self.view.strides == NULL:
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_7strides_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_7strides_1__get__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_pf_15View_dot_MemoryView_10memoryview_7strides___get__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_7strides___get__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  Py_ssize_t __pyx_8genexpr1__pyx_v_stride;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  Py_ssize_t *__pyx_t_3;
+  Py_ssize_t *__pyx_t_4;
+  Py_ssize_t *__pyx_t_5;
+  PyObject *__pyx_t_6 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__get__", 1);
+
+  /* "View.MemoryView":573
+ *     @property
+ *     def strides(self):
+ *         if self.view.strides == NULL:             # <<<<<<<<<<<<<<
+ * 
+ *             raise ValueError, "Buffer view does not expose strides"
+ */
+  __pyx_t_1 = (__pyx_v_self->view.strides == NULL);
+  if (unlikely(__pyx_t_1)) {
+
+    /* "View.MemoryView":575
+ *         if self.view.strides == NULL:
+ * 
+ *             raise ValueError, "Buffer view does not expose strides"             # <<<<<<<<<<<<<<
+ * 
+ *         return tuple([stride for stride in self.view.strides[:self.view.ndim]])
+ */
+    __Pyx_Raise(__pyx_builtin_ValueError, __pyx_kp_s_Buffer_view_does_not_expose_stri, 0, 0);
+    __PYX_ERR(1, 575, __pyx_L1_error)
+
+    /* "View.MemoryView":573
+ *     @property
+ *     def strides(self):
+ *         if self.view.strides == NULL:             # <<<<<<<<<<<<<<
+ * 
+ *             raise ValueError, "Buffer view does not expose strides"
+ */
+  }
+
+  /* "View.MemoryView":577
+ *             raise ValueError, "Buffer view does not expose strides"
+ * 
+ *         return tuple([stride for stride in self.view.strides[:self.view.ndim]])             # <<<<<<<<<<<<<<
+ * 
+ *     @property
+ */
+  __Pyx_XDECREF(__pyx_r);
+  { /* enter inner scope */
+    __pyx_t_2 = PyList_New(0); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 577, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_4 = (__pyx_v_self->view.strides + __pyx_v_self->view.ndim);
+    for (__pyx_t_5 = __pyx_v_self->view.strides; __pyx_t_5 < __pyx_t_4; __pyx_t_5++) {
+      __pyx_t_3 = __pyx_t_5;
+      __pyx_8genexpr1__pyx_v_stride = (__pyx_t_3[0]);
+      __pyx_t_6 = PyInt_FromSsize_t(__pyx_8genexpr1__pyx_v_stride); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 577, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      if (unlikely(__Pyx_ListComp_Append(__pyx_t_2, (PyObject*)__pyx_t_6))) __PYX_ERR(1, 577, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    }
+  } /* exit inner scope */
+  __pyx_t_6 = PyList_AsTuple(((PyObject*)__pyx_t_2)); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 577, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_6);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_r = __pyx_t_6;
+  __pyx_t_6 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":571
+ *         return tuple([length for length in self.view.shape[:self.view.ndim]])
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def strides(self):
+ *         if self.view.strides == NULL:
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.strides.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":579
+ *         return tuple([stride for stride in self.view.strides[:self.view.ndim]])
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def suboffsets(self):
+ *         if self.view.suboffsets == NULL:
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_10suboffsets_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_10suboffsets_1__get__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_pf_15View_dot_MemoryView_10memoryview_10suboffsets___get__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_10suboffsets___get__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  Py_ssize_t __pyx_8genexpr2__pyx_v_suboffset;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  Py_ssize_t *__pyx_t_3;
+  Py_ssize_t *__pyx_t_4;
+  Py_ssize_t *__pyx_t_5;
+  PyObject *__pyx_t_6 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__get__", 1);
+
+  /* "View.MemoryView":581
+ *     @property
+ *     def suboffsets(self):
+ *         if self.view.suboffsets == NULL:             # <<<<<<<<<<<<<<
+ *             return (-1,) * self.view.ndim
+ * 
+ */
+  __pyx_t_1 = (__pyx_v_self->view.suboffsets == NULL);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":582
+ *     def suboffsets(self):
+ *         if self.view.suboffsets == NULL:
+ *             return (-1,) * self.view.ndim             # <<<<<<<<<<<<<<
+ * 
+ *         return tuple([suboffset for suboffset in self.view.suboffsets[:self.view.ndim]])
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_t_2 = __Pyx_PySequence_Multiply(__pyx_tuple__4, __pyx_v_self->view.ndim); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 582, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_r = __pyx_t_2;
+    __pyx_t_2 = 0;
+    goto __pyx_L0;
+
+    /* "View.MemoryView":581
+ *     @property
+ *     def suboffsets(self):
+ *         if self.view.suboffsets == NULL:             # <<<<<<<<<<<<<<
+ *             return (-1,) * self.view.ndim
+ * 
+ */
+  }
+
+  /* "View.MemoryView":584
+ *             return (-1,) * self.view.ndim
+ * 
+ *         return tuple([suboffset for suboffset in self.view.suboffsets[:self.view.ndim]])             # <<<<<<<<<<<<<<
+ * 
+ *     @property
+ */
+  __Pyx_XDECREF(__pyx_r);
+  { /* enter inner scope */
+    __pyx_t_2 = PyList_New(0); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 584, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_4 = (__pyx_v_self->view.suboffsets + __pyx_v_self->view.ndim);
+    for (__pyx_t_5 = __pyx_v_self->view.suboffsets; __pyx_t_5 < __pyx_t_4; __pyx_t_5++) {
+      __pyx_t_3 = __pyx_t_5;
+      __pyx_8genexpr2__pyx_v_suboffset = (__pyx_t_3[0]);
+      __pyx_t_6 = PyInt_FromSsize_t(__pyx_8genexpr2__pyx_v_suboffset); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 584, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      if (unlikely(__Pyx_ListComp_Append(__pyx_t_2, (PyObject*)__pyx_t_6))) __PYX_ERR(1, 584, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    }
+  } /* exit inner scope */
+  __pyx_t_6 = PyList_AsTuple(((PyObject*)__pyx_t_2)); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 584, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_6);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_r = __pyx_t_6;
+  __pyx_t_6 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":579
+ *         return tuple([stride for stride in self.view.strides[:self.view.ndim]])
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def suboffsets(self):
+ *         if self.view.suboffsets == NULL:
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.suboffsets.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":586
+ *         return tuple([suboffset for suboffset in self.view.suboffsets[:self.view.ndim]])
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def ndim(self):
+ *         return self.view.ndim
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_4ndim_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_4ndim_1__get__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_pf_15View_dot_MemoryView_10memoryview_4ndim___get__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4ndim___get__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__get__", 1);
+
+  /* "View.MemoryView":588
+ *     @property
+ *     def ndim(self):
+ *         return self.view.ndim             # <<<<<<<<<<<<<<
+ * 
+ *     @property
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->view.ndim); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 588, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":586
+ *         return tuple([suboffset for suboffset in self.view.suboffsets[:self.view.ndim]])
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def ndim(self):
+ *         return self.view.ndim
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.ndim.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":590
+ *         return self.view.ndim
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def itemsize(self):
+ *         return self.view.itemsize
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_8itemsize_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_8itemsize_1__get__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_pf_15View_dot_MemoryView_10memoryview_8itemsize___get__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_8itemsize___get__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__get__", 1);
+
+  /* "View.MemoryView":592
+ *     @property
+ *     def itemsize(self):
+ *         return self.view.itemsize             # <<<<<<<<<<<<<<
+ * 
+ *     @property
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyInt_FromSsize_t(__pyx_v_self->view.itemsize); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 592, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":590
+ *         return self.view.ndim
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def itemsize(self):
+ *         return self.view.itemsize
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.itemsize.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":594
+ *         return self.view.itemsize
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def nbytes(self):
+ *         return self.size * self.view.itemsize
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_6nbytes_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_6nbytes_1__get__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_pf_15View_dot_MemoryView_10memoryview_6nbytes___get__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_6nbytes___get__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__get__", 1);
+
+  /* "View.MemoryView":596
+ *     @property
+ *     def nbytes(self):
+ *         return self.size * self.view.itemsize             # <<<<<<<<<<<<<<
+ * 
+ *     @property
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_size); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 596, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = PyInt_FromSsize_t(__pyx_v_self->view.itemsize); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 596, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = PyNumber_Multiply(__pyx_t_1, __pyx_t_2); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 596, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_r = __pyx_t_3;
+  __pyx_t_3 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":594
+ *         return self.view.itemsize
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def nbytes(self):
+ *         return self.size * self.view.itemsize
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.nbytes.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":598
+ *         return self.size * self.view.itemsize
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def size(self):
+ *         if self._size is None:
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_4size_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_4size_1__get__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_pf_15View_dot_MemoryView_10memoryview_4size___get__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4size___get__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  PyObject *__pyx_v_result = NULL;
+  PyObject *__pyx_v_length = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  Py_ssize_t *__pyx_t_2;
+  Py_ssize_t *__pyx_t_3;
+  Py_ssize_t *__pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__get__", 1);
+
+  /* "View.MemoryView":600
+ *     @property
+ *     def size(self):
+ *         if self._size is None:             # <<<<<<<<<<<<<<
+ *             result = 1
+ * 
+ */
+  __pyx_t_1 = (__pyx_v_self->_size == Py_None);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":601
+ *     def size(self):
+ *         if self._size is None:
+ *             result = 1             # <<<<<<<<<<<<<<
+ * 
+ *             for length in self.view.shape[:self.view.ndim]:
+ */
+    __Pyx_INCREF(__pyx_int_1);
+    __pyx_v_result = __pyx_int_1;
+
+    /* "View.MemoryView":603
+ *             result = 1
+ * 
+ *             for length in self.view.shape[:self.view.ndim]:             # <<<<<<<<<<<<<<
+ *                 result *= length
+ * 
+ */
+    __pyx_t_3 = (__pyx_v_self->view.shape + __pyx_v_self->view.ndim);
+    for (__pyx_t_4 = __pyx_v_self->view.shape; __pyx_t_4 < __pyx_t_3; __pyx_t_4++) {
+      __pyx_t_2 = __pyx_t_4;
+      __pyx_t_5 = PyInt_FromSsize_t((__pyx_t_2[0])); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 603, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_XDECREF_SET(__pyx_v_length, __pyx_t_5);
+      __pyx_t_5 = 0;
+
+      /* "View.MemoryView":604
+ * 
+ *             for length in self.view.shape[:self.view.ndim]:
+ *                 result *= length             # <<<<<<<<<<<<<<
+ * 
+ *             self._size = result
+ */
+      __pyx_t_5 = PyNumber_InPlaceMultiply(__pyx_v_result, __pyx_v_length); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 604, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_DECREF_SET(__pyx_v_result, __pyx_t_5);
+      __pyx_t_5 = 0;
+    }
+
+    /* "View.MemoryView":606
+ *                 result *= length
+ * 
+ *             self._size = result             # <<<<<<<<<<<<<<
+ * 
+ *         return self._size
+ */
+    __Pyx_INCREF(__pyx_v_result);
+    __Pyx_GIVEREF(__pyx_v_result);
+    __Pyx_GOTREF(__pyx_v_self->_size);
+    __Pyx_DECREF(__pyx_v_self->_size);
+    __pyx_v_self->_size = __pyx_v_result;
+
+    /* "View.MemoryView":600
+ *     @property
+ *     def size(self):
+ *         if self._size is None:             # <<<<<<<<<<<<<<
+ *             result = 1
+ * 
+ */
+  }
+
+  /* "View.MemoryView":608
+ *             self._size = result
+ * 
+ *         return self._size             # <<<<<<<<<<<<<<
+ * 
+ *     def __len__(self):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_self->_size);
+  __pyx_r = __pyx_v_self->_size;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":598
+ *         return self.size * self.view.itemsize
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def size(self):
+ *         if self._size is None:
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.size.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_result);
+  __Pyx_XDECREF(__pyx_v_length);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":610
+ *         return self._size
+ * 
+ *     def __len__(self):             # <<<<<<<<<<<<<<
+ *         if self.view.ndim >= 1:
+ *             return self.view.shape[0]
+ */
+
+/* Python wrapper */
+static Py_ssize_t __pyx_memoryview___len__(PyObject *__pyx_v_self); /*proto*/
+static Py_ssize_t __pyx_memoryview___len__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  Py_ssize_t __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__len__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_10__len__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static Py_ssize_t __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_10__len__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  Py_ssize_t __pyx_r;
+  int __pyx_t_1;
+
+  /* "View.MemoryView":611
+ * 
+ *     def __len__(self):
+ *         if self.view.ndim >= 1:             # <<<<<<<<<<<<<<
+ *             return self.view.shape[0]
+ * 
+ */
+  __pyx_t_1 = (__pyx_v_self->view.ndim >= 1);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":612
+ *     def __len__(self):
+ *         if self.view.ndim >= 1:
+ *             return self.view.shape[0]             # <<<<<<<<<<<<<<
+ * 
+ *         return 0
+ */
+    __pyx_r = (__pyx_v_self->view.shape[0]);
+    goto __pyx_L0;
+
+    /* "View.MemoryView":611
+ * 
+ *     def __len__(self):
+ *         if self.view.ndim >= 1:             # <<<<<<<<<<<<<<
+ *             return self.view.shape[0]
+ * 
+ */
+  }
+
+  /* "View.MemoryView":614
+ *             return self.view.shape[0]
+ * 
+ *         return 0             # <<<<<<<<<<<<<<
+ * 
+ *     def __repr__(self):
+ */
+  __pyx_r = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":610
+ *         return self._size
+ * 
+ *     def __len__(self):             # <<<<<<<<<<<<<<
+ *         if self.view.ndim >= 1:
+ *             return self.view.shape[0]
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":616
+ *         return 0
+ * 
+ *     def __repr__(self):             # <<<<<<<<<<<<<<
+ *         return "<MemoryView of %r at 0x%x>" % (self.base.__class__.__name__,
+ *                                                id(self))
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_memoryview___repr__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_memoryview___repr__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__repr__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_12__repr__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_12__repr__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__repr__", 1);
+
+  /* "View.MemoryView":617
+ * 
+ *     def __repr__(self):
+ *         return "<MemoryView of %r at 0x%x>" % (self.base.__class__.__name__,             # <<<<<<<<<<<<<<
+ *                                                id(self))
+ * 
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_base); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 617, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_class); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 617, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_name_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 617, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "View.MemoryView":618
+ *     def __repr__(self):
+ *         return "<MemoryView of %r at 0x%x>" % (self.base.__class__.__name__,
+ *                                                id(self))             # <<<<<<<<<<<<<<
+ * 
+ *     def __str__(self):
+ */
+  __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_builtin_id, ((PyObject *)__pyx_v_self)); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 618, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+
+  /* "View.MemoryView":617
+ * 
+ *     def __repr__(self):
+ *         return "<MemoryView of %r at 0x%x>" % (self.base.__class__.__name__,             # <<<<<<<<<<<<<<
+ *                                                id(self))
+ * 
+ */
+  __pyx_t_3 = PyTuple_New(2); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 617, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_GIVEREF(__pyx_t_1);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_1)) __PYX_ERR(1, 617, __pyx_L1_error);
+  __Pyx_GIVEREF(__pyx_t_2);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_2)) __PYX_ERR(1, 617, __pyx_L1_error);
+  __pyx_t_1 = 0;
+  __pyx_t_2 = 0;
+  __pyx_t_2 = __Pyx_PyString_Format(__pyx_kp_s_MemoryView_of_r_at_0x_x, __pyx_t_3); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 617, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_r = __pyx_t_2;
+  __pyx_t_2 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":616
+ *         return 0
+ * 
+ *     def __repr__(self):             # <<<<<<<<<<<<<<
+ *         return "<MemoryView of %r at 0x%x>" % (self.base.__class__.__name__,
+ *                                                id(self))
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.__repr__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":620
+ *                                                id(self))
+ * 
+ *     def __str__(self):             # <<<<<<<<<<<<<<
+ *         return "<MemoryView of %r object>" % (self.base.__class__.__name__,)
+ * 
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_memoryview___str__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_memoryview___str__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__str__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_14__str__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_14__str__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__str__", 1);
+
+  /* "View.MemoryView":621
+ * 
+ *     def __str__(self):
+ *         return "<MemoryView of %r object>" % (self.base.__class__.__name__,)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_base); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 621, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_class); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 621, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_name_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 621, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 621, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_GIVEREF(__pyx_t_1);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_1)) __PYX_ERR(1, 621, __pyx_L1_error);
+  __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyString_Format(__pyx_kp_s_MemoryView_of_r_object, __pyx_t_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 621, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":620
+ *                                                id(self))
+ * 
+ *     def __str__(self):             # <<<<<<<<<<<<<<
+ *         return "<MemoryView of %r object>" % (self.base.__class__.__name__,)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.__str__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":624
+ * 
+ * 
+ *     def is_c_contig(self):             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice *mslice
+ *         cdef __Pyx_memviewslice tmp
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_memoryview_is_c_contig(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_memoryview_is_c_contig(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("is_c_contig (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  if (unlikely(__pyx_nargs > 0)) {
+    __Pyx_RaiseArgtupleInvalid("is_c_contig", 1, 0, 0, __pyx_nargs); return NULL;}
+  if (unlikely(__pyx_kwds) && __Pyx_NumKwargs_FASTCALL(__pyx_kwds) && unlikely(!__Pyx_CheckKeywordStrings(__pyx_kwds, "is_c_contig", 0))) return NULL;
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_16is_c_contig(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_16is_c_contig(struct __pyx_memoryview_obj *__pyx_v_self) {
+  __Pyx_memviewslice *__pyx_v_mslice;
+  __Pyx_memviewslice __pyx_v_tmp;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_memviewslice *__pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("is_c_contig", 1);
+
+  /* "View.MemoryView":627
+ *         cdef __Pyx_memviewslice *mslice
+ *         cdef __Pyx_memviewslice tmp
+ *         mslice = get_slice_from_memview(self, &tmp)             # <<<<<<<<<<<<<<
+ *         return slice_is_contig(mslice[0], 'C', self.view.ndim)
+ * 
+ */
+  __pyx_t_1 = __pyx_memoryview_get_slice_from_memoryview(__pyx_v_self, (&__pyx_v_tmp)); if (unlikely(__pyx_t_1 == ((__Pyx_memviewslice *)NULL))) __PYX_ERR(1, 627, __pyx_L1_error)
+  __pyx_v_mslice = __pyx_t_1;
+
+  /* "View.MemoryView":628
+ *         cdef __Pyx_memviewslice tmp
+ *         mslice = get_slice_from_memview(self, &tmp)
+ *         return slice_is_contig(mslice[0], 'C', self.view.ndim)             # <<<<<<<<<<<<<<
+ * 
+ *     def is_f_contig(self):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_2 = __Pyx_PyBool_FromLong(__pyx_memviewslice_is_contig((__pyx_v_mslice[0]), 'C', __pyx_v_self->view.ndim)); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 628, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_r = __pyx_t_2;
+  __pyx_t_2 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":624
+ * 
+ * 
+ *     def is_c_contig(self):             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice *mslice
+ *         cdef __Pyx_memviewslice tmp
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.is_c_contig", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":630
+ *         return slice_is_contig(mslice[0], 'C', self.view.ndim)
+ * 
+ *     def is_f_contig(self):             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice *mslice
+ *         cdef __Pyx_memviewslice tmp
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_memoryview_is_f_contig(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_memoryview_is_f_contig(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("is_f_contig (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  if (unlikely(__pyx_nargs > 0)) {
+    __Pyx_RaiseArgtupleInvalid("is_f_contig", 1, 0, 0, __pyx_nargs); return NULL;}
+  if (unlikely(__pyx_kwds) && __Pyx_NumKwargs_FASTCALL(__pyx_kwds) && unlikely(!__Pyx_CheckKeywordStrings(__pyx_kwds, "is_f_contig", 0))) return NULL;
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_18is_f_contig(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_18is_f_contig(struct __pyx_memoryview_obj *__pyx_v_self) {
+  __Pyx_memviewslice *__pyx_v_mslice;
+  __Pyx_memviewslice __pyx_v_tmp;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_memviewslice *__pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("is_f_contig", 1);
+
+  /* "View.MemoryView":633
+ *         cdef __Pyx_memviewslice *mslice
+ *         cdef __Pyx_memviewslice tmp
+ *         mslice = get_slice_from_memview(self, &tmp)             # <<<<<<<<<<<<<<
+ *         return slice_is_contig(mslice[0], 'F', self.view.ndim)
+ * 
+ */
+  __pyx_t_1 = __pyx_memoryview_get_slice_from_memoryview(__pyx_v_self, (&__pyx_v_tmp)); if (unlikely(__pyx_t_1 == ((__Pyx_memviewslice *)NULL))) __PYX_ERR(1, 633, __pyx_L1_error)
+  __pyx_v_mslice = __pyx_t_1;
+
+  /* "View.MemoryView":634
+ *         cdef __Pyx_memviewslice tmp
+ *         mslice = get_slice_from_memview(self, &tmp)
+ *         return slice_is_contig(mslice[0], 'F', self.view.ndim)             # <<<<<<<<<<<<<<
+ * 
+ *     def copy(self):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_2 = __Pyx_PyBool_FromLong(__pyx_memviewslice_is_contig((__pyx_v_mslice[0]), 'F', __pyx_v_self->view.ndim)); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 634, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_r = __pyx_t_2;
+  __pyx_t_2 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":630
+ *         return slice_is_contig(mslice[0], 'C', self.view.ndim)
+ * 
+ *     def is_f_contig(self):             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice *mslice
+ *         cdef __Pyx_memviewslice tmp
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.is_f_contig", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":636
+ *         return slice_is_contig(mslice[0], 'F', self.view.ndim)
+ * 
+ *     def copy(self):             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice mslice
+ *         cdef int flags = self.flags & ~PyBUF_F_CONTIGUOUS
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_memoryview_copy(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_memoryview_copy(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("copy (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  if (unlikely(__pyx_nargs > 0)) {
+    __Pyx_RaiseArgtupleInvalid("copy", 1, 0, 0, __pyx_nargs); return NULL;}
+  if (unlikely(__pyx_kwds) && __Pyx_NumKwargs_FASTCALL(__pyx_kwds) && unlikely(!__Pyx_CheckKeywordStrings(__pyx_kwds, "copy", 0))) return NULL;
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_20copy(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_20copy(struct __pyx_memoryview_obj *__pyx_v_self) {
+  __Pyx_memviewslice __pyx_v_mslice;
+  int __pyx_v_flags;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_memviewslice __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("copy", 1);
+
+  /* "View.MemoryView":638
+ *     def copy(self):
+ *         cdef __Pyx_memviewslice mslice
+ *         cdef int flags = self.flags & ~PyBUF_F_CONTIGUOUS             # <<<<<<<<<<<<<<
+ * 
+ *         slice_copy(self, &mslice)
+ */
+  __pyx_v_flags = (__pyx_v_self->flags & (~PyBUF_F_CONTIGUOUS));
+
+  /* "View.MemoryView":640
+ *         cdef int flags = self.flags & ~PyBUF_F_CONTIGUOUS
+ * 
+ *         slice_copy(self, &mslice)             # <<<<<<<<<<<<<<
+ *         mslice = slice_copy_contig(&mslice, "c", self.view.ndim,
+ *                                    self.view.itemsize,
+ */
+  __pyx_memoryview_slice_copy(__pyx_v_self, (&__pyx_v_mslice));
+
+  /* "View.MemoryView":641
+ * 
+ *         slice_copy(self, &mslice)
+ *         mslice = slice_copy_contig(&mslice, "c", self.view.ndim,             # <<<<<<<<<<<<<<
+ *                                    self.view.itemsize,
+ *                                    flags|PyBUF_C_CONTIGUOUS,
+ */
+  __pyx_t_1 = __pyx_memoryview_copy_new_contig((&__pyx_v_mslice), ((char *)"c"), __pyx_v_self->view.ndim, __pyx_v_self->view.itemsize, (__pyx_v_flags | PyBUF_C_CONTIGUOUS), __pyx_v_self->dtype_is_object); if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 641, __pyx_L1_error)
+  __pyx_v_mslice = __pyx_t_1;
+
+  /* "View.MemoryView":646
+ *                                    self.dtype_is_object)
+ * 
+ *         return memoryview_copy_from_slice(self, &mslice)             # <<<<<<<<<<<<<<
+ * 
+ *     def copy_fortran(self):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_2 = __pyx_memoryview_copy_object_from_slice(__pyx_v_self, (&__pyx_v_mslice)); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 646, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_r = __pyx_t_2;
+  __pyx_t_2 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":636
+ *         return slice_is_contig(mslice[0], 'F', self.view.ndim)
+ * 
+ *     def copy(self):             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice mslice
+ *         cdef int flags = self.flags & ~PyBUF_F_CONTIGUOUS
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.copy", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":648
+ *         return memoryview_copy_from_slice(self, &mslice)
+ * 
+ *     def copy_fortran(self):             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice src, dst
+ *         cdef int flags = self.flags & ~PyBUF_C_CONTIGUOUS
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_memoryview_copy_fortran(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_memoryview_copy_fortran(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("copy_fortran (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  if (unlikely(__pyx_nargs > 0)) {
+    __Pyx_RaiseArgtupleInvalid("copy_fortran", 1, 0, 0, __pyx_nargs); return NULL;}
+  if (unlikely(__pyx_kwds) && __Pyx_NumKwargs_FASTCALL(__pyx_kwds) && unlikely(!__Pyx_CheckKeywordStrings(__pyx_kwds, "copy_fortran", 0))) return NULL;
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_22copy_fortran(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_22copy_fortran(struct __pyx_memoryview_obj *__pyx_v_self) {
+  __Pyx_memviewslice __pyx_v_src;
+  __Pyx_memviewslice __pyx_v_dst;
+  int __pyx_v_flags;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_memviewslice __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("copy_fortran", 1);
+
+  /* "View.MemoryView":650
+ *     def copy_fortran(self):
+ *         cdef __Pyx_memviewslice src, dst
+ *         cdef int flags = self.flags & ~PyBUF_C_CONTIGUOUS             # <<<<<<<<<<<<<<
+ * 
+ *         slice_copy(self, &src)
+ */
+  __pyx_v_flags = (__pyx_v_self->flags & (~PyBUF_C_CONTIGUOUS));
+
+  /* "View.MemoryView":652
+ *         cdef int flags = self.flags & ~PyBUF_C_CONTIGUOUS
+ * 
+ *         slice_copy(self, &src)             # <<<<<<<<<<<<<<
+ *         dst = slice_copy_contig(&src, "fortran", self.view.ndim,
+ *                                 self.view.itemsize,
+ */
+  __pyx_memoryview_slice_copy(__pyx_v_self, (&__pyx_v_src));
+
+  /* "View.MemoryView":653
+ * 
+ *         slice_copy(self, &src)
+ *         dst = slice_copy_contig(&src, "fortran", self.view.ndim,             # <<<<<<<<<<<<<<
+ *                                 self.view.itemsize,
+ *                                 flags|PyBUF_F_CONTIGUOUS,
+ */
+  __pyx_t_1 = __pyx_memoryview_copy_new_contig((&__pyx_v_src), ((char *)"fortran"), __pyx_v_self->view.ndim, __pyx_v_self->view.itemsize, (__pyx_v_flags | PyBUF_F_CONTIGUOUS), __pyx_v_self->dtype_is_object); if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 653, __pyx_L1_error)
+  __pyx_v_dst = __pyx_t_1;
+
+  /* "View.MemoryView":658
+ *                                 self.dtype_is_object)
+ * 
+ *         return memoryview_copy_from_slice(self, &dst)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_2 = __pyx_memoryview_copy_object_from_slice(__pyx_v_self, (&__pyx_v_dst)); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 658, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_r = __pyx_t_2;
+  __pyx_t_2 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":648
+ *         return memoryview_copy_from_slice(self, &mslice)
+ * 
+ *     def copy_fortran(self):             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice src, dst
+ *         cdef int flags = self.flags & ~PyBUF_C_CONTIGUOUS
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.copy_fortran", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw___pyx_memoryview_1__reduce_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_pw___pyx_memoryview_1__reduce_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__reduce_cython__ (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  if (unlikely(__pyx_nargs > 0)) {
+    __Pyx_RaiseArgtupleInvalid("__reduce_cython__", 1, 0, 0, __pyx_nargs); return NULL;}
+  if (unlikely(__pyx_kwds) && __Pyx_NumKwargs_FASTCALL(__pyx_kwds) && unlikely(!__Pyx_CheckKeywordStrings(__pyx_kwds, "__reduce_cython__", 0))) return NULL;
+  __pyx_r = __pyx_pf___pyx_memoryview___reduce_cython__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf___pyx_memoryview___reduce_cython__(CYTHON_UNUSED struct __pyx_memoryview_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__reduce_cython__", 1);
+
+  /* "(tree fragment)":2
+ * def __reduce_cython__(self):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"             # <<<<<<<<<<<<<<
+ * def __setstate_cython__(self, __pyx_state):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ */
+  __Pyx_Raise(__pyx_builtin_TypeError, __pyx_kp_s_no_default___reduce___due_to_non, 0, 0);
+  __PYX_ERR(1, 2, __pyx_L1_error)
+
+  /* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView.memoryview.__reduce_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":3
+ * def __reduce_cython__(self):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw___pyx_memoryview_3__setstate_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_pw___pyx_memoryview_3__setstate_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  CYTHON_UNUSED PyObject *__pyx_v___pyx_state = 0;
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[1] = {0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__setstate_cython__ (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_pyx_state,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  1: values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_FASTCALL(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_pyx_state)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 3, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "__setstate_cython__") < 0)) __PYX_ERR(1, 3, __pyx_L3_error)
+      }
+    } else if (unlikely(__pyx_nargs != 1)) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+    }
+    __pyx_v___pyx_state = values[0];
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__setstate_cython__", 1, 1, 1, __pyx_nargs); __PYX_ERR(1, 3, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("View.MemoryView.memoryview.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf___pyx_memoryview_2__setstate_cython__(((struct __pyx_memoryview_obj *)__pyx_v_self), __pyx_v___pyx_state);
+
+  /* function exit code */
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf___pyx_memoryview_2__setstate_cython__(CYTHON_UNUSED struct __pyx_memoryview_obj *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__setstate_cython__", 1);
+
+  /* "(tree fragment)":4
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"             # <<<<<<<<<<<<<<
+ */
+  __Pyx_Raise(__pyx_builtin_TypeError, __pyx_kp_s_no_default___reduce___due_to_non, 0, 0);
+  __PYX_ERR(1, 4, __pyx_L1_error)
+
+  /* "(tree fragment)":3
+ * def __reduce_cython__(self):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView.memoryview.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":662
+ * 
+ * @cname('__pyx_memoryview_new')
+ * cdef memoryview_cwrapper(object o, int flags, bint dtype_is_object, __Pyx_TypeInfo *typeinfo):             # <<<<<<<<<<<<<<
+ *     cdef memoryview result = memoryview(o, flags, dtype_is_object)
+ *     result.typeinfo = typeinfo
+ */
+
+static PyObject *__pyx_memoryview_new(PyObject *__pyx_v_o, int __pyx_v_flags, int __pyx_v_dtype_is_object, __Pyx_TypeInfo *__pyx_v_typeinfo) {
+  struct __pyx_memoryview_obj *__pyx_v_result = 0;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("memoryview_cwrapper", 1);
+
+  /* "View.MemoryView":663
+ * @cname('__pyx_memoryview_new')
+ * cdef memoryview_cwrapper(object o, int flags, bint dtype_is_object, __Pyx_TypeInfo *typeinfo):
+ *     cdef memoryview result = memoryview(o, flags, dtype_is_object)             # <<<<<<<<<<<<<<
+ *     result.typeinfo = typeinfo
+ *     return result
+ */
+  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_flags); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 663, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyBool_FromLong(__pyx_v_dtype_is_object); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 663, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = PyTuple_New(3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 663, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_INCREF(__pyx_v_o);
+  __Pyx_GIVEREF(__pyx_v_o);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_v_o)) __PYX_ERR(1, 663, __pyx_L1_error);
+  __Pyx_GIVEREF(__pyx_t_1);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_1)) __PYX_ERR(1, 663, __pyx_L1_error);
+  __Pyx_GIVEREF(__pyx_t_2);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 2, __pyx_t_2)) __PYX_ERR(1, 663, __pyx_L1_error);
+  __pyx_t_1 = 0;
+  __pyx_t_2 = 0;
+  __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)__pyx_memoryview_type), __pyx_t_3, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 663, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_v_result = ((struct __pyx_memoryview_obj *)__pyx_t_2);
+  __pyx_t_2 = 0;
+
+  /* "View.MemoryView":664
+ * cdef memoryview_cwrapper(object o, int flags, bint dtype_is_object, __Pyx_TypeInfo *typeinfo):
+ *     cdef memoryview result = memoryview(o, flags, dtype_is_object)
+ *     result.typeinfo = typeinfo             # <<<<<<<<<<<<<<
+ *     return result
+ * 
+ */
+  __pyx_v_result->typeinfo = __pyx_v_typeinfo;
+
+  /* "View.MemoryView":665
+ *     cdef memoryview result = memoryview(o, flags, dtype_is_object)
+ *     result.typeinfo = typeinfo
+ *     return result             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_check')
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF((PyObject *)__pyx_v_result);
+  __pyx_r = ((PyObject *)__pyx_v_result);
+  goto __pyx_L0;
+
+  /* "View.MemoryView":662
+ * 
+ * @cname('__pyx_memoryview_new')
+ * cdef memoryview_cwrapper(object o, int flags, bint dtype_is_object, __Pyx_TypeInfo *typeinfo):             # <<<<<<<<<<<<<<
+ *     cdef memoryview result = memoryview(o, flags, dtype_is_object)
+ *     result.typeinfo = typeinfo
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_AddTraceback("View.MemoryView.memoryview_cwrapper", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_result);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":668
+ * 
+ * @cname('__pyx_memoryview_check')
+ * cdef inline bint memoryview_check(object o) noexcept:             # <<<<<<<<<<<<<<
+ *     return isinstance(o, memoryview)
+ * 
+ */
+
+static CYTHON_INLINE int __pyx_memoryview_check(PyObject *__pyx_v_o) {
+  int __pyx_r;
+  int __pyx_t_1;
+
+  /* "View.MemoryView":669
+ * @cname('__pyx_memoryview_check')
+ * cdef inline bint memoryview_check(object o) noexcept:
+ *     return isinstance(o, memoryview)             # <<<<<<<<<<<<<<
+ * 
+ * cdef tuple _unellipsify(object index, int ndim):
+ */
+  __pyx_t_1 = __Pyx_TypeCheck(__pyx_v_o, __pyx_memoryview_type); 
+  __pyx_r = __pyx_t_1;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":668
+ * 
+ * @cname('__pyx_memoryview_check')
+ * cdef inline bint memoryview_check(object o) noexcept:             # <<<<<<<<<<<<<<
+ *     return isinstance(o, memoryview)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":671
+ *     return isinstance(o, memoryview)
+ * 
+ * cdef tuple _unellipsify(object index, int ndim):             # <<<<<<<<<<<<<<
+ *     """
+ *     Replace all ellipses with full slices and fill incomplete indices with
+ */
+
+static PyObject *_unellipsify(PyObject *__pyx_v_index, int __pyx_v_ndim) {
+  Py_ssize_t __pyx_v_idx;
+  PyObject *__pyx_v_tup = NULL;
+  PyObject *__pyx_v_result = NULL;
+  int __pyx_v_have_slices;
+  int __pyx_v_seen_ellipsis;
+  PyObject *__pyx_v_item = NULL;
+  Py_ssize_t __pyx_v_nslices;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  Py_ssize_t __pyx_t_4;
+  Py_ssize_t __pyx_t_5;
+  Py_UCS4 __pyx_t_6;
+  PyObject *__pyx_t_7 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("_unellipsify", 1);
+
+  /* "View.MemoryView":677
+ *     """
+ *     cdef Py_ssize_t idx
+ *     tup = <tuple>index if isinstance(index, tuple) else (index,)             # <<<<<<<<<<<<<<
+ * 
+ *     result = [slice(None)] * ndim
+ */
+  __pyx_t_2 = PyTuple_Check(__pyx_v_index); 
+  if (__pyx_t_2) {
+    __Pyx_INCREF(((PyObject*)__pyx_v_index));
+    __pyx_t_1 = __pyx_v_index;
+  } else {
+    __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 677, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_INCREF(__pyx_v_index);
+    __Pyx_GIVEREF(__pyx_v_index);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_v_index)) __PYX_ERR(1, 677, __pyx_L1_error);
+    __pyx_t_1 = __pyx_t_3;
+    __pyx_t_3 = 0;
+  }
+  __pyx_v_tup = ((PyObject*)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "View.MemoryView":679
+ *     tup = <tuple>index if isinstance(index, tuple) else (index,)
+ * 
+ *     result = [slice(None)] * ndim             # <<<<<<<<<<<<<<
+ *     have_slices = False
+ *     seen_ellipsis = False
+ */
+  __pyx_t_1 = PyList_New(1 * ((__pyx_v_ndim<0) ? 0:__pyx_v_ndim)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 679, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  { Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < __pyx_v_ndim; __pyx_temp++) {
+      __Pyx_INCREF(__pyx_slice__5);
+      __Pyx_GIVEREF(__pyx_slice__5);
+      if (__Pyx_PyList_SET_ITEM(__pyx_t_1, __pyx_temp, __pyx_slice__5)) __PYX_ERR(1, 679, __pyx_L1_error);
+    }
+  }
+  __pyx_v_result = ((PyObject*)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "View.MemoryView":680
+ * 
+ *     result = [slice(None)] * ndim
+ *     have_slices = False             # <<<<<<<<<<<<<<
+ *     seen_ellipsis = False
+ *     idx = 0
+ */
+  __pyx_v_have_slices = 0;
+
+  /* "View.MemoryView":681
+ *     result = [slice(None)] * ndim
+ *     have_slices = False
+ *     seen_ellipsis = False             # <<<<<<<<<<<<<<
+ *     idx = 0
+ *     for item in tup:
+ */
+  __pyx_v_seen_ellipsis = 0;
+
+  /* "View.MemoryView":682
+ *     have_slices = False
+ *     seen_ellipsis = False
+ *     idx = 0             # <<<<<<<<<<<<<<
+ *     for item in tup:
+ *         if item is Ellipsis:
+ */
+  __pyx_v_idx = 0;
+
+  /* "View.MemoryView":683
+ *     seen_ellipsis = False
+ *     idx = 0
+ *     for item in tup:             # <<<<<<<<<<<<<<
+ *         if item is Ellipsis:
+ *             if not seen_ellipsis:
+ */
+  if (unlikely(__pyx_v_tup == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
+    __PYX_ERR(1, 683, __pyx_L1_error)
+  }
+  __pyx_t_1 = __pyx_v_tup; __Pyx_INCREF(__pyx_t_1);
+  __pyx_t_4 = 0;
+  for (;;) {
+    {
+      Py_ssize_t __pyx_temp = __Pyx_PyTuple_GET_SIZE(__pyx_t_1);
+      #if !CYTHON_ASSUME_SAFE_MACROS
+      if (unlikely((__pyx_temp < 0))) __PYX_ERR(1, 683, __pyx_L1_error)
+      #endif
+      if (__pyx_t_4 >= __pyx_temp) break;
+    }
+    #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    __pyx_t_3 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_4); __Pyx_INCREF(__pyx_t_3); __pyx_t_4++; if (unlikely((0 < 0))) __PYX_ERR(1, 683, __pyx_L1_error)
+    #else
+    __pyx_t_3 = __Pyx_PySequence_ITEM(__pyx_t_1, __pyx_t_4); __pyx_t_4++; if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 683, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    #endif
+    __Pyx_XDECREF_SET(__pyx_v_item, __pyx_t_3);
+    __pyx_t_3 = 0;
+
+    /* "View.MemoryView":684
+ *     idx = 0
+ *     for item in tup:
+ *         if item is Ellipsis:             # <<<<<<<<<<<<<<
+ *             if not seen_ellipsis:
+ *                 idx += ndim - len(tup)
+ */
+    __pyx_t_2 = (__pyx_v_item == __pyx_builtin_Ellipsis);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":685
+ *     for item in tup:
+ *         if item is Ellipsis:
+ *             if not seen_ellipsis:             # <<<<<<<<<<<<<<
+ *                 idx += ndim - len(tup)
+ *                 seen_ellipsis = True
+ */
+      __pyx_t_2 = (!__pyx_v_seen_ellipsis);
+      if (__pyx_t_2) {
+
+        /* "View.MemoryView":686
+ *         if item is Ellipsis:
+ *             if not seen_ellipsis:
+ *                 idx += ndim - len(tup)             # <<<<<<<<<<<<<<
+ *                 seen_ellipsis = True
+ *             have_slices = True
+ */
+        if (unlikely(__pyx_v_tup == Py_None)) {
+          PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()");
+          __PYX_ERR(1, 686, __pyx_L1_error)
+        }
+        __pyx_t_5 = __Pyx_PyTuple_GET_SIZE(__pyx_v_tup); if (unlikely(__pyx_t_5 == ((Py_ssize_t)-1))) __PYX_ERR(1, 686, __pyx_L1_error)
+        __pyx_v_idx = (__pyx_v_idx + (__pyx_v_ndim - __pyx_t_5));
+
+        /* "View.MemoryView":687
+ *             if not seen_ellipsis:
+ *                 idx += ndim - len(tup)
+ *                 seen_ellipsis = True             # <<<<<<<<<<<<<<
+ *             have_slices = True
+ *         else:
+ */
+        __pyx_v_seen_ellipsis = 1;
+
+        /* "View.MemoryView":685
+ *     for item in tup:
+ *         if item is Ellipsis:
+ *             if not seen_ellipsis:             # <<<<<<<<<<<<<<
+ *                 idx += ndim - len(tup)
+ *                 seen_ellipsis = True
+ */
+      }
+
+      /* "View.MemoryView":688
+ *                 idx += ndim - len(tup)
+ *                 seen_ellipsis = True
+ *             have_slices = True             # <<<<<<<<<<<<<<
+ *         else:
+ *             if isinstance(item, slice):
+ */
+      __pyx_v_have_slices = 1;
+
+      /* "View.MemoryView":684
+ *     idx = 0
+ *     for item in tup:
+ *         if item is Ellipsis:             # <<<<<<<<<<<<<<
+ *             if not seen_ellipsis:
+ *                 idx += ndim - len(tup)
+ */
+      goto __pyx_L5;
+    }
+
+    /* "View.MemoryView":690
+ *             have_slices = True
+ *         else:
+ *             if isinstance(item, slice):             # <<<<<<<<<<<<<<
+ *                 have_slices = True
+ *             elif not PyIndex_Check(item):
+ */
+    /*else*/ {
+      __pyx_t_2 = PySlice_Check(__pyx_v_item); 
+      if (__pyx_t_2) {
+
+        /* "View.MemoryView":691
+ *         else:
+ *             if isinstance(item, slice):
+ *                 have_slices = True             # <<<<<<<<<<<<<<
+ *             elif not PyIndex_Check(item):
+ *                 raise TypeError, f"Cannot index with type '{type(item)}'"
+ */
+        __pyx_v_have_slices = 1;
+
+        /* "View.MemoryView":690
+ *             have_slices = True
+ *         else:
+ *             if isinstance(item, slice):             # <<<<<<<<<<<<<<
+ *                 have_slices = True
+ *             elif not PyIndex_Check(item):
+ */
+        goto __pyx_L7;
+      }
+
+      /* "View.MemoryView":692
+ *             if isinstance(item, slice):
+ *                 have_slices = True
+ *             elif not PyIndex_Check(item):             # <<<<<<<<<<<<<<
+ *                 raise TypeError, f"Cannot index with type '{type(item)}'"
+ *             result[idx] = item
+ */
+      __pyx_t_2 = (!(PyIndex_Check(__pyx_v_item) != 0));
+      if (unlikely(__pyx_t_2)) {
+
+        /* "View.MemoryView":693
+ *                 have_slices = True
+ *             elif not PyIndex_Check(item):
+ *                 raise TypeError, f"Cannot index with type '{type(item)}'"             # <<<<<<<<<<<<<<
+ *             result[idx] = item
+ *         idx += 1
+ */
+        __pyx_t_3 = PyTuple_New(3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 693, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_3);
+        __pyx_t_5 = 0;
+        __pyx_t_6 = 127;
+        __Pyx_INCREF(__pyx_kp_u_Cannot_index_with_type);
+        __pyx_t_5 += 24;
+        __Pyx_GIVEREF(__pyx_kp_u_Cannot_index_with_type);
+        PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_kp_u_Cannot_index_with_type);
+        __pyx_t_7 = __Pyx_PyObject_FormatSimple(((PyObject *)Py_TYPE(__pyx_v_item)), __pyx_empty_unicode); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 693, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_6 = (__Pyx_PyUnicode_MAX_CHAR_VALUE(__pyx_t_7) > __pyx_t_6) ? __Pyx_PyUnicode_MAX_CHAR_VALUE(__pyx_t_7) : __pyx_t_6;
+        __pyx_t_5 += __Pyx_PyUnicode_GET_LENGTH(__pyx_t_7);
+        __Pyx_GIVEREF(__pyx_t_7);
+        PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_7);
+        __pyx_t_7 = 0;
+        __Pyx_INCREF(__pyx_kp_u__6);
+        __pyx_t_5 += 1;
+        __Pyx_GIVEREF(__pyx_kp_u__6);
+        PyTuple_SET_ITEM(__pyx_t_3, 2, __pyx_kp_u__6);
+        __pyx_t_7 = __Pyx_PyUnicode_Join(__pyx_t_3, 3, __pyx_t_5, __pyx_t_6); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 693, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+        __Pyx_Raise(__pyx_builtin_TypeError, __pyx_t_7, 0, 0);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __PYX_ERR(1, 693, __pyx_L1_error)
+
+        /* "View.MemoryView":692
+ *             if isinstance(item, slice):
+ *                 have_slices = True
+ *             elif not PyIndex_Check(item):             # <<<<<<<<<<<<<<
+ *                 raise TypeError, f"Cannot index with type '{type(item)}'"
+ *             result[idx] = item
+ */
+      }
+      __pyx_L7:;
+
+      /* "View.MemoryView":694
+ *             elif not PyIndex_Check(item):
+ *                 raise TypeError, f"Cannot index with type '{type(item)}'"
+ *             result[idx] = item             # <<<<<<<<<<<<<<
+ *         idx += 1
+ * 
+ */
+      if (unlikely((__Pyx_SetItemInt(__pyx_v_result, __pyx_v_idx, __pyx_v_item, Py_ssize_t, 1, PyInt_FromSsize_t, 1, 1, 1) < 0))) __PYX_ERR(1, 694, __pyx_L1_error)
+    }
+    __pyx_L5:;
+
+    /* "View.MemoryView":695
+ *                 raise TypeError, f"Cannot index with type '{type(item)}'"
+ *             result[idx] = item
+ *         idx += 1             # <<<<<<<<<<<<<<
+ * 
+ *     nslices = ndim - idx
+ */
+    __pyx_v_idx = (__pyx_v_idx + 1);
+
+    /* "View.MemoryView":683
+ *     seen_ellipsis = False
+ *     idx = 0
+ *     for item in tup:             # <<<<<<<<<<<<<<
+ *         if item is Ellipsis:
+ *             if not seen_ellipsis:
+ */
+  }
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "View.MemoryView":697
+ *         idx += 1
+ * 
+ *     nslices = ndim - idx             # <<<<<<<<<<<<<<
+ *     return have_slices or nslices, tuple(result)
+ * 
+ */
+  __pyx_v_nslices = (__pyx_v_ndim - __pyx_v_idx);
+
+  /* "View.MemoryView":698
+ * 
+ *     nslices = ndim - idx
+ *     return have_slices or nslices, tuple(result)             # <<<<<<<<<<<<<<
+ * 
+ * cdef int assert_direct_dimensions(Py_ssize_t *suboffsets, int ndim) except -1:
+ */
+  __Pyx_XDECREF(__pyx_r);
+  if (!__pyx_v_have_slices) {
+  } else {
+    __pyx_t_7 = __Pyx_PyBool_FromLong(__pyx_v_have_slices); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 698, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __pyx_t_1 = __pyx_t_7;
+    __pyx_t_7 = 0;
+    goto __pyx_L9_bool_binop_done;
+  }
+  __pyx_t_7 = PyInt_FromSsize_t(__pyx_v_nslices); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 698, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  __pyx_t_1 = __pyx_t_7;
+  __pyx_t_7 = 0;
+  __pyx_L9_bool_binop_done:;
+  __pyx_t_7 = PyList_AsTuple(__pyx_v_result); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 698, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  __pyx_t_3 = PyTuple_New(2); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 698, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_GIVEREF(__pyx_t_1);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_1)) __PYX_ERR(1, 698, __pyx_L1_error);
+  __Pyx_GIVEREF(__pyx_t_7);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_7)) __PYX_ERR(1, 698, __pyx_L1_error);
+  __pyx_t_1 = 0;
+  __pyx_t_7 = 0;
+  __pyx_r = ((PyObject*)__pyx_t_3);
+  __pyx_t_3 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":671
+ *     return isinstance(o, memoryview)
+ * 
+ * cdef tuple _unellipsify(object index, int ndim):             # <<<<<<<<<<<<<<
+ *     """
+ *     Replace all ellipses with full slices and fill incomplete indices with
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_AddTraceback("View.MemoryView._unellipsify", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_tup);
+  __Pyx_XDECREF(__pyx_v_result);
+  __Pyx_XDECREF(__pyx_v_item);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":700
+ *     return have_slices or nslices, tuple(result)
+ * 
+ * cdef int assert_direct_dimensions(Py_ssize_t *suboffsets, int ndim) except -1:             # <<<<<<<<<<<<<<
+ *     for suboffset in suboffsets[:ndim]:
+ *         if suboffset >= 0:
+ */
+
+static int assert_direct_dimensions(Py_ssize_t *__pyx_v_suboffsets, int __pyx_v_ndim) {
+  Py_ssize_t __pyx_v_suboffset;
+  int __pyx_r;
+  Py_ssize_t *__pyx_t_1;
+  Py_ssize_t *__pyx_t_2;
+  Py_ssize_t *__pyx_t_3;
+  int __pyx_t_4;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+
+  /* "View.MemoryView":701
+ * 
+ * cdef int assert_direct_dimensions(Py_ssize_t *suboffsets, int ndim) except -1:
+ *     for suboffset in suboffsets[:ndim]:             # <<<<<<<<<<<<<<
+ *         if suboffset >= 0:
+ *             raise ValueError, "Indirect dimensions not supported"
+ */
+  __pyx_t_2 = (__pyx_v_suboffsets + __pyx_v_ndim);
+  for (__pyx_t_3 = __pyx_v_suboffsets; __pyx_t_3 < __pyx_t_2; __pyx_t_3++) {
+    __pyx_t_1 = __pyx_t_3;
+    __pyx_v_suboffset = (__pyx_t_1[0]);
+
+    /* "View.MemoryView":702
+ * cdef int assert_direct_dimensions(Py_ssize_t *suboffsets, int ndim) except -1:
+ *     for suboffset in suboffsets[:ndim]:
+ *         if suboffset >= 0:             # <<<<<<<<<<<<<<
+ *             raise ValueError, "Indirect dimensions not supported"
+ *     return 0  # return type just used as an error flag
+ */
+    __pyx_t_4 = (__pyx_v_suboffset >= 0);
+    if (unlikely(__pyx_t_4)) {
+
+      /* "View.MemoryView":703
+ *     for suboffset in suboffsets[:ndim]:
+ *         if suboffset >= 0:
+ *             raise ValueError, "Indirect dimensions not supported"             # <<<<<<<<<<<<<<
+ *     return 0  # return type just used as an error flag
+ * 
+ */
+      __Pyx_Raise(__pyx_builtin_ValueError, __pyx_kp_s_Indirect_dimensions_not_supporte, 0, 0);
+      __PYX_ERR(1, 703, __pyx_L1_error)
+
+      /* "View.MemoryView":702
+ * cdef int assert_direct_dimensions(Py_ssize_t *suboffsets, int ndim) except -1:
+ *     for suboffset in suboffsets[:ndim]:
+ *         if suboffset >= 0:             # <<<<<<<<<<<<<<
+ *             raise ValueError, "Indirect dimensions not supported"
+ *     return 0  # return type just used as an error flag
+ */
+    }
+  }
+
+  /* "View.MemoryView":704
+ *         if suboffset >= 0:
+ *             raise ValueError, "Indirect dimensions not supported"
+ *     return 0  # return type just used as an error flag             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":700
+ *     return have_slices or nslices, tuple(result)
+ * 
+ * cdef int assert_direct_dimensions(Py_ssize_t *suboffsets, int ndim) except -1:             # <<<<<<<<<<<<<<
+ *     for suboffset in suboffsets[:ndim]:
+ *         if suboffset >= 0:
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView.assert_direct_dimensions", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":711
+ * 
+ * @cname('__pyx_memview_slice')
+ * cdef memoryview memview_slice(memoryview memview, object indices):             # <<<<<<<<<<<<<<
+ *     cdef int new_ndim = 0, suboffset_dim = -1, dim
+ *     cdef bint negative_step
+ */
+
+static struct __pyx_memoryview_obj *__pyx_memview_slice(struct __pyx_memoryview_obj *__pyx_v_memview, PyObject *__pyx_v_indices) {
+  int __pyx_v_new_ndim;
+  int __pyx_v_suboffset_dim;
+  int __pyx_v_dim;
+  __Pyx_memviewslice __pyx_v_src;
+  __Pyx_memviewslice __pyx_v_dst;
+  __Pyx_memviewslice *__pyx_v_p_src;
+  struct __pyx_memoryviewslice_obj *__pyx_v_memviewsliceobj = 0;
+  __Pyx_memviewslice *__pyx_v_p_dst;
+  int *__pyx_v_p_suboffset_dim;
+  Py_ssize_t __pyx_v_start;
+  Py_ssize_t __pyx_v_stop;
+  Py_ssize_t __pyx_v_step;
+  Py_ssize_t __pyx_v_cindex;
+  int __pyx_v_have_start;
+  int __pyx_v_have_stop;
+  int __pyx_v_have_step;
+  PyObject *__pyx_v_index = NULL;
+  struct __pyx_memoryview_obj *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  struct __pyx_memoryview_obj *__pyx_t_3;
+  char *__pyx_t_4;
+  int __pyx_t_5;
+  Py_ssize_t __pyx_t_6;
+  PyObject *(*__pyx_t_7)(PyObject *);
+  PyObject *__pyx_t_8 = NULL;
+  Py_ssize_t __pyx_t_9;
+  int __pyx_t_10;
+  Py_ssize_t __pyx_t_11;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("memview_slice", 1);
+
+  /* "View.MemoryView":712
+ * @cname('__pyx_memview_slice')
+ * cdef memoryview memview_slice(memoryview memview, object indices):
+ *     cdef int new_ndim = 0, suboffset_dim = -1, dim             # <<<<<<<<<<<<<<
+ *     cdef bint negative_step
+ *     cdef __Pyx_memviewslice src, dst
+ */
+  __pyx_v_new_ndim = 0;
+  __pyx_v_suboffset_dim = -1;
+
+  /* "View.MemoryView":719
+ * 
+ * 
+ *     memset(&dst, 0, sizeof(dst))             # <<<<<<<<<<<<<<
+ * 
+ *     cdef _memoryviewslice memviewsliceobj
+ */
+  (void)(memset((&__pyx_v_dst), 0, (sizeof(__pyx_v_dst))));
+
+  /* "View.MemoryView":723
+ *     cdef _memoryviewslice memviewsliceobj
+ * 
+ *     assert memview.view.ndim > 0             # <<<<<<<<<<<<<<
+ * 
+ *     if isinstance(memview, _memoryviewslice):
+ */
+  #ifndef CYTHON_WITHOUT_ASSERTIONS
+  if (unlikely(__pyx_assertions_enabled())) {
+    __pyx_t_1 = (__pyx_v_memview->view.ndim > 0);
+    if (unlikely(!__pyx_t_1)) {
+      __Pyx_Raise(__pyx_builtin_AssertionError, 0, 0, 0);
+      __PYX_ERR(1, 723, __pyx_L1_error)
+    }
+  }
+  #else
+  if ((1)); else __PYX_ERR(1, 723, __pyx_L1_error)
+  #endif
+
+  /* "View.MemoryView":725
+ *     assert memview.view.ndim > 0
+ * 
+ *     if isinstance(memview, _memoryviewslice):             # <<<<<<<<<<<<<<
+ *         memviewsliceobj = memview
+ *         p_src = &memviewsliceobj.from_slice
+ */
+  __pyx_t_1 = __Pyx_TypeCheck(((PyObject *)__pyx_v_memview), __pyx_memoryviewslice_type); 
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":726
+ * 
+ *     if isinstance(memview, _memoryviewslice):
+ *         memviewsliceobj = memview             # <<<<<<<<<<<<<<
+ *         p_src = &memviewsliceobj.from_slice
+ *     else:
+ */
+    if (!(likely(((((PyObject *)__pyx_v_memview)) == Py_None) || likely(__Pyx_TypeTest(((PyObject *)__pyx_v_memview), __pyx_memoryviewslice_type))))) __PYX_ERR(1, 726, __pyx_L1_error)
+    __pyx_t_2 = ((PyObject *)__pyx_v_memview);
+    __Pyx_INCREF(__pyx_t_2);
+    __pyx_v_memviewsliceobj = ((struct __pyx_memoryviewslice_obj *)__pyx_t_2);
+    __pyx_t_2 = 0;
+
+    /* "View.MemoryView":727
+ *     if isinstance(memview, _memoryviewslice):
+ *         memviewsliceobj = memview
+ *         p_src = &memviewsliceobj.from_slice             # <<<<<<<<<<<<<<
+ *     else:
+ *         slice_copy(memview, &src)
+ */
+    __pyx_v_p_src = (&__pyx_v_memviewsliceobj->from_slice);
+
+    /* "View.MemoryView":725
+ *     assert memview.view.ndim > 0
+ * 
+ *     if isinstance(memview, _memoryviewslice):             # <<<<<<<<<<<<<<
+ *         memviewsliceobj = memview
+ *         p_src = &memviewsliceobj.from_slice
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":729
+ *         p_src = &memviewsliceobj.from_slice
+ *     else:
+ *         slice_copy(memview, &src)             # <<<<<<<<<<<<<<
+ *         p_src = &src
+ * 
+ */
+  /*else*/ {
+    __pyx_memoryview_slice_copy(__pyx_v_memview, (&__pyx_v_src));
+
+    /* "View.MemoryView":730
+ *     else:
+ *         slice_copy(memview, &src)
+ *         p_src = &src             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+    __pyx_v_p_src = (&__pyx_v_src);
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":736
+ * 
+ * 
+ *     dst.memview = p_src.memview             # <<<<<<<<<<<<<<
+ *     dst.data = p_src.data
+ * 
+ */
+  __pyx_t_3 = __pyx_v_p_src->memview;
+  __pyx_v_dst.memview = __pyx_t_3;
+
+  /* "View.MemoryView":737
+ * 
+ *     dst.memview = p_src.memview
+ *     dst.data = p_src.data             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_t_4 = __pyx_v_p_src->data;
+  __pyx_v_dst.data = __pyx_t_4;
+
+  /* "View.MemoryView":742
+ * 
+ * 
+ *     cdef __Pyx_memviewslice *p_dst = &dst             # <<<<<<<<<<<<<<
+ *     cdef int *p_suboffset_dim = &suboffset_dim
+ *     cdef Py_ssize_t start, stop, step, cindex
+ */
+  __pyx_v_p_dst = (&__pyx_v_dst);
+
+  /* "View.MemoryView":743
+ * 
+ *     cdef __Pyx_memviewslice *p_dst = &dst
+ *     cdef int *p_suboffset_dim = &suboffset_dim             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t start, stop, step, cindex
+ *     cdef bint have_start, have_stop, have_step
+ */
+  __pyx_v_p_suboffset_dim = (&__pyx_v_suboffset_dim);
+
+  /* "View.MemoryView":747
+ *     cdef bint have_start, have_stop, have_step
+ * 
+ *     for dim, index in enumerate(indices):             # <<<<<<<<<<<<<<
+ *         if PyIndex_Check(index):
+ *             cindex = index
+ */
+  __pyx_t_5 = 0;
+  if (likely(PyList_CheckExact(__pyx_v_indices)) || PyTuple_CheckExact(__pyx_v_indices)) {
+    __pyx_t_2 = __pyx_v_indices; __Pyx_INCREF(__pyx_t_2);
+    __pyx_t_6 = 0;
+    __pyx_t_7 = NULL;
+  } else {
+    __pyx_t_6 = -1; __pyx_t_2 = PyObject_GetIter(__pyx_v_indices); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 747, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_7 = __Pyx_PyObject_GetIterNextFunc(__pyx_t_2); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 747, __pyx_L1_error)
+  }
+  for (;;) {
+    if (likely(!__pyx_t_7)) {
+      if (likely(PyList_CheckExact(__pyx_t_2))) {
+        {
+          Py_ssize_t __pyx_temp = __Pyx_PyList_GET_SIZE(__pyx_t_2);
+          #if !CYTHON_ASSUME_SAFE_MACROS
+          if (unlikely((__pyx_temp < 0))) __PYX_ERR(1, 747, __pyx_L1_error)
+          #endif
+          if (__pyx_t_6 >= __pyx_temp) break;
+        }
+        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+        __pyx_t_8 = PyList_GET_ITEM(__pyx_t_2, __pyx_t_6); __Pyx_INCREF(__pyx_t_8); __pyx_t_6++; if (unlikely((0 < 0))) __PYX_ERR(1, 747, __pyx_L1_error)
+        #else
+        __pyx_t_8 = __Pyx_PySequence_ITEM(__pyx_t_2, __pyx_t_6); __pyx_t_6++; if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 747, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_8);
+        #endif
+      } else {
+        {
+          Py_ssize_t __pyx_temp = __Pyx_PyTuple_GET_SIZE(__pyx_t_2);
+          #if !CYTHON_ASSUME_SAFE_MACROS
+          if (unlikely((__pyx_temp < 0))) __PYX_ERR(1, 747, __pyx_L1_error)
+          #endif
+          if (__pyx_t_6 >= __pyx_temp) break;
+        }
+        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+        __pyx_t_8 = PyTuple_GET_ITEM(__pyx_t_2, __pyx_t_6); __Pyx_INCREF(__pyx_t_8); __pyx_t_6++; if (unlikely((0 < 0))) __PYX_ERR(1, 747, __pyx_L1_error)
+        #else
+        __pyx_t_8 = __Pyx_PySequence_ITEM(__pyx_t_2, __pyx_t_6); __pyx_t_6++; if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 747, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_8);
+        #endif
+      }
+    } else {
+      __pyx_t_8 = __pyx_t_7(__pyx_t_2);
+      if (unlikely(!__pyx_t_8)) {
+        PyObject* exc_type = PyErr_Occurred();
+        if (exc_type) {
+          if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
+          else __PYX_ERR(1, 747, __pyx_L1_error)
+        }
+        break;
+      }
+      __Pyx_GOTREF(__pyx_t_8);
+    }
+    __Pyx_XDECREF_SET(__pyx_v_index, __pyx_t_8);
+    __pyx_t_8 = 0;
+    __pyx_v_dim = __pyx_t_5;
+    __pyx_t_5 = (__pyx_t_5 + 1);
+
+    /* "View.MemoryView":748
+ * 
+ *     for dim, index in enumerate(indices):
+ *         if PyIndex_Check(index):             # <<<<<<<<<<<<<<
+ *             cindex = index
+ *             slice_memviewslice(
+ */
+    __pyx_t_1 = (PyIndex_Check(__pyx_v_index) != 0);
+    if (__pyx_t_1) {
+
+      /* "View.MemoryView":749
+ *     for dim, index in enumerate(indices):
+ *         if PyIndex_Check(index):
+ *             cindex = index             # <<<<<<<<<<<<<<
+ *             slice_memviewslice(
+ *                 p_dst, p_src.shape[dim], p_src.strides[dim], p_src.suboffsets[dim],
+ */
+      __pyx_t_9 = __Pyx_PyIndex_AsSsize_t(__pyx_v_index); if (unlikely((__pyx_t_9 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(1, 749, __pyx_L1_error)
+      __pyx_v_cindex = __pyx_t_9;
+
+      /* "View.MemoryView":750
+ *         if PyIndex_Check(index):
+ *             cindex = index
+ *             slice_memviewslice(             # <<<<<<<<<<<<<<
+ *                 p_dst, p_src.shape[dim], p_src.strides[dim], p_src.suboffsets[dim],
+ *                 dim, new_ndim, p_suboffset_dim,
+ */
+      __pyx_t_10 = __pyx_memoryview_slice_memviewslice(__pyx_v_p_dst, (__pyx_v_p_src->shape[__pyx_v_dim]), (__pyx_v_p_src->strides[__pyx_v_dim]), (__pyx_v_p_src->suboffsets[__pyx_v_dim]), __pyx_v_dim, __pyx_v_new_ndim, __pyx_v_p_suboffset_dim, __pyx_v_cindex, 0, 0, 0, 0, 0, 0); if (unlikely(__pyx_t_10 == ((int)-1))) __PYX_ERR(1, 750, __pyx_L1_error)
+
+      /* "View.MemoryView":748
+ * 
+ *     for dim, index in enumerate(indices):
+ *         if PyIndex_Check(index):             # <<<<<<<<<<<<<<
+ *             cindex = index
+ *             slice_memviewslice(
+ */
+      goto __pyx_L6;
+    }
+
+    /* "View.MemoryView":756
+ *                 0, 0, 0, # have_{start,stop,step}
+ *                 False)
+ *         elif index is None:             # <<<<<<<<<<<<<<
+ *             p_dst.shape[new_ndim] = 1
+ *             p_dst.strides[new_ndim] = 0
+ */
+    __pyx_t_1 = (__pyx_v_index == Py_None);
+    if (__pyx_t_1) {
+
+      /* "View.MemoryView":757
+ *                 False)
+ *         elif index is None:
+ *             p_dst.shape[new_ndim] = 1             # <<<<<<<<<<<<<<
+ *             p_dst.strides[new_ndim] = 0
+ *             p_dst.suboffsets[new_ndim] = -1
+ */
+      (__pyx_v_p_dst->shape[__pyx_v_new_ndim]) = 1;
+
+      /* "View.MemoryView":758
+ *         elif index is None:
+ *             p_dst.shape[new_ndim] = 1
+ *             p_dst.strides[new_ndim] = 0             # <<<<<<<<<<<<<<
+ *             p_dst.suboffsets[new_ndim] = -1
+ *             new_ndim += 1
+ */
+      (__pyx_v_p_dst->strides[__pyx_v_new_ndim]) = 0;
+
+      /* "View.MemoryView":759
+ *             p_dst.shape[new_ndim] = 1
+ *             p_dst.strides[new_ndim] = 0
+ *             p_dst.suboffsets[new_ndim] = -1             # <<<<<<<<<<<<<<
+ *             new_ndim += 1
+ *         else:
+ */
+      (__pyx_v_p_dst->suboffsets[__pyx_v_new_ndim]) = -1L;
+
+      /* "View.MemoryView":760
+ *             p_dst.strides[new_ndim] = 0
+ *             p_dst.suboffsets[new_ndim] = -1
+ *             new_ndim += 1             # <<<<<<<<<<<<<<
+ *         else:
+ *             start = index.start or 0
+ */
+      __pyx_v_new_ndim = (__pyx_v_new_ndim + 1);
+
+      /* "View.MemoryView":756
+ *                 0, 0, 0, # have_{start,stop,step}
+ *                 False)
+ *         elif index is None:             # <<<<<<<<<<<<<<
+ *             p_dst.shape[new_ndim] = 1
+ *             p_dst.strides[new_ndim] = 0
+ */
+      goto __pyx_L6;
+    }
+
+    /* "View.MemoryView":762
+ *             new_ndim += 1
+ *         else:
+ *             start = index.start or 0             # <<<<<<<<<<<<<<
+ *             stop = index.stop or 0
+ *             step = index.step or 0
+ */
+    /*else*/ {
+      __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_index, __pyx_n_s_start); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 762, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_t_8); if (unlikely((__pyx_t_1 < 0))) __PYX_ERR(1, 762, __pyx_L1_error)
+      if (!__pyx_t_1) {
+        __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      } else {
+        __pyx_t_11 = __Pyx_PyIndex_AsSsize_t(__pyx_t_8); if (unlikely((__pyx_t_11 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(1, 762, __pyx_L1_error)
+        __pyx_t_9 = __pyx_t_11;
+        __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+        goto __pyx_L7_bool_binop_done;
+      }
+      __pyx_t_9 = 0;
+      __pyx_L7_bool_binop_done:;
+      __pyx_v_start = __pyx_t_9;
+
+      /* "View.MemoryView":763
+ *         else:
+ *             start = index.start or 0
+ *             stop = index.stop or 0             # <<<<<<<<<<<<<<
+ *             step = index.step or 0
+ * 
+ */
+      __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_index, __pyx_n_s_stop); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 763, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_t_8); if (unlikely((__pyx_t_1 < 0))) __PYX_ERR(1, 763, __pyx_L1_error)
+      if (!__pyx_t_1) {
+        __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      } else {
+        __pyx_t_11 = __Pyx_PyIndex_AsSsize_t(__pyx_t_8); if (unlikely((__pyx_t_11 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(1, 763, __pyx_L1_error)
+        __pyx_t_9 = __pyx_t_11;
+        __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+        goto __pyx_L9_bool_binop_done;
+      }
+      __pyx_t_9 = 0;
+      __pyx_L9_bool_binop_done:;
+      __pyx_v_stop = __pyx_t_9;
+
+      /* "View.MemoryView":764
+ *             start = index.start or 0
+ *             stop = index.stop or 0
+ *             step = index.step or 0             # <<<<<<<<<<<<<<
+ * 
+ *             have_start = index.start is not None
+ */
+      __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_index, __pyx_n_s_step); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 764, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_t_8); if (unlikely((__pyx_t_1 < 0))) __PYX_ERR(1, 764, __pyx_L1_error)
+      if (!__pyx_t_1) {
+        __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      } else {
+        __pyx_t_11 = __Pyx_PyIndex_AsSsize_t(__pyx_t_8); if (unlikely((__pyx_t_11 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(1, 764, __pyx_L1_error)
+        __pyx_t_9 = __pyx_t_11;
+        __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+        goto __pyx_L11_bool_binop_done;
+      }
+      __pyx_t_9 = 0;
+      __pyx_L11_bool_binop_done:;
+      __pyx_v_step = __pyx_t_9;
+
+      /* "View.MemoryView":766
+ *             step = index.step or 0
+ * 
+ *             have_start = index.start is not None             # <<<<<<<<<<<<<<
+ *             have_stop = index.stop is not None
+ *             have_step = index.step is not None
+ */
+      __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_index, __pyx_n_s_start); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 766, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __pyx_t_1 = (__pyx_t_8 != Py_None);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __pyx_v_have_start = __pyx_t_1;
+
+      /* "View.MemoryView":767
+ * 
+ *             have_start = index.start is not None
+ *             have_stop = index.stop is not None             # <<<<<<<<<<<<<<
+ *             have_step = index.step is not None
+ * 
+ */
+      __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_index, __pyx_n_s_stop); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 767, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __pyx_t_1 = (__pyx_t_8 != Py_None);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __pyx_v_have_stop = __pyx_t_1;
+
+      /* "View.MemoryView":768
+ *             have_start = index.start is not None
+ *             have_stop = index.stop is not None
+ *             have_step = index.step is not None             # <<<<<<<<<<<<<<
+ * 
+ *             slice_memviewslice(
+ */
+      __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_index, __pyx_n_s_step); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 768, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __pyx_t_1 = (__pyx_t_8 != Py_None);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __pyx_v_have_step = __pyx_t_1;
+
+      /* "View.MemoryView":770
+ *             have_step = index.step is not None
+ * 
+ *             slice_memviewslice(             # <<<<<<<<<<<<<<
+ *                 p_dst, p_src.shape[dim], p_src.strides[dim], p_src.suboffsets[dim],
+ *                 dim, new_ndim, p_suboffset_dim,
+ */
+      __pyx_t_10 = __pyx_memoryview_slice_memviewslice(__pyx_v_p_dst, (__pyx_v_p_src->shape[__pyx_v_dim]), (__pyx_v_p_src->strides[__pyx_v_dim]), (__pyx_v_p_src->suboffsets[__pyx_v_dim]), __pyx_v_dim, __pyx_v_new_ndim, __pyx_v_p_suboffset_dim, __pyx_v_start, __pyx_v_stop, __pyx_v_step, __pyx_v_have_start, __pyx_v_have_stop, __pyx_v_have_step, 1); if (unlikely(__pyx_t_10 == ((int)-1))) __PYX_ERR(1, 770, __pyx_L1_error)
+
+      /* "View.MemoryView":776
+ *                 have_start, have_stop, have_step,
+ *                 True)
+ *             new_ndim += 1             # <<<<<<<<<<<<<<
+ * 
+ *     if isinstance(memview, _memoryviewslice):
+ */
+      __pyx_v_new_ndim = (__pyx_v_new_ndim + 1);
+    }
+    __pyx_L6:;
+
+    /* "View.MemoryView":747
+ *     cdef bint have_start, have_stop, have_step
+ * 
+ *     for dim, index in enumerate(indices):             # <<<<<<<<<<<<<<
+ *         if PyIndex_Check(index):
+ *             cindex = index
+ */
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "View.MemoryView":778
+ *             new_ndim += 1
+ * 
+ *     if isinstance(memview, _memoryviewslice):             # <<<<<<<<<<<<<<
+ *         return memoryview_fromslice(dst, new_ndim,
+ *                                     memviewsliceobj.to_object_func,
+ */
+  __pyx_t_1 = __Pyx_TypeCheck(((PyObject *)__pyx_v_memview), __pyx_memoryviewslice_type); 
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":779
+ * 
+ *     if isinstance(memview, _memoryviewslice):
+ *         return memoryview_fromslice(dst, new_ndim,             # <<<<<<<<<<<<<<
+ *                                     memviewsliceobj.to_object_func,
+ *                                     memviewsliceobj.to_dtype_func,
+ */
+    __Pyx_XDECREF((PyObject *)__pyx_r);
+
+    /* "View.MemoryView":780
+ *     if isinstance(memview, _memoryviewslice):
+ *         return memoryview_fromslice(dst, new_ndim,
+ *                                     memviewsliceobj.to_object_func,             # <<<<<<<<<<<<<<
+ *                                     memviewsliceobj.to_dtype_func,
+ *                                     memview.dtype_is_object)
+ */
+    if (unlikely(!__pyx_v_memviewsliceobj)) { __Pyx_RaiseUnboundLocalError("memviewsliceobj"); __PYX_ERR(1, 780, __pyx_L1_error) }
+
+    /* "View.MemoryView":781
+ *         return memoryview_fromslice(dst, new_ndim,
+ *                                     memviewsliceobj.to_object_func,
+ *                                     memviewsliceobj.to_dtype_func,             # <<<<<<<<<<<<<<
+ *                                     memview.dtype_is_object)
+ *     else:
+ */
+    if (unlikely(!__pyx_v_memviewsliceobj)) { __Pyx_RaiseUnboundLocalError("memviewsliceobj"); __PYX_ERR(1, 781, __pyx_L1_error) }
+
+    /* "View.MemoryView":779
+ * 
+ *     if isinstance(memview, _memoryviewslice):
+ *         return memoryview_fromslice(dst, new_ndim,             # <<<<<<<<<<<<<<
+ *                                     memviewsliceobj.to_object_func,
+ *                                     memviewsliceobj.to_dtype_func,
+ */
+    __pyx_t_2 = __pyx_memoryview_fromslice(__pyx_v_dst, __pyx_v_new_ndim, __pyx_v_memviewsliceobj->to_object_func, __pyx_v_memviewsliceobj->to_dtype_func, __pyx_v_memview->dtype_is_object); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 779, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    if (!(likely(((__pyx_t_2) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_2, __pyx_memoryview_type))))) __PYX_ERR(1, 779, __pyx_L1_error)
+    __pyx_r = ((struct __pyx_memoryview_obj *)__pyx_t_2);
+    __pyx_t_2 = 0;
+    goto __pyx_L0;
+
+    /* "View.MemoryView":778
+ *             new_ndim += 1
+ * 
+ *     if isinstance(memview, _memoryviewslice):             # <<<<<<<<<<<<<<
+ *         return memoryview_fromslice(dst, new_ndim,
+ *                                     memviewsliceobj.to_object_func,
+ */
+  }
+
+  /* "View.MemoryView":784
+ *                                     memview.dtype_is_object)
+ *     else:
+ *         return memoryview_fromslice(dst, new_ndim, NULL, NULL,             # <<<<<<<<<<<<<<
+ *                                     memview.dtype_is_object)
+ * 
+ */
+  /*else*/ {
+    __Pyx_XDECREF((PyObject *)__pyx_r);
+
+    /* "View.MemoryView":785
+ *     else:
+ *         return memoryview_fromslice(dst, new_ndim, NULL, NULL,
+ *                                     memview.dtype_is_object)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+    __pyx_t_2 = __pyx_memoryview_fromslice(__pyx_v_dst, __pyx_v_new_ndim, NULL, NULL, __pyx_v_memview->dtype_is_object); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 784, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+
+    /* "View.MemoryView":784
+ *                                     memview.dtype_is_object)
+ *     else:
+ *         return memoryview_fromslice(dst, new_ndim, NULL, NULL,             # <<<<<<<<<<<<<<
+ *                                     memview.dtype_is_object)
+ * 
+ */
+    if (!(likely(((__pyx_t_2) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_2, __pyx_memoryview_type))))) __PYX_ERR(1, 784, __pyx_L1_error)
+    __pyx_r = ((struct __pyx_memoryview_obj *)__pyx_t_2);
+    __pyx_t_2 = 0;
+    goto __pyx_L0;
+  }
+
+  /* "View.MemoryView":711
+ * 
+ * @cname('__pyx_memview_slice')
+ * cdef memoryview memview_slice(memoryview memview, object indices):             # <<<<<<<<<<<<<<
+ *     cdef int new_ndim = 0, suboffset_dim = -1, dim
+ *     cdef bint negative_step
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("View.MemoryView.memview_slice", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_memviewsliceobj);
+  __Pyx_XDECREF(__pyx_v_index);
+  __Pyx_XGIVEREF((PyObject *)__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":793
+ * 
+ * @cname('__pyx_memoryview_slice_memviewslice')
+ * cdef int slice_memviewslice(             # <<<<<<<<<<<<<<
+ *         __Pyx_memviewslice *dst,
+ *         Py_ssize_t shape, Py_ssize_t stride, Py_ssize_t suboffset,
+ */
+
+static int __pyx_memoryview_slice_memviewslice(__Pyx_memviewslice *__pyx_v_dst, Py_ssize_t __pyx_v_shape, Py_ssize_t __pyx_v_stride, Py_ssize_t __pyx_v_suboffset, int __pyx_v_dim, int __pyx_v_new_ndim, int *__pyx_v_suboffset_dim, Py_ssize_t __pyx_v_start, Py_ssize_t __pyx_v_stop, Py_ssize_t __pyx_v_step, int __pyx_v_have_start, int __pyx_v_have_stop, int __pyx_v_have_step, int __pyx_v_is_slice) {
+  Py_ssize_t __pyx_v_new_shape;
+  int __pyx_v_negative_step;
+  int __pyx_r;
+  int __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  #ifdef WITH_THREAD
+  PyGILState_STATE __pyx_gilstate_save;
+  #endif
+
+  /* "View.MemoryView":813
+ *     cdef bint negative_step
+ * 
+ *     if not is_slice:             # <<<<<<<<<<<<<<
+ * 
+ *         if start < 0:
+ */
+  __pyx_t_1 = (!__pyx_v_is_slice);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":815
+ *     if not is_slice:
+ * 
+ *         if start < 0:             # <<<<<<<<<<<<<<
+ *             start += shape
+ *         if not 0 <= start < shape:
+ */
+    __pyx_t_1 = (__pyx_v_start < 0);
+    if (__pyx_t_1) {
+
+      /* "View.MemoryView":816
+ * 
+ *         if start < 0:
+ *             start += shape             # <<<<<<<<<<<<<<
+ *         if not 0 <= start < shape:
+ *             _err_dim(PyExc_IndexError, "Index out of bounds (axis %d)", dim)
+ */
+      __pyx_v_start = (__pyx_v_start + __pyx_v_shape);
+
+      /* "View.MemoryView":815
+ *     if not is_slice:
+ * 
+ *         if start < 0:             # <<<<<<<<<<<<<<
+ *             start += shape
+ *         if not 0 <= start < shape:
+ */
+    }
+
+    /* "View.MemoryView":817
+ *         if start < 0:
+ *             start += shape
+ *         if not 0 <= start < shape:             # <<<<<<<<<<<<<<
+ *             _err_dim(PyExc_IndexError, "Index out of bounds (axis %d)", dim)
+ *     else:
+ */
+    __pyx_t_1 = (0 <= __pyx_v_start);
+    if (__pyx_t_1) {
+      __pyx_t_1 = (__pyx_v_start < __pyx_v_shape);
+    }
+    __pyx_t_2 = (!__pyx_t_1);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":818
+ *             start += shape
+ *         if not 0 <= start < shape:
+ *             _err_dim(PyExc_IndexError, "Index out of bounds (axis %d)", dim)             # <<<<<<<<<<<<<<
+ *     else:
+ * 
+ */
+      __pyx_t_3 = __pyx_memoryview_err_dim(PyExc_IndexError, __pyx_kp_s_Index_out_of_bounds_axis_d, __pyx_v_dim); if (unlikely(__pyx_t_3 == ((int)-1))) __PYX_ERR(1, 818, __pyx_L1_error)
+
+      /* "View.MemoryView":817
+ *         if start < 0:
+ *             start += shape
+ *         if not 0 <= start < shape:             # <<<<<<<<<<<<<<
+ *             _err_dim(PyExc_IndexError, "Index out of bounds (axis %d)", dim)
+ *     else:
+ */
+    }
+
+    /* "View.MemoryView":813
+ *     cdef bint negative_step
+ * 
+ *     if not is_slice:             # <<<<<<<<<<<<<<
+ * 
+ *         if start < 0:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":821
+ *     else:
+ * 
+ *         if have_step:             # <<<<<<<<<<<<<<
+ *             negative_step = step < 0
+ *             if step == 0:
+ */
+  /*else*/ {
+    __pyx_t_2 = (__pyx_v_have_step != 0);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":822
+ * 
+ *         if have_step:
+ *             negative_step = step < 0             # <<<<<<<<<<<<<<
+ *             if step == 0:
+ *                 _err_dim(PyExc_ValueError, "Step may not be zero (axis %d)", dim)
+ */
+      __pyx_v_negative_step = (__pyx_v_step < 0);
+
+      /* "View.MemoryView":823
+ *         if have_step:
+ *             negative_step = step < 0
+ *             if step == 0:             # <<<<<<<<<<<<<<
+ *                 _err_dim(PyExc_ValueError, "Step may not be zero (axis %d)", dim)
+ *         else:
+ */
+      __pyx_t_2 = (__pyx_v_step == 0);
+      if (__pyx_t_2) {
+
+        /* "View.MemoryView":824
+ *             negative_step = step < 0
+ *             if step == 0:
+ *                 _err_dim(PyExc_ValueError, "Step may not be zero (axis %d)", dim)             # <<<<<<<<<<<<<<
+ *         else:
+ *             negative_step = False
+ */
+        __pyx_t_3 = __pyx_memoryview_err_dim(PyExc_ValueError, __pyx_kp_s_Step_may_not_be_zero_axis_d, __pyx_v_dim); if (unlikely(__pyx_t_3 == ((int)-1))) __PYX_ERR(1, 824, __pyx_L1_error)
+
+        /* "View.MemoryView":823
+ *         if have_step:
+ *             negative_step = step < 0
+ *             if step == 0:             # <<<<<<<<<<<<<<
+ *                 _err_dim(PyExc_ValueError, "Step may not be zero (axis %d)", dim)
+ *         else:
+ */
+      }
+
+      /* "View.MemoryView":821
+ *     else:
+ * 
+ *         if have_step:             # <<<<<<<<<<<<<<
+ *             negative_step = step < 0
+ *             if step == 0:
+ */
+      goto __pyx_L6;
+    }
+
+    /* "View.MemoryView":826
+ *                 _err_dim(PyExc_ValueError, "Step may not be zero (axis %d)", dim)
+ *         else:
+ *             negative_step = False             # <<<<<<<<<<<<<<
+ *             step = 1
+ * 
+ */
+    /*else*/ {
+      __pyx_v_negative_step = 0;
+
+      /* "View.MemoryView":827
+ *         else:
+ *             negative_step = False
+ *             step = 1             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+      __pyx_v_step = 1;
+    }
+    __pyx_L6:;
+
+    /* "View.MemoryView":830
+ * 
+ * 
+ *         if have_start:             # <<<<<<<<<<<<<<
+ *             if start < 0:
+ *                 start += shape
+ */
+    __pyx_t_2 = (__pyx_v_have_start != 0);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":831
+ * 
+ *         if have_start:
+ *             if start < 0:             # <<<<<<<<<<<<<<
+ *                 start += shape
+ *                 if start < 0:
+ */
+      __pyx_t_2 = (__pyx_v_start < 0);
+      if (__pyx_t_2) {
+
+        /* "View.MemoryView":832
+ *         if have_start:
+ *             if start < 0:
+ *                 start += shape             # <<<<<<<<<<<<<<
+ *                 if start < 0:
+ *                     start = 0
+ */
+        __pyx_v_start = (__pyx_v_start + __pyx_v_shape);
+
+        /* "View.MemoryView":833
+ *             if start < 0:
+ *                 start += shape
+ *                 if start < 0:             # <<<<<<<<<<<<<<
+ *                     start = 0
+ *             elif start >= shape:
+ */
+        __pyx_t_2 = (__pyx_v_start < 0);
+        if (__pyx_t_2) {
+
+          /* "View.MemoryView":834
+ *                 start += shape
+ *                 if start < 0:
+ *                     start = 0             # <<<<<<<<<<<<<<
+ *             elif start >= shape:
+ *                 if negative_step:
+ */
+          __pyx_v_start = 0;
+
+          /* "View.MemoryView":833
+ *             if start < 0:
+ *                 start += shape
+ *                 if start < 0:             # <<<<<<<<<<<<<<
+ *                     start = 0
+ *             elif start >= shape:
+ */
+        }
+
+        /* "View.MemoryView":831
+ * 
+ *         if have_start:
+ *             if start < 0:             # <<<<<<<<<<<<<<
+ *                 start += shape
+ *                 if start < 0:
+ */
+        goto __pyx_L9;
+      }
+
+      /* "View.MemoryView":835
+ *                 if start < 0:
+ *                     start = 0
+ *             elif start >= shape:             # <<<<<<<<<<<<<<
+ *                 if negative_step:
+ *                     start = shape - 1
+ */
+      __pyx_t_2 = (__pyx_v_start >= __pyx_v_shape);
+      if (__pyx_t_2) {
+
+        /* "View.MemoryView":836
+ *                     start = 0
+ *             elif start >= shape:
+ *                 if negative_step:             # <<<<<<<<<<<<<<
+ *                     start = shape - 1
+ *                 else:
+ */
+        if (__pyx_v_negative_step) {
+
+          /* "View.MemoryView":837
+ *             elif start >= shape:
+ *                 if negative_step:
+ *                     start = shape - 1             # <<<<<<<<<<<<<<
+ *                 else:
+ *                     start = shape
+ */
+          __pyx_v_start = (__pyx_v_shape - 1);
+
+          /* "View.MemoryView":836
+ *                     start = 0
+ *             elif start >= shape:
+ *                 if negative_step:             # <<<<<<<<<<<<<<
+ *                     start = shape - 1
+ *                 else:
+ */
+          goto __pyx_L11;
+        }
+
+        /* "View.MemoryView":839
+ *                     start = shape - 1
+ *                 else:
+ *                     start = shape             # <<<<<<<<<<<<<<
+ *         else:
+ *             if negative_step:
+ */
+        /*else*/ {
+          __pyx_v_start = __pyx_v_shape;
+        }
+        __pyx_L11:;
+
+        /* "View.MemoryView":835
+ *                 if start < 0:
+ *                     start = 0
+ *             elif start >= shape:             # <<<<<<<<<<<<<<
+ *                 if negative_step:
+ *                     start = shape - 1
+ */
+      }
+      __pyx_L9:;
+
+      /* "View.MemoryView":830
+ * 
+ * 
+ *         if have_start:             # <<<<<<<<<<<<<<
+ *             if start < 0:
+ *                 start += shape
+ */
+      goto __pyx_L8;
+    }
+
+    /* "View.MemoryView":841
+ *                     start = shape
+ *         else:
+ *             if negative_step:             # <<<<<<<<<<<<<<
+ *                 start = shape - 1
+ *             else:
+ */
+    /*else*/ {
+      if (__pyx_v_negative_step) {
+
+        /* "View.MemoryView":842
+ *         else:
+ *             if negative_step:
+ *                 start = shape - 1             # <<<<<<<<<<<<<<
+ *             else:
+ *                 start = 0
+ */
+        __pyx_v_start = (__pyx_v_shape - 1);
+
+        /* "View.MemoryView":841
+ *                     start = shape
+ *         else:
+ *             if negative_step:             # <<<<<<<<<<<<<<
+ *                 start = shape - 1
+ *             else:
+ */
+        goto __pyx_L12;
+      }
+
+      /* "View.MemoryView":844
+ *                 start = shape - 1
+ *             else:
+ *                 start = 0             # <<<<<<<<<<<<<<
+ * 
+ *         if have_stop:
+ */
+      /*else*/ {
+        __pyx_v_start = 0;
+      }
+      __pyx_L12:;
+    }
+    __pyx_L8:;
+
+    /* "View.MemoryView":846
+ *                 start = 0
+ * 
+ *         if have_stop:             # <<<<<<<<<<<<<<
+ *             if stop < 0:
+ *                 stop += shape
+ */
+    __pyx_t_2 = (__pyx_v_have_stop != 0);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":847
+ * 
+ *         if have_stop:
+ *             if stop < 0:             # <<<<<<<<<<<<<<
+ *                 stop += shape
+ *                 if stop < 0:
+ */
+      __pyx_t_2 = (__pyx_v_stop < 0);
+      if (__pyx_t_2) {
+
+        /* "View.MemoryView":848
+ *         if have_stop:
+ *             if stop < 0:
+ *                 stop += shape             # <<<<<<<<<<<<<<
+ *                 if stop < 0:
+ *                     stop = 0
+ */
+        __pyx_v_stop = (__pyx_v_stop + __pyx_v_shape);
+
+        /* "View.MemoryView":849
+ *             if stop < 0:
+ *                 stop += shape
+ *                 if stop < 0:             # <<<<<<<<<<<<<<
+ *                     stop = 0
+ *             elif stop > shape:
+ */
+        __pyx_t_2 = (__pyx_v_stop < 0);
+        if (__pyx_t_2) {
+
+          /* "View.MemoryView":850
+ *                 stop += shape
+ *                 if stop < 0:
+ *                     stop = 0             # <<<<<<<<<<<<<<
+ *             elif stop > shape:
+ *                 stop = shape
+ */
+          __pyx_v_stop = 0;
+
+          /* "View.MemoryView":849
+ *             if stop < 0:
+ *                 stop += shape
+ *                 if stop < 0:             # <<<<<<<<<<<<<<
+ *                     stop = 0
+ *             elif stop > shape:
+ */
+        }
+
+        /* "View.MemoryView":847
+ * 
+ *         if have_stop:
+ *             if stop < 0:             # <<<<<<<<<<<<<<
+ *                 stop += shape
+ *                 if stop < 0:
+ */
+        goto __pyx_L14;
+      }
+
+      /* "View.MemoryView":851
+ *                 if stop < 0:
+ *                     stop = 0
+ *             elif stop > shape:             # <<<<<<<<<<<<<<
+ *                 stop = shape
+ *         else:
+ */
+      __pyx_t_2 = (__pyx_v_stop > __pyx_v_shape);
+      if (__pyx_t_2) {
+
+        /* "View.MemoryView":852
+ *                     stop = 0
+ *             elif stop > shape:
+ *                 stop = shape             # <<<<<<<<<<<<<<
+ *         else:
+ *             if negative_step:
+ */
+        __pyx_v_stop = __pyx_v_shape;
+
+        /* "View.MemoryView":851
+ *                 if stop < 0:
+ *                     stop = 0
+ *             elif stop > shape:             # <<<<<<<<<<<<<<
+ *                 stop = shape
+ *         else:
+ */
+      }
+      __pyx_L14:;
+
+      /* "View.MemoryView":846
+ *                 start = 0
+ * 
+ *         if have_stop:             # <<<<<<<<<<<<<<
+ *             if stop < 0:
+ *                 stop += shape
+ */
+      goto __pyx_L13;
+    }
+
+    /* "View.MemoryView":854
+ *                 stop = shape
+ *         else:
+ *             if negative_step:             # <<<<<<<<<<<<<<
+ *                 stop = -1
+ *             else:
+ */
+    /*else*/ {
+      if (__pyx_v_negative_step) {
+
+        /* "View.MemoryView":855
+ *         else:
+ *             if negative_step:
+ *                 stop = -1             # <<<<<<<<<<<<<<
+ *             else:
+ *                 stop = shape
+ */
+        __pyx_v_stop = -1L;
+
+        /* "View.MemoryView":854
+ *                 stop = shape
+ *         else:
+ *             if negative_step:             # <<<<<<<<<<<<<<
+ *                 stop = -1
+ *             else:
+ */
+        goto __pyx_L16;
+      }
+
+      /* "View.MemoryView":857
+ *                 stop = -1
+ *             else:
+ *                 stop = shape             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+      /*else*/ {
+        __pyx_v_stop = __pyx_v_shape;
+      }
+      __pyx_L16:;
+    }
+    __pyx_L13:;
+
+    /* "View.MemoryView":861
+ * 
+ *         with cython.cdivision(True):
+ *             new_shape = (stop - start) // step             # <<<<<<<<<<<<<<
+ * 
+ *             if (stop - start) - step * new_shape:
+ */
+    __pyx_v_new_shape = ((__pyx_v_stop - __pyx_v_start) / __pyx_v_step);
+
+    /* "View.MemoryView":863
+ *             new_shape = (stop - start) // step
+ * 
+ *             if (stop - start) - step * new_shape:             # <<<<<<<<<<<<<<
+ *                 new_shape += 1
+ * 
+ */
+    __pyx_t_2 = (((__pyx_v_stop - __pyx_v_start) - (__pyx_v_step * __pyx_v_new_shape)) != 0);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":864
+ * 
+ *             if (stop - start) - step * new_shape:
+ *                 new_shape += 1             # <<<<<<<<<<<<<<
+ * 
+ *         if new_shape < 0:
+ */
+      __pyx_v_new_shape = (__pyx_v_new_shape + 1);
+
+      /* "View.MemoryView":863
+ *             new_shape = (stop - start) // step
+ * 
+ *             if (stop - start) - step * new_shape:             # <<<<<<<<<<<<<<
+ *                 new_shape += 1
+ * 
+ */
+    }
+
+    /* "View.MemoryView":866
+ *                 new_shape += 1
+ * 
+ *         if new_shape < 0:             # <<<<<<<<<<<<<<
+ *             new_shape = 0
+ * 
+ */
+    __pyx_t_2 = (__pyx_v_new_shape < 0);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":867
+ * 
+ *         if new_shape < 0:
+ *             new_shape = 0             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+      __pyx_v_new_shape = 0;
+
+      /* "View.MemoryView":866
+ *                 new_shape += 1
+ * 
+ *         if new_shape < 0:             # <<<<<<<<<<<<<<
+ *             new_shape = 0
+ * 
+ */
+    }
+
+    /* "View.MemoryView":870
+ * 
+ * 
+ *         dst.strides[new_ndim] = stride * step             # <<<<<<<<<<<<<<
+ *         dst.shape[new_ndim] = new_shape
+ *         dst.suboffsets[new_ndim] = suboffset
+ */
+    (__pyx_v_dst->strides[__pyx_v_new_ndim]) = (__pyx_v_stride * __pyx_v_step);
+
+    /* "View.MemoryView":871
+ * 
+ *         dst.strides[new_ndim] = stride * step
+ *         dst.shape[new_ndim] = new_shape             # <<<<<<<<<<<<<<
+ *         dst.suboffsets[new_ndim] = suboffset
+ * 
+ */
+    (__pyx_v_dst->shape[__pyx_v_new_ndim]) = __pyx_v_new_shape;
+
+    /* "View.MemoryView":872
+ *         dst.strides[new_ndim] = stride * step
+ *         dst.shape[new_ndim] = new_shape
+ *         dst.suboffsets[new_ndim] = suboffset             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+    (__pyx_v_dst->suboffsets[__pyx_v_new_ndim]) = __pyx_v_suboffset;
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":875
+ * 
+ * 
+ *     if suboffset_dim[0] < 0:             # <<<<<<<<<<<<<<
+ *         dst.data += start * stride
+ *     else:
+ */
+  __pyx_t_2 = ((__pyx_v_suboffset_dim[0]) < 0);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":876
+ * 
+ *     if suboffset_dim[0] < 0:
+ *         dst.data += start * stride             # <<<<<<<<<<<<<<
+ *     else:
+ *         dst.suboffsets[suboffset_dim[0]] += start * stride
+ */
+    __pyx_v_dst->data = (__pyx_v_dst->data + (__pyx_v_start * __pyx_v_stride));
+
+    /* "View.MemoryView":875
+ * 
+ * 
+ *     if suboffset_dim[0] < 0:             # <<<<<<<<<<<<<<
+ *         dst.data += start * stride
+ *     else:
+ */
+    goto __pyx_L19;
+  }
+
+  /* "View.MemoryView":878
+ *         dst.data += start * stride
+ *     else:
+ *         dst.suboffsets[suboffset_dim[0]] += start * stride             # <<<<<<<<<<<<<<
+ * 
+ *     if suboffset >= 0:
+ */
+  /*else*/ {
+    __pyx_t_3 = (__pyx_v_suboffset_dim[0]);
+    (__pyx_v_dst->suboffsets[__pyx_t_3]) = ((__pyx_v_dst->suboffsets[__pyx_t_3]) + (__pyx_v_start * __pyx_v_stride));
+  }
+  __pyx_L19:;
+
+  /* "View.MemoryView":880
+ *         dst.suboffsets[suboffset_dim[0]] += start * stride
+ * 
+ *     if suboffset >= 0:             # <<<<<<<<<<<<<<
+ *         if not is_slice:
+ *             if new_ndim == 0:
+ */
+  __pyx_t_2 = (__pyx_v_suboffset >= 0);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":881
+ * 
+ *     if suboffset >= 0:
+ *         if not is_slice:             # <<<<<<<<<<<<<<
+ *             if new_ndim == 0:
+ *                 dst.data = (<char **> dst.data)[0] + suboffset
+ */
+    __pyx_t_2 = (!__pyx_v_is_slice);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":882
+ *     if suboffset >= 0:
+ *         if not is_slice:
+ *             if new_ndim == 0:             # <<<<<<<<<<<<<<
+ *                 dst.data = (<char **> dst.data)[0] + suboffset
+ *             else:
+ */
+      __pyx_t_2 = (__pyx_v_new_ndim == 0);
+      if (__pyx_t_2) {
+
+        /* "View.MemoryView":883
+ *         if not is_slice:
+ *             if new_ndim == 0:
+ *                 dst.data = (<char **> dst.data)[0] + suboffset             # <<<<<<<<<<<<<<
+ *             else:
+ *                 _err_dim(PyExc_IndexError, "All dimensions preceding dimension %d "
+ */
+        __pyx_v_dst->data = ((((char **)__pyx_v_dst->data)[0]) + __pyx_v_suboffset);
+
+        /* "View.MemoryView":882
+ *     if suboffset >= 0:
+ *         if not is_slice:
+ *             if new_ndim == 0:             # <<<<<<<<<<<<<<
+ *                 dst.data = (<char **> dst.data)[0] + suboffset
+ *             else:
+ */
+        goto __pyx_L22;
+      }
+
+      /* "View.MemoryView":885
+ *                 dst.data = (<char **> dst.data)[0] + suboffset
+ *             else:
+ *                 _err_dim(PyExc_IndexError, "All dimensions preceding dimension %d "             # <<<<<<<<<<<<<<
+ *                                      "must be indexed and not sliced", dim)
+ *         else:
+ */
+      /*else*/ {
+
+        /* "View.MemoryView":886
+ *             else:
+ *                 _err_dim(PyExc_IndexError, "All dimensions preceding dimension %d "
+ *                                      "must be indexed and not sliced", dim)             # <<<<<<<<<<<<<<
+ *         else:
+ *             suboffset_dim[0] = new_ndim
+ */
+        __pyx_t_3 = __pyx_memoryview_err_dim(PyExc_IndexError, __pyx_kp_s_All_dimensions_preceding_dimensi, __pyx_v_dim); if (unlikely(__pyx_t_3 == ((int)-1))) __PYX_ERR(1, 885, __pyx_L1_error)
+      }
+      __pyx_L22:;
+
+      /* "View.MemoryView":881
+ * 
+ *     if suboffset >= 0:
+ *         if not is_slice:             # <<<<<<<<<<<<<<
+ *             if new_ndim == 0:
+ *                 dst.data = (<char **> dst.data)[0] + suboffset
+ */
+      goto __pyx_L21;
+    }
+
+    /* "View.MemoryView":888
+ *                                      "must be indexed and not sliced", dim)
+ *         else:
+ *             suboffset_dim[0] = new_ndim             # <<<<<<<<<<<<<<
+ * 
+ *     return 0
+ */
+    /*else*/ {
+      (__pyx_v_suboffset_dim[0]) = __pyx_v_new_ndim;
+    }
+    __pyx_L21:;
+
+    /* "View.MemoryView":880
+ *         dst.suboffsets[suboffset_dim[0]] += start * stride
+ * 
+ *     if suboffset >= 0:             # <<<<<<<<<<<<<<
+ *         if not is_slice:
+ *             if new_ndim == 0:
+ */
+  }
+
+  /* "View.MemoryView":890
+ *             suboffset_dim[0] = new_ndim
+ * 
+ *     return 0             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":793
+ * 
+ * @cname('__pyx_memoryview_slice_memviewslice')
+ * cdef int slice_memviewslice(             # <<<<<<<<<<<<<<
+ *         __Pyx_memviewslice *dst,
+ *         Py_ssize_t shape, Py_ssize_t stride, Py_ssize_t suboffset,
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  #ifdef WITH_THREAD
+  __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
+  #endif
+  __Pyx_AddTraceback("View.MemoryView.slice_memviewslice", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  #ifdef WITH_THREAD
+  __Pyx_PyGILState_Release(__pyx_gilstate_save);
+  #endif
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":896
+ * 
+ * @cname('__pyx_pybuffer_index')
+ * cdef char *pybuffer_index(Py_buffer *view, char *bufp, Py_ssize_t index,             # <<<<<<<<<<<<<<
+ *                           Py_ssize_t dim) except NULL:
+ *     cdef Py_ssize_t shape, stride, suboffset = -1
+ */
+
+static char *__pyx_pybuffer_index(Py_buffer *__pyx_v_view, char *__pyx_v_bufp, Py_ssize_t __pyx_v_index, Py_ssize_t __pyx_v_dim) {
+  Py_ssize_t __pyx_v_shape;
+  Py_ssize_t __pyx_v_stride;
+  Py_ssize_t __pyx_v_suboffset;
+  Py_ssize_t __pyx_v_itemsize;
+  char *__pyx_v_resultp;
+  char *__pyx_r;
+  __Pyx_RefNannyDeclarations
+  Py_ssize_t __pyx_t_1;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  Py_UCS4 __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("pybuffer_index", 1);
+
+  /* "View.MemoryView":898
+ * cdef char *pybuffer_index(Py_buffer *view, char *bufp, Py_ssize_t index,
+ *                           Py_ssize_t dim) except NULL:
+ *     cdef Py_ssize_t shape, stride, suboffset = -1             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t itemsize = view.itemsize
+ *     cdef char *resultp
+ */
+  __pyx_v_suboffset = -1L;
+
+  /* "View.MemoryView":899
+ *                           Py_ssize_t dim) except NULL:
+ *     cdef Py_ssize_t shape, stride, suboffset = -1
+ *     cdef Py_ssize_t itemsize = view.itemsize             # <<<<<<<<<<<<<<
+ *     cdef char *resultp
+ * 
+ */
+  __pyx_t_1 = __pyx_v_view->itemsize;
+  __pyx_v_itemsize = __pyx_t_1;
+
+  /* "View.MemoryView":902
+ *     cdef char *resultp
+ * 
+ *     if view.ndim == 0:             # <<<<<<<<<<<<<<
+ *         shape = view.len // itemsize
+ *         stride = itemsize
+ */
+  __pyx_t_2 = (__pyx_v_view->ndim == 0);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":903
+ * 
+ *     if view.ndim == 0:
+ *         shape = view.len // itemsize             # <<<<<<<<<<<<<<
+ *         stride = itemsize
+ *     else:
+ */
+    if (unlikely(__pyx_v_itemsize == 0)) {
+      PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
+      __PYX_ERR(1, 903, __pyx_L1_error)
+    }
+    else if (sizeof(Py_ssize_t) == sizeof(long) && (!(((Py_ssize_t)-1) > 0)) && unlikely(__pyx_v_itemsize == (Py_ssize_t)-1)  && unlikely(__Pyx_UNARY_NEG_WOULD_OVERFLOW(__pyx_v_view->len))) {
+      PyErr_SetString(PyExc_OverflowError, "value too large to perform division");
+      __PYX_ERR(1, 903, __pyx_L1_error)
+    }
+    __pyx_v_shape = __Pyx_div_Py_ssize_t(__pyx_v_view->len, __pyx_v_itemsize);
+
+    /* "View.MemoryView":904
+ *     if view.ndim == 0:
+ *         shape = view.len // itemsize
+ *         stride = itemsize             # <<<<<<<<<<<<<<
+ *     else:
+ *         shape = view.shape[dim]
+ */
+    __pyx_v_stride = __pyx_v_itemsize;
+
+    /* "View.MemoryView":902
+ *     cdef char *resultp
+ * 
+ *     if view.ndim == 0:             # <<<<<<<<<<<<<<
+ *         shape = view.len // itemsize
+ *         stride = itemsize
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":906
+ *         stride = itemsize
+ *     else:
+ *         shape = view.shape[dim]             # <<<<<<<<<<<<<<
+ *         stride = view.strides[dim]
+ *         if view.suboffsets != NULL:
+ */
+  /*else*/ {
+    __pyx_v_shape = (__pyx_v_view->shape[__pyx_v_dim]);
+
+    /* "View.MemoryView":907
+ *     else:
+ *         shape = view.shape[dim]
+ *         stride = view.strides[dim]             # <<<<<<<<<<<<<<
+ *         if view.suboffsets != NULL:
+ *             suboffset = view.suboffsets[dim]
+ */
+    __pyx_v_stride = (__pyx_v_view->strides[__pyx_v_dim]);
+
+    /* "View.MemoryView":908
+ *         shape = view.shape[dim]
+ *         stride = view.strides[dim]
+ *         if view.suboffsets != NULL:             # <<<<<<<<<<<<<<
+ *             suboffset = view.suboffsets[dim]
+ * 
+ */
+    __pyx_t_2 = (__pyx_v_view->suboffsets != NULL);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":909
+ *         stride = view.strides[dim]
+ *         if view.suboffsets != NULL:
+ *             suboffset = view.suboffsets[dim]             # <<<<<<<<<<<<<<
+ * 
+ *     if index < 0:
+ */
+      __pyx_v_suboffset = (__pyx_v_view->suboffsets[__pyx_v_dim]);
+
+      /* "View.MemoryView":908
+ *         shape = view.shape[dim]
+ *         stride = view.strides[dim]
+ *         if view.suboffsets != NULL:             # <<<<<<<<<<<<<<
+ *             suboffset = view.suboffsets[dim]
+ * 
+ */
+    }
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":911
+ *             suboffset = view.suboffsets[dim]
+ * 
+ *     if index < 0:             # <<<<<<<<<<<<<<
+ *         index += view.shape[dim]
+ *         if index < 0:
+ */
+  __pyx_t_2 = (__pyx_v_index < 0);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":912
+ * 
+ *     if index < 0:
+ *         index += view.shape[dim]             # <<<<<<<<<<<<<<
+ *         if index < 0:
+ *             raise IndexError, f"Out of bounds on buffer access (axis {dim})"
+ */
+    __pyx_v_index = (__pyx_v_index + (__pyx_v_view->shape[__pyx_v_dim]));
+
+    /* "View.MemoryView":913
+ *     if index < 0:
+ *         index += view.shape[dim]
+ *         if index < 0:             # <<<<<<<<<<<<<<
+ *             raise IndexError, f"Out of bounds on buffer access (axis {dim})"
+ * 
+ */
+    __pyx_t_2 = (__pyx_v_index < 0);
+    if (unlikely(__pyx_t_2)) {
+
+      /* "View.MemoryView":914
+ *         index += view.shape[dim]
+ *         if index < 0:
+ *             raise IndexError, f"Out of bounds on buffer access (axis {dim})"             # <<<<<<<<<<<<<<
+ * 
+ *     if index >= shape:
+ */
+      __pyx_t_3 = PyTuple_New(3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 914, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_1 = 0;
+      __pyx_t_4 = 127;
+      __Pyx_INCREF(__pyx_kp_u_Out_of_bounds_on_buffer_access_a);
+      __pyx_t_1 += 37;
+      __Pyx_GIVEREF(__pyx_kp_u_Out_of_bounds_on_buffer_access_a);
+      PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_kp_u_Out_of_bounds_on_buffer_access_a);
+      __pyx_t_5 = __Pyx_PyUnicode_From_Py_ssize_t(__pyx_v_dim, 0, ' ', 'd'); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 914, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __pyx_t_1 += __Pyx_PyUnicode_GET_LENGTH(__pyx_t_5);
+      __Pyx_GIVEREF(__pyx_t_5);
+      PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_5);
+      __pyx_t_5 = 0;
+      __Pyx_INCREF(__pyx_kp_u__7);
+      __pyx_t_1 += 1;
+      __Pyx_GIVEREF(__pyx_kp_u__7);
+      PyTuple_SET_ITEM(__pyx_t_3, 2, __pyx_kp_u__7);
+      __pyx_t_5 = __Pyx_PyUnicode_Join(__pyx_t_3, 3, __pyx_t_1, __pyx_t_4); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 914, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_Raise(__pyx_builtin_IndexError, __pyx_t_5, 0, 0);
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __PYX_ERR(1, 914, __pyx_L1_error)
+
+      /* "View.MemoryView":913
+ *     if index < 0:
+ *         index += view.shape[dim]
+ *         if index < 0:             # <<<<<<<<<<<<<<
+ *             raise IndexError, f"Out of bounds on buffer access (axis {dim})"
+ * 
+ */
+    }
+
+    /* "View.MemoryView":911
+ *             suboffset = view.suboffsets[dim]
+ * 
+ *     if index < 0:             # <<<<<<<<<<<<<<
+ *         index += view.shape[dim]
+ *         if index < 0:
+ */
+  }
+
+  /* "View.MemoryView":916
+ *             raise IndexError, f"Out of bounds on buffer access (axis {dim})"
+ * 
+ *     if index >= shape:             # <<<<<<<<<<<<<<
+ *         raise IndexError, f"Out of bounds on buffer access (axis {dim})"
+ * 
+ */
+  __pyx_t_2 = (__pyx_v_index >= __pyx_v_shape);
+  if (unlikely(__pyx_t_2)) {
+
+    /* "View.MemoryView":917
+ * 
+ *     if index >= shape:
+ *         raise IndexError, f"Out of bounds on buffer access (axis {dim})"             # <<<<<<<<<<<<<<
+ * 
+ *     resultp = bufp + index * stride
+ */
+    __pyx_t_5 = PyTuple_New(3); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 917, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_1 = 0;
+    __pyx_t_4 = 127;
+    __Pyx_INCREF(__pyx_kp_u_Out_of_bounds_on_buffer_access_a);
+    __pyx_t_1 += 37;
+    __Pyx_GIVEREF(__pyx_kp_u_Out_of_bounds_on_buffer_access_a);
+    PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_kp_u_Out_of_bounds_on_buffer_access_a);
+    __pyx_t_3 = __Pyx_PyUnicode_From_Py_ssize_t(__pyx_v_dim, 0, ' ', 'd'); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 917, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_1 += __Pyx_PyUnicode_GET_LENGTH(__pyx_t_3);
+    __Pyx_GIVEREF(__pyx_t_3);
+    PyTuple_SET_ITEM(__pyx_t_5, 1, __pyx_t_3);
+    __pyx_t_3 = 0;
+    __Pyx_INCREF(__pyx_kp_u__7);
+    __pyx_t_1 += 1;
+    __Pyx_GIVEREF(__pyx_kp_u__7);
+    PyTuple_SET_ITEM(__pyx_t_5, 2, __pyx_kp_u__7);
+    __pyx_t_3 = __Pyx_PyUnicode_Join(__pyx_t_5, 3, __pyx_t_1, __pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 917, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __Pyx_Raise(__pyx_builtin_IndexError, __pyx_t_3, 0, 0);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __PYX_ERR(1, 917, __pyx_L1_error)
+
+    /* "View.MemoryView":916
+ *             raise IndexError, f"Out of bounds on buffer access (axis {dim})"
+ * 
+ *     if index >= shape:             # <<<<<<<<<<<<<<
+ *         raise IndexError, f"Out of bounds on buffer access (axis {dim})"
+ * 
+ */
+  }
+
+  /* "View.MemoryView":919
+ *         raise IndexError, f"Out of bounds on buffer access (axis {dim})"
+ * 
+ *     resultp = bufp + index * stride             # <<<<<<<<<<<<<<
+ *     if suboffset >= 0:
+ *         resultp = (<char **> resultp)[0] + suboffset
+ */
+  __pyx_v_resultp = (__pyx_v_bufp + (__pyx_v_index * __pyx_v_stride));
+
+  /* "View.MemoryView":920
+ * 
+ *     resultp = bufp + index * stride
+ *     if suboffset >= 0:             # <<<<<<<<<<<<<<
+ *         resultp = (<char **> resultp)[0] + suboffset
+ * 
+ */
+  __pyx_t_2 = (__pyx_v_suboffset >= 0);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":921
+ *     resultp = bufp + index * stride
+ *     if suboffset >= 0:
+ *         resultp = (<char **> resultp)[0] + suboffset             # <<<<<<<<<<<<<<
+ * 
+ *     return resultp
+ */
+    __pyx_v_resultp = ((((char **)__pyx_v_resultp)[0]) + __pyx_v_suboffset);
+
+    /* "View.MemoryView":920
+ * 
+ *     resultp = bufp + index * stride
+ *     if suboffset >= 0:             # <<<<<<<<<<<<<<
+ *         resultp = (<char **> resultp)[0] + suboffset
+ * 
+ */
+  }
+
+  /* "View.MemoryView":923
+ *         resultp = (<char **> resultp)[0] + suboffset
+ * 
+ *     return resultp             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = __pyx_v_resultp;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":896
+ * 
+ * @cname('__pyx_pybuffer_index')
+ * cdef char *pybuffer_index(Py_buffer *view, char *bufp, Py_ssize_t index,             # <<<<<<<<<<<<<<
+ *                           Py_ssize_t dim) except NULL:
+ *     cdef Py_ssize_t shape, stride, suboffset = -1
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_AddTraceback("View.MemoryView.pybuffer_index", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":929
+ * 
+ * @cname('__pyx_memslice_transpose')
+ * cdef int transpose_memslice(__Pyx_memviewslice *memslice) except -1 nogil:             # <<<<<<<<<<<<<<
+ *     cdef int ndim = memslice.memview.view.ndim
+ * 
+ */
+
+static int __pyx_memslice_transpose(__Pyx_memviewslice *__pyx_v_memslice) {
+  int __pyx_v_ndim;
+  Py_ssize_t *__pyx_v_shape;
+  Py_ssize_t *__pyx_v_strides;
+  int __pyx_v_i;
+  int __pyx_v_j;
+  int __pyx_r;
+  int __pyx_t_1;
+  Py_ssize_t *__pyx_t_2;
+  long __pyx_t_3;
+  long __pyx_t_4;
+  Py_ssize_t __pyx_t_5;
+  Py_ssize_t __pyx_t_6;
+  int __pyx_t_7;
+  int __pyx_t_8;
+  int __pyx_t_9;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  #ifdef WITH_THREAD
+  PyGILState_STATE __pyx_gilstate_save;
+  #endif
+
+  /* "View.MemoryView":930
+ * @cname('__pyx_memslice_transpose')
+ * cdef int transpose_memslice(__Pyx_memviewslice *memslice) except -1 nogil:
+ *     cdef int ndim = memslice.memview.view.ndim             # <<<<<<<<<<<<<<
+ * 
+ *     cdef Py_ssize_t *shape = memslice.shape
+ */
+  __pyx_t_1 = __pyx_v_memslice->memview->view.ndim;
+  __pyx_v_ndim = __pyx_t_1;
+
+  /* "View.MemoryView":932
+ *     cdef int ndim = memslice.memview.view.ndim
+ * 
+ *     cdef Py_ssize_t *shape = memslice.shape             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t *strides = memslice.strides
+ * 
+ */
+  __pyx_t_2 = __pyx_v_memslice->shape;
+  __pyx_v_shape = __pyx_t_2;
+
+  /* "View.MemoryView":933
+ * 
+ *     cdef Py_ssize_t *shape = memslice.shape
+ *     cdef Py_ssize_t *strides = memslice.strides             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_t_2 = __pyx_v_memslice->strides;
+  __pyx_v_strides = __pyx_t_2;
+
+  /* "View.MemoryView":937
+ * 
+ *     cdef int i, j
+ *     for i in range(ndim // 2):             # <<<<<<<<<<<<<<
+ *         j = ndim - 1 - i
+ *         strides[i], strides[j] = strides[j], strides[i]
+ */
+  __pyx_t_3 = __Pyx_div_long(__pyx_v_ndim, 2);
+  __pyx_t_4 = __pyx_t_3;
+  for (__pyx_t_1 = 0; __pyx_t_1 < __pyx_t_4; __pyx_t_1+=1) {
+    __pyx_v_i = __pyx_t_1;
+
+    /* "View.MemoryView":938
+ *     cdef int i, j
+ *     for i in range(ndim // 2):
+ *         j = ndim - 1 - i             # <<<<<<<<<<<<<<
+ *         strides[i], strides[j] = strides[j], strides[i]
+ *         shape[i], shape[j] = shape[j], shape[i]
+ */
+    __pyx_v_j = ((__pyx_v_ndim - 1) - __pyx_v_i);
+
+    /* "View.MemoryView":939
+ *     for i in range(ndim // 2):
+ *         j = ndim - 1 - i
+ *         strides[i], strides[j] = strides[j], strides[i]             # <<<<<<<<<<<<<<
+ *         shape[i], shape[j] = shape[j], shape[i]
+ * 
+ */
+    __pyx_t_5 = (__pyx_v_strides[__pyx_v_j]);
+    __pyx_t_6 = (__pyx_v_strides[__pyx_v_i]);
+    (__pyx_v_strides[__pyx_v_i]) = __pyx_t_5;
+    (__pyx_v_strides[__pyx_v_j]) = __pyx_t_6;
+
+    /* "View.MemoryView":940
+ *         j = ndim - 1 - i
+ *         strides[i], strides[j] = strides[j], strides[i]
+ *         shape[i], shape[j] = shape[j], shape[i]             # <<<<<<<<<<<<<<
+ * 
+ *         if memslice.suboffsets[i] >= 0 or memslice.suboffsets[j] >= 0:
+ */
+    __pyx_t_6 = (__pyx_v_shape[__pyx_v_j]);
+    __pyx_t_5 = (__pyx_v_shape[__pyx_v_i]);
+    (__pyx_v_shape[__pyx_v_i]) = __pyx_t_6;
+    (__pyx_v_shape[__pyx_v_j]) = __pyx_t_5;
+
+    /* "View.MemoryView":942
+ *         shape[i], shape[j] = shape[j], shape[i]
+ * 
+ *         if memslice.suboffsets[i] >= 0 or memslice.suboffsets[j] >= 0:             # <<<<<<<<<<<<<<
+ *             _err(PyExc_ValueError, "Cannot transpose memoryview with indirect dimensions")
+ * 
+ */
+    __pyx_t_8 = ((__pyx_v_memslice->suboffsets[__pyx_v_i]) >= 0);
+    if (!__pyx_t_8) {
+    } else {
+      __pyx_t_7 = __pyx_t_8;
+      goto __pyx_L6_bool_binop_done;
+    }
+    __pyx_t_8 = ((__pyx_v_memslice->suboffsets[__pyx_v_j]) >= 0);
+    __pyx_t_7 = __pyx_t_8;
+    __pyx_L6_bool_binop_done:;
+    if (__pyx_t_7) {
+
+      /* "View.MemoryView":943
+ * 
+ *         if memslice.suboffsets[i] >= 0 or memslice.suboffsets[j] >= 0:
+ *             _err(PyExc_ValueError, "Cannot transpose memoryview with indirect dimensions")             # <<<<<<<<<<<<<<
+ * 
+ *     return 0
+ */
+      __pyx_t_9 = __pyx_memoryview_err(PyExc_ValueError, __pyx_kp_s_Cannot_transpose_memoryview_with); if (unlikely(__pyx_t_9 == ((int)-1))) __PYX_ERR(1, 943, __pyx_L1_error)
+
+      /* "View.MemoryView":942
+ *         shape[i], shape[j] = shape[j], shape[i]
+ * 
+ *         if memslice.suboffsets[i] >= 0 or memslice.suboffsets[j] >= 0:             # <<<<<<<<<<<<<<
+ *             _err(PyExc_ValueError, "Cannot transpose memoryview with indirect dimensions")
+ * 
+ */
+    }
+  }
+
+  /* "View.MemoryView":945
+ *             _err(PyExc_ValueError, "Cannot transpose memoryview with indirect dimensions")
+ * 
+ *     return 0             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":929
+ * 
+ * @cname('__pyx_memslice_transpose')
+ * cdef int transpose_memslice(__Pyx_memviewslice *memslice) except -1 nogil:             # <<<<<<<<<<<<<<
+ *     cdef int ndim = memslice.memview.view.ndim
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  #ifdef WITH_THREAD
+  __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
+  #endif
+  __Pyx_AddTraceback("View.MemoryView.transpose_memslice", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  #ifdef WITH_THREAD
+  __Pyx_PyGILState_Release(__pyx_gilstate_save);
+  #endif
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":963
+ *     cdef int (*to_dtype_func)(char *, object) except 0
+ * 
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
+ *         __PYX_XCLEAR_MEMVIEW(&self.from_slice, 1)
+ * 
+ */
+
+/* Python wrapper */
+static void __pyx_memoryviewslice___dealloc__(PyObject *__pyx_v_self); /*proto*/
+static void __pyx_memoryviewslice___dealloc__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_memoryviewslice___pyx_pf_15View_dot_MemoryView_16_memoryviewslice___dealloc__(((struct __pyx_memoryviewslice_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+static void __pyx_memoryviewslice___pyx_pf_15View_dot_MemoryView_16_memoryviewslice___dealloc__(struct __pyx_memoryviewslice_obj *__pyx_v_self) {
+
+  /* "View.MemoryView":964
+ * 
+ *     def __dealloc__(self):
+ *         __PYX_XCLEAR_MEMVIEW(&self.from_slice, 1)             # <<<<<<<<<<<<<<
+ * 
+ *     cdef convert_item_to_object(self, char *itemp):
+ */
+  __PYX_XCLEAR_MEMVIEW((&__pyx_v_self->from_slice), 1);
+
+  /* "View.MemoryView":963
+ *     cdef int (*to_dtype_func)(char *, object) except 0
+ * 
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
+ *         __PYX_XCLEAR_MEMVIEW(&self.from_slice, 1)
+ * 
+ */
+
+  /* function exit code */
+}
+
+/* "View.MemoryView":966
+ *         __PYX_XCLEAR_MEMVIEW(&self.from_slice, 1)
+ * 
+ *     cdef convert_item_to_object(self, char *itemp):             # <<<<<<<<<<<<<<
+ *         if self.to_object_func != NULL:
+ *             return self.to_object_func(itemp)
+ */
+
+static PyObject *__pyx_memoryviewslice_convert_item_to_object(struct __pyx_memoryviewslice_obj *__pyx_v_self, char *__pyx_v_itemp) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("convert_item_to_object", 1);
+
+  /* "View.MemoryView":967
+ * 
+ *     cdef convert_item_to_object(self, char *itemp):
+ *         if self.to_object_func != NULL:             # <<<<<<<<<<<<<<
+ *             return self.to_object_func(itemp)
+ *         else:
+ */
+  __pyx_t_1 = (__pyx_v_self->to_object_func != NULL);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":968
+ *     cdef convert_item_to_object(self, char *itemp):
+ *         if self.to_object_func != NULL:
+ *             return self.to_object_func(itemp)             # <<<<<<<<<<<<<<
+ *         else:
+ *             return memoryview.convert_item_to_object(self, itemp)
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_t_2 = __pyx_v_self->to_object_func(__pyx_v_itemp); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 968, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_r = __pyx_t_2;
+    __pyx_t_2 = 0;
+    goto __pyx_L0;
+
+    /* "View.MemoryView":967
+ * 
+ *     cdef convert_item_to_object(self, char *itemp):
+ *         if self.to_object_func != NULL:             # <<<<<<<<<<<<<<
+ *             return self.to_object_func(itemp)
+ *         else:
+ */
+  }
+
+  /* "View.MemoryView":970
+ *             return self.to_object_func(itemp)
+ *         else:
+ *             return memoryview.convert_item_to_object(self, itemp)             # <<<<<<<<<<<<<<
+ * 
+ *     cdef assign_item_from_object(self, char *itemp, object value):
+ */
+  /*else*/ {
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_t_2 = __pyx_memoryview_convert_item_to_object(((struct __pyx_memoryview_obj *)__pyx_v_self), __pyx_v_itemp); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 970, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_r = __pyx_t_2;
+    __pyx_t_2 = 0;
+    goto __pyx_L0;
+  }
+
+  /* "View.MemoryView":966
+ *         __PYX_XCLEAR_MEMVIEW(&self.from_slice, 1)
+ * 
+ *     cdef convert_item_to_object(self, char *itemp):             # <<<<<<<<<<<<<<
+ *         if self.to_object_func != NULL:
+ *             return self.to_object_func(itemp)
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView._memoryviewslice.convert_item_to_object", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":972
+ *             return memoryview.convert_item_to_object(self, itemp)
+ * 
+ *     cdef assign_item_from_object(self, char *itemp, object value):             # <<<<<<<<<<<<<<
+ *         if self.to_dtype_func != NULL:
+ *             self.to_dtype_func(itemp, value)
+ */
+
+static PyObject *__pyx_memoryviewslice_assign_item_from_object(struct __pyx_memoryviewslice_obj *__pyx_v_self, char *__pyx_v_itemp, PyObject *__pyx_v_value) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("assign_item_from_object", 1);
+
+  /* "View.MemoryView":973
+ * 
+ *     cdef assign_item_from_object(self, char *itemp, object value):
+ *         if self.to_dtype_func != NULL:             # <<<<<<<<<<<<<<
+ *             self.to_dtype_func(itemp, value)
+ *         else:
+ */
+  __pyx_t_1 = (__pyx_v_self->to_dtype_func != NULL);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":974
+ *     cdef assign_item_from_object(self, char *itemp, object value):
+ *         if self.to_dtype_func != NULL:
+ *             self.to_dtype_func(itemp, value)             # <<<<<<<<<<<<<<
+ *         else:
+ *             memoryview.assign_item_from_object(self, itemp, value)
+ */
+    __pyx_t_2 = __pyx_v_self->to_dtype_func(__pyx_v_itemp, __pyx_v_value); if (unlikely(__pyx_t_2 == ((int)0))) __PYX_ERR(1, 974, __pyx_L1_error)
+
+    /* "View.MemoryView":973
+ * 
+ *     cdef assign_item_from_object(self, char *itemp, object value):
+ *         if self.to_dtype_func != NULL:             # <<<<<<<<<<<<<<
+ *             self.to_dtype_func(itemp, value)
+ *         else:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":976
+ *             self.to_dtype_func(itemp, value)
+ *         else:
+ *             memoryview.assign_item_from_object(self, itemp, value)             # <<<<<<<<<<<<<<
+ * 
+ *     cdef _get_base(self):
+ */
+  /*else*/ {
+    __pyx_t_3 = __pyx_memoryview_assign_item_from_object(((struct __pyx_memoryview_obj *)__pyx_v_self), __pyx_v_itemp, __pyx_v_value); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 976, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":972
+ *             return memoryview.convert_item_to_object(self, itemp)
+ * 
+ *     cdef assign_item_from_object(self, char *itemp, object value):             # <<<<<<<<<<<<<<
+ *         if self.to_dtype_func != NULL:
+ *             self.to_dtype_func(itemp, value)
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_AddTraceback("View.MemoryView._memoryviewslice.assign_item_from_object", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":978
+ *             memoryview.assign_item_from_object(self, itemp, value)
+ * 
+ *     cdef _get_base(self):             # <<<<<<<<<<<<<<
+ *         return self.from_object
+ * 
+ */
+
+static PyObject *__pyx_memoryviewslice__get_base(struct __pyx_memoryviewslice_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_get_base", 1);
+
+  /* "View.MemoryView":979
+ * 
+ *     cdef _get_base(self):
+ *         return self.from_object             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_self->from_object);
+  __pyx_r = __pyx_v_self->from_object;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":978
+ *             memoryview.assign_item_from_object(self, itemp, value)
+ * 
+ *     cdef _get_base(self):             # <<<<<<<<<<<<<<
+ *         return self.from_object
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw___pyx_memoryviewslice_1__reduce_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_pw___pyx_memoryviewslice_1__reduce_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__reduce_cython__ (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  if (unlikely(__pyx_nargs > 0)) {
+    __Pyx_RaiseArgtupleInvalid("__reduce_cython__", 1, 0, 0, __pyx_nargs); return NULL;}
+  if (unlikely(__pyx_kwds) && __Pyx_NumKwargs_FASTCALL(__pyx_kwds) && unlikely(!__Pyx_CheckKeywordStrings(__pyx_kwds, "__reduce_cython__", 0))) return NULL;
+  __pyx_r = __pyx_pf___pyx_memoryviewslice___reduce_cython__(((struct __pyx_memoryviewslice_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf___pyx_memoryviewslice___reduce_cython__(CYTHON_UNUSED struct __pyx_memoryviewslice_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__reduce_cython__", 1);
+
+  /* "(tree fragment)":2
+ * def __reduce_cython__(self):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"             # <<<<<<<<<<<<<<
+ * def __setstate_cython__(self, __pyx_state):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ */
+  __Pyx_Raise(__pyx_builtin_TypeError, __pyx_kp_s_no_default___reduce___due_to_non, 0, 0);
+  __PYX_ERR(1, 2, __pyx_L1_error)
+
+  /* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView._memoryviewslice.__reduce_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":3
+ * def __reduce_cython__(self):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw___pyx_memoryviewslice_3__setstate_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_pw___pyx_memoryviewslice_3__setstate_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  CYTHON_UNUSED PyObject *__pyx_v___pyx_state = 0;
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[1] = {0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__setstate_cython__ (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_pyx_state,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  1: values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_FASTCALL(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_pyx_state)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 3, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "__setstate_cython__") < 0)) __PYX_ERR(1, 3, __pyx_L3_error)
+      }
+    } else if (unlikely(__pyx_nargs != 1)) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+    }
+    __pyx_v___pyx_state = values[0];
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__setstate_cython__", 1, 1, 1, __pyx_nargs); __PYX_ERR(1, 3, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("View.MemoryView._memoryviewslice.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf___pyx_memoryviewslice_2__setstate_cython__(((struct __pyx_memoryviewslice_obj *)__pyx_v_self), __pyx_v___pyx_state);
+
+  /* function exit code */
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf___pyx_memoryviewslice_2__setstate_cython__(CYTHON_UNUSED struct __pyx_memoryviewslice_obj *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__setstate_cython__", 1);
+
+  /* "(tree fragment)":4
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"             # <<<<<<<<<<<<<<
+ */
+  __Pyx_Raise(__pyx_builtin_TypeError, __pyx_kp_s_no_default___reduce___due_to_non, 0, 0);
+  __PYX_ERR(1, 4, __pyx_L1_error)
+
+  /* "(tree fragment)":3
+ * def __reduce_cython__(self):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView._memoryviewslice.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":999
+ * 
+ * @cname('__pyx_memoryview_fromslice')
+ * cdef memoryview_fromslice(__Pyx_memviewslice memviewslice,             # <<<<<<<<<<<<<<
+ *                           int ndim,
+ *                           object (*to_object_func)(char *),
+ */
+
+static PyObject *__pyx_memoryview_fromslice(__Pyx_memviewslice __pyx_v_memviewslice, int __pyx_v_ndim, PyObject *(*__pyx_v_to_object_func)(char *), int (*__pyx_v_to_dtype_func)(char *, PyObject *), int __pyx_v_dtype_is_object) {
+  struct __pyx_memoryviewslice_obj *__pyx_v_result = 0;
+  Py_ssize_t __pyx_v_suboffset;
+  PyObject *__pyx_v_length = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  __Pyx_TypeInfo *__pyx_t_4;
+  Py_buffer __pyx_t_5;
+  Py_ssize_t *__pyx_t_6;
+  Py_ssize_t *__pyx_t_7;
+  Py_ssize_t *__pyx_t_8;
+  Py_ssize_t __pyx_t_9;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("memoryview_fromslice", 1);
+
+  /* "View.MemoryView":1007
+ *     cdef _memoryviewslice result
+ * 
+ *     if <PyObject *> memviewslice.memview == Py_None:             # <<<<<<<<<<<<<<
+ *         return None
+ * 
+ */
+  __pyx_t_1 = (((PyObject *)__pyx_v_memviewslice.memview) == Py_None);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":1008
+ * 
+ *     if <PyObject *> memviewslice.memview == Py_None:
+ *         return None             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+    goto __pyx_L0;
+
+    /* "View.MemoryView":1007
+ *     cdef _memoryviewslice result
+ * 
+ *     if <PyObject *> memviewslice.memview == Py_None:             # <<<<<<<<<<<<<<
+ *         return None
+ * 
+ */
+  }
+
+  /* "View.MemoryView":1013
+ * 
+ * 
+ *     result = _memoryviewslice.__new__(_memoryviewslice, None, 0, dtype_is_object)             # <<<<<<<<<<<<<<
+ * 
+ *     result.from_slice = memviewslice
+ */
+  __pyx_t_2 = __Pyx_PyBool_FromLong(__pyx_v_dtype_is_object); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 1013, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = PyTuple_New(3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 1013, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_INCREF(Py_None);
+  __Pyx_GIVEREF(Py_None);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 0, Py_None)) __PYX_ERR(1, 1013, __pyx_L1_error);
+  __Pyx_INCREF(__pyx_int_0);
+  __Pyx_GIVEREF(__pyx_int_0);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_int_0)) __PYX_ERR(1, 1013, __pyx_L1_error);
+  __Pyx_GIVEREF(__pyx_t_2);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 2, __pyx_t_2)) __PYX_ERR(1, 1013, __pyx_L1_error);
+  __pyx_t_2 = 0;
+  __pyx_t_2 = ((PyObject *)__pyx_tp_new__memoryviewslice(((PyTypeObject *)__pyx_memoryviewslice_type), __pyx_t_3, NULL)); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 1013, __pyx_L1_error)
+  __Pyx_GOTREF((PyObject *)__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_v_result = ((struct __pyx_memoryviewslice_obj *)__pyx_t_2);
+  __pyx_t_2 = 0;
+
+  /* "View.MemoryView":1015
+ *     result = _memoryviewslice.__new__(_memoryviewslice, None, 0, dtype_is_object)
+ * 
+ *     result.from_slice = memviewslice             # <<<<<<<<<<<<<<
+ *     __PYX_INC_MEMVIEW(&memviewslice, 1)
+ * 
+ */
+  __pyx_v_result->from_slice = __pyx_v_memviewslice;
+
+  /* "View.MemoryView":1016
+ * 
+ *     result.from_slice = memviewslice
+ *     __PYX_INC_MEMVIEW(&memviewslice, 1)             # <<<<<<<<<<<<<<
+ * 
+ *     result.from_object = (<memoryview> memviewslice.memview)._get_base()
+ */
+  __PYX_INC_MEMVIEW((&__pyx_v_memviewslice), 1);
+
+  /* "View.MemoryView":1018
+ *     __PYX_INC_MEMVIEW(&memviewslice, 1)
+ * 
+ *     result.from_object = (<memoryview> memviewslice.memview)._get_base()             # <<<<<<<<<<<<<<
+ *     result.typeinfo = memviewslice.memview.typeinfo
+ * 
+ */
+  __pyx_t_2 = ((struct __pyx_vtabstruct_memoryview *)((struct __pyx_memoryview_obj *)__pyx_v_memviewslice.memview)->__pyx_vtab)->_get_base(((struct __pyx_memoryview_obj *)__pyx_v_memviewslice.memview)); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 1018, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_GIVEREF(__pyx_t_2);
+  __Pyx_GOTREF(__pyx_v_result->from_object);
+  __Pyx_DECREF(__pyx_v_result->from_object);
+  __pyx_v_result->from_object = __pyx_t_2;
+  __pyx_t_2 = 0;
+
+  /* "View.MemoryView":1019
+ * 
+ *     result.from_object = (<memoryview> memviewslice.memview)._get_base()
+ *     result.typeinfo = memviewslice.memview.typeinfo             # <<<<<<<<<<<<<<
+ * 
+ *     result.view = memviewslice.memview.view
+ */
+  __pyx_t_4 = __pyx_v_memviewslice.memview->typeinfo;
+  __pyx_v_result->__pyx_base.typeinfo = __pyx_t_4;
+
+  /* "View.MemoryView":1021
+ *     result.typeinfo = memviewslice.memview.typeinfo
+ * 
+ *     result.view = memviewslice.memview.view             # <<<<<<<<<<<<<<
+ *     result.view.buf = <void *> memviewslice.data
+ *     result.view.ndim = ndim
+ */
+  __pyx_t_5 = __pyx_v_memviewslice.memview->view;
+  __pyx_v_result->__pyx_base.view = __pyx_t_5;
+
+  /* "View.MemoryView":1022
+ * 
+ *     result.view = memviewslice.memview.view
+ *     result.view.buf = <void *> memviewslice.data             # <<<<<<<<<<<<<<
+ *     result.view.ndim = ndim
+ *     (<__pyx_buffer *> &result.view).obj = Py_None
+ */
+  __pyx_v_result->__pyx_base.view.buf = ((void *)__pyx_v_memviewslice.data);
+
+  /* "View.MemoryView":1023
+ *     result.view = memviewslice.memview.view
+ *     result.view.buf = <void *> memviewslice.data
+ *     result.view.ndim = ndim             # <<<<<<<<<<<<<<
+ *     (<__pyx_buffer *> &result.view).obj = Py_None
+ *     Py_INCREF(Py_None)
+ */
+  __pyx_v_result->__pyx_base.view.ndim = __pyx_v_ndim;
+
+  /* "View.MemoryView":1024
+ *     result.view.buf = <void *> memviewslice.data
+ *     result.view.ndim = ndim
+ *     (<__pyx_buffer *> &result.view).obj = Py_None             # <<<<<<<<<<<<<<
+ *     Py_INCREF(Py_None)
+ * 
+ */
+  ((Py_buffer *)(&__pyx_v_result->__pyx_base.view))->obj = Py_None;
+
+  /* "View.MemoryView":1025
+ *     result.view.ndim = ndim
+ *     (<__pyx_buffer *> &result.view).obj = Py_None
+ *     Py_INCREF(Py_None)             # <<<<<<<<<<<<<<
+ * 
+ *     if (<memoryview>memviewslice.memview).flags & PyBUF_WRITABLE:
+ */
+  Py_INCREF(Py_None);
+
+  /* "View.MemoryView":1027
+ *     Py_INCREF(Py_None)
+ * 
+ *     if (<memoryview>memviewslice.memview).flags & PyBUF_WRITABLE:             # <<<<<<<<<<<<<<
+ *         result.flags = PyBUF_RECORDS
+ *     else:
+ */
+  __pyx_t_1 = ((((struct __pyx_memoryview_obj *)__pyx_v_memviewslice.memview)->flags & PyBUF_WRITABLE) != 0);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":1028
+ * 
+ *     if (<memoryview>memviewslice.memview).flags & PyBUF_WRITABLE:
+ *         result.flags = PyBUF_RECORDS             # <<<<<<<<<<<<<<
+ *     else:
+ *         result.flags = PyBUF_RECORDS_RO
+ */
+    __pyx_v_result->__pyx_base.flags = PyBUF_RECORDS;
+
+    /* "View.MemoryView":1027
+ *     Py_INCREF(Py_None)
+ * 
+ *     if (<memoryview>memviewslice.memview).flags & PyBUF_WRITABLE:             # <<<<<<<<<<<<<<
+ *         result.flags = PyBUF_RECORDS
+ *     else:
+ */
+    goto __pyx_L4;
+  }
+
+  /* "View.MemoryView":1030
+ *         result.flags = PyBUF_RECORDS
+ *     else:
+ *         result.flags = PyBUF_RECORDS_RO             # <<<<<<<<<<<<<<
+ * 
+ *     result.view.shape = <Py_ssize_t *> result.from_slice.shape
+ */
+  /*else*/ {
+    __pyx_v_result->__pyx_base.flags = PyBUF_RECORDS_RO;
+  }
+  __pyx_L4:;
+
+  /* "View.MemoryView":1032
+ *         result.flags = PyBUF_RECORDS_RO
+ * 
+ *     result.view.shape = <Py_ssize_t *> result.from_slice.shape             # <<<<<<<<<<<<<<
+ *     result.view.strides = <Py_ssize_t *> result.from_slice.strides
+ * 
+ */
+  __pyx_v_result->__pyx_base.view.shape = ((Py_ssize_t *)__pyx_v_result->from_slice.shape);
+
+  /* "View.MemoryView":1033
+ * 
+ *     result.view.shape = <Py_ssize_t *> result.from_slice.shape
+ *     result.view.strides = <Py_ssize_t *> result.from_slice.strides             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_v_result->__pyx_base.view.strides = ((Py_ssize_t *)__pyx_v_result->from_slice.strides);
+
+  /* "View.MemoryView":1036
+ * 
+ * 
+ *     result.view.suboffsets = NULL             # <<<<<<<<<<<<<<
+ *     for suboffset in result.from_slice.suboffsets[:ndim]:
+ *         if suboffset >= 0:
+ */
+  __pyx_v_result->__pyx_base.view.suboffsets = NULL;
+
+  /* "View.MemoryView":1037
+ * 
+ *     result.view.suboffsets = NULL
+ *     for suboffset in result.from_slice.suboffsets[:ndim]:             # <<<<<<<<<<<<<<
+ *         if suboffset >= 0:
+ *             result.view.suboffsets = <Py_ssize_t *> result.from_slice.suboffsets
+ */
+  __pyx_t_7 = (__pyx_v_result->from_slice.suboffsets + __pyx_v_ndim);
+  for (__pyx_t_8 = __pyx_v_result->from_slice.suboffsets; __pyx_t_8 < __pyx_t_7; __pyx_t_8++) {
+    __pyx_t_6 = __pyx_t_8;
+    __pyx_v_suboffset = (__pyx_t_6[0]);
+
+    /* "View.MemoryView":1038
+ *     result.view.suboffsets = NULL
+ *     for suboffset in result.from_slice.suboffsets[:ndim]:
+ *         if suboffset >= 0:             # <<<<<<<<<<<<<<
+ *             result.view.suboffsets = <Py_ssize_t *> result.from_slice.suboffsets
+ *             break
+ */
+    __pyx_t_1 = (__pyx_v_suboffset >= 0);
+    if (__pyx_t_1) {
+
+      /* "View.MemoryView":1039
+ *     for suboffset in result.from_slice.suboffsets[:ndim]:
+ *         if suboffset >= 0:
+ *             result.view.suboffsets = <Py_ssize_t *> result.from_slice.suboffsets             # <<<<<<<<<<<<<<
+ *             break
+ * 
+ */
+      __pyx_v_result->__pyx_base.view.suboffsets = ((Py_ssize_t *)__pyx_v_result->from_slice.suboffsets);
+
+      /* "View.MemoryView":1040
+ *         if suboffset >= 0:
+ *             result.view.suboffsets = <Py_ssize_t *> result.from_slice.suboffsets
+ *             break             # <<<<<<<<<<<<<<
+ * 
+ *     result.view.len = result.view.itemsize
+ */
+      goto __pyx_L6_break;
+
+      /* "View.MemoryView":1038
+ *     result.view.suboffsets = NULL
+ *     for suboffset in result.from_slice.suboffsets[:ndim]:
+ *         if suboffset >= 0:             # <<<<<<<<<<<<<<
+ *             result.view.suboffsets = <Py_ssize_t *> result.from_slice.suboffsets
+ *             break
+ */
+    }
+  }
+  __pyx_L6_break:;
+
+  /* "View.MemoryView":1042
+ *             break
+ * 
+ *     result.view.len = result.view.itemsize             # <<<<<<<<<<<<<<
+ *     for length in result.view.shape[:ndim]:
+ *         result.view.len *= length
+ */
+  __pyx_t_9 = __pyx_v_result->__pyx_base.view.itemsize;
+  __pyx_v_result->__pyx_base.view.len = __pyx_t_9;
+
+  /* "View.MemoryView":1043
+ * 
+ *     result.view.len = result.view.itemsize
+ *     for length in result.view.shape[:ndim]:             # <<<<<<<<<<<<<<
+ *         result.view.len *= length
+ * 
+ */
+  __pyx_t_7 = (__pyx_v_result->__pyx_base.view.shape + __pyx_v_ndim);
+  for (__pyx_t_8 = __pyx_v_result->__pyx_base.view.shape; __pyx_t_8 < __pyx_t_7; __pyx_t_8++) {
+    __pyx_t_6 = __pyx_t_8;
+    __pyx_t_2 = PyInt_FromSsize_t((__pyx_t_6[0])); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 1043, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __Pyx_XDECREF_SET(__pyx_v_length, __pyx_t_2);
+    __pyx_t_2 = 0;
+
+    /* "View.MemoryView":1044
+ *     result.view.len = result.view.itemsize
+ *     for length in result.view.shape[:ndim]:
+ *         result.view.len *= length             # <<<<<<<<<<<<<<
+ * 
+ *     result.to_object_func = to_object_func
+ */
+    __pyx_t_2 = PyInt_FromSsize_t(__pyx_v_result->__pyx_base.view.len); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 1044, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_3 = PyNumber_InPlaceMultiply(__pyx_t_2, __pyx_v_length); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 1044, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __pyx_t_9 = __Pyx_PyIndex_AsSsize_t(__pyx_t_3); if (unlikely((__pyx_t_9 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(1, 1044, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_v_result->__pyx_base.view.len = __pyx_t_9;
+  }
+
+  /* "View.MemoryView":1046
+ *         result.view.len *= length
+ * 
+ *     result.to_object_func = to_object_func             # <<<<<<<<<<<<<<
+ *     result.to_dtype_func = to_dtype_func
+ * 
+ */
+  __pyx_v_result->to_object_func = __pyx_v_to_object_func;
+
+  /* "View.MemoryView":1047
+ * 
+ *     result.to_object_func = to_object_func
+ *     result.to_dtype_func = to_dtype_func             # <<<<<<<<<<<<<<
+ * 
+ *     return result
+ */
+  __pyx_v_result->to_dtype_func = __pyx_v_to_dtype_func;
+
+  /* "View.MemoryView":1049
+ *     result.to_dtype_func = to_dtype_func
+ * 
+ *     return result             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_get_slice_from_memoryview')
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF((PyObject *)__pyx_v_result);
+  __pyx_r = ((PyObject *)__pyx_v_result);
+  goto __pyx_L0;
+
+  /* "View.MemoryView":999
+ * 
+ * @cname('__pyx_memoryview_fromslice')
+ * cdef memoryview_fromslice(__Pyx_memviewslice memviewslice,             # <<<<<<<<<<<<<<
+ *                           int ndim,
+ *                           object (*to_object_func)(char *),
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_AddTraceback("View.MemoryView.memoryview_fromslice", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_result);
+  __Pyx_XDECREF(__pyx_v_length);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1052
+ * 
+ * @cname('__pyx_memoryview_get_slice_from_memoryview')
+ * cdef __Pyx_memviewslice *get_slice_from_memview(memoryview memview,             # <<<<<<<<<<<<<<
+ *                                                    __Pyx_memviewslice *mslice) except NULL:
+ *     cdef _memoryviewslice obj
+ */
+
+static __Pyx_memviewslice *__pyx_memoryview_get_slice_from_memoryview(struct __pyx_memoryview_obj *__pyx_v_memview, __Pyx_memviewslice *__pyx_v_mslice) {
+  struct __pyx_memoryviewslice_obj *__pyx_v_obj = 0;
+  __Pyx_memviewslice *__pyx_r;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("get_slice_from_memview", 1);
+
+  /* "View.MemoryView":1055
+ *                                                    __Pyx_memviewslice *mslice) except NULL:
+ *     cdef _memoryviewslice obj
+ *     if isinstance(memview, _memoryviewslice):             # <<<<<<<<<<<<<<
+ *         obj = memview
+ *         return &obj.from_slice
+ */
+  __pyx_t_1 = __Pyx_TypeCheck(((PyObject *)__pyx_v_memview), __pyx_memoryviewslice_type); 
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":1056
+ *     cdef _memoryviewslice obj
+ *     if isinstance(memview, _memoryviewslice):
+ *         obj = memview             # <<<<<<<<<<<<<<
+ *         return &obj.from_slice
+ *     else:
+ */
+    if (!(likely(((((PyObject *)__pyx_v_memview)) == Py_None) || likely(__Pyx_TypeTest(((PyObject *)__pyx_v_memview), __pyx_memoryviewslice_type))))) __PYX_ERR(1, 1056, __pyx_L1_error)
+    __pyx_t_2 = ((PyObject *)__pyx_v_memview);
+    __Pyx_INCREF(__pyx_t_2);
+    __pyx_v_obj = ((struct __pyx_memoryviewslice_obj *)__pyx_t_2);
+    __pyx_t_2 = 0;
+
+    /* "View.MemoryView":1057
+ *     if isinstance(memview, _memoryviewslice):
+ *         obj = memview
+ *         return &obj.from_slice             # <<<<<<<<<<<<<<
+ *     else:
+ *         slice_copy(memview, mslice)
+ */
+    __pyx_r = (&__pyx_v_obj->from_slice);
+    goto __pyx_L0;
+
+    /* "View.MemoryView":1055
+ *                                                    __Pyx_memviewslice *mslice) except NULL:
+ *     cdef _memoryviewslice obj
+ *     if isinstance(memview, _memoryviewslice):             # <<<<<<<<<<<<<<
+ *         obj = memview
+ *         return &obj.from_slice
+ */
+  }
+
+  /* "View.MemoryView":1059
+ *         return &obj.from_slice
+ *     else:
+ *         slice_copy(memview, mslice)             # <<<<<<<<<<<<<<
+ *         return mslice
+ * 
+ */
+  /*else*/ {
+    __pyx_memoryview_slice_copy(__pyx_v_memview, __pyx_v_mslice);
+
+    /* "View.MemoryView":1060
+ *     else:
+ *         slice_copy(memview, mslice)
+ *         return mslice             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_slice_copy')
+ */
+    __pyx_r = __pyx_v_mslice;
+    goto __pyx_L0;
+  }
+
+  /* "View.MemoryView":1052
+ * 
+ * @cname('__pyx_memoryview_get_slice_from_memoryview')
+ * cdef __Pyx_memviewslice *get_slice_from_memview(memoryview memview,             # <<<<<<<<<<<<<<
+ *                                                    __Pyx_memviewslice *mslice) except NULL:
+ *     cdef _memoryviewslice obj
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView.get_slice_from_memview", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_obj);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1063
+ * 
+ * @cname('__pyx_memoryview_slice_copy')
+ * cdef void slice_copy(memoryview memview, __Pyx_memviewslice *dst) noexcept:             # <<<<<<<<<<<<<<
+ *     cdef int dim
+ *     cdef (Py_ssize_t*) shape, strides, suboffsets
+ */
+
+static void __pyx_memoryview_slice_copy(struct __pyx_memoryview_obj *__pyx_v_memview, __Pyx_memviewslice *__pyx_v_dst) {
+  int __pyx_v_dim;
+  Py_ssize_t *__pyx_v_shape;
+  Py_ssize_t *__pyx_v_strides;
+  Py_ssize_t *__pyx_v_suboffsets;
+  Py_ssize_t *__pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+  int __pyx_t_4;
+  Py_ssize_t __pyx_t_5;
+  int __pyx_t_6;
+
+  /* "View.MemoryView":1067
+ *     cdef (Py_ssize_t*) shape, strides, suboffsets
+ * 
+ *     shape = memview.view.shape             # <<<<<<<<<<<<<<
+ *     strides = memview.view.strides
+ *     suboffsets = memview.view.suboffsets
+ */
+  __pyx_t_1 = __pyx_v_memview->view.shape;
+  __pyx_v_shape = __pyx_t_1;
+
+  /* "View.MemoryView":1068
+ * 
+ *     shape = memview.view.shape
+ *     strides = memview.view.strides             # <<<<<<<<<<<<<<
+ *     suboffsets = memview.view.suboffsets
+ * 
+ */
+  __pyx_t_1 = __pyx_v_memview->view.strides;
+  __pyx_v_strides = __pyx_t_1;
+
+  /* "View.MemoryView":1069
+ *     shape = memview.view.shape
+ *     strides = memview.view.strides
+ *     suboffsets = memview.view.suboffsets             # <<<<<<<<<<<<<<
+ * 
+ *     dst.memview = <__pyx_memoryview *> memview
+ */
+  __pyx_t_1 = __pyx_v_memview->view.suboffsets;
+  __pyx_v_suboffsets = __pyx_t_1;
+
+  /* "View.MemoryView":1071
+ *     suboffsets = memview.view.suboffsets
+ * 
+ *     dst.memview = <__pyx_memoryview *> memview             # <<<<<<<<<<<<<<
+ *     dst.data = <char *> memview.view.buf
+ * 
+ */
+  __pyx_v_dst->memview = ((struct __pyx_memoryview_obj *)__pyx_v_memview);
+
+  /* "View.MemoryView":1072
+ * 
+ *     dst.memview = <__pyx_memoryview *> memview
+ *     dst.data = <char *> memview.view.buf             # <<<<<<<<<<<<<<
+ * 
+ *     for dim in range(memview.view.ndim):
+ */
+  __pyx_v_dst->data = ((char *)__pyx_v_memview->view.buf);
+
+  /* "View.MemoryView":1074
+ *     dst.data = <char *> memview.view.buf
+ * 
+ *     for dim in range(memview.view.ndim):             # <<<<<<<<<<<<<<
+ *         dst.shape[dim] = shape[dim]
+ *         dst.strides[dim] = strides[dim]
+ */
+  __pyx_t_2 = __pyx_v_memview->view.ndim;
+  __pyx_t_3 = __pyx_t_2;
+  for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
+    __pyx_v_dim = __pyx_t_4;
+
+    /* "View.MemoryView":1075
+ * 
+ *     for dim in range(memview.view.ndim):
+ *         dst.shape[dim] = shape[dim]             # <<<<<<<<<<<<<<
+ *         dst.strides[dim] = strides[dim]
+ *         dst.suboffsets[dim] = suboffsets[dim] if suboffsets else -1
+ */
+    (__pyx_v_dst->shape[__pyx_v_dim]) = (__pyx_v_shape[__pyx_v_dim]);
+
+    /* "View.MemoryView":1076
+ *     for dim in range(memview.view.ndim):
+ *         dst.shape[dim] = shape[dim]
+ *         dst.strides[dim] = strides[dim]             # <<<<<<<<<<<<<<
+ *         dst.suboffsets[dim] = suboffsets[dim] if suboffsets else -1
+ * 
+ */
+    (__pyx_v_dst->strides[__pyx_v_dim]) = (__pyx_v_strides[__pyx_v_dim]);
+
+    /* "View.MemoryView":1077
+ *         dst.shape[dim] = shape[dim]
+ *         dst.strides[dim] = strides[dim]
+ *         dst.suboffsets[dim] = suboffsets[dim] if suboffsets else -1             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_copy_object')
+ */
+    __pyx_t_6 = (__pyx_v_suboffsets != 0);
+    if (__pyx_t_6) {
+      __pyx_t_5 = (__pyx_v_suboffsets[__pyx_v_dim]);
+    } else {
+      __pyx_t_5 = -1L;
+    }
+    (__pyx_v_dst->suboffsets[__pyx_v_dim]) = __pyx_t_5;
+  }
+
+  /* "View.MemoryView":1063
+ * 
+ * @cname('__pyx_memoryview_slice_copy')
+ * cdef void slice_copy(memoryview memview, __Pyx_memviewslice *dst) noexcept:             # <<<<<<<<<<<<<<
+ *     cdef int dim
+ *     cdef (Py_ssize_t*) shape, strides, suboffsets
+ */
+
+  /* function exit code */
+}
+
+/* "View.MemoryView":1080
+ * 
+ * @cname('__pyx_memoryview_copy_object')
+ * cdef memoryview_copy(memoryview memview):             # <<<<<<<<<<<<<<
+ *     "Create a new memoryview object"
+ *     cdef __Pyx_memviewslice memviewslice
+ */
+
+static PyObject *__pyx_memoryview_copy_object(struct __pyx_memoryview_obj *__pyx_v_memview) {
+  __Pyx_memviewslice __pyx_v_memviewslice;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("memoryview_copy", 1);
+
+  /* "View.MemoryView":1083
+ *     "Create a new memoryview object"
+ *     cdef __Pyx_memviewslice memviewslice
+ *     slice_copy(memview, &memviewslice)             # <<<<<<<<<<<<<<
+ *     return memoryview_copy_from_slice(memview, &memviewslice)
+ * 
+ */
+  __pyx_memoryview_slice_copy(__pyx_v_memview, (&__pyx_v_memviewslice));
+
+  /* "View.MemoryView":1084
+ *     cdef __Pyx_memviewslice memviewslice
+ *     slice_copy(memview, &memviewslice)
+ *     return memoryview_copy_from_slice(memview, &memviewslice)             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_copy_object_from_slice')
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __pyx_memoryview_copy_object_from_slice(__pyx_v_memview, (&__pyx_v_memviewslice)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 1084, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":1080
+ * 
+ * @cname('__pyx_memoryview_copy_object')
+ * cdef memoryview_copy(memoryview memview):             # <<<<<<<<<<<<<<
+ *     "Create a new memoryview object"
+ *     cdef __Pyx_memviewslice memviewslice
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("View.MemoryView.memoryview_copy", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1087
+ * 
+ * @cname('__pyx_memoryview_copy_object_from_slice')
+ * cdef memoryview_copy_from_slice(memoryview memview, __Pyx_memviewslice *memviewslice):             # <<<<<<<<<<<<<<
+ *     """
+ *     Create a new memoryview object from a given memoryview object and slice.
+ */
+
+static PyObject *__pyx_memoryview_copy_object_from_slice(struct __pyx_memoryview_obj *__pyx_v_memview, __Pyx_memviewslice *__pyx_v_memviewslice) {
+  PyObject *(*__pyx_v_to_object_func)(char *);
+  int (*__pyx_v_to_dtype_func)(char *, PyObject *);
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *(*__pyx_t_2)(char *);
+  int (*__pyx_t_3)(char *, PyObject *);
+  PyObject *__pyx_t_4 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("memoryview_copy_from_slice", 1);
+
+  /* "View.MemoryView":1094
+ *     cdef int (*to_dtype_func)(char *, object) except 0
+ * 
+ *     if isinstance(memview, _memoryviewslice):             # <<<<<<<<<<<<<<
+ *         to_object_func = (<_memoryviewslice> memview).to_object_func
+ *         to_dtype_func = (<_memoryviewslice> memview).to_dtype_func
+ */
+  __pyx_t_1 = __Pyx_TypeCheck(((PyObject *)__pyx_v_memview), __pyx_memoryviewslice_type); 
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":1095
+ * 
+ *     if isinstance(memview, _memoryviewslice):
+ *         to_object_func = (<_memoryviewslice> memview).to_object_func             # <<<<<<<<<<<<<<
+ *         to_dtype_func = (<_memoryviewslice> memview).to_dtype_func
+ *     else:
+ */
+    __pyx_t_2 = ((struct __pyx_memoryviewslice_obj *)__pyx_v_memview)->to_object_func;
+    __pyx_v_to_object_func = __pyx_t_2;
+
+    /* "View.MemoryView":1096
+ *     if isinstance(memview, _memoryviewslice):
+ *         to_object_func = (<_memoryviewslice> memview).to_object_func
+ *         to_dtype_func = (<_memoryviewslice> memview).to_dtype_func             # <<<<<<<<<<<<<<
+ *     else:
+ *         to_object_func = NULL
+ */
+    __pyx_t_3 = ((struct __pyx_memoryviewslice_obj *)__pyx_v_memview)->to_dtype_func;
+    __pyx_v_to_dtype_func = __pyx_t_3;
+
+    /* "View.MemoryView":1094
+ *     cdef int (*to_dtype_func)(char *, object) except 0
+ * 
+ *     if isinstance(memview, _memoryviewslice):             # <<<<<<<<<<<<<<
+ *         to_object_func = (<_memoryviewslice> memview).to_object_func
+ *         to_dtype_func = (<_memoryviewslice> memview).to_dtype_func
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":1098
+ *         to_dtype_func = (<_memoryviewslice> memview).to_dtype_func
+ *     else:
+ *         to_object_func = NULL             # <<<<<<<<<<<<<<
+ *         to_dtype_func = NULL
+ * 
+ */
+  /*else*/ {
+    __pyx_v_to_object_func = NULL;
+
+    /* "View.MemoryView":1099
+ *     else:
+ *         to_object_func = NULL
+ *         to_dtype_func = NULL             # <<<<<<<<<<<<<<
+ * 
+ *     return memoryview_fromslice(memviewslice[0], memview.view.ndim,
+ */
+    __pyx_v_to_dtype_func = NULL;
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":1101
+ *         to_dtype_func = NULL
+ * 
+ *     return memoryview_fromslice(memviewslice[0], memview.view.ndim,             # <<<<<<<<<<<<<<
+ *                                 to_object_func, to_dtype_func,
+ *                                 memview.dtype_is_object)
+ */
+  __Pyx_XDECREF(__pyx_r);
+
+  /* "View.MemoryView":1103
+ *     return memoryview_fromslice(memviewslice[0], memview.view.ndim,
+ *                                 to_object_func, to_dtype_func,
+ *                                 memview.dtype_is_object)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_t_4 = __pyx_memoryview_fromslice((__pyx_v_memviewslice[0]), __pyx_v_memview->view.ndim, __pyx_v_to_object_func, __pyx_v_to_dtype_func, __pyx_v_memview->dtype_is_object); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 1101, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_r = __pyx_t_4;
+  __pyx_t_4 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":1087
+ * 
+ * @cname('__pyx_memoryview_copy_object_from_slice')
+ * cdef memoryview_copy_from_slice(memoryview memview, __Pyx_memviewslice *memviewslice):             # <<<<<<<<<<<<<<
+ *     """
+ *     Create a new memoryview object from a given memoryview object and slice.
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("View.MemoryView.memoryview_copy_from_slice", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1109
+ * 
+ * 
+ * cdef Py_ssize_t abs_py_ssize_t(Py_ssize_t arg) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     return -arg if arg < 0 else arg
+ * 
+ */
+
+static Py_ssize_t abs_py_ssize_t(Py_ssize_t __pyx_v_arg) {
+  Py_ssize_t __pyx_r;
+  Py_ssize_t __pyx_t_1;
+  int __pyx_t_2;
+
+  /* "View.MemoryView":1110
+ * 
+ * cdef Py_ssize_t abs_py_ssize_t(Py_ssize_t arg) noexcept nogil:
+ *     return -arg if arg < 0 else arg             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_get_best_slice_order')
+ */
+  __pyx_t_2 = (__pyx_v_arg < 0);
+  if (__pyx_t_2) {
+    __pyx_t_1 = (-__pyx_v_arg);
+  } else {
+    __pyx_t_1 = __pyx_v_arg;
+  }
+  __pyx_r = __pyx_t_1;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":1109
+ * 
+ * 
+ * cdef Py_ssize_t abs_py_ssize_t(Py_ssize_t arg) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     return -arg if arg < 0 else arg
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1113
+ * 
+ * @cname('__pyx_get_best_slice_order')
+ * cdef char get_best_order(__Pyx_memviewslice *mslice, int ndim) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     Figure out the best memory access order for a given slice.
+ */
+
+static char __pyx_get_best_slice_order(__Pyx_memviewslice *__pyx_v_mslice, int __pyx_v_ndim) {
+  int __pyx_v_i;
+  Py_ssize_t __pyx_v_c_stride;
+  Py_ssize_t __pyx_v_f_stride;
+  char __pyx_r;
+  int __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+  int __pyx_t_4;
+
+  /* "View.MemoryView":1118
+ *     """
+ *     cdef int i
+ *     cdef Py_ssize_t c_stride = 0             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t f_stride = 0
+ * 
+ */
+  __pyx_v_c_stride = 0;
+
+  /* "View.MemoryView":1119
+ *     cdef int i
+ *     cdef Py_ssize_t c_stride = 0
+ *     cdef Py_ssize_t f_stride = 0             # <<<<<<<<<<<<<<
+ * 
+ *     for i in range(ndim - 1, -1, -1):
+ */
+  __pyx_v_f_stride = 0;
+
+  /* "View.MemoryView":1121
+ *     cdef Py_ssize_t f_stride = 0
+ * 
+ *     for i in range(ndim - 1, -1, -1):             # <<<<<<<<<<<<<<
+ *         if mslice.shape[i] > 1:
+ *             c_stride = mslice.strides[i]
+ */
+  for (__pyx_t_1 = (__pyx_v_ndim - 1); __pyx_t_1 > -1; __pyx_t_1-=1) {
+    __pyx_v_i = __pyx_t_1;
+
+    /* "View.MemoryView":1122
+ * 
+ *     for i in range(ndim - 1, -1, -1):
+ *         if mslice.shape[i] > 1:             # <<<<<<<<<<<<<<
+ *             c_stride = mslice.strides[i]
+ *             break
+ */
+    __pyx_t_2 = ((__pyx_v_mslice->shape[__pyx_v_i]) > 1);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":1123
+ *     for i in range(ndim - 1, -1, -1):
+ *         if mslice.shape[i] > 1:
+ *             c_stride = mslice.strides[i]             # <<<<<<<<<<<<<<
+ *             break
+ * 
+ */
+      __pyx_v_c_stride = (__pyx_v_mslice->strides[__pyx_v_i]);
+
+      /* "View.MemoryView":1124
+ *         if mslice.shape[i] > 1:
+ *             c_stride = mslice.strides[i]
+ *             break             # <<<<<<<<<<<<<<
+ * 
+ *     for i in range(ndim):
+ */
+      goto __pyx_L4_break;
+
+      /* "View.MemoryView":1122
+ * 
+ *     for i in range(ndim - 1, -1, -1):
+ *         if mslice.shape[i] > 1:             # <<<<<<<<<<<<<<
+ *             c_stride = mslice.strides[i]
+ *             break
+ */
+    }
+  }
+  __pyx_L4_break:;
+
+  /* "View.MemoryView":1126
+ *             break
+ * 
+ *     for i in range(ndim):             # <<<<<<<<<<<<<<
+ *         if mslice.shape[i] > 1:
+ *             f_stride = mslice.strides[i]
+ */
+  __pyx_t_1 = __pyx_v_ndim;
+  __pyx_t_3 = __pyx_t_1;
+  for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
+    __pyx_v_i = __pyx_t_4;
+
+    /* "View.MemoryView":1127
+ * 
+ *     for i in range(ndim):
+ *         if mslice.shape[i] > 1:             # <<<<<<<<<<<<<<
+ *             f_stride = mslice.strides[i]
+ *             break
+ */
+    __pyx_t_2 = ((__pyx_v_mslice->shape[__pyx_v_i]) > 1);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":1128
+ *     for i in range(ndim):
+ *         if mslice.shape[i] > 1:
+ *             f_stride = mslice.strides[i]             # <<<<<<<<<<<<<<
+ *             break
+ * 
+ */
+      __pyx_v_f_stride = (__pyx_v_mslice->strides[__pyx_v_i]);
+
+      /* "View.MemoryView":1129
+ *         if mslice.shape[i] > 1:
+ *             f_stride = mslice.strides[i]
+ *             break             # <<<<<<<<<<<<<<
+ * 
+ *     if abs_py_ssize_t(c_stride) <= abs_py_ssize_t(f_stride):
+ */
+      goto __pyx_L7_break;
+
+      /* "View.MemoryView":1127
+ * 
+ *     for i in range(ndim):
+ *         if mslice.shape[i] > 1:             # <<<<<<<<<<<<<<
+ *             f_stride = mslice.strides[i]
+ *             break
+ */
+    }
+  }
+  __pyx_L7_break:;
+
+  /* "View.MemoryView":1131
+ *             break
+ * 
+ *     if abs_py_ssize_t(c_stride) <= abs_py_ssize_t(f_stride):             # <<<<<<<<<<<<<<
+ *         return 'C'
+ *     else:
+ */
+  __pyx_t_2 = (abs_py_ssize_t(__pyx_v_c_stride) <= abs_py_ssize_t(__pyx_v_f_stride));
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":1132
+ * 
+ *     if abs_py_ssize_t(c_stride) <= abs_py_ssize_t(f_stride):
+ *         return 'C'             # <<<<<<<<<<<<<<
+ *     else:
+ *         return 'F'
+ */
+    __pyx_r = 'C';
+    goto __pyx_L0;
+
+    /* "View.MemoryView":1131
+ *             break
+ * 
+ *     if abs_py_ssize_t(c_stride) <= abs_py_ssize_t(f_stride):             # <<<<<<<<<<<<<<
+ *         return 'C'
+ *     else:
+ */
+  }
+
+  /* "View.MemoryView":1134
+ *         return 'C'
+ *     else:
+ *         return 'F'             # <<<<<<<<<<<<<<
+ * 
+ * @cython.cdivision(True)
+ */
+  /*else*/ {
+    __pyx_r = 'F';
+    goto __pyx_L0;
+  }
+
+  /* "View.MemoryView":1113
+ * 
+ * @cname('__pyx_get_best_slice_order')
+ * cdef char get_best_order(__Pyx_memviewslice *mslice, int ndim) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     Figure out the best memory access order for a given slice.
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1137
+ * 
+ * @cython.cdivision(True)
+ * cdef void _copy_strided_to_strided(char *src_data, Py_ssize_t *src_strides,             # <<<<<<<<<<<<<<
+ *                                    char *dst_data, Py_ssize_t *dst_strides,
+ *                                    Py_ssize_t *src_shape, Py_ssize_t *dst_shape,
+ */
+
+static void _copy_strided_to_strided(char *__pyx_v_src_data, Py_ssize_t *__pyx_v_src_strides, char *__pyx_v_dst_data, Py_ssize_t *__pyx_v_dst_strides, Py_ssize_t *__pyx_v_src_shape, Py_ssize_t *__pyx_v_dst_shape, int __pyx_v_ndim, size_t __pyx_v_itemsize) {
+  CYTHON_UNUSED Py_ssize_t __pyx_v_i;
+  CYTHON_UNUSED Py_ssize_t __pyx_v_src_extent;
+  Py_ssize_t __pyx_v_dst_extent;
+  Py_ssize_t __pyx_v_src_stride;
+  Py_ssize_t __pyx_v_dst_stride;
+  int __pyx_t_1;
+  int __pyx_t_2;
+  Py_ssize_t __pyx_t_3;
+  Py_ssize_t __pyx_t_4;
+  Py_ssize_t __pyx_t_5;
+
+  /* "View.MemoryView":1144
+ * 
+ *     cdef Py_ssize_t i
+ *     cdef Py_ssize_t src_extent = src_shape[0]             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t dst_extent = dst_shape[0]
+ *     cdef Py_ssize_t src_stride = src_strides[0]
+ */
+  __pyx_v_src_extent = (__pyx_v_src_shape[0]);
+
+  /* "View.MemoryView":1145
+ *     cdef Py_ssize_t i
+ *     cdef Py_ssize_t src_extent = src_shape[0]
+ *     cdef Py_ssize_t dst_extent = dst_shape[0]             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t src_stride = src_strides[0]
+ *     cdef Py_ssize_t dst_stride = dst_strides[0]
+ */
+  __pyx_v_dst_extent = (__pyx_v_dst_shape[0]);
+
+  /* "View.MemoryView":1146
+ *     cdef Py_ssize_t src_extent = src_shape[0]
+ *     cdef Py_ssize_t dst_extent = dst_shape[0]
+ *     cdef Py_ssize_t src_stride = src_strides[0]             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t dst_stride = dst_strides[0]
+ * 
+ */
+  __pyx_v_src_stride = (__pyx_v_src_strides[0]);
+
+  /* "View.MemoryView":1147
+ *     cdef Py_ssize_t dst_extent = dst_shape[0]
+ *     cdef Py_ssize_t src_stride = src_strides[0]
+ *     cdef Py_ssize_t dst_stride = dst_strides[0]             # <<<<<<<<<<<<<<
+ * 
+ *     if ndim == 1:
+ */
+  __pyx_v_dst_stride = (__pyx_v_dst_strides[0]);
+
+  /* "View.MemoryView":1149
+ *     cdef Py_ssize_t dst_stride = dst_strides[0]
+ * 
+ *     if ndim == 1:             # <<<<<<<<<<<<<<
+ *         if (src_stride > 0 and dst_stride > 0 and
+ *             <size_t> src_stride == itemsize == <size_t> dst_stride):
+ */
+  __pyx_t_1 = (__pyx_v_ndim == 1);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":1150
+ * 
+ *     if ndim == 1:
+ *         if (src_stride > 0 and dst_stride > 0 and             # <<<<<<<<<<<<<<
+ *             <size_t> src_stride == itemsize == <size_t> dst_stride):
+ *             memcpy(dst_data, src_data, itemsize * dst_extent)
+ */
+    __pyx_t_2 = (__pyx_v_src_stride > 0);
+    if (__pyx_t_2) {
+    } else {
+      __pyx_t_1 = __pyx_t_2;
+      goto __pyx_L5_bool_binop_done;
+    }
+    __pyx_t_2 = (__pyx_v_dst_stride > 0);
+    if (__pyx_t_2) {
+    } else {
+      __pyx_t_1 = __pyx_t_2;
+      goto __pyx_L5_bool_binop_done;
+    }
+
+    /* "View.MemoryView":1151
+ *     if ndim == 1:
+ *         if (src_stride > 0 and dst_stride > 0 and
+ *             <size_t> src_stride == itemsize == <size_t> dst_stride):             # <<<<<<<<<<<<<<
+ *             memcpy(dst_data, src_data, itemsize * dst_extent)
+ *         else:
+ */
+    __pyx_t_2 = (((size_t)__pyx_v_src_stride) == __pyx_v_itemsize);
+    if (__pyx_t_2) {
+      __pyx_t_2 = (__pyx_v_itemsize == ((size_t)__pyx_v_dst_stride));
+    }
+    __pyx_t_1 = __pyx_t_2;
+    __pyx_L5_bool_binop_done:;
+
+    /* "View.MemoryView":1150
+ * 
+ *     if ndim == 1:
+ *         if (src_stride > 0 and dst_stride > 0 and             # <<<<<<<<<<<<<<
+ *             <size_t> src_stride == itemsize == <size_t> dst_stride):
+ *             memcpy(dst_data, src_data, itemsize * dst_extent)
+ */
+    if (__pyx_t_1) {
+
+      /* "View.MemoryView":1152
+ *         if (src_stride > 0 and dst_stride > 0 and
+ *             <size_t> src_stride == itemsize == <size_t> dst_stride):
+ *             memcpy(dst_data, src_data, itemsize * dst_extent)             # <<<<<<<<<<<<<<
+ *         else:
+ *             for i in range(dst_extent):
+ */
+      (void)(memcpy(__pyx_v_dst_data, __pyx_v_src_data, (__pyx_v_itemsize * __pyx_v_dst_extent)));
+
+      /* "View.MemoryView":1150
+ * 
+ *     if ndim == 1:
+ *         if (src_stride > 0 and dst_stride > 0 and             # <<<<<<<<<<<<<<
+ *             <size_t> src_stride == itemsize == <size_t> dst_stride):
+ *             memcpy(dst_data, src_data, itemsize * dst_extent)
+ */
+      goto __pyx_L4;
+    }
+
+    /* "View.MemoryView":1154
+ *             memcpy(dst_data, src_data, itemsize * dst_extent)
+ *         else:
+ *             for i in range(dst_extent):             # <<<<<<<<<<<<<<
+ *                 memcpy(dst_data, src_data, itemsize)
+ *                 src_data += src_stride
+ */
+    /*else*/ {
+      __pyx_t_3 = __pyx_v_dst_extent;
+      __pyx_t_4 = __pyx_t_3;
+      for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
+        __pyx_v_i = __pyx_t_5;
+
+        /* "View.MemoryView":1155
+ *         else:
+ *             for i in range(dst_extent):
+ *                 memcpy(dst_data, src_data, itemsize)             # <<<<<<<<<<<<<<
+ *                 src_data += src_stride
+ *                 dst_data += dst_stride
+ */
+        (void)(memcpy(__pyx_v_dst_data, __pyx_v_src_data, __pyx_v_itemsize));
+
+        /* "View.MemoryView":1156
+ *             for i in range(dst_extent):
+ *                 memcpy(dst_data, src_data, itemsize)
+ *                 src_data += src_stride             # <<<<<<<<<<<<<<
+ *                 dst_data += dst_stride
+ *     else:
+ */
+        __pyx_v_src_data = (__pyx_v_src_data + __pyx_v_src_stride);
+
+        /* "View.MemoryView":1157
+ *                 memcpy(dst_data, src_data, itemsize)
+ *                 src_data += src_stride
+ *                 dst_data += dst_stride             # <<<<<<<<<<<<<<
+ *     else:
+ *         for i in range(dst_extent):
+ */
+        __pyx_v_dst_data = (__pyx_v_dst_data + __pyx_v_dst_stride);
+      }
+    }
+    __pyx_L4:;
+
+    /* "View.MemoryView":1149
+ *     cdef Py_ssize_t dst_stride = dst_strides[0]
+ * 
+ *     if ndim == 1:             # <<<<<<<<<<<<<<
+ *         if (src_stride > 0 and dst_stride > 0 and
+ *             <size_t> src_stride == itemsize == <size_t> dst_stride):
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":1159
+ *                 dst_data += dst_stride
+ *     else:
+ *         for i in range(dst_extent):             # <<<<<<<<<<<<<<
+ *             _copy_strided_to_strided(src_data, src_strides + 1,
+ *                                      dst_data, dst_strides + 1,
+ */
+  /*else*/ {
+    __pyx_t_3 = __pyx_v_dst_extent;
+    __pyx_t_4 = __pyx_t_3;
+    for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
+      __pyx_v_i = __pyx_t_5;
+
+      /* "View.MemoryView":1160
+ *     else:
+ *         for i in range(dst_extent):
+ *             _copy_strided_to_strided(src_data, src_strides + 1,             # <<<<<<<<<<<<<<
+ *                                      dst_data, dst_strides + 1,
+ *                                      src_shape + 1, dst_shape + 1,
+ */
+      _copy_strided_to_strided(__pyx_v_src_data, (__pyx_v_src_strides + 1), __pyx_v_dst_data, (__pyx_v_dst_strides + 1), (__pyx_v_src_shape + 1), (__pyx_v_dst_shape + 1), (__pyx_v_ndim - 1), __pyx_v_itemsize);
+
+      /* "View.MemoryView":1164
+ *                                      src_shape + 1, dst_shape + 1,
+ *                                      ndim - 1, itemsize)
+ *             src_data += src_stride             # <<<<<<<<<<<<<<
+ *             dst_data += dst_stride
+ * 
+ */
+      __pyx_v_src_data = (__pyx_v_src_data + __pyx_v_src_stride);
+
+      /* "View.MemoryView":1165
+ *                                      ndim - 1, itemsize)
+ *             src_data += src_stride
+ *             dst_data += dst_stride             # <<<<<<<<<<<<<<
+ * 
+ * cdef void copy_strided_to_strided(__Pyx_memviewslice *src,
+ */
+      __pyx_v_dst_data = (__pyx_v_dst_data + __pyx_v_dst_stride);
+    }
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":1137
+ * 
+ * @cython.cdivision(True)
+ * cdef void _copy_strided_to_strided(char *src_data, Py_ssize_t *src_strides,             # <<<<<<<<<<<<<<
+ *                                    char *dst_data, Py_ssize_t *dst_strides,
+ *                                    Py_ssize_t *src_shape, Py_ssize_t *dst_shape,
+ */
+
+  /* function exit code */
+}
+
+/* "View.MemoryView":1167
+ *             dst_data += dst_stride
+ * 
+ * cdef void copy_strided_to_strided(__Pyx_memviewslice *src,             # <<<<<<<<<<<<<<
+ *                                   __Pyx_memviewslice *dst,
+ *                                   int ndim, size_t itemsize) noexcept nogil:
+ */
+
+static void copy_strided_to_strided(__Pyx_memviewslice *__pyx_v_src, __Pyx_memviewslice *__pyx_v_dst, int __pyx_v_ndim, size_t __pyx_v_itemsize) {
+
+  /* "View.MemoryView":1170
+ *                                   __Pyx_memviewslice *dst,
+ *                                   int ndim, size_t itemsize) noexcept nogil:
+ *     _copy_strided_to_strided(src.data, src.strides, dst.data, dst.strides,             # <<<<<<<<<<<<<<
+ *                              src.shape, dst.shape, ndim, itemsize)
+ * 
+ */
+  _copy_strided_to_strided(__pyx_v_src->data, __pyx_v_src->strides, __pyx_v_dst->data, __pyx_v_dst->strides, __pyx_v_src->shape, __pyx_v_dst->shape, __pyx_v_ndim, __pyx_v_itemsize);
+
+  /* "View.MemoryView":1167
+ *             dst_data += dst_stride
+ * 
+ * cdef void copy_strided_to_strided(__Pyx_memviewslice *src,             # <<<<<<<<<<<<<<
+ *                                   __Pyx_memviewslice *dst,
+ *                                   int ndim, size_t itemsize) noexcept nogil:
+ */
+
+  /* function exit code */
+}
+
+/* "View.MemoryView":1174
+ * 
+ * @cname('__pyx_memoryview_slice_get_size')
+ * cdef Py_ssize_t slice_get_size(__Pyx_memviewslice *src, int ndim) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     "Return the size of the memory occupied by the slice in number of bytes"
+ *     cdef Py_ssize_t shape, size = src.memview.view.itemsize
+ */
+
+static Py_ssize_t __pyx_memoryview_slice_get_size(__Pyx_memviewslice *__pyx_v_src, int __pyx_v_ndim) {
+  Py_ssize_t __pyx_v_shape;
+  Py_ssize_t __pyx_v_size;
+  Py_ssize_t __pyx_r;
+  Py_ssize_t __pyx_t_1;
+  Py_ssize_t *__pyx_t_2;
+  Py_ssize_t *__pyx_t_3;
+  Py_ssize_t *__pyx_t_4;
+
+  /* "View.MemoryView":1176
+ * cdef Py_ssize_t slice_get_size(__Pyx_memviewslice *src, int ndim) noexcept nogil:
+ *     "Return the size of the memory occupied by the slice in number of bytes"
+ *     cdef Py_ssize_t shape, size = src.memview.view.itemsize             # <<<<<<<<<<<<<<
+ * 
+ *     for shape in src.shape[:ndim]:
+ */
+  __pyx_t_1 = __pyx_v_src->memview->view.itemsize;
+  __pyx_v_size = __pyx_t_1;
+
+  /* "View.MemoryView":1178
+ *     cdef Py_ssize_t shape, size = src.memview.view.itemsize
+ * 
+ *     for shape in src.shape[:ndim]:             # <<<<<<<<<<<<<<
+ *         size *= shape
+ * 
+ */
+  __pyx_t_3 = (__pyx_v_src->shape + __pyx_v_ndim);
+  for (__pyx_t_4 = __pyx_v_src->shape; __pyx_t_4 < __pyx_t_3; __pyx_t_4++) {
+    __pyx_t_2 = __pyx_t_4;
+    __pyx_v_shape = (__pyx_t_2[0]);
+
+    /* "View.MemoryView":1179
+ * 
+ *     for shape in src.shape[:ndim]:
+ *         size *= shape             # <<<<<<<<<<<<<<
+ * 
+ *     return size
+ */
+    __pyx_v_size = (__pyx_v_size * __pyx_v_shape);
+  }
+
+  /* "View.MemoryView":1181
+ *         size *= shape
+ * 
+ *     return size             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_fill_contig_strides_array')
+ */
+  __pyx_r = __pyx_v_size;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":1174
+ * 
+ * @cname('__pyx_memoryview_slice_get_size')
+ * cdef Py_ssize_t slice_get_size(__Pyx_memviewslice *src, int ndim) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     "Return the size of the memory occupied by the slice in number of bytes"
+ *     cdef Py_ssize_t shape, size = src.memview.view.itemsize
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1184
+ * 
+ * @cname('__pyx_fill_contig_strides_array')
+ * cdef Py_ssize_t fill_contig_strides_array(             # <<<<<<<<<<<<<<
+ *                 Py_ssize_t *shape, Py_ssize_t *strides, Py_ssize_t stride,
+ *                 int ndim, char order) noexcept nogil:
+ */
+
+static Py_ssize_t __pyx_fill_contig_strides_array(Py_ssize_t *__pyx_v_shape, Py_ssize_t *__pyx_v_strides, Py_ssize_t __pyx_v_stride, int __pyx_v_ndim, char __pyx_v_order) {
+  int __pyx_v_idx;
+  Py_ssize_t __pyx_r;
+  int __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+  int __pyx_t_4;
+
+  /* "View.MemoryView":1193
+ *     cdef int idx
+ * 
+ *     if order == 'F':             # <<<<<<<<<<<<<<
+ *         for idx in range(ndim):
+ *             strides[idx] = stride
+ */
+  __pyx_t_1 = (__pyx_v_order == 'F');
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":1194
+ * 
+ *     if order == 'F':
+ *         for idx in range(ndim):             # <<<<<<<<<<<<<<
+ *             strides[idx] = stride
+ *             stride *= shape[idx]
+ */
+    __pyx_t_2 = __pyx_v_ndim;
+    __pyx_t_3 = __pyx_t_2;
+    for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
+      __pyx_v_idx = __pyx_t_4;
+
+      /* "View.MemoryView":1195
+ *     if order == 'F':
+ *         for idx in range(ndim):
+ *             strides[idx] = stride             # <<<<<<<<<<<<<<
+ *             stride *= shape[idx]
+ *     else:
+ */
+      (__pyx_v_strides[__pyx_v_idx]) = __pyx_v_stride;
+
+      /* "View.MemoryView":1196
+ *         for idx in range(ndim):
+ *             strides[idx] = stride
+ *             stride *= shape[idx]             # <<<<<<<<<<<<<<
+ *     else:
+ *         for idx in range(ndim - 1, -1, -1):
+ */
+      __pyx_v_stride = (__pyx_v_stride * (__pyx_v_shape[__pyx_v_idx]));
+    }
+
+    /* "View.MemoryView":1193
+ *     cdef int idx
+ * 
+ *     if order == 'F':             # <<<<<<<<<<<<<<
+ *         for idx in range(ndim):
+ *             strides[idx] = stride
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":1198
+ *             stride *= shape[idx]
+ *     else:
+ *         for idx in range(ndim - 1, -1, -1):             # <<<<<<<<<<<<<<
+ *             strides[idx] = stride
+ *             stride *= shape[idx]
+ */
+  /*else*/ {
+    for (__pyx_t_2 = (__pyx_v_ndim - 1); __pyx_t_2 > -1; __pyx_t_2-=1) {
+      __pyx_v_idx = __pyx_t_2;
+
+      /* "View.MemoryView":1199
+ *     else:
+ *         for idx in range(ndim - 1, -1, -1):
+ *             strides[idx] = stride             # <<<<<<<<<<<<<<
+ *             stride *= shape[idx]
+ * 
+ */
+      (__pyx_v_strides[__pyx_v_idx]) = __pyx_v_stride;
+
+      /* "View.MemoryView":1200
+ *         for idx in range(ndim - 1, -1, -1):
+ *             strides[idx] = stride
+ *             stride *= shape[idx]             # <<<<<<<<<<<<<<
+ * 
+ *     return stride
+ */
+      __pyx_v_stride = (__pyx_v_stride * (__pyx_v_shape[__pyx_v_idx]));
+    }
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":1202
+ *             stride *= shape[idx]
+ * 
+ *     return stride             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_copy_data_to_temp')
+ */
+  __pyx_r = __pyx_v_stride;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":1184
+ * 
+ * @cname('__pyx_fill_contig_strides_array')
+ * cdef Py_ssize_t fill_contig_strides_array(             # <<<<<<<<<<<<<<
+ *                 Py_ssize_t *shape, Py_ssize_t *strides, Py_ssize_t stride,
+ *                 int ndim, char order) noexcept nogil:
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1205
+ * 
+ * @cname('__pyx_memoryview_copy_data_to_temp')
+ * cdef void *copy_data_to_temp(__Pyx_memviewslice *src,             # <<<<<<<<<<<<<<
+ *                              __Pyx_memviewslice *tmpslice,
+ *                              char order,
+ */
+
+static void *__pyx_memoryview_copy_data_to_temp(__Pyx_memviewslice *__pyx_v_src, __Pyx_memviewslice *__pyx_v_tmpslice, char __pyx_v_order, int __pyx_v_ndim) {
+  int __pyx_v_i;
+  void *__pyx_v_result;
+  size_t __pyx_v_itemsize;
+  size_t __pyx_v_size;
+  void *__pyx_r;
+  Py_ssize_t __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+  struct __pyx_memoryview_obj *__pyx_t_4;
+  int __pyx_t_5;
+  int __pyx_t_6;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  #ifdef WITH_THREAD
+  PyGILState_STATE __pyx_gilstate_save;
+  #endif
+
+  /* "View.MemoryView":1216
+ *     cdef void *result
+ * 
+ *     cdef size_t itemsize = src.memview.view.itemsize             # <<<<<<<<<<<<<<
+ *     cdef size_t size = slice_get_size(src, ndim)
+ * 
+ */
+  __pyx_t_1 = __pyx_v_src->memview->view.itemsize;
+  __pyx_v_itemsize = __pyx_t_1;
+
+  /* "View.MemoryView":1217
+ * 
+ *     cdef size_t itemsize = src.memview.view.itemsize
+ *     cdef size_t size = slice_get_size(src, ndim)             # <<<<<<<<<<<<<<
+ * 
+ *     result = malloc(size)
+ */
+  __pyx_v_size = __pyx_memoryview_slice_get_size(__pyx_v_src, __pyx_v_ndim);
+
+  /* "View.MemoryView":1219
+ *     cdef size_t size = slice_get_size(src, ndim)
+ * 
+ *     result = malloc(size)             # <<<<<<<<<<<<<<
+ *     if not result:
+ *         _err_no_memory()
+ */
+  __pyx_v_result = malloc(__pyx_v_size);
+
+  /* "View.MemoryView":1220
+ * 
+ *     result = malloc(size)
+ *     if not result:             # <<<<<<<<<<<<<<
+ *         _err_no_memory()
+ * 
+ */
+  __pyx_t_2 = (!(__pyx_v_result != 0));
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":1221
+ *     result = malloc(size)
+ *     if not result:
+ *         _err_no_memory()             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+    __pyx_t_3 = __pyx_memoryview_err_no_memory(); if (unlikely(__pyx_t_3 == ((int)-1))) __PYX_ERR(1, 1221, __pyx_L1_error)
+
+    /* "View.MemoryView":1220
+ * 
+ *     result = malloc(size)
+ *     if not result:             # <<<<<<<<<<<<<<
+ *         _err_no_memory()
+ * 
+ */
+  }
+
+  /* "View.MemoryView":1224
+ * 
+ * 
+ *     tmpslice.data = <char *> result             # <<<<<<<<<<<<<<
+ *     tmpslice.memview = src.memview
+ *     for i in range(ndim):
+ */
+  __pyx_v_tmpslice->data = ((char *)__pyx_v_result);
+
+  /* "View.MemoryView":1225
+ * 
+ *     tmpslice.data = <char *> result
+ *     tmpslice.memview = src.memview             # <<<<<<<<<<<<<<
+ *     for i in range(ndim):
+ *         tmpslice.shape[i] = src.shape[i]
+ */
+  __pyx_t_4 = __pyx_v_src->memview;
+  __pyx_v_tmpslice->memview = __pyx_t_4;
+
+  /* "View.MemoryView":1226
+ *     tmpslice.data = <char *> result
+ *     tmpslice.memview = src.memview
+ *     for i in range(ndim):             # <<<<<<<<<<<<<<
+ *         tmpslice.shape[i] = src.shape[i]
+ *         tmpslice.suboffsets[i] = -1
+ */
+  __pyx_t_3 = __pyx_v_ndim;
+  __pyx_t_5 = __pyx_t_3;
+  for (__pyx_t_6 = 0; __pyx_t_6 < __pyx_t_5; __pyx_t_6+=1) {
+    __pyx_v_i = __pyx_t_6;
+
+    /* "View.MemoryView":1227
+ *     tmpslice.memview = src.memview
+ *     for i in range(ndim):
+ *         tmpslice.shape[i] = src.shape[i]             # <<<<<<<<<<<<<<
+ *         tmpslice.suboffsets[i] = -1
+ * 
+ */
+    (__pyx_v_tmpslice->shape[__pyx_v_i]) = (__pyx_v_src->shape[__pyx_v_i]);
+
+    /* "View.MemoryView":1228
+ *     for i in range(ndim):
+ *         tmpslice.shape[i] = src.shape[i]
+ *         tmpslice.suboffsets[i] = -1             # <<<<<<<<<<<<<<
+ * 
+ *     fill_contig_strides_array(&tmpslice.shape[0], &tmpslice.strides[0], itemsize, ndim, order)
+ */
+    (__pyx_v_tmpslice->suboffsets[__pyx_v_i]) = -1L;
+  }
+
+  /* "View.MemoryView":1230
+ *         tmpslice.suboffsets[i] = -1
+ * 
+ *     fill_contig_strides_array(&tmpslice.shape[0], &tmpslice.strides[0], itemsize, ndim, order)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  (void)(__pyx_fill_contig_strides_array((&(__pyx_v_tmpslice->shape[0])), (&(__pyx_v_tmpslice->strides[0])), __pyx_v_itemsize, __pyx_v_ndim, __pyx_v_order));
+
+  /* "View.MemoryView":1233
+ * 
+ * 
+ *     for i in range(ndim):             # <<<<<<<<<<<<<<
+ *         if tmpslice.shape[i] == 1:
+ *             tmpslice.strides[i] = 0
+ */
+  __pyx_t_3 = __pyx_v_ndim;
+  __pyx_t_5 = __pyx_t_3;
+  for (__pyx_t_6 = 0; __pyx_t_6 < __pyx_t_5; __pyx_t_6+=1) {
+    __pyx_v_i = __pyx_t_6;
+
+    /* "View.MemoryView":1234
+ * 
+ *     for i in range(ndim):
+ *         if tmpslice.shape[i] == 1:             # <<<<<<<<<<<<<<
+ *             tmpslice.strides[i] = 0
+ * 
+ */
+    __pyx_t_2 = ((__pyx_v_tmpslice->shape[__pyx_v_i]) == 1);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":1235
+ *     for i in range(ndim):
+ *         if tmpslice.shape[i] == 1:
+ *             tmpslice.strides[i] = 0             # <<<<<<<<<<<<<<
+ * 
+ *     if slice_is_contig(src[0], order, ndim):
+ */
+      (__pyx_v_tmpslice->strides[__pyx_v_i]) = 0;
+
+      /* "View.MemoryView":1234
+ * 
+ *     for i in range(ndim):
+ *         if tmpslice.shape[i] == 1:             # <<<<<<<<<<<<<<
+ *             tmpslice.strides[i] = 0
+ * 
+ */
+    }
+  }
+
+  /* "View.MemoryView":1237
+ *             tmpslice.strides[i] = 0
+ * 
+ *     if slice_is_contig(src[0], order, ndim):             # <<<<<<<<<<<<<<
+ *         memcpy(result, src.data, size)
+ *     else:
+ */
+  __pyx_t_2 = __pyx_memviewslice_is_contig((__pyx_v_src[0]), __pyx_v_order, __pyx_v_ndim);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":1238
+ * 
+ *     if slice_is_contig(src[0], order, ndim):
+ *         memcpy(result, src.data, size)             # <<<<<<<<<<<<<<
+ *     else:
+ *         copy_strided_to_strided(src, tmpslice, ndim, itemsize)
+ */
+    (void)(memcpy(__pyx_v_result, __pyx_v_src->data, __pyx_v_size));
+
+    /* "View.MemoryView":1237
+ *             tmpslice.strides[i] = 0
+ * 
+ *     if slice_is_contig(src[0], order, ndim):             # <<<<<<<<<<<<<<
+ *         memcpy(result, src.data, size)
+ *     else:
+ */
+    goto __pyx_L9;
+  }
+
+  /* "View.MemoryView":1240
+ *         memcpy(result, src.data, size)
+ *     else:
+ *         copy_strided_to_strided(src, tmpslice, ndim, itemsize)             # <<<<<<<<<<<<<<
+ * 
+ *     return result
+ */
+  /*else*/ {
+    copy_strided_to_strided(__pyx_v_src, __pyx_v_tmpslice, __pyx_v_ndim, __pyx_v_itemsize);
+  }
+  __pyx_L9:;
+
+  /* "View.MemoryView":1242
+ *         copy_strided_to_strided(src, tmpslice, ndim, itemsize)
+ * 
+ *     return result             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = __pyx_v_result;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":1205
+ * 
+ * @cname('__pyx_memoryview_copy_data_to_temp')
+ * cdef void *copy_data_to_temp(__Pyx_memviewslice *src,             # <<<<<<<<<<<<<<
+ *                              __Pyx_memviewslice *tmpslice,
+ *                              char order,
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  #ifdef WITH_THREAD
+  __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
+  #endif
+  __Pyx_AddTraceback("View.MemoryView.copy_data_to_temp", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  #ifdef WITH_THREAD
+  __Pyx_PyGILState_Release(__pyx_gilstate_save);
+  #endif
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1247
+ * 
+ * @cname('__pyx_memoryview_err_extents')
+ * cdef int _err_extents(int i, Py_ssize_t extent1,             # <<<<<<<<<<<<<<
+ *                              Py_ssize_t extent2) except -1 with gil:
+ *     raise ValueError, f"got differing extents in dimension {i} (got {extent1} and {extent2})"
+ */
+
+static int __pyx_memoryview_err_extents(int __pyx_v_i, Py_ssize_t __pyx_v_extent1, Py_ssize_t __pyx_v_extent2) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  Py_ssize_t __pyx_t_2;
+  Py_UCS4 __pyx_t_3;
+  PyObject *__pyx_t_4 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  #ifdef WITH_THREAD
+  PyGILState_STATE __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
+  #endif
+  __Pyx_RefNannySetupContext("_err_extents", 0);
+
+  /* "View.MemoryView":1249
+ * cdef int _err_extents(int i, Py_ssize_t extent1,
+ *                              Py_ssize_t extent2) except -1 with gil:
+ *     raise ValueError, f"got differing extents in dimension {i} (got {extent1} and {extent2})"             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_err_dim')
+ */
+  __pyx_t_1 = PyTuple_New(7); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 1249, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = 0;
+  __pyx_t_3 = 127;
+  __Pyx_INCREF(__pyx_kp_u_got_differing_extents_in_dimensi);
+  __pyx_t_2 += 35;
+  __Pyx_GIVEREF(__pyx_kp_u_got_differing_extents_in_dimensi);
+  PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_kp_u_got_differing_extents_in_dimensi);
+  __pyx_t_4 = __Pyx_PyUnicode_From_int(__pyx_v_i, 0, ' ', 'd'); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 1249, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_2 += __Pyx_PyUnicode_GET_LENGTH(__pyx_t_4);
+  __Pyx_GIVEREF(__pyx_t_4);
+  PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_t_4);
+  __pyx_t_4 = 0;
+  __Pyx_INCREF(__pyx_kp_u_got);
+  __pyx_t_2 += 6;
+  __Pyx_GIVEREF(__pyx_kp_u_got);
+  PyTuple_SET_ITEM(__pyx_t_1, 2, __pyx_kp_u_got);
+  __pyx_t_4 = __Pyx_PyUnicode_From_Py_ssize_t(__pyx_v_extent1, 0, ' ', 'd'); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 1249, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_2 += __Pyx_PyUnicode_GET_LENGTH(__pyx_t_4);
+  __Pyx_GIVEREF(__pyx_t_4);
+  PyTuple_SET_ITEM(__pyx_t_1, 3, __pyx_t_4);
+  __pyx_t_4 = 0;
+  __Pyx_INCREF(__pyx_kp_u_and);
+  __pyx_t_2 += 5;
+  __Pyx_GIVEREF(__pyx_kp_u_and);
+  PyTuple_SET_ITEM(__pyx_t_1, 4, __pyx_kp_u_and);
+  __pyx_t_4 = __Pyx_PyUnicode_From_Py_ssize_t(__pyx_v_extent2, 0, ' ', 'd'); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 1249, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_2 += __Pyx_PyUnicode_GET_LENGTH(__pyx_t_4);
+  __Pyx_GIVEREF(__pyx_t_4);
+  PyTuple_SET_ITEM(__pyx_t_1, 5, __pyx_t_4);
+  __pyx_t_4 = 0;
+  __Pyx_INCREF(__pyx_kp_u__7);
+  __pyx_t_2 += 1;
+  __Pyx_GIVEREF(__pyx_kp_u__7);
+  PyTuple_SET_ITEM(__pyx_t_1, 6, __pyx_kp_u__7);
+  __pyx_t_4 = __Pyx_PyUnicode_Join(__pyx_t_1, 7, __pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 1249, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_Raise(__pyx_builtin_ValueError, __pyx_t_4, 0, 0);
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __PYX_ERR(1, 1249, __pyx_L1_error)
+
+  /* "View.MemoryView":1247
+ * 
+ * @cname('__pyx_memoryview_err_extents')
+ * cdef int _err_extents(int i, Py_ssize_t extent1,             # <<<<<<<<<<<<<<
+ *                              Py_ssize_t extent2) except -1 with gil:
+ *     raise ValueError, f"got differing extents in dimension {i} (got {extent1} and {extent2})"
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("View.MemoryView._err_extents", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __Pyx_RefNannyFinishContext();
+  #ifdef WITH_THREAD
+  __Pyx_PyGILState_Release(__pyx_gilstate_save);
+  #endif
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1252
+ * 
+ * @cname('__pyx_memoryview_err_dim')
+ * cdef int _err_dim(PyObject *error, str msg, int dim) except -1 with gil:             # <<<<<<<<<<<<<<
+ *     raise <object>error, msg % dim
+ * 
+ */
+
+static int __pyx_memoryview_err_dim(PyObject *__pyx_v_error, PyObject *__pyx_v_msg, int __pyx_v_dim) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  #ifdef WITH_THREAD
+  PyGILState_STATE __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
+  #endif
+  __Pyx_RefNannySetupContext("_err_dim", 0);
+  __Pyx_INCREF(__pyx_v_msg);
+
+  /* "View.MemoryView":1253
+ * @cname('__pyx_memoryview_err_dim')
+ * cdef int _err_dim(PyObject *error, str msg, int dim) except -1 with gil:
+ *     raise <object>error, msg % dim             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_err')
+ */
+  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_dim); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 1253, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyString_FormatSafe(__pyx_v_msg, __pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 1253, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_Raise(((PyObject *)__pyx_v_error), __pyx_t_2, 0, 0);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __PYX_ERR(1, 1253, __pyx_L1_error)
+
+  /* "View.MemoryView":1252
+ * 
+ * @cname('__pyx_memoryview_err_dim')
+ * cdef int _err_dim(PyObject *error, str msg, int dim) except -1 with gil:             # <<<<<<<<<<<<<<
+ *     raise <object>error, msg % dim
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView._err_dim", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __Pyx_XDECREF(__pyx_v_msg);
+  __Pyx_RefNannyFinishContext();
+  #ifdef WITH_THREAD
+  __Pyx_PyGILState_Release(__pyx_gilstate_save);
+  #endif
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1256
+ * 
+ * @cname('__pyx_memoryview_err')
+ * cdef int _err(PyObject *error, str msg) except -1 with gil:             # <<<<<<<<<<<<<<
+ *     raise <object>error, msg
+ * 
+ */
+
+static int __pyx_memoryview_err(PyObject *__pyx_v_error, PyObject *__pyx_v_msg) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  #ifdef WITH_THREAD
+  PyGILState_STATE __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
+  #endif
+  __Pyx_RefNannySetupContext("_err", 0);
+  __Pyx_INCREF(__pyx_v_msg);
+
+  /* "View.MemoryView":1257
+ * @cname('__pyx_memoryview_err')
+ * cdef int _err(PyObject *error, str msg) except -1 with gil:
+ *     raise <object>error, msg             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_err_no_memory')
+ */
+  __Pyx_Raise(((PyObject *)__pyx_v_error), __pyx_v_msg, 0, 0);
+  __PYX_ERR(1, 1257, __pyx_L1_error)
+
+  /* "View.MemoryView":1256
+ * 
+ * @cname('__pyx_memoryview_err')
+ * cdef int _err(PyObject *error, str msg) except -1 with gil:             # <<<<<<<<<<<<<<
+ *     raise <object>error, msg
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView._err", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __Pyx_XDECREF(__pyx_v_msg);
+  __Pyx_RefNannyFinishContext();
+  #ifdef WITH_THREAD
+  __Pyx_PyGILState_Release(__pyx_gilstate_save);
+  #endif
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1260
+ * 
+ * @cname('__pyx_memoryview_err_no_memory')
+ * cdef int _err_no_memory() except -1 with gil:             # <<<<<<<<<<<<<<
+ *     raise MemoryError
+ * 
+ */
+
+static int __pyx_memoryview_err_no_memory(void) {
+  int __pyx_r;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  #ifdef WITH_THREAD
+  PyGILState_STATE __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
+  #endif
+
+  /* "View.MemoryView":1261
+ * @cname('__pyx_memoryview_err_no_memory')
+ * cdef int _err_no_memory() except -1 with gil:
+ *     raise MemoryError             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  PyErr_NoMemory(); __PYX_ERR(1, 1261, __pyx_L1_error)
+
+  /* "View.MemoryView":1260
+ * 
+ * @cname('__pyx_memoryview_err_no_memory')
+ * cdef int _err_no_memory() except -1 with gil:             # <<<<<<<<<<<<<<
+ *     raise MemoryError
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView._err_no_memory", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  #ifdef WITH_THREAD
+  __Pyx_PyGILState_Release(__pyx_gilstate_save);
+  #endif
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1265
+ * 
+ * @cname('__pyx_memoryview_copy_contents')
+ * cdef int memoryview_copy_contents(__Pyx_memviewslice src,             # <<<<<<<<<<<<<<
+ *                                   __Pyx_memviewslice dst,
+ *                                   int src_ndim, int dst_ndim,
+ */
+
+static int __pyx_memoryview_copy_contents(__Pyx_memviewslice __pyx_v_src, __Pyx_memviewslice __pyx_v_dst, int __pyx_v_src_ndim, int __pyx_v_dst_ndim, int __pyx_v_dtype_is_object) {
+  void *__pyx_v_tmpdata;
+  size_t __pyx_v_itemsize;
+  int __pyx_v_i;
+  char __pyx_v_order;
+  int __pyx_v_broadcasting;
+  int __pyx_v_direct_copy;
+  __Pyx_memviewslice __pyx_v_tmp;
+  int __pyx_v_ndim;
+  int __pyx_r;
+  Py_ssize_t __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+  int __pyx_t_4;
+  int __pyx_t_5;
+  int __pyx_t_6;
+  void *__pyx_t_7;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  #ifdef WITH_THREAD
+  PyGILState_STATE __pyx_gilstate_save;
+  #endif
+
+  /* "View.MemoryView":1273
+ *     Check for overlapping memory and verify the shapes.
+ *     """
+ *     cdef void *tmpdata = NULL             # <<<<<<<<<<<<<<
+ *     cdef size_t itemsize = src.memview.view.itemsize
+ *     cdef int i
+ */
+  __pyx_v_tmpdata = NULL;
+
+  /* "View.MemoryView":1274
+ *     """
+ *     cdef void *tmpdata = NULL
+ *     cdef size_t itemsize = src.memview.view.itemsize             # <<<<<<<<<<<<<<
+ *     cdef int i
+ *     cdef char order = get_best_order(&src, src_ndim)
+ */
+  __pyx_t_1 = __pyx_v_src.memview->view.itemsize;
+  __pyx_v_itemsize = __pyx_t_1;
+
+  /* "View.MemoryView":1276
+ *     cdef size_t itemsize = src.memview.view.itemsize
+ *     cdef int i
+ *     cdef char order = get_best_order(&src, src_ndim)             # <<<<<<<<<<<<<<
+ *     cdef bint broadcasting = False
+ *     cdef bint direct_copy = False
+ */
+  __pyx_v_order = __pyx_get_best_slice_order((&__pyx_v_src), __pyx_v_src_ndim);
+
+  /* "View.MemoryView":1277
+ *     cdef int i
+ *     cdef char order = get_best_order(&src, src_ndim)
+ *     cdef bint broadcasting = False             # <<<<<<<<<<<<<<
+ *     cdef bint direct_copy = False
+ *     cdef __Pyx_memviewslice tmp
+ */
+  __pyx_v_broadcasting = 0;
+
+  /* "View.MemoryView":1278
+ *     cdef char order = get_best_order(&src, src_ndim)
+ *     cdef bint broadcasting = False
+ *     cdef bint direct_copy = False             # <<<<<<<<<<<<<<
+ *     cdef __Pyx_memviewslice tmp
+ * 
+ */
+  __pyx_v_direct_copy = 0;
+
+  /* "View.MemoryView":1281
+ *     cdef __Pyx_memviewslice tmp
+ * 
+ *     if src_ndim < dst_ndim:             # <<<<<<<<<<<<<<
+ *         broadcast_leading(&src, src_ndim, dst_ndim)
+ *     elif dst_ndim < src_ndim:
+ */
+  __pyx_t_2 = (__pyx_v_src_ndim < __pyx_v_dst_ndim);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":1282
+ * 
+ *     if src_ndim < dst_ndim:
+ *         broadcast_leading(&src, src_ndim, dst_ndim)             # <<<<<<<<<<<<<<
+ *     elif dst_ndim < src_ndim:
+ *         broadcast_leading(&dst, dst_ndim, src_ndim)
+ */
+    __pyx_memoryview_broadcast_leading((&__pyx_v_src), __pyx_v_src_ndim, __pyx_v_dst_ndim);
+
+    /* "View.MemoryView":1281
+ *     cdef __Pyx_memviewslice tmp
+ * 
+ *     if src_ndim < dst_ndim:             # <<<<<<<<<<<<<<
+ *         broadcast_leading(&src, src_ndim, dst_ndim)
+ *     elif dst_ndim < src_ndim:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":1283
+ *     if src_ndim < dst_ndim:
+ *         broadcast_leading(&src, src_ndim, dst_ndim)
+ *     elif dst_ndim < src_ndim:             # <<<<<<<<<<<<<<
+ *         broadcast_leading(&dst, dst_ndim, src_ndim)
+ * 
+ */
+  __pyx_t_2 = (__pyx_v_dst_ndim < __pyx_v_src_ndim);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":1284
+ *         broadcast_leading(&src, src_ndim, dst_ndim)
+ *     elif dst_ndim < src_ndim:
+ *         broadcast_leading(&dst, dst_ndim, src_ndim)             # <<<<<<<<<<<<<<
+ * 
+ *     cdef int ndim = max(src_ndim, dst_ndim)
+ */
+    __pyx_memoryview_broadcast_leading((&__pyx_v_dst), __pyx_v_dst_ndim, __pyx_v_src_ndim);
+
+    /* "View.MemoryView":1283
+ *     if src_ndim < dst_ndim:
+ *         broadcast_leading(&src, src_ndim, dst_ndim)
+ *     elif dst_ndim < src_ndim:             # <<<<<<<<<<<<<<
+ *         broadcast_leading(&dst, dst_ndim, src_ndim)
+ * 
+ */
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":1286
+ *         broadcast_leading(&dst, dst_ndim, src_ndim)
+ * 
+ *     cdef int ndim = max(src_ndim, dst_ndim)             # <<<<<<<<<<<<<<
+ * 
+ *     for i in range(ndim):
+ */
+  __pyx_t_3 = __pyx_v_dst_ndim;
+  __pyx_t_4 = __pyx_v_src_ndim;
+  __pyx_t_2 = (__pyx_t_3 > __pyx_t_4);
+  if (__pyx_t_2) {
+    __pyx_t_5 = __pyx_t_3;
+  } else {
+    __pyx_t_5 = __pyx_t_4;
+  }
+  __pyx_v_ndim = __pyx_t_5;
+
+  /* "View.MemoryView":1288
+ *     cdef int ndim = max(src_ndim, dst_ndim)
+ * 
+ *     for i in range(ndim):             # <<<<<<<<<<<<<<
+ *         if src.shape[i] != dst.shape[i]:
+ *             if src.shape[i] == 1:
+ */
+  __pyx_t_5 = __pyx_v_ndim;
+  __pyx_t_3 = __pyx_t_5;
+  for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
+    __pyx_v_i = __pyx_t_4;
+
+    /* "View.MemoryView":1289
+ * 
+ *     for i in range(ndim):
+ *         if src.shape[i] != dst.shape[i]:             # <<<<<<<<<<<<<<
+ *             if src.shape[i] == 1:
+ *                 broadcasting = True
+ */
+    __pyx_t_2 = ((__pyx_v_src.shape[__pyx_v_i]) != (__pyx_v_dst.shape[__pyx_v_i]));
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":1290
+ *     for i in range(ndim):
+ *         if src.shape[i] != dst.shape[i]:
+ *             if src.shape[i] == 1:             # <<<<<<<<<<<<<<
+ *                 broadcasting = True
+ *                 src.strides[i] = 0
+ */
+      __pyx_t_2 = ((__pyx_v_src.shape[__pyx_v_i]) == 1);
+      if (__pyx_t_2) {
+
+        /* "View.MemoryView":1291
+ *         if src.shape[i] != dst.shape[i]:
+ *             if src.shape[i] == 1:
+ *                 broadcasting = True             # <<<<<<<<<<<<<<
+ *                 src.strides[i] = 0
+ *             else:
+ */
+        __pyx_v_broadcasting = 1;
+
+        /* "View.MemoryView":1292
+ *             if src.shape[i] == 1:
+ *                 broadcasting = True
+ *                 src.strides[i] = 0             # <<<<<<<<<<<<<<
+ *             else:
+ *                 _err_extents(i, dst.shape[i], src.shape[i])
+ */
+        (__pyx_v_src.strides[__pyx_v_i]) = 0;
+
+        /* "View.MemoryView":1290
+ *     for i in range(ndim):
+ *         if src.shape[i] != dst.shape[i]:
+ *             if src.shape[i] == 1:             # <<<<<<<<<<<<<<
+ *                 broadcasting = True
+ *                 src.strides[i] = 0
+ */
+        goto __pyx_L7;
+      }
+
+      /* "View.MemoryView":1294
+ *                 src.strides[i] = 0
+ *             else:
+ *                 _err_extents(i, dst.shape[i], src.shape[i])             # <<<<<<<<<<<<<<
+ * 
+ *         if src.suboffsets[i] >= 0:
+ */
+      /*else*/ {
+        __pyx_t_6 = __pyx_memoryview_err_extents(__pyx_v_i, (__pyx_v_dst.shape[__pyx_v_i]), (__pyx_v_src.shape[__pyx_v_i])); if (unlikely(__pyx_t_6 == ((int)-1))) __PYX_ERR(1, 1294, __pyx_L1_error)
+      }
+      __pyx_L7:;
+
+      /* "View.MemoryView":1289
+ * 
+ *     for i in range(ndim):
+ *         if src.shape[i] != dst.shape[i]:             # <<<<<<<<<<<<<<
+ *             if src.shape[i] == 1:
+ *                 broadcasting = True
+ */
+    }
+
+    /* "View.MemoryView":1296
+ *                 _err_extents(i, dst.shape[i], src.shape[i])
+ * 
+ *         if src.suboffsets[i] >= 0:             # <<<<<<<<<<<<<<
+ *             _err_dim(PyExc_ValueError, "Dimension %d is not direct", i)
+ * 
+ */
+    __pyx_t_2 = ((__pyx_v_src.suboffsets[__pyx_v_i]) >= 0);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":1297
+ * 
+ *         if src.suboffsets[i] >= 0:
+ *             _err_dim(PyExc_ValueError, "Dimension %d is not direct", i)             # <<<<<<<<<<<<<<
+ * 
+ *     if slices_overlap(&src, &dst, ndim, itemsize):
+ */
+      __pyx_t_6 = __pyx_memoryview_err_dim(PyExc_ValueError, __pyx_kp_s_Dimension_d_is_not_direct, __pyx_v_i); if (unlikely(__pyx_t_6 == ((int)-1))) __PYX_ERR(1, 1297, __pyx_L1_error)
+
+      /* "View.MemoryView":1296
+ *                 _err_extents(i, dst.shape[i], src.shape[i])
+ * 
+ *         if src.suboffsets[i] >= 0:             # <<<<<<<<<<<<<<
+ *             _err_dim(PyExc_ValueError, "Dimension %d is not direct", i)
+ * 
+ */
+    }
+  }
+
+  /* "View.MemoryView":1299
+ *             _err_dim(PyExc_ValueError, "Dimension %d is not direct", i)
+ * 
+ *     if slices_overlap(&src, &dst, ndim, itemsize):             # <<<<<<<<<<<<<<
+ * 
+ *         if not slice_is_contig(src, order, ndim):
+ */
+  __pyx_t_2 = __pyx_slices_overlap((&__pyx_v_src), (&__pyx_v_dst), __pyx_v_ndim, __pyx_v_itemsize);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":1301
+ *     if slices_overlap(&src, &dst, ndim, itemsize):
+ * 
+ *         if not slice_is_contig(src, order, ndim):             # <<<<<<<<<<<<<<
+ *             order = get_best_order(&dst, ndim)
+ * 
+ */
+    __pyx_t_2 = (!__pyx_memviewslice_is_contig(__pyx_v_src, __pyx_v_order, __pyx_v_ndim));
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":1302
+ * 
+ *         if not slice_is_contig(src, order, ndim):
+ *             order = get_best_order(&dst, ndim)             # <<<<<<<<<<<<<<
+ * 
+ *         tmpdata = copy_data_to_temp(&src, &tmp, order, ndim)
+ */
+      __pyx_v_order = __pyx_get_best_slice_order((&__pyx_v_dst), __pyx_v_ndim);
+
+      /* "View.MemoryView":1301
+ *     if slices_overlap(&src, &dst, ndim, itemsize):
+ * 
+ *         if not slice_is_contig(src, order, ndim):             # <<<<<<<<<<<<<<
+ *             order = get_best_order(&dst, ndim)
+ * 
+ */
+    }
+
+    /* "View.MemoryView":1304
+ *             order = get_best_order(&dst, ndim)
+ * 
+ *         tmpdata = copy_data_to_temp(&src, &tmp, order, ndim)             # <<<<<<<<<<<<<<
+ *         src = tmp
+ * 
+ */
+    __pyx_t_7 = __pyx_memoryview_copy_data_to_temp((&__pyx_v_src), (&__pyx_v_tmp), __pyx_v_order, __pyx_v_ndim); if (unlikely(__pyx_t_7 == ((void *)NULL))) __PYX_ERR(1, 1304, __pyx_L1_error)
+    __pyx_v_tmpdata = __pyx_t_7;
+
+    /* "View.MemoryView":1305
+ * 
+ *         tmpdata = copy_data_to_temp(&src, &tmp, order, ndim)
+ *         src = tmp             # <<<<<<<<<<<<<<
+ * 
+ *     if not broadcasting:
+ */
+    __pyx_v_src = __pyx_v_tmp;
+
+    /* "View.MemoryView":1299
+ *             _err_dim(PyExc_ValueError, "Dimension %d is not direct", i)
+ * 
+ *     if slices_overlap(&src, &dst, ndim, itemsize):             # <<<<<<<<<<<<<<
+ * 
+ *         if not slice_is_contig(src, order, ndim):
+ */
+  }
+
+  /* "View.MemoryView":1307
+ *         src = tmp
+ * 
+ *     if not broadcasting:             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_t_2 = (!__pyx_v_broadcasting);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":1310
+ * 
+ * 
+ *         if slice_is_contig(src, 'C', ndim):             # <<<<<<<<<<<<<<
+ *             direct_copy = slice_is_contig(dst, 'C', ndim)
+ *         elif slice_is_contig(src, 'F', ndim):
+ */
+    __pyx_t_2 = __pyx_memviewslice_is_contig(__pyx_v_src, 'C', __pyx_v_ndim);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":1311
+ * 
+ *         if slice_is_contig(src, 'C', ndim):
+ *             direct_copy = slice_is_contig(dst, 'C', ndim)             # <<<<<<<<<<<<<<
+ *         elif slice_is_contig(src, 'F', ndim):
+ *             direct_copy = slice_is_contig(dst, 'F', ndim)
+ */
+      __pyx_v_direct_copy = __pyx_memviewslice_is_contig(__pyx_v_dst, 'C', __pyx_v_ndim);
+
+      /* "View.MemoryView":1310
+ * 
+ * 
+ *         if slice_is_contig(src, 'C', ndim):             # <<<<<<<<<<<<<<
+ *             direct_copy = slice_is_contig(dst, 'C', ndim)
+ *         elif slice_is_contig(src, 'F', ndim):
+ */
+      goto __pyx_L12;
+    }
+
+    /* "View.MemoryView":1312
+ *         if slice_is_contig(src, 'C', ndim):
+ *             direct_copy = slice_is_contig(dst, 'C', ndim)
+ *         elif slice_is_contig(src, 'F', ndim):             # <<<<<<<<<<<<<<
+ *             direct_copy = slice_is_contig(dst, 'F', ndim)
+ * 
+ */
+    __pyx_t_2 = __pyx_memviewslice_is_contig(__pyx_v_src, 'F', __pyx_v_ndim);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":1313
+ *             direct_copy = slice_is_contig(dst, 'C', ndim)
+ *         elif slice_is_contig(src, 'F', ndim):
+ *             direct_copy = slice_is_contig(dst, 'F', ndim)             # <<<<<<<<<<<<<<
+ * 
+ *         if direct_copy:
+ */
+      __pyx_v_direct_copy = __pyx_memviewslice_is_contig(__pyx_v_dst, 'F', __pyx_v_ndim);
+
+      /* "View.MemoryView":1312
+ *         if slice_is_contig(src, 'C', ndim):
+ *             direct_copy = slice_is_contig(dst, 'C', ndim)
+ *         elif slice_is_contig(src, 'F', ndim):             # <<<<<<<<<<<<<<
+ *             direct_copy = slice_is_contig(dst, 'F', ndim)
+ * 
+ */
+    }
+    __pyx_L12:;
+
+    /* "View.MemoryView":1315
+ *             direct_copy = slice_is_contig(dst, 'F', ndim)
+ * 
+ *         if direct_copy:             # <<<<<<<<<<<<<<
+ * 
+ *             refcount_copying(&dst, dtype_is_object, ndim, inc=False)
+ */
+    if (__pyx_v_direct_copy) {
+
+      /* "View.MemoryView":1317
+ *         if direct_copy:
+ * 
+ *             refcount_copying(&dst, dtype_is_object, ndim, inc=False)             # <<<<<<<<<<<<<<
+ *             memcpy(dst.data, src.data, slice_get_size(&src, ndim))
+ *             refcount_copying(&dst, dtype_is_object, ndim, inc=True)
+ */
+      __pyx_memoryview_refcount_copying((&__pyx_v_dst), __pyx_v_dtype_is_object, __pyx_v_ndim, 0);
+
+      /* "View.MemoryView":1318
+ * 
+ *             refcount_copying(&dst, dtype_is_object, ndim, inc=False)
+ *             memcpy(dst.data, src.data, slice_get_size(&src, ndim))             # <<<<<<<<<<<<<<
+ *             refcount_copying(&dst, dtype_is_object, ndim, inc=True)
+ *             free(tmpdata)
+ */
+      (void)(memcpy(__pyx_v_dst.data, __pyx_v_src.data, __pyx_memoryview_slice_get_size((&__pyx_v_src), __pyx_v_ndim)));
+
+      /* "View.MemoryView":1319
+ *             refcount_copying(&dst, dtype_is_object, ndim, inc=False)
+ *             memcpy(dst.data, src.data, slice_get_size(&src, ndim))
+ *             refcount_copying(&dst, dtype_is_object, ndim, inc=True)             # <<<<<<<<<<<<<<
+ *             free(tmpdata)
+ *             return 0
+ */
+      __pyx_memoryview_refcount_copying((&__pyx_v_dst), __pyx_v_dtype_is_object, __pyx_v_ndim, 1);
+
+      /* "View.MemoryView":1320
+ *             memcpy(dst.data, src.data, slice_get_size(&src, ndim))
+ *             refcount_copying(&dst, dtype_is_object, ndim, inc=True)
+ *             free(tmpdata)             # <<<<<<<<<<<<<<
+ *             return 0
+ * 
+ */
+      free(__pyx_v_tmpdata);
+
+      /* "View.MemoryView":1321
+ *             refcount_copying(&dst, dtype_is_object, ndim, inc=True)
+ *             free(tmpdata)
+ *             return 0             # <<<<<<<<<<<<<<
+ * 
+ *     if order == 'F' == get_best_order(&dst, ndim):
+ */
+      __pyx_r = 0;
+      goto __pyx_L0;
+
+      /* "View.MemoryView":1315
+ *             direct_copy = slice_is_contig(dst, 'F', ndim)
+ * 
+ *         if direct_copy:             # <<<<<<<<<<<<<<
+ * 
+ *             refcount_copying(&dst, dtype_is_object, ndim, inc=False)
+ */
+    }
+
+    /* "View.MemoryView":1307
+ *         src = tmp
+ * 
+ *     if not broadcasting:             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  }
+
+  /* "View.MemoryView":1323
+ *             return 0
+ * 
+ *     if order == 'F' == get_best_order(&dst, ndim):             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_t_2 = (__pyx_v_order == 'F');
+  if (__pyx_t_2) {
+    __pyx_t_2 = ('F' == __pyx_get_best_slice_order((&__pyx_v_dst), __pyx_v_ndim));
+  }
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":1326
+ * 
+ * 
+ *         transpose_memslice(&src)             # <<<<<<<<<<<<<<
+ *         transpose_memslice(&dst)
+ * 
+ */
+    __pyx_t_5 = __pyx_memslice_transpose((&__pyx_v_src)); if (unlikely(__pyx_t_5 == ((int)-1))) __PYX_ERR(1, 1326, __pyx_L1_error)
+
+    /* "View.MemoryView":1327
+ * 
+ *         transpose_memslice(&src)
+ *         transpose_memslice(&dst)             # <<<<<<<<<<<<<<
+ * 
+ *     refcount_copying(&dst, dtype_is_object, ndim, inc=False)
+ */
+    __pyx_t_5 = __pyx_memslice_transpose((&__pyx_v_dst)); if (unlikely(__pyx_t_5 == ((int)-1))) __PYX_ERR(1, 1327, __pyx_L1_error)
+
+    /* "View.MemoryView":1323
+ *             return 0
+ * 
+ *     if order == 'F' == get_best_order(&dst, ndim):             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  }
+
+  /* "View.MemoryView":1329
+ *         transpose_memslice(&dst)
+ * 
+ *     refcount_copying(&dst, dtype_is_object, ndim, inc=False)             # <<<<<<<<<<<<<<
+ *     copy_strided_to_strided(&src, &dst, ndim, itemsize)
+ *     refcount_copying(&dst, dtype_is_object, ndim, inc=True)
+ */
+  __pyx_memoryview_refcount_copying((&__pyx_v_dst), __pyx_v_dtype_is_object, __pyx_v_ndim, 0);
+
+  /* "View.MemoryView":1330
+ * 
+ *     refcount_copying(&dst, dtype_is_object, ndim, inc=False)
+ *     copy_strided_to_strided(&src, &dst, ndim, itemsize)             # <<<<<<<<<<<<<<
+ *     refcount_copying(&dst, dtype_is_object, ndim, inc=True)
+ * 
+ */
+  copy_strided_to_strided((&__pyx_v_src), (&__pyx_v_dst), __pyx_v_ndim, __pyx_v_itemsize);
+
+  /* "View.MemoryView":1331
+ *     refcount_copying(&dst, dtype_is_object, ndim, inc=False)
+ *     copy_strided_to_strided(&src, &dst, ndim, itemsize)
+ *     refcount_copying(&dst, dtype_is_object, ndim, inc=True)             # <<<<<<<<<<<<<<
+ * 
+ *     free(tmpdata)
+ */
+  __pyx_memoryview_refcount_copying((&__pyx_v_dst), __pyx_v_dtype_is_object, __pyx_v_ndim, 1);
+
+  /* "View.MemoryView":1333
+ *     refcount_copying(&dst, dtype_is_object, ndim, inc=True)
+ * 
+ *     free(tmpdata)             # <<<<<<<<<<<<<<
+ *     return 0
+ * 
+ */
+  free(__pyx_v_tmpdata);
+
+  /* "View.MemoryView":1334
+ * 
+ *     free(tmpdata)
+ *     return 0             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_broadcast_leading')
+ */
+  __pyx_r = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":1265
+ * 
+ * @cname('__pyx_memoryview_copy_contents')
+ * cdef int memoryview_copy_contents(__Pyx_memviewslice src,             # <<<<<<<<<<<<<<
+ *                                   __Pyx_memviewslice dst,
+ *                                   int src_ndim, int dst_ndim,
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  #ifdef WITH_THREAD
+  __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
+  #endif
+  __Pyx_AddTraceback("View.MemoryView.memoryview_copy_contents", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  #ifdef WITH_THREAD
+  __Pyx_PyGILState_Release(__pyx_gilstate_save);
+  #endif
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1337
+ * 
+ * @cname('__pyx_memoryview_broadcast_leading')
+ * cdef void broadcast_leading(__Pyx_memviewslice *mslice,             # <<<<<<<<<<<<<<
+ *                             int ndim,
+ *                             int ndim_other) noexcept nogil:
+ */
+
+static void __pyx_memoryview_broadcast_leading(__Pyx_memviewslice *__pyx_v_mslice, int __pyx_v_ndim, int __pyx_v_ndim_other) {
+  int __pyx_v_i;
+  int __pyx_v_offset;
+  int __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+
+  /* "View.MemoryView":1341
+ *                             int ndim_other) noexcept nogil:
+ *     cdef int i
+ *     cdef int offset = ndim_other - ndim             # <<<<<<<<<<<<<<
+ * 
+ *     for i in range(ndim - 1, -1, -1):
+ */
+  __pyx_v_offset = (__pyx_v_ndim_other - __pyx_v_ndim);
+
+  /* "View.MemoryView":1343
+ *     cdef int offset = ndim_other - ndim
+ * 
+ *     for i in range(ndim - 1, -1, -1):             # <<<<<<<<<<<<<<
+ *         mslice.shape[i + offset] = mslice.shape[i]
+ *         mslice.strides[i + offset] = mslice.strides[i]
+ */
+  for (__pyx_t_1 = (__pyx_v_ndim - 1); __pyx_t_1 > -1; __pyx_t_1-=1) {
+    __pyx_v_i = __pyx_t_1;
+
+    /* "View.MemoryView":1344
+ * 
+ *     for i in range(ndim - 1, -1, -1):
+ *         mslice.shape[i + offset] = mslice.shape[i]             # <<<<<<<<<<<<<<
+ *         mslice.strides[i + offset] = mslice.strides[i]
+ *         mslice.suboffsets[i + offset] = mslice.suboffsets[i]
+ */
+    (__pyx_v_mslice->shape[(__pyx_v_i + __pyx_v_offset)]) = (__pyx_v_mslice->shape[__pyx_v_i]);
+
+    /* "View.MemoryView":1345
+ *     for i in range(ndim - 1, -1, -1):
+ *         mslice.shape[i + offset] = mslice.shape[i]
+ *         mslice.strides[i + offset] = mslice.strides[i]             # <<<<<<<<<<<<<<
+ *         mslice.suboffsets[i + offset] = mslice.suboffsets[i]
+ * 
+ */
+    (__pyx_v_mslice->strides[(__pyx_v_i + __pyx_v_offset)]) = (__pyx_v_mslice->strides[__pyx_v_i]);
+
+    /* "View.MemoryView":1346
+ *         mslice.shape[i + offset] = mslice.shape[i]
+ *         mslice.strides[i + offset] = mslice.strides[i]
+ *         mslice.suboffsets[i + offset] = mslice.suboffsets[i]             # <<<<<<<<<<<<<<
+ * 
+ *     for i in range(offset):
+ */
+    (__pyx_v_mslice->suboffsets[(__pyx_v_i + __pyx_v_offset)]) = (__pyx_v_mslice->suboffsets[__pyx_v_i]);
+  }
+
+  /* "View.MemoryView":1348
+ *         mslice.suboffsets[i + offset] = mslice.suboffsets[i]
+ * 
+ *     for i in range(offset):             # <<<<<<<<<<<<<<
+ *         mslice.shape[i] = 1
+ *         mslice.strides[i] = mslice.strides[0]
+ */
+  __pyx_t_1 = __pyx_v_offset;
+  __pyx_t_2 = __pyx_t_1;
+  for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_2; __pyx_t_3+=1) {
+    __pyx_v_i = __pyx_t_3;
+
+    /* "View.MemoryView":1349
+ * 
+ *     for i in range(offset):
+ *         mslice.shape[i] = 1             # <<<<<<<<<<<<<<
+ *         mslice.strides[i] = mslice.strides[0]
+ *         mslice.suboffsets[i] = -1
+ */
+    (__pyx_v_mslice->shape[__pyx_v_i]) = 1;
+
+    /* "View.MemoryView":1350
+ *     for i in range(offset):
+ *         mslice.shape[i] = 1
+ *         mslice.strides[i] = mslice.strides[0]             # <<<<<<<<<<<<<<
+ *         mslice.suboffsets[i] = -1
+ * 
+ */
+    (__pyx_v_mslice->strides[__pyx_v_i]) = (__pyx_v_mslice->strides[0]);
+
+    /* "View.MemoryView":1351
+ *         mslice.shape[i] = 1
+ *         mslice.strides[i] = mslice.strides[0]
+ *         mslice.suboffsets[i] = -1             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+    (__pyx_v_mslice->suboffsets[__pyx_v_i]) = -1L;
+  }
+
+  /* "View.MemoryView":1337
+ * 
+ * @cname('__pyx_memoryview_broadcast_leading')
+ * cdef void broadcast_leading(__Pyx_memviewslice *mslice,             # <<<<<<<<<<<<<<
+ *                             int ndim,
+ *                             int ndim_other) noexcept nogil:
+ */
+
+  /* function exit code */
+}
+
+/* "View.MemoryView":1359
+ * 
+ * @cname('__pyx_memoryview_refcount_copying')
+ * cdef void refcount_copying(__Pyx_memviewslice *dst, bint dtype_is_object, int ndim, bint inc) noexcept nogil:             # <<<<<<<<<<<<<<
+ * 
+ *     if dtype_is_object:
+ */
+
+static void __pyx_memoryview_refcount_copying(__Pyx_memviewslice *__pyx_v_dst, int __pyx_v_dtype_is_object, int __pyx_v_ndim, int __pyx_v_inc) {
+
+  /* "View.MemoryView":1361
+ * cdef void refcount_copying(__Pyx_memviewslice *dst, bint dtype_is_object, int ndim, bint inc) noexcept nogil:
+ * 
+ *     if dtype_is_object:             # <<<<<<<<<<<<<<
+ *         refcount_objects_in_slice_with_gil(dst.data, dst.shape, dst.strides, ndim, inc)
+ * 
+ */
+  if (__pyx_v_dtype_is_object) {
+
+    /* "View.MemoryView":1362
+ * 
+ *     if dtype_is_object:
+ *         refcount_objects_in_slice_with_gil(dst.data, dst.shape, dst.strides, ndim, inc)             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_refcount_objects_in_slice_with_gil')
+ */
+    __pyx_memoryview_refcount_objects_in_slice_with_gil(__pyx_v_dst->data, __pyx_v_dst->shape, __pyx_v_dst->strides, __pyx_v_ndim, __pyx_v_inc);
+
+    /* "View.MemoryView":1361
+ * cdef void refcount_copying(__Pyx_memviewslice *dst, bint dtype_is_object, int ndim, bint inc) noexcept nogil:
+ * 
+ *     if dtype_is_object:             # <<<<<<<<<<<<<<
+ *         refcount_objects_in_slice_with_gil(dst.data, dst.shape, dst.strides, ndim, inc)
+ * 
+ */
+  }
+
+  /* "View.MemoryView":1359
+ * 
+ * @cname('__pyx_memoryview_refcount_copying')
+ * cdef void refcount_copying(__Pyx_memviewslice *dst, bint dtype_is_object, int ndim, bint inc) noexcept nogil:             # <<<<<<<<<<<<<<
+ * 
+ *     if dtype_is_object:
+ */
+
+  /* function exit code */
+}
+
+/* "View.MemoryView":1365
+ * 
+ * @cname('__pyx_memoryview_refcount_objects_in_slice_with_gil')
+ * cdef void refcount_objects_in_slice_with_gil(char *data, Py_ssize_t *shape,             # <<<<<<<<<<<<<<
+ *                                              Py_ssize_t *strides, int ndim,
+ *                                              bint inc) noexcept with gil:
+ */
+
+static void __pyx_memoryview_refcount_objects_in_slice_with_gil(char *__pyx_v_data, Py_ssize_t *__pyx_v_shape, Py_ssize_t *__pyx_v_strides, int __pyx_v_ndim, int __pyx_v_inc) {
+  #ifdef WITH_THREAD
+  PyGILState_STATE __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
+  #endif
+
+  /* "View.MemoryView":1368
+ *                                              Py_ssize_t *strides, int ndim,
+ *                                              bint inc) noexcept with gil:
+ *     refcount_objects_in_slice(data, shape, strides, ndim, inc)             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_refcount_objects_in_slice')
+ */
+  __pyx_memoryview_refcount_objects_in_slice(__pyx_v_data, __pyx_v_shape, __pyx_v_strides, __pyx_v_ndim, __pyx_v_inc);
+
+  /* "View.MemoryView":1365
+ * 
+ * @cname('__pyx_memoryview_refcount_objects_in_slice_with_gil')
+ * cdef void refcount_objects_in_slice_with_gil(char *data, Py_ssize_t *shape,             # <<<<<<<<<<<<<<
+ *                                              Py_ssize_t *strides, int ndim,
+ *                                              bint inc) noexcept with gil:
+ */
+
+  /* function exit code */
+  #ifdef WITH_THREAD
+  __Pyx_PyGILState_Release(__pyx_gilstate_save);
+  #endif
+}
+
+/* "View.MemoryView":1371
+ * 
+ * @cname('__pyx_memoryview_refcount_objects_in_slice')
+ * cdef void refcount_objects_in_slice(char *data, Py_ssize_t *shape,             # <<<<<<<<<<<<<<
+ *                                     Py_ssize_t *strides, int ndim, bint inc) noexcept:
+ *     cdef Py_ssize_t i
+ */
+
+static void __pyx_memoryview_refcount_objects_in_slice(char *__pyx_v_data, Py_ssize_t *__pyx_v_shape, Py_ssize_t *__pyx_v_strides, int __pyx_v_ndim, int __pyx_v_inc) {
+  CYTHON_UNUSED Py_ssize_t __pyx_v_i;
+  Py_ssize_t __pyx_v_stride;
+  Py_ssize_t __pyx_t_1;
+  Py_ssize_t __pyx_t_2;
+  Py_ssize_t __pyx_t_3;
+  int __pyx_t_4;
+
+  /* "View.MemoryView":1374
+ *                                     Py_ssize_t *strides, int ndim, bint inc) noexcept:
+ *     cdef Py_ssize_t i
+ *     cdef Py_ssize_t stride = strides[0]             # <<<<<<<<<<<<<<
+ * 
+ *     for i in range(shape[0]):
+ */
+  __pyx_v_stride = (__pyx_v_strides[0]);
+
+  /* "View.MemoryView":1376
+ *     cdef Py_ssize_t stride = strides[0]
+ * 
+ *     for i in range(shape[0]):             # <<<<<<<<<<<<<<
+ *         if ndim == 1:
+ *             if inc:
+ */
+  __pyx_t_1 = (__pyx_v_shape[0]);
+  __pyx_t_2 = __pyx_t_1;
+  for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_2; __pyx_t_3+=1) {
+    __pyx_v_i = __pyx_t_3;
+
+    /* "View.MemoryView":1377
+ * 
+ *     for i in range(shape[0]):
+ *         if ndim == 1:             # <<<<<<<<<<<<<<
+ *             if inc:
+ *                 Py_INCREF((<PyObject **> data)[0])
+ */
+    __pyx_t_4 = (__pyx_v_ndim == 1);
+    if (__pyx_t_4) {
+
+      /* "View.MemoryView":1378
+ *     for i in range(shape[0]):
+ *         if ndim == 1:
+ *             if inc:             # <<<<<<<<<<<<<<
+ *                 Py_INCREF((<PyObject **> data)[0])
+ *             else:
+ */
+      if (__pyx_v_inc) {
+
+        /* "View.MemoryView":1379
+ *         if ndim == 1:
+ *             if inc:
+ *                 Py_INCREF((<PyObject **> data)[0])             # <<<<<<<<<<<<<<
+ *             else:
+ *                 Py_DECREF((<PyObject **> data)[0])
+ */
+        Py_INCREF((((PyObject **)__pyx_v_data)[0]));
+
+        /* "View.MemoryView":1378
+ *     for i in range(shape[0]):
+ *         if ndim == 1:
+ *             if inc:             # <<<<<<<<<<<<<<
+ *                 Py_INCREF((<PyObject **> data)[0])
+ *             else:
+ */
+        goto __pyx_L6;
+      }
+
+      /* "View.MemoryView":1381
+ *                 Py_INCREF((<PyObject **> data)[0])
+ *             else:
+ *                 Py_DECREF((<PyObject **> data)[0])             # <<<<<<<<<<<<<<
+ *         else:
+ *             refcount_objects_in_slice(data, shape + 1, strides + 1, ndim - 1, inc)
+ */
+      /*else*/ {
+        Py_DECREF((((PyObject **)__pyx_v_data)[0]));
+      }
+      __pyx_L6:;
+
+      /* "View.MemoryView":1377
+ * 
+ *     for i in range(shape[0]):
+ *         if ndim == 1:             # <<<<<<<<<<<<<<
+ *             if inc:
+ *                 Py_INCREF((<PyObject **> data)[0])
+ */
+      goto __pyx_L5;
+    }
+
+    /* "View.MemoryView":1383
+ *                 Py_DECREF((<PyObject **> data)[0])
+ *         else:
+ *             refcount_objects_in_slice(data, shape + 1, strides + 1, ndim - 1, inc)             # <<<<<<<<<<<<<<
+ * 
+ *         data += stride
+ */
+    /*else*/ {
+      __pyx_memoryview_refcount_objects_in_slice(__pyx_v_data, (__pyx_v_shape + 1), (__pyx_v_strides + 1), (__pyx_v_ndim - 1), __pyx_v_inc);
+    }
+    __pyx_L5:;
+
+    /* "View.MemoryView":1385
+ *             refcount_objects_in_slice(data, shape + 1, strides + 1, ndim - 1, inc)
+ * 
+ *         data += stride             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+    __pyx_v_data = (__pyx_v_data + __pyx_v_stride);
+  }
+
+  /* "View.MemoryView":1371
+ * 
+ * @cname('__pyx_memoryview_refcount_objects_in_slice')
+ * cdef void refcount_objects_in_slice(char *data, Py_ssize_t *shape,             # <<<<<<<<<<<<<<
+ *                                     Py_ssize_t *strides, int ndim, bint inc) noexcept:
+ *     cdef Py_ssize_t i
+ */
+
+  /* function exit code */
+}
+
+/* "View.MemoryView":1391
+ * 
+ * @cname('__pyx_memoryview_slice_assign_scalar')
+ * cdef void slice_assign_scalar(__Pyx_memviewslice *dst, int ndim,             # <<<<<<<<<<<<<<
+ *                               size_t itemsize, void *item,
+ *                               bint dtype_is_object) noexcept nogil:
+ */
+
+static void __pyx_memoryview_slice_assign_scalar(__Pyx_memviewslice *__pyx_v_dst, int __pyx_v_ndim, size_t __pyx_v_itemsize, void *__pyx_v_item, int __pyx_v_dtype_is_object) {
+
+  /* "View.MemoryView":1394
+ *                               size_t itemsize, void *item,
+ *                               bint dtype_is_object) noexcept nogil:
+ *     refcount_copying(dst, dtype_is_object, ndim, inc=False)             # <<<<<<<<<<<<<<
+ *     _slice_assign_scalar(dst.data, dst.shape, dst.strides, ndim, itemsize, item)
+ *     refcount_copying(dst, dtype_is_object, ndim, inc=True)
+ */
+  __pyx_memoryview_refcount_copying(__pyx_v_dst, __pyx_v_dtype_is_object, __pyx_v_ndim, 0);
+
+  /* "View.MemoryView":1395
+ *                               bint dtype_is_object) noexcept nogil:
+ *     refcount_copying(dst, dtype_is_object, ndim, inc=False)
+ *     _slice_assign_scalar(dst.data, dst.shape, dst.strides, ndim, itemsize, item)             # <<<<<<<<<<<<<<
+ *     refcount_copying(dst, dtype_is_object, ndim, inc=True)
+ * 
+ */
+  __pyx_memoryview__slice_assign_scalar(__pyx_v_dst->data, __pyx_v_dst->shape, __pyx_v_dst->strides, __pyx_v_ndim, __pyx_v_itemsize, __pyx_v_item);
+
+  /* "View.MemoryView":1396
+ *     refcount_copying(dst, dtype_is_object, ndim, inc=False)
+ *     _slice_assign_scalar(dst.data, dst.shape, dst.strides, ndim, itemsize, item)
+ *     refcount_copying(dst, dtype_is_object, ndim, inc=True)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_memoryview_refcount_copying(__pyx_v_dst, __pyx_v_dtype_is_object, __pyx_v_ndim, 1);
+
+  /* "View.MemoryView":1391
+ * 
+ * @cname('__pyx_memoryview_slice_assign_scalar')
+ * cdef void slice_assign_scalar(__Pyx_memviewslice *dst, int ndim,             # <<<<<<<<<<<<<<
+ *                               size_t itemsize, void *item,
+ *                               bint dtype_is_object) noexcept nogil:
+ */
+
+  /* function exit code */
+}
+
+/* "View.MemoryView":1400
+ * 
+ * @cname('__pyx_memoryview__slice_assign_scalar')
+ * cdef void _slice_assign_scalar(char *data, Py_ssize_t *shape,             # <<<<<<<<<<<<<<
+ *                               Py_ssize_t *strides, int ndim,
+ *                               size_t itemsize, void *item) noexcept nogil:
+ */
+
+static void __pyx_memoryview__slice_assign_scalar(char *__pyx_v_data, Py_ssize_t *__pyx_v_shape, Py_ssize_t *__pyx_v_strides, int __pyx_v_ndim, size_t __pyx_v_itemsize, void *__pyx_v_item) {
+  CYTHON_UNUSED Py_ssize_t __pyx_v_i;
+  Py_ssize_t __pyx_v_stride;
+  Py_ssize_t __pyx_v_extent;
+  int __pyx_t_1;
+  Py_ssize_t __pyx_t_2;
+  Py_ssize_t __pyx_t_3;
+  Py_ssize_t __pyx_t_4;
+
+  /* "View.MemoryView":1404
+ *                               size_t itemsize, void *item) noexcept nogil:
+ *     cdef Py_ssize_t i
+ *     cdef Py_ssize_t stride = strides[0]             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t extent = shape[0]
+ * 
+ */
+  __pyx_v_stride = (__pyx_v_strides[0]);
+
+  /* "View.MemoryView":1405
+ *     cdef Py_ssize_t i
+ *     cdef Py_ssize_t stride = strides[0]
+ *     cdef Py_ssize_t extent = shape[0]             # <<<<<<<<<<<<<<
+ * 
+ *     if ndim == 1:
+ */
+  __pyx_v_extent = (__pyx_v_shape[0]);
+
+  /* "View.MemoryView":1407
+ *     cdef Py_ssize_t extent = shape[0]
+ * 
+ *     if ndim == 1:             # <<<<<<<<<<<<<<
+ *         for i in range(extent):
+ *             memcpy(data, item, itemsize)
+ */
+  __pyx_t_1 = (__pyx_v_ndim == 1);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":1408
+ * 
+ *     if ndim == 1:
+ *         for i in range(extent):             # <<<<<<<<<<<<<<
+ *             memcpy(data, item, itemsize)
+ *             data += stride
+ */
+    __pyx_t_2 = __pyx_v_extent;
+    __pyx_t_3 = __pyx_t_2;
+    for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
+      __pyx_v_i = __pyx_t_4;
+
+      /* "View.MemoryView":1409
+ *     if ndim == 1:
+ *         for i in range(extent):
+ *             memcpy(data, item, itemsize)             # <<<<<<<<<<<<<<
+ *             data += stride
+ *     else:
+ */
+      (void)(memcpy(__pyx_v_data, __pyx_v_item, __pyx_v_itemsize));
+
+      /* "View.MemoryView":1410
+ *         for i in range(extent):
+ *             memcpy(data, item, itemsize)
+ *             data += stride             # <<<<<<<<<<<<<<
+ *     else:
+ *         for i in range(extent):
+ */
+      __pyx_v_data = (__pyx_v_data + __pyx_v_stride);
+    }
+
+    /* "View.MemoryView":1407
+ *     cdef Py_ssize_t extent = shape[0]
+ * 
+ *     if ndim == 1:             # <<<<<<<<<<<<<<
+ *         for i in range(extent):
+ *             memcpy(data, item, itemsize)
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":1412
+ *             data += stride
+ *     else:
+ *         for i in range(extent):             # <<<<<<<<<<<<<<
+ *             _slice_assign_scalar(data, shape + 1, strides + 1, ndim - 1, itemsize, item)
+ *             data += stride
+ */
+  /*else*/ {
+    __pyx_t_2 = __pyx_v_extent;
+    __pyx_t_3 = __pyx_t_2;
+    for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
+      __pyx_v_i = __pyx_t_4;
+
+      /* "View.MemoryView":1413
+ *     else:
+ *         for i in range(extent):
+ *             _slice_assign_scalar(data, shape + 1, strides + 1, ndim - 1, itemsize, item)             # <<<<<<<<<<<<<<
+ *             data += stride
+ * 
+ */
+      __pyx_memoryview__slice_assign_scalar(__pyx_v_data, (__pyx_v_shape + 1), (__pyx_v_strides + 1), (__pyx_v_ndim - 1), __pyx_v_itemsize, __pyx_v_item);
+
+      /* "View.MemoryView":1414
+ *         for i in range(extent):
+ *             _slice_assign_scalar(data, shape + 1, strides + 1, ndim - 1, itemsize, item)
+ *             data += stride             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+      __pyx_v_data = (__pyx_v_data + __pyx_v_stride);
+    }
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":1400
+ * 
+ * @cname('__pyx_memoryview__slice_assign_scalar')
+ * cdef void _slice_assign_scalar(char *data, Py_ssize_t *shape,             # <<<<<<<<<<<<<<
+ *                               Py_ssize_t *strides, int ndim,
+ *                               size_t itemsize, void *item) noexcept nogil:
+ */
+
+  /* function exit code */
+}
+
+/* "(tree fragment)":1
+ * def __pyx_unpickle_Enum(__pyx_type, long __pyx_checksum, __pyx_state):             # <<<<<<<<<<<<<<
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_1__pyx_unpickle_Enum(PyObject *__pyx_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyMethodDef __pyx_mdef_15View_dot_MemoryView_1__pyx_unpickle_Enum = {"__pyx_unpickle_Enum", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_15View_dot_MemoryView_1__pyx_unpickle_Enum, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_15View_dot_MemoryView_1__pyx_unpickle_Enum(PyObject *__pyx_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  PyObject *__pyx_v___pyx_type = 0;
+  long __pyx_v___pyx_checksum;
+  PyObject *__pyx_v___pyx_state = 0;
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[3] = {0,0,0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__pyx_unpickle_Enum (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_pyx_type,&__pyx_n_s_pyx_checksum,&__pyx_n_s_pyx_state,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  3: values[2] = __Pyx_Arg_FASTCALL(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = __Pyx_Arg_FASTCALL(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_FASTCALL(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_pyx_type)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 1, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_pyx_checksum)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[1]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 1, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("__pyx_unpickle_Enum", 1, 3, 3, 1); __PYX_ERR(1, 1, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_pyx_state)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[2]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 1, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("__pyx_unpickle_Enum", 1, 3, 3, 2); __PYX_ERR(1, 1, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "__pyx_unpickle_Enum") < 0)) __PYX_ERR(1, 1, __pyx_L3_error)
+      }
+    } else if (unlikely(__pyx_nargs != 3)) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+      values[1] = __Pyx_Arg_FASTCALL(__pyx_args, 1);
+      values[2] = __Pyx_Arg_FASTCALL(__pyx_args, 2);
+    }
+    __pyx_v___pyx_type = values[0];
+    __pyx_v___pyx_checksum = __Pyx_PyInt_As_long(values[1]); if (unlikely((__pyx_v___pyx_checksum == (long)-1) && PyErr_Occurred())) __PYX_ERR(1, 1, __pyx_L3_error)
+    __pyx_v___pyx_state = values[2];
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__pyx_unpickle_Enum", 1, 3, 3, __pyx_nargs); __PYX_ERR(1, 1, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("View.MemoryView.__pyx_unpickle_Enum", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_15View_dot_MemoryView___pyx_unpickle_Enum(__pyx_self, __pyx_v___pyx_type, __pyx_v___pyx_checksum, __pyx_v___pyx_state);
+
+  /* function exit code */
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView___pyx_unpickle_Enum(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v___pyx_type, long __pyx_v___pyx_checksum, PyObject *__pyx_v___pyx_state) {
+  PyObject *__pyx_v___pyx_PickleError = 0;
+  PyObject *__pyx_v___pyx_result = 0;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  unsigned int __pyx_t_5;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__pyx_unpickle_Enum", 1);
+
+  /* "(tree fragment)":4
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ *     if __pyx_checksum not in (0x82a3537, 0x6ae9995, 0xb068931):             # <<<<<<<<<<<<<<
+ *         from pickle import PickleError as __pyx_PickleError
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x82a3537, 0x6ae9995, 0xb068931) = (name))" % __pyx_checksum
+ */
+  __pyx_t_1 = __Pyx_PyInt_From_long(__pyx_v___pyx_checksum); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 4, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = (__Pyx_PySequence_ContainsTF(__pyx_t_1, __pyx_tuple__8, Py_NE)); if (unlikely((__pyx_t_2 < 0))) __PYX_ERR(1, 4, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (__pyx_t_2) {
+
+    /* "(tree fragment)":5
+ *     cdef object __pyx_result
+ *     if __pyx_checksum not in (0x82a3537, 0x6ae9995, 0xb068931):
+ *         from pickle import PickleError as __pyx_PickleError             # <<<<<<<<<<<<<<
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x82a3537, 0x6ae9995, 0xb068931) = (name))" % __pyx_checksum
+ *     __pyx_result = Enum.__new__(__pyx_type)
+ */
+    __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 5, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_INCREF(__pyx_n_s_PickleError);
+    __Pyx_GIVEREF(__pyx_n_s_PickleError);
+    if (__Pyx_PyList_SET_ITEM(__pyx_t_1, 0, __pyx_n_s_PickleError)) __PYX_ERR(1, 5, __pyx_L1_error);
+    __pyx_t_3 = __Pyx_Import(__pyx_n_s_pickle, __pyx_t_1, 0); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 5, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_3, __pyx_n_s_PickleError); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 5, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_INCREF(__pyx_t_1);
+    __pyx_v___pyx_PickleError = __pyx_t_1;
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+
+    /* "(tree fragment)":6
+ *     if __pyx_checksum not in (0x82a3537, 0x6ae9995, 0xb068931):
+ *         from pickle import PickleError as __pyx_PickleError
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x82a3537, 0x6ae9995, 0xb068931) = (name))" % __pyx_checksum             # <<<<<<<<<<<<<<
+ *     __pyx_result = Enum.__new__(__pyx_type)
+ *     if __pyx_state is not None:
+ */
+    __pyx_t_3 = __Pyx_PyInt_From_long(__pyx_v___pyx_checksum); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 6, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_1 = __Pyx_PyString_Format(__pyx_kp_s_Incompatible_checksums_0x_x_vs_0, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 6, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __Pyx_Raise(__pyx_v___pyx_PickleError, __pyx_t_1, 0, 0);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __PYX_ERR(1, 6, __pyx_L1_error)
+
+    /* "(tree fragment)":4
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ *     if __pyx_checksum not in (0x82a3537, 0x6ae9995, 0xb068931):             # <<<<<<<<<<<<<<
+ *         from pickle import PickleError as __pyx_PickleError
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x82a3537, 0x6ae9995, 0xb068931) = (name))" % __pyx_checksum
+ */
+  }
+
+  /* "(tree fragment)":7
+ *         from pickle import PickleError as __pyx_PickleError
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x82a3537, 0x6ae9995, 0xb068931) = (name))" % __pyx_checksum
+ *     __pyx_result = Enum.__new__(__pyx_type)             # <<<<<<<<<<<<<<
+ *     if __pyx_state is not None:
+ *         __pyx_unpickle_Enum__set_state(<Enum> __pyx_result, __pyx_state)
+ */
+  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_MemviewEnum_type), __pyx_n_s_new); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 7, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_4 = NULL;
+  __pyx_t_5 = 0;
+  #if CYTHON_UNPACK_METHODS
+  if (likely(PyMethod_Check(__pyx_t_3))) {
+    __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_3);
+    if (likely(__pyx_t_4)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3);
+      __Pyx_INCREF(__pyx_t_4);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_3, function);
+      __pyx_t_5 = 1;
+    }
+  }
+  #endif
+  {
+    PyObject *__pyx_callargs[2] = {__pyx_t_4, __pyx_v___pyx_type};
+    __pyx_t_1 = __Pyx_PyObject_FastCall(__pyx_t_3, __pyx_callargs+1-__pyx_t_5, 1+__pyx_t_5);
+    __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+    if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 7, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  }
+  __pyx_v___pyx_result = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "(tree fragment)":8
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x82a3537, 0x6ae9995, 0xb068931) = (name))" % __pyx_checksum
+ *     __pyx_result = Enum.__new__(__pyx_type)
+ *     if __pyx_state is not None:             # <<<<<<<<<<<<<<
+ *         __pyx_unpickle_Enum__set_state(<Enum> __pyx_result, __pyx_state)
+ *     return __pyx_result
+ */
+  __pyx_t_2 = (__pyx_v___pyx_state != Py_None);
+  if (__pyx_t_2) {
+
+    /* "(tree fragment)":9
+ *     __pyx_result = Enum.__new__(__pyx_type)
+ *     if __pyx_state is not None:
+ *         __pyx_unpickle_Enum__set_state(<Enum> __pyx_result, __pyx_state)             # <<<<<<<<<<<<<<
+ *     return __pyx_result
+ * cdef __pyx_unpickle_Enum__set_state(Enum __pyx_result, tuple __pyx_state):
+ */
+    if (!(likely(PyTuple_CheckExact(__pyx_v___pyx_state))||((__pyx_v___pyx_state) == Py_None) || __Pyx_RaiseUnexpectedTypeError("tuple", __pyx_v___pyx_state))) __PYX_ERR(1, 9, __pyx_L1_error)
+    __pyx_t_1 = __pyx_unpickle_Enum__set_state(((struct __pyx_MemviewEnum_obj *)__pyx_v___pyx_result), ((PyObject*)__pyx_v___pyx_state)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 9, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+    /* "(tree fragment)":8
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x82a3537, 0x6ae9995, 0xb068931) = (name))" % __pyx_checksum
+ *     __pyx_result = Enum.__new__(__pyx_type)
+ *     if __pyx_state is not None:             # <<<<<<<<<<<<<<
+ *         __pyx_unpickle_Enum__set_state(<Enum> __pyx_result, __pyx_state)
+ *     return __pyx_result
+ */
+  }
+
+  /* "(tree fragment)":10
+ *     if __pyx_state is not None:
+ *         __pyx_unpickle_Enum__set_state(<Enum> __pyx_result, __pyx_state)
+ *     return __pyx_result             # <<<<<<<<<<<<<<
+ * cdef __pyx_unpickle_Enum__set_state(Enum __pyx_result, tuple __pyx_state):
+ *     __pyx_result.name = __pyx_state[0]
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v___pyx_result);
+  __pyx_r = __pyx_v___pyx_result;
+  goto __pyx_L0;
+
+  /* "(tree fragment)":1
+ * def __pyx_unpickle_Enum(__pyx_type, long __pyx_checksum, __pyx_state):             # <<<<<<<<<<<<<<
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("View.MemoryView.__pyx_unpickle_Enum", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v___pyx_PickleError);
+  __Pyx_XDECREF(__pyx_v___pyx_result);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":11
+ *         __pyx_unpickle_Enum__set_state(<Enum> __pyx_result, __pyx_state)
+ *     return __pyx_result
+ * cdef __pyx_unpickle_Enum__set_state(Enum __pyx_result, tuple __pyx_state):             # <<<<<<<<<<<<<<
+ *     __pyx_result.name = __pyx_state[0]
+ *     if len(__pyx_state) > 1 and hasattr(__pyx_result, '__dict__'):
+ */
+
+static PyObject *__pyx_unpickle_Enum__set_state(struct __pyx_MemviewEnum_obj *__pyx_v___pyx_result, PyObject *__pyx_v___pyx_state) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  Py_ssize_t __pyx_t_3;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  unsigned int __pyx_t_8;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__pyx_unpickle_Enum__set_state", 1);
+
+  /* "(tree fragment)":12
+ *     return __pyx_result
+ * cdef __pyx_unpickle_Enum__set_state(Enum __pyx_result, tuple __pyx_state):
+ *     __pyx_result.name = __pyx_state[0]             # <<<<<<<<<<<<<<
+ *     if len(__pyx_state) > 1 and hasattr(__pyx_result, '__dict__'):
+ *         __pyx_result.__dict__.update(__pyx_state[1])
+ */
+  if (unlikely(__pyx_v___pyx_state == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable");
+    __PYX_ERR(1, 12, __pyx_L1_error)
+  }
+  __pyx_t_1 = __Pyx_GetItemInt_Tuple(__pyx_v___pyx_state, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 12, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_GIVEREF(__pyx_t_1);
+  __Pyx_GOTREF(__pyx_v___pyx_result->name);
+  __Pyx_DECREF(__pyx_v___pyx_result->name);
+  __pyx_v___pyx_result->name = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "(tree fragment)":13
+ * cdef __pyx_unpickle_Enum__set_state(Enum __pyx_result, tuple __pyx_state):
+ *     __pyx_result.name = __pyx_state[0]
+ *     if len(__pyx_state) > 1 and hasattr(__pyx_result, '__dict__'):             # <<<<<<<<<<<<<<
+ *         __pyx_result.__dict__.update(__pyx_state[1])
+ */
+  if (unlikely(__pyx_v___pyx_state == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()");
+    __PYX_ERR(1, 13, __pyx_L1_error)
+  }
+  __pyx_t_3 = __Pyx_PyTuple_GET_SIZE(__pyx_v___pyx_state); if (unlikely(__pyx_t_3 == ((Py_ssize_t)-1))) __PYX_ERR(1, 13, __pyx_L1_error)
+  __pyx_t_4 = (__pyx_t_3 > 1);
+  if (__pyx_t_4) {
+  } else {
+    __pyx_t_2 = __pyx_t_4;
+    goto __pyx_L4_bool_binop_done;
+  }
+  __pyx_t_4 = __Pyx_HasAttr(((PyObject *)__pyx_v___pyx_result), __pyx_n_s_dict); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(1, 13, __pyx_L1_error)
+  __pyx_t_2 = __pyx_t_4;
+  __pyx_L4_bool_binop_done:;
+  if (__pyx_t_2) {
+
+    /* "(tree fragment)":14
+ *     __pyx_result.name = __pyx_state[0]
+ *     if len(__pyx_state) > 1 and hasattr(__pyx_result, '__dict__'):
+ *         __pyx_result.__dict__.update(__pyx_state[1])             # <<<<<<<<<<<<<<
+ */
+    __pyx_t_5 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v___pyx_result), __pyx_n_s_dict); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 14, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_6 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_update); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 14, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    if (unlikely(__pyx_v___pyx_state == Py_None)) {
+      PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable");
+      __PYX_ERR(1, 14, __pyx_L1_error)
+    }
+    __pyx_t_5 = __Pyx_GetItemInt_Tuple(__pyx_v___pyx_state, 1, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 14, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_7 = NULL;
+    __pyx_t_8 = 0;
+    #if CYTHON_UNPACK_METHODS
+    if (likely(PyMethod_Check(__pyx_t_6))) {
+      __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_6);
+      if (likely(__pyx_t_7)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_6);
+        __Pyx_INCREF(__pyx_t_7);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_6, function);
+        __pyx_t_8 = 1;
+      }
+    }
+    #endif
+    {
+      PyObject *__pyx_callargs[2] = {__pyx_t_7, __pyx_t_5};
+      __pyx_t_1 = __Pyx_PyObject_FastCall(__pyx_t_6, __pyx_callargs+1-__pyx_t_8, 1+__pyx_t_8);
+      __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 14, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    }
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+    /* "(tree fragment)":13
+ * cdef __pyx_unpickle_Enum__set_state(Enum __pyx_result, tuple __pyx_state):
+ *     __pyx_result.name = __pyx_state[0]
+ *     if len(__pyx_state) > 1 and hasattr(__pyx_result, '__dict__'):             # <<<<<<<<<<<<<<
+ *         __pyx_result.__dict__.update(__pyx_state[1])
+ */
+  }
+
+  /* "(tree fragment)":11
+ *         __pyx_unpickle_Enum__set_state(<Enum> __pyx_result, __pyx_state)
+ *     return __pyx_result
+ * cdef __pyx_unpickle_Enum__set_state(Enum __pyx_result, tuple __pyx_state):             # <<<<<<<<<<<<<<
+ *     __pyx_result.name = __pyx_state[0]
+ *     if len(__pyx_state) > 1 and hasattr(__pyx_result, '__dict__'):
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_AddTraceback("View.MemoryView.__pyx_unpickle_Enum__set_state", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":287
+ * 
+ *         @property
+ *         cdef inline npy_intp itemsize(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             return PyDataType_ELSIZE(self)
+ * 
+ */
+
+static CYTHON_INLINE npy_intp __pyx_f_5numpy_5dtype_8itemsize_itemsize(PyArray_Descr *__pyx_v_self) {
+  npy_intp __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":288
+ *         @property
+ *         cdef inline npy_intp itemsize(self) noexcept nogil:
+ *             return PyDataType_ELSIZE(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyDataType_ELSIZE(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":287
+ * 
+ *         @property
+ *         cdef inline npy_intp itemsize(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             return PyDataType_ELSIZE(self)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":291
+ * 
+ *         @property
+ *         cdef inline npy_intp alignment(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             return PyDataType_ALIGNMENT(self)
+ * 
+ */
+
+static CYTHON_INLINE npy_intp __pyx_f_5numpy_5dtype_9alignment_alignment(PyArray_Descr *__pyx_v_self) {
+  npy_intp __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":292
+ *         @property
+ *         cdef inline npy_intp alignment(self) noexcept nogil:
+ *             return PyDataType_ALIGNMENT(self)             # <<<<<<<<<<<<<<
+ * 
+ *         # Use fields/names with care as they may be NULL.  You must check
+ */
+  __pyx_r = PyDataType_ALIGNMENT(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":291
+ * 
+ *         @property
+ *         cdef inline npy_intp alignment(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             return PyDataType_ALIGNMENT(self)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":297
+ *         # for this using PyDataType_HASFIELDS.
+ *         @property
+ *         cdef inline object fields(self):             # <<<<<<<<<<<<<<
+ *             return <object>PyDataType_FIELDS(self)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_5dtype_6fields_fields(PyArray_Descr *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1;
+  __Pyx_RefNannySetupContext("fields", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":298
+ *         @property
+ *         cdef inline object fields(self):
+ *             return <object>PyDataType_FIELDS(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyDataType_FIELDS(__pyx_v_self);
+  __Pyx_INCREF(((PyObject *)__pyx_t_1));
+  __pyx_r = ((PyObject *)__pyx_t_1);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":297
+ *         # for this using PyDataType_HASFIELDS.
+ *         @property
+ *         cdef inline object fields(self):             # <<<<<<<<<<<<<<
+ *             return <object>PyDataType_FIELDS(self)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":301
+ * 
+ *         @property
+ *         cdef inline tuple names(self):             # <<<<<<<<<<<<<<
+ *             return <tuple>PyDataType_NAMES(self)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_5dtype_5names_names(PyArray_Descr *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1;
+  __Pyx_RefNannySetupContext("names", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":302
+ *         @property
+ *         cdef inline tuple names(self):
+ *             return <tuple>PyDataType_NAMES(self)             # <<<<<<<<<<<<<<
+ * 
+ *         # Use PyDataType_HASSUBARRAY to test whether this field is
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyDataType_NAMES(__pyx_v_self);
+  __Pyx_INCREF(((PyObject*)__pyx_t_1));
+  __pyx_r = ((PyObject*)__pyx_t_1);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":301
+ * 
+ *         @property
+ *         cdef inline tuple names(self):             # <<<<<<<<<<<<<<
+ *             return <tuple>PyDataType_NAMES(self)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":308
+ *         # this field via the inline helper method PyDataType_SHAPE.
+ *         @property
+ *         cdef inline PyArray_ArrayDescr* subarray(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             return PyDataType_SUBARRAY(self)
+ * 
+ */
+
+static CYTHON_INLINE PyArray_ArrayDescr *__pyx_f_5numpy_5dtype_8subarray_subarray(PyArray_Descr *__pyx_v_self) {
+  PyArray_ArrayDescr *__pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":309
+ *         @property
+ *         cdef inline PyArray_ArrayDescr* subarray(self) noexcept nogil:
+ *             return PyDataType_SUBARRAY(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyDataType_SUBARRAY(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":308
+ *         # this field via the inline helper method PyDataType_SHAPE.
+ *         @property
+ *         cdef inline PyArray_ArrayDescr* subarray(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             return PyDataType_SUBARRAY(self)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":312
+ * 
+ *         @property
+ *         cdef inline npy_uint64 flags(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The data types flags."""
+ *             return PyDataType_FLAGS(self)
+ */
+
+static CYTHON_INLINE npy_uint64 __pyx_f_5numpy_5dtype_5flags_flags(PyArray_Descr *__pyx_v_self) {
+  npy_uint64 __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":314
+ *         cdef inline npy_uint64 flags(self) noexcept nogil:
+ *             """The data types flags."""
+ *             return PyDataType_FLAGS(self)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = PyDataType_FLAGS(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":312
+ * 
+ *         @property
+ *         cdef inline npy_uint64 flags(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The data types flags."""
+ *             return PyDataType_FLAGS(self)
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":324
+ * 
+ *         @property
+ *         cdef inline int numiter(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The number of arrays that need to be broadcast to the same shape."""
+ *             return PyArray_MultiIter_NUMITER(self)
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_9broadcast_7numiter_numiter(PyArrayMultiIterObject *__pyx_v_self) {
+  int __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":326
+ *         cdef inline int numiter(self) noexcept nogil:
+ *             """The number of arrays that need to be broadcast to the same shape."""
+ *             return PyArray_MultiIter_NUMITER(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyArray_MultiIter_NUMITER(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":324
+ * 
+ *         @property
+ *         cdef inline int numiter(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The number of arrays that need to be broadcast to the same shape."""
+ *             return PyArray_MultiIter_NUMITER(self)
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":329
+ * 
+ *         @property
+ *         cdef inline npy_intp size(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The total broadcasted size."""
+ *             return PyArray_MultiIter_SIZE(self)
+ */
+
+static CYTHON_INLINE npy_intp __pyx_f_5numpy_9broadcast_4size_size(PyArrayMultiIterObject *__pyx_v_self) {
+  npy_intp __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":331
+ *         cdef inline npy_intp size(self) noexcept nogil:
+ *             """The total broadcasted size."""
+ *             return PyArray_MultiIter_SIZE(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyArray_MultiIter_SIZE(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":329
+ * 
+ *         @property
+ *         cdef inline npy_intp size(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The total broadcasted size."""
+ *             return PyArray_MultiIter_SIZE(self)
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":334
+ * 
+ *         @property
+ *         cdef inline npy_intp index(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The current (1-d) index into the broadcasted result."""
+ *             return PyArray_MultiIter_INDEX(self)
+ */
+
+static CYTHON_INLINE npy_intp __pyx_f_5numpy_9broadcast_5index_index(PyArrayMultiIterObject *__pyx_v_self) {
+  npy_intp __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":336
+ *         cdef inline npy_intp index(self) noexcept nogil:
+ *             """The current (1-d) index into the broadcasted result."""
+ *             return PyArray_MultiIter_INDEX(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyArray_MultiIter_INDEX(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":334
+ * 
+ *         @property
+ *         cdef inline npy_intp index(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The current (1-d) index into the broadcasted result."""
+ *             return PyArray_MultiIter_INDEX(self)
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":339
+ * 
+ *         @property
+ *         cdef inline int nd(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The number of dimensions in the broadcasted result."""
+ *             return PyArray_MultiIter_NDIM(self)
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_9broadcast_2nd_nd(PyArrayMultiIterObject *__pyx_v_self) {
+  int __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":341
+ *         cdef inline int nd(self) noexcept nogil:
+ *             """The number of dimensions in the broadcasted result."""
+ *             return PyArray_MultiIter_NDIM(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyArray_MultiIter_NDIM(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":339
+ * 
+ *         @property
+ *         cdef inline int nd(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The number of dimensions in the broadcasted result."""
+ *             return PyArray_MultiIter_NDIM(self)
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":344
+ * 
+ *         @property
+ *         cdef inline npy_intp* dimensions(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The shape of the broadcasted result."""
+ *             return PyArray_MultiIter_DIMS(self)
+ */
+
+static CYTHON_INLINE npy_intp *__pyx_f_5numpy_9broadcast_10dimensions_dimensions(PyArrayMultiIterObject *__pyx_v_self) {
+  npy_intp *__pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":346
+ *         cdef inline npy_intp* dimensions(self) noexcept nogil:
+ *             """The shape of the broadcasted result."""
+ *             return PyArray_MultiIter_DIMS(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyArray_MultiIter_DIMS(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":344
+ * 
+ *         @property
+ *         cdef inline npy_intp* dimensions(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The shape of the broadcasted result."""
+ *             return PyArray_MultiIter_DIMS(self)
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":349
+ * 
+ *         @property
+ *         cdef inline void** iters(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """An array of iterator objects that holds the iterators for the arrays to be broadcast together.
+ *             On return, the iterators are adjusted for broadcasting."""
+ */
+
+static CYTHON_INLINE void **__pyx_f_5numpy_9broadcast_5iters_iters(PyArrayMultiIterObject *__pyx_v_self) {
+  void **__pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":352
+ *             """An array of iterator objects that holds the iterators for the arrays to be broadcast together.
+ *             On return, the iterators are adjusted for broadcasting."""
+ *             return PyArray_MultiIter_ITERS(self)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = PyArray_MultiIter_ITERS(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":349
+ * 
+ *         @property
+ *         cdef inline void** iters(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """An array of iterator objects that holds the iterators for the arrays to be broadcast together.
+ *             On return, the iterators are adjusted for broadcasting."""
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":367
+ * 
+ *         @property
+ *         cdef inline PyObject* base(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """Returns a borrowed reference to the object owning the data/memory.
+ *             """
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_7ndarray_4base_base(PyArrayObject *__pyx_v_self) {
+  PyObject *__pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":370
+ *             """Returns a borrowed reference to the object owning the data/memory.
+ *             """
+ *             return PyArray_BASE(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyArray_BASE(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":367
+ * 
+ *         @property
+ *         cdef inline PyObject* base(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """Returns a borrowed reference to the object owning the data/memory.
+ *             """
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":373
+ * 
+ *         @property
+ *         cdef inline dtype descr(self):             # <<<<<<<<<<<<<<
+ *             """Returns an owned reference to the dtype of the array.
+ *             """
+ */
+
+static CYTHON_INLINE PyArray_Descr *__pyx_f_5numpy_7ndarray_5descr_descr(PyArrayObject *__pyx_v_self) {
+  PyArray_Descr *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyArray_Descr *__pyx_t_1;
+  __Pyx_RefNannySetupContext("descr", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":376
+ *             """Returns an owned reference to the dtype of the array.
+ *             """
+ *             return <dtype>PyArray_DESCR(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __Pyx_XDECREF((PyObject *)__pyx_r);
+  __pyx_t_1 = PyArray_DESCR(__pyx_v_self);
+  __Pyx_INCREF((PyObject *)((PyArray_Descr *)__pyx_t_1));
+  __pyx_r = ((PyArray_Descr *)__pyx_t_1);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":373
+ * 
+ *         @property
+ *         cdef inline dtype descr(self):             # <<<<<<<<<<<<<<
+ *             """Returns an owned reference to the dtype of the array.
+ *             """
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF((PyObject *)__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":379
+ * 
+ *         @property
+ *         cdef inline int ndim(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """Returns the number of dimensions in the array.
+ *             """
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_7ndarray_4ndim_ndim(PyArrayObject *__pyx_v_self) {
+  int __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":382
+ *             """Returns the number of dimensions in the array.
+ *             """
+ *             return PyArray_NDIM(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyArray_NDIM(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":379
+ * 
+ *         @property
+ *         cdef inline int ndim(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """Returns the number of dimensions in the array.
+ *             """
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":385
+ * 
+ *         @property
+ *         cdef inline npy_intp *shape(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """Returns a pointer to the dimensions/shape of the array.
+ *             The number of elements matches the number of dimensions of the array (ndim).
+ */
+
+static CYTHON_INLINE npy_intp *__pyx_f_5numpy_7ndarray_5shape_shape(PyArrayObject *__pyx_v_self) {
+  npy_intp *__pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":390
+ *             Can return NULL for 0-dimensional arrays.
+ *             """
+ *             return PyArray_DIMS(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyArray_DIMS(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":385
+ * 
+ *         @property
+ *         cdef inline npy_intp *shape(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """Returns a pointer to the dimensions/shape of the array.
+ *             The number of elements matches the number of dimensions of the array (ndim).
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":393
+ * 
+ *         @property
+ *         cdef inline npy_intp *strides(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """Returns a pointer to the strides of the array.
+ *             The number of elements matches the number of dimensions of the array (ndim).
+ */
+
+static CYTHON_INLINE npy_intp *__pyx_f_5numpy_7ndarray_7strides_strides(PyArrayObject *__pyx_v_self) {
+  npy_intp *__pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":397
+ *             The number of elements matches the number of dimensions of the array (ndim).
+ *             """
+ *             return PyArray_STRIDES(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyArray_STRIDES(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":393
+ * 
+ *         @property
+ *         cdef inline npy_intp *strides(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """Returns a pointer to the strides of the array.
+ *             The number of elements matches the number of dimensions of the array (ndim).
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":400
+ * 
+ *         @property
+ *         cdef inline npy_intp size(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """Returns the total size (in number of elements) of the array.
+ *             """
+ */
+
+static CYTHON_INLINE npy_intp __pyx_f_5numpy_7ndarray_4size_size(PyArrayObject *__pyx_v_self) {
+  npy_intp __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":403
+ *             """Returns the total size (in number of elements) of the array.
+ *             """
+ *             return PyArray_SIZE(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyArray_SIZE(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":400
+ * 
+ *         @property
+ *         cdef inline npy_intp size(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """Returns the total size (in number of elements) of the array.
+ *             """
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":406
+ * 
+ *         @property
+ *         cdef inline char* data(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The pointer to the data buffer as a char*.
+ *             This is provided for legacy reasons to avoid direct struct field access.
+ */
+
+static CYTHON_INLINE char *__pyx_f_5numpy_7ndarray_4data_data(PyArrayObject *__pyx_v_self) {
+  char *__pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":412
+ *             of `PyArray_DATA()` instead, which returns a 'void*'.
+ *             """
+ *             return PyArray_BYTES(self)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = PyArray_BYTES(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":406
+ * 
+ *         @property
+ *         cdef inline char* data(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The pointer to the data buffer as a char*.
+ *             This is provided for legacy reasons to avoid direct struct field access.
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":824
+ * ctypedef long double complex clongdouble_t
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew1(PyObject *__pyx_v_a) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew1", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":825
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):
+ *     return PyArray_MultiIterNew(1, <void*>a)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(1, ((void *)__pyx_v_a)); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 825, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":824
+ * ctypedef long double complex clongdouble_t
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew1", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":827
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew2(PyObject *__pyx_v_a, PyObject *__pyx_v_b) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew2", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":828
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(2, ((void *)__pyx_v_a), ((void *)__pyx_v_b)); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 828, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":827
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew2", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":830
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew3(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew3", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":831
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(3, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c)); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 831, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":830
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew3", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":833
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew4(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew4", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":834
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(4, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d)); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 834, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":833
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew4", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":836
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew5(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d, PyObject *__pyx_v_e) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew5", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":837
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(5, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d), ((void *)__pyx_v_e)); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 837, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":836
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew5", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":839
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):             # <<<<<<<<<<<<<<
+ *     if PyDataType_HASSUBARRAY(d):
+ *         return <tuple>d.subarray.shape
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyDataType_SHAPE(PyArray_Descr *__pyx_v_d) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *__pyx_t_2;
+  __Pyx_RefNannySetupContext("PyDataType_SHAPE", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":840
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ *     if PyDataType_HASSUBARRAY(d):             # <<<<<<<<<<<<<<
+ *         return <tuple>d.subarray.shape
+ *     else:
+ */
+  __pyx_t_1 = PyDataType_HASSUBARRAY(__pyx_v_d);
+  if (__pyx_t_1) {
+
+    /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":841
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ *     if PyDataType_HASSUBARRAY(d):
+ *         return <tuple>d.subarray.shape             # <<<<<<<<<<<<<<
+ *     else:
+ *         return ()
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_t_2 = __pyx_f_5numpy_5dtype_8subarray_subarray(__pyx_v_d)->shape;
+    __Pyx_INCREF(((PyObject*)__pyx_t_2));
+    __pyx_r = ((PyObject*)__pyx_t_2);
+    goto __pyx_L0;
+
+    /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":840
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ *     if PyDataType_HASSUBARRAY(d):             # <<<<<<<<<<<<<<
+ *         return <tuple>d.subarray.shape
+ *     else:
+ */
+  }
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":843
+ *         return <tuple>d.subarray.shape
+ *     else:
+ *         return ()             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  /*else*/ {
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(__pyx_empty_tuple);
+    __pyx_r = __pyx_empty_tuple;
+    goto __pyx_L0;
+  }
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":839
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):             # <<<<<<<<<<<<<<
+ *     if PyDataType_HASSUBARRAY(d):
+ *         return <tuple>d.subarray.shape
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1027
+ *     int _import_umath() except -1
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base) except *:             # <<<<<<<<<<<<<<
+ *     Py_INCREF(base) # important to do this before stealing the reference below!
+ *     PyArray_SetBaseObject(arr, base)
+ */
+
+static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_arr, PyObject *__pyx_v_base) {
+  int __pyx_t_1;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1028
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base) except *:
+ *     Py_INCREF(base) # important to do this before stealing the reference below!             # <<<<<<<<<<<<<<
+ *     PyArray_SetBaseObject(arr, base)
+ * 
+ */
+  Py_INCREF(__pyx_v_base);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1029
+ * cdef inline void set_array_base(ndarray arr, object base) except *:
+ *     Py_INCREF(base) # important to do this before stealing the reference below!
+ *     PyArray_SetBaseObject(arr, base)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ */
+  __pyx_t_1 = PyArray_SetBaseObject(__pyx_v_arr, __pyx_v_base); if (unlikely(__pyx_t_1 == ((int)-1))) __PYX_ERR(2, 1029, __pyx_L1_error)
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1027
+ *     int _import_umath() except -1
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base) except *:             # <<<<<<<<<<<<<<
+ *     Py_INCREF(base) # important to do this before stealing the reference below!
+ *     PyArray_SetBaseObject(arr, base)
+ */
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("numpy.set_array_base", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_L0:;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1031
+ *     PyArray_SetBaseObject(arr, base)
+ * 
+ * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
+ *     base = PyArray_BASE(arr)
+ *     if base is NULL:
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__pyx_v_arr) {
+  PyObject *__pyx_v_base;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  __Pyx_RefNannySetupContext("get_array_base", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1032
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ *     base = PyArray_BASE(arr)             # <<<<<<<<<<<<<<
+ *     if base is NULL:
+ *         return None
+ */
+  __pyx_v_base = PyArray_BASE(__pyx_v_arr);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1033
+ * cdef inline object get_array_base(ndarray arr):
+ *     base = PyArray_BASE(arr)
+ *     if base is NULL:             # <<<<<<<<<<<<<<
+ *         return None
+ *     return <object>base
+ */
+  __pyx_t_1 = (__pyx_v_base == NULL);
+  if (__pyx_t_1) {
+
+    /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1034
+ *     base = PyArray_BASE(arr)
+ *     if base is NULL:
+ *         return None             # <<<<<<<<<<<<<<
+ *     return <object>base
+ * 
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+    goto __pyx_L0;
+
+    /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1033
+ * cdef inline object get_array_base(ndarray arr):
+ *     base = PyArray_BASE(arr)
+ *     if base is NULL:             # <<<<<<<<<<<<<<
+ *         return None
+ *     return <object>base
+ */
+  }
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1035
+ *     if base is NULL:
+ *         return None
+ *     return <object>base             # <<<<<<<<<<<<<<
+ * 
+ * # Versions of the import_* functions which are more suitable for
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(((PyObject *)__pyx_v_base));
+  __pyx_r = ((PyObject *)__pyx_v_base);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1031
+ *     PyArray_SetBaseObject(arr, base)
+ * 
+ * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
+ *     base = PyArray_BASE(arr)
+ *     if base is NULL:
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1039
+ * # Versions of the import_* functions which are more suitable for
+ * # Cython code.
+ * cdef inline int import_array() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         __pyx_import_array()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_array(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("import_array", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1040
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         __pyx_import_array()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1041
+ * cdef inline int import_array() except -1:
+ *     try:
+ *         __pyx_import_array()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy._core.multiarray failed to import")
+ */
+      __pyx_t_4 = _import_array(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(2, 1041, __pyx_L3_error)
+
+      /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1040
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         __pyx_import_array()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L8_try_end;
+    __pyx_L3_error:;
+
+    /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1042
+ *     try:
+ *         __pyx_import_array()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy._core.multiarray failed to import")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_array", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(2, 1042, __pyx_L5_except_error)
+      __Pyx_XGOTREF(__pyx_t_5);
+      __Pyx_XGOTREF(__pyx_t_6);
+      __Pyx_XGOTREF(__pyx_t_7);
+
+      /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1043
+ *         __pyx_import_array()
+ *     except Exception:
+ *         raise ImportError("numpy._core.multiarray failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_umath() except -1:
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__9, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(2, 1043, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(2, 1043, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+
+    /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1040
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         __pyx_import_array()
+ *     except Exception:
+ */
+    __pyx_L5_except_error:;
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L8_try_end:;
+  }
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1039
+ * # Versions of the import_* functions which are more suitable for
+ * # Cython code.
+ * cdef inline int import_array() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         __pyx_import_array()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_array", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1045
+ *         raise ImportError("numpy._core.multiarray failed to import")
+ * 
+ * cdef inline int import_umath() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_umath(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("import_umath", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1046
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1047
+ * cdef inline int import_umath() except -1:
+ *     try:
+ *         _import_umath()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy._core.umath failed to import")
+ */
+      __pyx_t_4 = _import_umath(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(2, 1047, __pyx_L3_error)
+
+      /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1046
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L8_try_end;
+    __pyx_L3_error:;
+
+    /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1048
+ *     try:
+ *         _import_umath()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy._core.umath failed to import")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_umath", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(2, 1048, __pyx_L5_except_error)
+      __Pyx_XGOTREF(__pyx_t_5);
+      __Pyx_XGOTREF(__pyx_t_6);
+      __Pyx_XGOTREF(__pyx_t_7);
+
+      /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1049
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy._core.umath failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_ufunc() except -1:
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__10, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(2, 1049, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(2, 1049, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+
+    /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1046
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    __pyx_L5_except_error:;
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L8_try_end:;
+  }
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1045
+ *         raise ImportError("numpy._core.multiarray failed to import")
+ * 
+ * cdef inline int import_umath() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_umath", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1051
+ *         raise ImportError("numpy._core.umath failed to import")
+ * 
+ * cdef inline int import_ufunc() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_ufunc(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("import_ufunc", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1052
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1053
+ * cdef inline int import_ufunc() except -1:
+ *     try:
+ *         _import_umath()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy._core.umath failed to import")
+ */
+      __pyx_t_4 = _import_umath(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(2, 1053, __pyx_L3_error)
+
+      /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1052
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L8_try_end;
+    __pyx_L3_error:;
+
+    /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1054
+ *     try:
+ *         _import_umath()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy._core.umath failed to import")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_ufunc", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(2, 1054, __pyx_L5_except_error)
+      __Pyx_XGOTREF(__pyx_t_5);
+      __Pyx_XGOTREF(__pyx_t_6);
+      __Pyx_XGOTREF(__pyx_t_7);
+
+      /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1055
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy._core.umath failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__10, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(2, 1055, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(2, 1055, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+
+    /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1052
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    __pyx_L5_except_error:;
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L8_try_end:;
+  }
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1051
+ *         raise ImportError("numpy._core.umath failed to import")
+ * 
+ * cdef inline int import_ufunc() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_ufunc", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1058
+ * 
+ * 
+ * cdef inline bint is_timedelta64_object(object obj) noexcept:             # <<<<<<<<<<<<<<
+ *     """
+ *     Cython equivalent of `isinstance(obj, np.timedelta64)`
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_is_timedelta64_object(PyObject *__pyx_v_obj) {
+  int __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1070
+ *     bool
+ *     """
+ *     return PyObject_TypeCheck(obj, &PyTimedeltaArrType_Type)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = PyObject_TypeCheck(__pyx_v_obj, (&PyTimedeltaArrType_Type));
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1058
+ * 
+ * 
+ * cdef inline bint is_timedelta64_object(object obj) noexcept:             # <<<<<<<<<<<<<<
+ *     """
+ *     Cython equivalent of `isinstance(obj, np.timedelta64)`
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1073
+ * 
+ * 
+ * cdef inline bint is_datetime64_object(object obj) noexcept:             # <<<<<<<<<<<<<<
+ *     """
+ *     Cython equivalent of `isinstance(obj, np.datetime64)`
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_is_datetime64_object(PyObject *__pyx_v_obj) {
+  int __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1085
+ *     bool
+ *     """
+ *     return PyObject_TypeCheck(obj, &PyDatetimeArrType_Type)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = PyObject_TypeCheck(__pyx_v_obj, (&PyDatetimeArrType_Type));
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1073
+ * 
+ * 
+ * cdef inline bint is_datetime64_object(object obj) noexcept:             # <<<<<<<<<<<<<<
+ *     """
+ *     Cython equivalent of `isinstance(obj, np.datetime64)`
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1088
+ * 
+ * 
+ * cdef inline npy_datetime get_datetime64_value(object obj) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     returns the int64 value underlying scalar numpy datetime64 object
+ */
+
+static CYTHON_INLINE npy_datetime __pyx_f_5numpy_get_datetime64_value(PyObject *__pyx_v_obj) {
+  npy_datetime __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1095
+ *     also needed.  That can be found using `get_datetime64_unit`.
+ *     """
+ *     return (<PyDatetimeScalarObject*>obj).obval             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = ((PyDatetimeScalarObject *)__pyx_v_obj)->obval;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1088
+ * 
+ * 
+ * cdef inline npy_datetime get_datetime64_value(object obj) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     returns the int64 value underlying scalar numpy datetime64 object
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1098
+ * 
+ * 
+ * cdef inline npy_timedelta get_timedelta64_value(object obj) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     returns the int64 value underlying scalar numpy timedelta64 object
+ */
+
+static CYTHON_INLINE npy_timedelta __pyx_f_5numpy_get_timedelta64_value(PyObject *__pyx_v_obj) {
+  npy_timedelta __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1102
+ *     returns the int64 value underlying scalar numpy timedelta64 object
+ *     """
+ *     return (<PyTimedeltaScalarObject*>obj).obval             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = ((PyTimedeltaScalarObject *)__pyx_v_obj)->obval;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1098
+ * 
+ * 
+ * cdef inline npy_timedelta get_timedelta64_value(object obj) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     returns the int64 value underlying scalar numpy timedelta64 object
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1105
+ * 
+ * 
+ * cdef inline NPY_DATETIMEUNIT get_datetime64_unit(object obj) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     returns the unit part of the dtype for a numpy datetime64 object.
+ */
+
+static CYTHON_INLINE NPY_DATETIMEUNIT __pyx_f_5numpy_get_datetime64_unit(PyObject *__pyx_v_obj) {
+  NPY_DATETIMEUNIT __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1109
+ *     returns the unit part of the dtype for a numpy datetime64 object.
+ *     """
+ *     return <NPY_DATETIMEUNIT>(<PyDatetimeScalarObject*>obj).obmeta.base             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = ((NPY_DATETIMEUNIT)((PyDatetimeScalarObject *)__pyx_v_obj)->obmeta.base);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1105
+ * 
+ * 
+ * cdef inline NPY_DATETIMEUNIT get_datetime64_unit(object obj) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     returns the unit part of the dtype for a numpy datetime64 object.
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "fairseq/data/data_utils_fast.pyx":16
+ * 
+ * 
+ * cdef _is_batch_full(long num_sentences, long num_tokens, long max_tokens, long max_sentences):             # <<<<<<<<<<<<<<
+ *     if num_sentences == 0:
+ *         return 0
+ */
+
+static PyObject *__pyx_f_7fairseq_4data_15data_utils_fast__is_batch_full(long __pyx_v_num_sentences, long __pyx_v_num_tokens, long __pyx_v_max_tokens, long __pyx_v_max_sentences) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  int __pyx_t_2;
+  __Pyx_RefNannySetupContext("_is_batch_full", 1);
+
+  /* "fairseq/data/data_utils_fast.pyx":17
+ * 
+ * cdef _is_batch_full(long num_sentences, long num_tokens, long max_tokens, long max_sentences):
+ *     if num_sentences == 0:             # <<<<<<<<<<<<<<
+ *         return 0
+ *     if max_sentences > 0 and num_sentences == max_sentences:
+ */
+  __pyx_t_1 = (__pyx_v_num_sentences == 0);
+  if (__pyx_t_1) {
+
+    /* "fairseq/data/data_utils_fast.pyx":18
+ * cdef _is_batch_full(long num_sentences, long num_tokens, long max_tokens, long max_sentences):
+ *     if num_sentences == 0:
+ *         return 0             # <<<<<<<<<<<<<<
+ *     if max_sentences > 0 and num_sentences == max_sentences:
+ *         return 1
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(__pyx_int_0);
+    __pyx_r = __pyx_int_0;
+    goto __pyx_L0;
+
+    /* "fairseq/data/data_utils_fast.pyx":17
+ * 
+ * cdef _is_batch_full(long num_sentences, long num_tokens, long max_tokens, long max_sentences):
+ *     if num_sentences == 0:             # <<<<<<<<<<<<<<
+ *         return 0
+ *     if max_sentences > 0 and num_sentences == max_sentences:
+ */
+  }
+
+  /* "fairseq/data/data_utils_fast.pyx":19
+ *     if num_sentences == 0:
+ *         return 0
+ *     if max_sentences > 0 and num_sentences == max_sentences:             # <<<<<<<<<<<<<<
+ *         return 1
+ *     if max_tokens > 0 and num_tokens > max_tokens:
+ */
+  __pyx_t_2 = (__pyx_v_max_sentences > 0);
+  if (__pyx_t_2) {
+  } else {
+    __pyx_t_1 = __pyx_t_2;
+    goto __pyx_L5_bool_binop_done;
+  }
+  __pyx_t_2 = (__pyx_v_num_sentences == __pyx_v_max_sentences);
+  __pyx_t_1 = __pyx_t_2;
+  __pyx_L5_bool_binop_done:;
+  if (__pyx_t_1) {
+
+    /* "fairseq/data/data_utils_fast.pyx":20
+ *         return 0
+ *     if max_sentences > 0 and num_sentences == max_sentences:
+ *         return 1             # <<<<<<<<<<<<<<
+ *     if max_tokens > 0 and num_tokens > max_tokens:
+ *         return 1
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(__pyx_int_1);
+    __pyx_r = __pyx_int_1;
+    goto __pyx_L0;
+
+    /* "fairseq/data/data_utils_fast.pyx":19
+ *     if num_sentences == 0:
+ *         return 0
+ *     if max_sentences > 0 and num_sentences == max_sentences:             # <<<<<<<<<<<<<<
+ *         return 1
+ *     if max_tokens > 0 and num_tokens > max_tokens:
+ */
+  }
+
+  /* "fairseq/data/data_utils_fast.pyx":21
+ *     if max_sentences > 0 and num_sentences == max_sentences:
+ *         return 1
+ *     if max_tokens > 0 and num_tokens > max_tokens:             # <<<<<<<<<<<<<<
+ *         return 1
+ *     return 0
+ */
+  __pyx_t_2 = (__pyx_v_max_tokens > 0);
+  if (__pyx_t_2) {
+  } else {
+    __pyx_t_1 = __pyx_t_2;
+    goto __pyx_L8_bool_binop_done;
+  }
+  __pyx_t_2 = (__pyx_v_num_tokens > __pyx_v_max_tokens);
+  __pyx_t_1 = __pyx_t_2;
+  __pyx_L8_bool_binop_done:;
+  if (__pyx_t_1) {
+
+    /* "fairseq/data/data_utils_fast.pyx":22
+ *         return 1
+ *     if max_tokens > 0 and num_tokens > max_tokens:
+ *         return 1             # <<<<<<<<<<<<<<
+ *     return 0
+ * 
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(__pyx_int_1);
+    __pyx_r = __pyx_int_1;
+    goto __pyx_L0;
+
+    /* "fairseq/data/data_utils_fast.pyx":21
+ *     if max_sentences > 0 and num_sentences == max_sentences:
+ *         return 1
+ *     if max_tokens > 0 and num_tokens > max_tokens:             # <<<<<<<<<<<<<<
+ *         return 1
+ *     return 0
+ */
+  }
+
+  /* "fairseq/data/data_utils_fast.pyx":23
+ *     if max_tokens > 0 and num_tokens > max_tokens:
+ *         return 1
+ *     return 0             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_int_0);
+  __pyx_r = __pyx_int_0;
+  goto __pyx_L0;
+
+  /* "fairseq/data/data_utils_fast.pyx":16
+ * 
+ * 
+ * cdef _is_batch_full(long num_sentences, long num_tokens, long max_tokens, long max_sentences):             # <<<<<<<<<<<<<<
+ *     if num_sentences == 0:
+ *         return 0
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "fairseq/data/data_utils_fast.pyx":27
+ * 
+ * @cython.cdivision(True)
+ * cpdef list batch_by_size_fast(             # <<<<<<<<<<<<<<
+ *     np.ndarray[DTYPE_t, ndim=1] indices,
+ *     num_tokens_fn,
+ */
+
+static PyObject *__pyx_pw_7fairseq_4data_15data_utils_fast_1batch_by_size_fast(PyObject *__pyx_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_f_7fairseq_4data_15data_utils_fast_batch_by_size_fast(PyArrayObject *__pyx_v_indices, PyObject *__pyx_v_num_tokens_fn, long __pyx_v_max_tokens, long __pyx_v_max_sentences, int __pyx_v_bsz_mult, CYTHON_UNUSED int __pyx_skip_dispatch) {
+  long __pyx_v_sample_len;
+  PyObject *__pyx_v_sample_lens = 0;
+  PyObject *__pyx_v_batch = 0;
+  PyObject *__pyx_v_batches = 0;
+  long __pyx_v_mod_len;
+  long __pyx_v_i;
+  long __pyx_v_idx;
+  long __pyx_v_num_tokens;
+  __Pyx_memviewslice __pyx_v_indices_view = { 0, 0, { 0 }, { 0 }, { 0 } };
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_indices;
+  __Pyx_Buffer __pyx_pybuffer_indices;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_memviewslice __pyx_t_2 = { 0, 0, { 0 }, { 0 }, { 0 } };
+  Py_ssize_t __pyx_t_3;
+  Py_ssize_t __pyx_t_4;
+  long __pyx_t_5;
+  Py_ssize_t __pyx_t_6;
+  int __pyx_t_7;
+  PyObject *__pyx_t_8 = NULL;
+  PyObject *__pyx_t_9 = NULL;
+  PyObject *__pyx_t_10 = NULL;
+  unsigned int __pyx_t_11;
+  long __pyx_t_12;
+  int __pyx_t_13;
+  long __pyx_t_14;
+  long __pyx_t_15;
+  int __pyx_t_16;
+  int __pyx_t_17;
+  PyObject *__pyx_t_18 = NULL;
+  PyObject *__pyx_t_19 = NULL;
+  Py_ssize_t __pyx_t_20;
+  Py_ssize_t __pyx_t_21;
+  Py_ssize_t __pyx_t_22;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("batch_by_size_fast", 1);
+  __pyx_pybuffer_indices.pybuffer.buf = NULL;
+  __pyx_pybuffer_indices.refcount = 0;
+  __pyx_pybuffernd_indices.data = NULL;
+  __pyx_pybuffernd_indices.rcbuffer = &__pyx_pybuffer_indices;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_indices.rcbuffer->pybuffer, (PyObject*)__pyx_v_indices, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) __PYX_ERR(0, 27, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_indices.diminfo[0].strides = __pyx_pybuffernd_indices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_indices.diminfo[0].shape = __pyx_pybuffernd_indices.rcbuffer->pybuffer.shape[0];
+
+  /* "fairseq/data/data_utils_fast.pyx":34
+ *     int bsz_mult,
+ * ):
+ *     cdef long sample_len = 0             # <<<<<<<<<<<<<<
+ *     cdef list sample_lens = []
+ *     cdef list batch = []
+ */
+  __pyx_v_sample_len = 0;
+
+  /* "fairseq/data/data_utils_fast.pyx":35
+ * ):
+ *     cdef long sample_len = 0
+ *     cdef list sample_lens = []             # <<<<<<<<<<<<<<
+ *     cdef list batch = []
+ *     cdef list batches = []
+ */
+  __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 35, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v_sample_lens = ((PyObject*)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "fairseq/data/data_utils_fast.pyx":36
+ *     cdef long sample_len = 0
+ *     cdef list sample_lens = []
+ *     cdef list batch = []             # <<<<<<<<<<<<<<
+ *     cdef list batches = []
+ *     cdef long mod_len
+ */
+  __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 36, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v_batch = ((PyObject*)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "fairseq/data/data_utils_fast.pyx":37
+ *     cdef list sample_lens = []
+ *     cdef list batch = []
+ *     cdef list batches = []             # <<<<<<<<<<<<<<
+ *     cdef long mod_len
+ *     cdef long i
+ */
+  __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 37, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v_batches = ((PyObject*)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "fairseq/data/data_utils_fast.pyx":42
+ *     cdef long idx
+ *     cdef long num_tokens
+ *     cdef DTYPE_t[:] indices_view = indices             # <<<<<<<<<<<<<<
+ * 
+ *     for i in range(len(indices_view)):
+ */
+  __pyx_t_2 = __Pyx_PyObject_to_MemoryviewSlice_ds_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t(((PyObject *)__pyx_v_indices), PyBUF_WRITABLE); if (unlikely(!__pyx_t_2.memview)) __PYX_ERR(0, 42, __pyx_L1_error)
+  __pyx_v_indices_view = __pyx_t_2;
+  __pyx_t_2.memview = NULL;
+  __pyx_t_2.data = NULL;
+
+  /* "fairseq/data/data_utils_fast.pyx":44
+ *     cdef DTYPE_t[:] indices_view = indices
+ * 
+ *     for i in range(len(indices_view)):             # <<<<<<<<<<<<<<
+ *         idx = indices_view[i]
+ *         num_tokens = num_tokens_fn(idx)
+ */
+  __pyx_t_3 = __Pyx_MemoryView_Len(__pyx_v_indices_view); 
+  __pyx_t_4 = __pyx_t_3;
+  for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
+    __pyx_v_i = __pyx_t_5;
+
+    /* "fairseq/data/data_utils_fast.pyx":45
+ * 
+ *     for i in range(len(indices_view)):
+ *         idx = indices_view[i]             # <<<<<<<<<<<<<<
+ *         num_tokens = num_tokens_fn(idx)
+ *         sample_lens.append(num_tokens)
+ */
+    __pyx_t_6 = __pyx_v_i;
+    __pyx_t_7 = -1;
+    if (__pyx_t_6 < 0) {
+      __pyx_t_6 += __pyx_v_indices_view.shape[0];
+      if (unlikely(__pyx_t_6 < 0)) __pyx_t_7 = 0;
+    } else if (unlikely(__pyx_t_6 >= __pyx_v_indices_view.shape[0])) __pyx_t_7 = 0;
+    if (unlikely(__pyx_t_7 != -1)) {
+      __Pyx_RaiseBufferIndexError(__pyx_t_7);
+      __PYX_ERR(0, 45, __pyx_L1_error)
+    }
+    __pyx_v_idx = (*((__pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t *) ( /* dim=0 */ (__pyx_v_indices_view.data + __pyx_t_6 * __pyx_v_indices_view.strides[0]) )));
+
+    /* "fairseq/data/data_utils_fast.pyx":46
+ *     for i in range(len(indices_view)):
+ *         idx = indices_view[i]
+ *         num_tokens = num_tokens_fn(idx)             # <<<<<<<<<<<<<<
+ *         sample_lens.append(num_tokens)
+ *         sample_len = max(sample_len, num_tokens)
+ */
+    __pyx_t_8 = __Pyx_PyInt_From_long(__pyx_v_idx); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 46, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_8);
+    __Pyx_INCREF(__pyx_v_num_tokens_fn);
+    __pyx_t_9 = __pyx_v_num_tokens_fn; __pyx_t_10 = NULL;
+    __pyx_t_11 = 0;
+    #if CYTHON_UNPACK_METHODS
+    if (unlikely(PyMethod_Check(__pyx_t_9))) {
+      __pyx_t_10 = PyMethod_GET_SELF(__pyx_t_9);
+      if (likely(__pyx_t_10)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_9);
+        __Pyx_INCREF(__pyx_t_10);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_9, function);
+        __pyx_t_11 = 1;
+      }
+    }
+    #endif
+    {
+      PyObject *__pyx_callargs[2] = {__pyx_t_10, __pyx_t_8};
+      __pyx_t_1 = __Pyx_PyObject_FastCall(__pyx_t_9, __pyx_callargs+1-__pyx_t_11, 1+__pyx_t_11);
+      __Pyx_XDECREF(__pyx_t_10); __pyx_t_10 = 0;
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 46, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+    }
+    __pyx_t_12 = __Pyx_PyInt_As_long(__pyx_t_1); if (unlikely((__pyx_t_12 == (long)-1) && PyErr_Occurred())) __PYX_ERR(0, 46, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __pyx_v_num_tokens = __pyx_t_12;
+
+    /* "fairseq/data/data_utils_fast.pyx":47
+ *         idx = indices_view[i]
+ *         num_tokens = num_tokens_fn(idx)
+ *         sample_lens.append(num_tokens)             # <<<<<<<<<<<<<<
+ *         sample_len = max(sample_len, num_tokens)
+ * 
+ */
+    __pyx_t_1 = __Pyx_PyInt_From_long(__pyx_v_num_tokens); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 47, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_13 = __Pyx_PyList_Append(__pyx_v_sample_lens, __pyx_t_1); if (unlikely(__pyx_t_13 == ((int)-1))) __PYX_ERR(0, 47, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+    /* "fairseq/data/data_utils_fast.pyx":48
+ *         num_tokens = num_tokens_fn(idx)
+ *         sample_lens.append(num_tokens)
+ *         sample_len = max(sample_len, num_tokens)             # <<<<<<<<<<<<<<
+ * 
+ *         assert max_tokens <= 0 or sample_len <= max_tokens, (
+ */
+    __pyx_t_12 = __pyx_v_num_tokens;
+    __pyx_t_14 = __pyx_v_sample_len;
+    __pyx_t_16 = (__pyx_t_12 > __pyx_t_14);
+    if (__pyx_t_16) {
+      __pyx_t_15 = __pyx_t_12;
+    } else {
+      __pyx_t_15 = __pyx_t_14;
+    }
+    __pyx_v_sample_len = __pyx_t_15;
+
+    /* "fairseq/data/data_utils_fast.pyx":50
+ *         sample_len = max(sample_len, num_tokens)
+ * 
+ *         assert max_tokens <= 0 or sample_len <= max_tokens, (             # <<<<<<<<<<<<<<
+ *             "sentence at index {} of size {} exceeds max_tokens "
+ *             "limit of {}!".format(idx, sample_len, max_tokens)
+ */
+    #ifndef CYTHON_WITHOUT_ASSERTIONS
+    if (unlikely(__pyx_assertions_enabled())) {
+      __pyx_t_17 = (__pyx_v_max_tokens <= 0);
+      if (!__pyx_t_17) {
+      } else {
+        __pyx_t_16 = __pyx_t_17;
+        goto __pyx_L5_bool_binop_done;
+      }
+      __pyx_t_17 = (__pyx_v_sample_len <= __pyx_v_max_tokens);
+      __pyx_t_16 = __pyx_t_17;
+      __pyx_L5_bool_binop_done:;
+      if (unlikely(!__pyx_t_16)) {
+
+        /* "fairseq/data/data_utils_fast.pyx":52
+ *         assert max_tokens <= 0 or sample_len <= max_tokens, (
+ *             "sentence at index {} of size {} exceeds max_tokens "
+ *             "limit of {}!".format(idx, sample_len, max_tokens)             # <<<<<<<<<<<<<<
+ *         )
+ *         num_tokens = (len(batch) + 1) * sample_len
+ */
+        __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_kp_u_sentence_at_index_of_size_exceed, __pyx_n_s_format); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 52, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_9);
+        __pyx_t_8 = __Pyx_PyInt_From_long(__pyx_v_idx); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 52, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_8);
+        __pyx_t_10 = __Pyx_PyInt_From_long(__pyx_v_sample_len); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 52, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_10);
+        __pyx_t_18 = __Pyx_PyInt_From_long(__pyx_v_max_tokens); if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 52, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_18);
+        __pyx_t_19 = NULL;
+        __pyx_t_11 = 0;
+        #if CYTHON_UNPACK_METHODS
+        if (likely(PyMethod_Check(__pyx_t_9))) {
+          __pyx_t_19 = PyMethod_GET_SELF(__pyx_t_9);
+          if (likely(__pyx_t_19)) {
+            PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_9);
+            __Pyx_INCREF(__pyx_t_19);
+            __Pyx_INCREF(function);
+            __Pyx_DECREF_SET(__pyx_t_9, function);
+            __pyx_t_11 = 1;
+          }
+        }
+        #endif
+        {
+          PyObject *__pyx_callargs[4] = {__pyx_t_19, __pyx_t_8, __pyx_t_10, __pyx_t_18};
+          __pyx_t_1 = __Pyx_PyObject_FastCall(__pyx_t_9, __pyx_callargs+1-__pyx_t_11, 3+__pyx_t_11);
+          __Pyx_XDECREF(__pyx_t_19); __pyx_t_19 = 0;
+          __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+          __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+          __Pyx_DECREF(__pyx_t_18); __pyx_t_18 = 0;
+          if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 52, __pyx_L1_error)
+          __Pyx_GOTREF(__pyx_t_1);
+          __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        }
+        __pyx_t_9 = PyTuple_Pack(1, __pyx_t_1); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 52, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_9);
+        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+        __Pyx_Raise(__pyx_builtin_AssertionError, __pyx_t_9, 0, 0);
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __PYX_ERR(0, 50, __pyx_L1_error)
+      }
+    }
+    #else
+    if ((1)); else __PYX_ERR(0, 50, __pyx_L1_error)
+    #endif
+
+    /* "fairseq/data/data_utils_fast.pyx":54
+ *             "limit of {}!".format(idx, sample_len, max_tokens)
+ *         )
+ *         num_tokens = (len(batch) + 1) * sample_len             # <<<<<<<<<<<<<<
+ * 
+ *         if _is_batch_full(len(batch), num_tokens, max_tokens, max_sentences):
+ */
+    __pyx_t_20 = __Pyx_PyList_GET_SIZE(__pyx_v_batch); if (unlikely(__pyx_t_20 == ((Py_ssize_t)-1))) __PYX_ERR(0, 54, __pyx_L1_error)
+    __pyx_v_num_tokens = ((__pyx_t_20 + 1) * __pyx_v_sample_len);
+
+    /* "fairseq/data/data_utils_fast.pyx":56
+ *         num_tokens = (len(batch) + 1) * sample_len
+ * 
+ *         if _is_batch_full(len(batch), num_tokens, max_tokens, max_sentences):             # <<<<<<<<<<<<<<
+ *             mod_len = max(
+ *                 bsz_mult * (len(batch) // bsz_mult),
+ */
+    __pyx_t_20 = __Pyx_PyList_GET_SIZE(__pyx_v_batch); if (unlikely(__pyx_t_20 == ((Py_ssize_t)-1))) __PYX_ERR(0, 56, __pyx_L1_error)
+    __pyx_t_9 = __pyx_f_7fairseq_4data_15data_utils_fast__is_batch_full(__pyx_t_20, __pyx_v_num_tokens, __pyx_v_max_tokens, __pyx_v_max_sentences); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 56, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_9);
+    __pyx_t_16 = __Pyx_PyObject_IsTrue(__pyx_t_9); if (unlikely((__pyx_t_16 < 0))) __PYX_ERR(0, 56, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+    if (__pyx_t_16) {
+
+      /* "fairseq/data/data_utils_fast.pyx":59
+ *             mod_len = max(
+ *                 bsz_mult * (len(batch) // bsz_mult),
+ *                 len(batch) % bsz_mult,             # <<<<<<<<<<<<<<
+ *             )
+ *             batches.append(batch[:mod_len])
+ */
+      __pyx_t_20 = __Pyx_PyList_GET_SIZE(__pyx_v_batch); if (unlikely(__pyx_t_20 == ((Py_ssize_t)-1))) __PYX_ERR(0, 59, __pyx_L1_error)
+      __pyx_t_21 = (__pyx_t_20 % __pyx_v_bsz_mult);
+
+      /* "fairseq/data/data_utils_fast.pyx":58
+ *         if _is_batch_full(len(batch), num_tokens, max_tokens, max_sentences):
+ *             mod_len = max(
+ *                 bsz_mult * (len(batch) // bsz_mult),             # <<<<<<<<<<<<<<
+ *                 len(batch) % bsz_mult,
+ *             )
+ */
+      __pyx_t_20 = __Pyx_PyList_GET_SIZE(__pyx_v_batch); if (unlikely(__pyx_t_20 == ((Py_ssize_t)-1))) __PYX_ERR(0, 58, __pyx_L1_error)
+      __pyx_t_22 = (__pyx_v_bsz_mult * (__pyx_t_20 / __pyx_v_bsz_mult));
+
+      /* "fairseq/data/data_utils_fast.pyx":59
+ *             mod_len = max(
+ *                 bsz_mult * (len(batch) // bsz_mult),
+ *                 len(batch) % bsz_mult,             # <<<<<<<<<<<<<<
+ *             )
+ *             batches.append(batch[:mod_len])
+ */
+      __pyx_t_16 = (__pyx_t_21 > __pyx_t_22);
+      if (__pyx_t_16) {
+        __pyx_t_20 = __pyx_t_21;
+      } else {
+        __pyx_t_20 = __pyx_t_22;
+      }
+      __pyx_v_mod_len = __pyx_t_20;
+
+      /* "fairseq/data/data_utils_fast.pyx":61
+ *                 len(batch) % bsz_mult,
+ *             )
+ *             batches.append(batch[:mod_len])             # <<<<<<<<<<<<<<
+ *             batch = batch[mod_len:]
+ *             sample_lens = sample_lens[mod_len:]
+ */
+      __pyx_t_9 = __Pyx_PyList_GetSlice(__pyx_v_batch, 0, __pyx_v_mod_len); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 61, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_9);
+      __pyx_t_13 = __Pyx_PyList_Append(__pyx_v_batches, __pyx_t_9); if (unlikely(__pyx_t_13 == ((int)-1))) __PYX_ERR(0, 61, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+
+      /* "fairseq/data/data_utils_fast.pyx":62
+ *             )
+ *             batches.append(batch[:mod_len])
+ *             batch = batch[mod_len:]             # <<<<<<<<<<<<<<
+ *             sample_lens = sample_lens[mod_len:]
+ *             sample_len = max(sample_lens) if len(sample_lens) > 0 else 0
+ */
+      __pyx_t_9 = __Pyx_PyList_GetSlice(__pyx_v_batch, __pyx_v_mod_len, PY_SSIZE_T_MAX); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 62, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_9);
+      __Pyx_DECREF_SET(__pyx_v_batch, ((PyObject*)__pyx_t_9));
+      __pyx_t_9 = 0;
+
+      /* "fairseq/data/data_utils_fast.pyx":63
+ *             batches.append(batch[:mod_len])
+ *             batch = batch[mod_len:]
+ *             sample_lens = sample_lens[mod_len:]             # <<<<<<<<<<<<<<
+ *             sample_len = max(sample_lens) if len(sample_lens) > 0 else 0
+ *         batch.append(idx)
+ */
+      __pyx_t_9 = __Pyx_PyList_GetSlice(__pyx_v_sample_lens, __pyx_v_mod_len, PY_SSIZE_T_MAX); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 63, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_9);
+      __Pyx_DECREF_SET(__pyx_v_sample_lens, ((PyObject*)__pyx_t_9));
+      __pyx_t_9 = 0;
+
+      /* "fairseq/data/data_utils_fast.pyx":64
+ *             batch = batch[mod_len:]
+ *             sample_lens = sample_lens[mod_len:]
+ *             sample_len = max(sample_lens) if len(sample_lens) > 0 else 0             # <<<<<<<<<<<<<<
+ *         batch.append(idx)
+ *     if len(batch) > 0:
+ */
+      __pyx_t_20 = __Pyx_PyList_GET_SIZE(__pyx_v_sample_lens); if (unlikely(__pyx_t_20 == ((Py_ssize_t)-1))) __PYX_ERR(0, 64, __pyx_L1_error)
+      __pyx_t_16 = (__pyx_t_20 > 0);
+      if (__pyx_t_16) {
+        __pyx_t_9 = __Pyx_PyObject_CallOneArg(__pyx_builtin_max, __pyx_v_sample_lens); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 64, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_9);
+        __pyx_t_12 = __Pyx_PyInt_As_long(__pyx_t_9); if (unlikely((__pyx_t_12 == (long)-1) && PyErr_Occurred())) __PYX_ERR(0, 64, __pyx_L1_error)
+        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+        __pyx_t_15 = __pyx_t_12;
+      } else {
+        __pyx_t_15 = 0;
+      }
+      __pyx_v_sample_len = __pyx_t_15;
+
+      /* "fairseq/data/data_utils_fast.pyx":56
+ *         num_tokens = (len(batch) + 1) * sample_len
+ * 
+ *         if _is_batch_full(len(batch), num_tokens, max_tokens, max_sentences):             # <<<<<<<<<<<<<<
+ *             mod_len = max(
+ *                 bsz_mult * (len(batch) // bsz_mult),
+ */
+    }
+
+    /* "fairseq/data/data_utils_fast.pyx":65
+ *             sample_lens = sample_lens[mod_len:]
+ *             sample_len = max(sample_lens) if len(sample_lens) > 0 else 0
+ *         batch.append(idx)             # <<<<<<<<<<<<<<
+ *     if len(batch) > 0:
+ *         batches.append(batch)
+ */
+    __pyx_t_9 = __Pyx_PyInt_From_long(__pyx_v_idx); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 65, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_9);
+    __pyx_t_13 = __Pyx_PyList_Append(__pyx_v_batch, __pyx_t_9); if (unlikely(__pyx_t_13 == ((int)-1))) __PYX_ERR(0, 65, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+  }
+
+  /* "fairseq/data/data_utils_fast.pyx":66
+ *             sample_len = max(sample_lens) if len(sample_lens) > 0 else 0
+ *         batch.append(idx)
+ *     if len(batch) > 0:             # <<<<<<<<<<<<<<
+ *         batches.append(batch)
+ *     return batches
+ */
+  __pyx_t_3 = __Pyx_PyList_GET_SIZE(__pyx_v_batch); if (unlikely(__pyx_t_3 == ((Py_ssize_t)-1))) __PYX_ERR(0, 66, __pyx_L1_error)
+  __pyx_t_16 = (__pyx_t_3 > 0);
+  if (__pyx_t_16) {
+
+    /* "fairseq/data/data_utils_fast.pyx":67
+ *         batch.append(idx)
+ *     if len(batch) > 0:
+ *         batches.append(batch)             # <<<<<<<<<<<<<<
+ *     return batches
+ * 
+ */
+    __pyx_t_13 = __Pyx_PyList_Append(__pyx_v_batches, __pyx_v_batch); if (unlikely(__pyx_t_13 == ((int)-1))) __PYX_ERR(0, 67, __pyx_L1_error)
+
+    /* "fairseq/data/data_utils_fast.pyx":66
+ *             sample_len = max(sample_lens) if len(sample_lens) > 0 else 0
+ *         batch.append(idx)
+ *     if len(batch) > 0:             # <<<<<<<<<<<<<<
+ *         batches.append(batch)
+ *     return batches
+ */
+  }
+
+  /* "fairseq/data/data_utils_fast.pyx":68
+ *     if len(batch) > 0:
+ *         batches.append(batch)
+ *     return batches             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_batches);
+  __pyx_r = __pyx_v_batches;
+  goto __pyx_L0;
+
+  /* "fairseq/data/data_utils_fast.pyx":27
+ * 
+ * @cython.cdivision(True)
+ * cpdef list batch_by_size_fast(             # <<<<<<<<<<<<<<
+ *     np.ndarray[DTYPE_t, ndim=1] indices,
+ *     num_tokens_fn,
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __PYX_XCLEAR_MEMVIEW(&__pyx_t_2, 1);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_XDECREF(__pyx_t_9);
+  __Pyx_XDECREF(__pyx_t_10);
+  __Pyx_XDECREF(__pyx_t_18);
+  __Pyx_XDECREF(__pyx_t_19);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_indices.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("fairseq.data.data_utils_fast.batch_by_size_fast", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_indices.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XDECREF(__pyx_v_sample_lens);
+  __Pyx_XDECREF(__pyx_v_batch);
+  __Pyx_XDECREF(__pyx_v_batches);
+  __PYX_XCLEAR_MEMVIEW(&__pyx_v_indices_view, 1);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* Python wrapper */
+static PyObject *__pyx_pw_7fairseq_4data_15data_utils_fast_1batch_by_size_fast(PyObject *__pyx_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyMethodDef __pyx_mdef_7fairseq_4data_15data_utils_fast_1batch_by_size_fast = {"batch_by_size_fast", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_7fairseq_4data_15data_utils_fast_1batch_by_size_fast, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_7fairseq_4data_15data_utils_fast_1batch_by_size_fast(PyObject *__pyx_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  PyArrayObject *__pyx_v_indices = 0;
+  PyObject *__pyx_v_num_tokens_fn = 0;
+  long __pyx_v_max_tokens;
+  long __pyx_v_max_sentences;
+  int __pyx_v_bsz_mult;
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[5] = {0,0,0,0,0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("batch_by_size_fast (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_indices,&__pyx_n_s_num_tokens_fn,&__pyx_n_s_max_tokens,&__pyx_n_s_max_sentences,&__pyx_n_s_bsz_mult,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  5: values[4] = __Pyx_Arg_FASTCALL(__pyx_args, 4);
+        CYTHON_FALLTHROUGH;
+        case  4: values[3] = __Pyx_Arg_FASTCALL(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = __Pyx_Arg_FASTCALL(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = __Pyx_Arg_FASTCALL(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_FASTCALL(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_indices)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 27, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_num_tokens_fn)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[1]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 27, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("batch_by_size_fast", 1, 5, 5, 1); __PYX_ERR(0, 27, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_max_tokens)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[2]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 27, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("batch_by_size_fast", 1, 5, 5, 2); __PYX_ERR(0, 27, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  3:
+        if (likely((values[3] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_max_sentences)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[3]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 27, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("batch_by_size_fast", 1, 5, 5, 3); __PYX_ERR(0, 27, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  4:
+        if (likely((values[4] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_bsz_mult)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[4]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 27, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("batch_by_size_fast", 1, 5, 5, 4); __PYX_ERR(0, 27, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "batch_by_size_fast") < 0)) __PYX_ERR(0, 27, __pyx_L3_error)
+      }
+    } else if (unlikely(__pyx_nargs != 5)) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+      values[1] = __Pyx_Arg_FASTCALL(__pyx_args, 1);
+      values[2] = __Pyx_Arg_FASTCALL(__pyx_args, 2);
+      values[3] = __Pyx_Arg_FASTCALL(__pyx_args, 3);
+      values[4] = __Pyx_Arg_FASTCALL(__pyx_args, 4);
+    }
+    __pyx_v_indices = ((PyArrayObject *)values[0]);
+    __pyx_v_num_tokens_fn = values[1];
+    __pyx_v_max_tokens = __Pyx_PyInt_As_long(values[2]); if (unlikely((__pyx_v_max_tokens == (long)-1) && PyErr_Occurred())) __PYX_ERR(0, 30, __pyx_L3_error)
+    __pyx_v_max_sentences = __Pyx_PyInt_As_long(values[3]); if (unlikely((__pyx_v_max_sentences == (long)-1) && PyErr_Occurred())) __PYX_ERR(0, 31, __pyx_L3_error)
+    __pyx_v_bsz_mult = __Pyx_PyInt_As_int(values[4]); if (unlikely((__pyx_v_bsz_mult == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 32, __pyx_L3_error)
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("batch_by_size_fast", 1, 5, 5, __pyx_nargs); __PYX_ERR(0, 27, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("fairseq.data.data_utils_fast.batch_by_size_fast", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_indices), __pyx_ptype_5numpy_ndarray, 1, "indices", 0))) __PYX_ERR(0, 28, __pyx_L1_error)
+  __pyx_r = __pyx_pf_7fairseq_4data_15data_utils_fast_batch_by_size_fast(__pyx_self, __pyx_v_indices, __pyx_v_num_tokens_fn, __pyx_v_max_tokens, __pyx_v_max_sentences, __pyx_v_bsz_mult);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_7fairseq_4data_15data_utils_fast_batch_by_size_fast(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_indices, PyObject *__pyx_v_num_tokens_fn, long __pyx_v_max_tokens, long __pyx_v_max_sentences, int __pyx_v_bsz_mult) {
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_indices;
+  __Pyx_Buffer __pyx_pybuffer_indices;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("batch_by_size_fast", 1);
+  __pyx_pybuffer_indices.pybuffer.buf = NULL;
+  __pyx_pybuffer_indices.refcount = 0;
+  __pyx_pybuffernd_indices.data = NULL;
+  __pyx_pybuffernd_indices.rcbuffer = &__pyx_pybuffer_indices;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_indices.rcbuffer->pybuffer, (PyObject*)__pyx_v_indices, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) __PYX_ERR(0, 27, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_indices.diminfo[0].strides = __pyx_pybuffernd_indices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_indices.diminfo[0].shape = __pyx_pybuffernd_indices.rcbuffer->pybuffer.shape[0];
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __pyx_f_7fairseq_4data_15data_utils_fast_batch_by_size_fast(__pyx_v_indices, __pyx_v_num_tokens_fn, __pyx_v_max_tokens, __pyx_v_max_sentences, __pyx_v_bsz_mult, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 27, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_indices.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("fairseq.data.data_utils_fast.batch_by_size_fast", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_indices.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "fairseq/data/data_utils_fast.pyx":71
+ * 
+ * 
+ * cdef _find_valid_shape(             # <<<<<<<<<<<<<<
+ *     DTYPE_t[:, :] shapes_view,
+ *     long num_sentences,
+ */
+
+static PyObject *__pyx_f_7fairseq_4data_15data_utils_fast__find_valid_shape(__Pyx_memviewslice __pyx_v_shapes_view, long __pyx_v_num_sentences, long __pyx_v_num_tokens) {
+  Py_ssize_t __pyx_v_i;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  Py_ssize_t __pyx_t_1;
+  Py_ssize_t __pyx_t_2;
+  Py_ssize_t __pyx_t_3;
+  int __pyx_t_4;
+  Py_ssize_t __pyx_t_5;
+  Py_ssize_t __pyx_t_6;
+  int __pyx_t_7;
+  int __pyx_t_8;
+  PyObject *__pyx_t_9 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("_find_valid_shape", 1);
+
+  /* "fairseq/data/data_utils_fast.pyx":77
+ * ):
+ *     """Return index of first valid shape of -1 if none is found."""
+ *     for i in range(shapes_view.shape[0]):             # <<<<<<<<<<<<<<
+ *         if num_sentences <= shapes_view[i][0] and num_tokens <= shapes_view[i][1]:
+ *             return i
+ */
+  __pyx_t_1 = (__pyx_v_shapes_view.shape[0]);
+  __pyx_t_2 = __pyx_t_1;
+  for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_2; __pyx_t_3+=1) {
+    __pyx_v_i = __pyx_t_3;
+
+    /* "fairseq/data/data_utils_fast.pyx":78
+ *     """Return index of first valid shape of -1 if none is found."""
+ *     for i in range(shapes_view.shape[0]):
+ *         if num_sentences <= shapes_view[i][0] and num_tokens <= shapes_view[i][1]:             # <<<<<<<<<<<<<<
+ *             return i
+ *     return -1
+ */
+    __pyx_t_5 = __pyx_v_i;
+    __pyx_t_6 = 0;
+    __pyx_t_7 = -1;
+    if (__pyx_t_5 < 0) {
+      __pyx_t_5 += __pyx_v_shapes_view.shape[0];
+      if (unlikely(__pyx_t_5 < 0)) __pyx_t_7 = 0;
+    } else if (unlikely(__pyx_t_5 >= __pyx_v_shapes_view.shape[0])) __pyx_t_7 = 0;
+    if (__pyx_t_6 < 0) {
+      __pyx_t_6 += __pyx_v_shapes_view.shape[1];
+      if (unlikely(__pyx_t_6 < 0)) __pyx_t_7 = 1;
+    } else if (unlikely(__pyx_t_6 >= __pyx_v_shapes_view.shape[1])) __pyx_t_7 = 1;
+    if (unlikely(__pyx_t_7 != -1)) {
+      __Pyx_RaiseBufferIndexError(__pyx_t_7);
+      __PYX_ERR(0, 78, __pyx_L1_error)
+    }
+    __pyx_t_8 = (__pyx_v_num_sentences <= (*((__pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_shapes_view.data + __pyx_t_5 * __pyx_v_shapes_view.strides[0]) ) + __pyx_t_6 * __pyx_v_shapes_view.strides[1]) ))));
+    if (__pyx_t_8) {
+    } else {
+      __pyx_t_4 = __pyx_t_8;
+      goto __pyx_L6_bool_binop_done;
+    }
+    __pyx_t_6 = __pyx_v_i;
+    __pyx_t_5 = 1;
+    __pyx_t_7 = -1;
+    if (__pyx_t_6 < 0) {
+      __pyx_t_6 += __pyx_v_shapes_view.shape[0];
+      if (unlikely(__pyx_t_6 < 0)) __pyx_t_7 = 0;
+    } else if (unlikely(__pyx_t_6 >= __pyx_v_shapes_view.shape[0])) __pyx_t_7 = 0;
+    if (__pyx_t_5 < 0) {
+      __pyx_t_5 += __pyx_v_shapes_view.shape[1];
+      if (unlikely(__pyx_t_5 < 0)) __pyx_t_7 = 1;
+    } else if (unlikely(__pyx_t_5 >= __pyx_v_shapes_view.shape[1])) __pyx_t_7 = 1;
+    if (unlikely(__pyx_t_7 != -1)) {
+      __Pyx_RaiseBufferIndexError(__pyx_t_7);
+      __PYX_ERR(0, 78, __pyx_L1_error)
+    }
+    __pyx_t_8 = (__pyx_v_num_tokens <= (*((__pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_shapes_view.data + __pyx_t_6 * __pyx_v_shapes_view.strides[0]) ) + __pyx_t_5 * __pyx_v_shapes_view.strides[1]) ))));
+    __pyx_t_4 = __pyx_t_8;
+    __pyx_L6_bool_binop_done:;
+    if (__pyx_t_4) {
+
+      /* "fairseq/data/data_utils_fast.pyx":79
+ *     for i in range(shapes_view.shape[0]):
+ *         if num_sentences <= shapes_view[i][0] and num_tokens <= shapes_view[i][1]:
+ *             return i             # <<<<<<<<<<<<<<
+ *     return -1
+ * 
+ */
+      __Pyx_XDECREF(__pyx_r);
+      __pyx_t_9 = PyInt_FromSsize_t(__pyx_v_i); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 79, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_9);
+      __pyx_r = __pyx_t_9;
+      __pyx_t_9 = 0;
+      goto __pyx_L0;
+
+      /* "fairseq/data/data_utils_fast.pyx":78
+ *     """Return index of first valid shape of -1 if none is found."""
+ *     for i in range(shapes_view.shape[0]):
+ *         if num_sentences <= shapes_view[i][0] and num_tokens <= shapes_view[i][1]:             # <<<<<<<<<<<<<<
+ *             return i
+ *     return -1
+ */
+    }
+  }
+
+  /* "fairseq/data/data_utils_fast.pyx":80
+ *         if num_sentences <= shapes_view[i][0] and num_tokens <= shapes_view[i][1]:
+ *             return i
+ *     return -1             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_int_neg_1);
+  __pyx_r = __pyx_int_neg_1;
+  goto __pyx_L0;
+
+  /* "fairseq/data/data_utils_fast.pyx":71
+ * 
+ * 
+ * cdef _find_valid_shape(             # <<<<<<<<<<<<<<
+ *     DTYPE_t[:, :] shapes_view,
+ *     long num_sentences,
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_9);
+  __Pyx_AddTraceback("fairseq.data.data_utils_fast._find_valid_shape", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "fairseq/data/data_utils_fast.pyx":84
+ * 
+ * @cython.cdivision(True)
+ * cpdef list batch_fixed_shapes_fast(             # <<<<<<<<<<<<<<
+ *     np.ndarray[DTYPE_t, ndim=1] indices,
+ *     num_tokens_fn,
+ */
+
+static PyObject *__pyx_pw_7fairseq_4data_15data_utils_fast_3batch_fixed_shapes_fast(PyObject *__pyx_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_f_7fairseq_4data_15data_utils_fast_batch_fixed_shapes_fast(PyArrayObject *__pyx_v_indices, PyObject *__pyx_v_num_tokens_fn, PyArrayObject *__pyx_v_fixed_shapes_sorted, CYTHON_UNUSED int __pyx_skip_dispatch) {
+  long __pyx_v_sample_len;
+  PyObject *__pyx_v_sample_lens = 0;
+  PyObject *__pyx_v_batch = 0;
+  PyObject *__pyx_v_batches = 0;
+  long __pyx_v_i;
+  long __pyx_v_idx;
+  long __pyx_v_num_tokens;
+  __Pyx_memviewslice __pyx_v_indices_view = { 0, 0, { 0 }, { 0 }, { 0 } };
+  __Pyx_memviewslice __pyx_v_shapes_view = { 0, 0, { 0 }, { 0 }, { 0 } };
+  PyObject *__pyx_v_shape_idx = NULL;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_fixed_shapes_sorted;
+  __Pyx_Buffer __pyx_pybuffer_fixed_shapes_sorted;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_indices;
+  __Pyx_Buffer __pyx_pybuffer_indices;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_memviewslice __pyx_t_2 = { 0, 0, { 0 }, { 0 }, { 0 } };
+  __Pyx_memviewslice __pyx_t_3 = { 0, 0, { 0 }, { 0 }, { 0 } };
+  Py_ssize_t __pyx_t_4;
+  Py_ssize_t __pyx_t_5;
+  long __pyx_t_6;
+  Py_ssize_t __pyx_t_7;
+  int __pyx_t_8;
+  PyObject *__pyx_t_9 = NULL;
+  PyObject *__pyx_t_10 = NULL;
+  PyObject *__pyx_t_11 = NULL;
+  unsigned int __pyx_t_12;
+  long __pyx_t_13;
+  int __pyx_t_14;
+  long __pyx_t_15;
+  long __pyx_t_16;
+  int __pyx_t_17;
+  Py_ssize_t __pyx_t_18;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("batch_fixed_shapes_fast", 1);
+  __pyx_pybuffer_indices.pybuffer.buf = NULL;
+  __pyx_pybuffer_indices.refcount = 0;
+  __pyx_pybuffernd_indices.data = NULL;
+  __pyx_pybuffernd_indices.rcbuffer = &__pyx_pybuffer_indices;
+  __pyx_pybuffer_fixed_shapes_sorted.pybuffer.buf = NULL;
+  __pyx_pybuffer_fixed_shapes_sorted.refcount = 0;
+  __pyx_pybuffernd_fixed_shapes_sorted.data = NULL;
+  __pyx_pybuffernd_fixed_shapes_sorted.rcbuffer = &__pyx_pybuffer_fixed_shapes_sorted;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_indices.rcbuffer->pybuffer, (PyObject*)__pyx_v_indices, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) __PYX_ERR(0, 84, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_indices.diminfo[0].strides = __pyx_pybuffernd_indices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_indices.diminfo[0].shape = __pyx_pybuffernd_indices.rcbuffer->pybuffer.shape[0];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_fixed_shapes_sorted.rcbuffer->pybuffer, (PyObject*)__pyx_v_fixed_shapes_sorted, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 84, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_fixed_shapes_sorted.diminfo[0].strides = __pyx_pybuffernd_fixed_shapes_sorted.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_fixed_shapes_sorted.diminfo[0].shape = __pyx_pybuffernd_fixed_shapes_sorted.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_fixed_shapes_sorted.diminfo[1].strides = __pyx_pybuffernd_fixed_shapes_sorted.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_fixed_shapes_sorted.diminfo[1].shape = __pyx_pybuffernd_fixed_shapes_sorted.rcbuffer->pybuffer.shape[1];
+
+  /* "fairseq/data/data_utils_fast.pyx":89
+ *     np.ndarray[DTYPE_t, ndim=2] fixed_shapes_sorted,
+ * ):
+ *     cdef long sample_len = 0             # <<<<<<<<<<<<<<
+ *     cdef list sample_lens = []
+ *     cdef list batch = []
+ */
+  __pyx_v_sample_len = 0;
+
+  /* "fairseq/data/data_utils_fast.pyx":90
+ * ):
+ *     cdef long sample_len = 0
+ *     cdef list sample_lens = []             # <<<<<<<<<<<<<<
+ *     cdef list batch = []
+ *     cdef list batches = []
+ */
+  __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 90, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v_sample_lens = ((PyObject*)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "fairseq/data/data_utils_fast.pyx":91
+ *     cdef long sample_len = 0
+ *     cdef list sample_lens = []
+ *     cdef list batch = []             # <<<<<<<<<<<<<<
+ *     cdef list batches = []
+ *     cdef long mod_len
+ */
+  __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 91, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v_batch = ((PyObject*)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "fairseq/data/data_utils_fast.pyx":92
+ *     cdef list sample_lens = []
+ *     cdef list batch = []
+ *     cdef list batches = []             # <<<<<<<<<<<<<<
+ *     cdef long mod_len
+ *     cdef long i
+ */
+  __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 92, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v_batches = ((PyObject*)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "fairseq/data/data_utils_fast.pyx":97
+ *     cdef long idx
+ *     cdef long num_tokens
+ *     cdef DTYPE_t[:] indices_view = indices             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t[:, :] shapes_view = fixed_shapes_sorted
+ * 
+ */
+  __pyx_t_2 = __Pyx_PyObject_to_MemoryviewSlice_ds_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t(((PyObject *)__pyx_v_indices), PyBUF_WRITABLE); if (unlikely(!__pyx_t_2.memview)) __PYX_ERR(0, 97, __pyx_L1_error)
+  __pyx_v_indices_view = __pyx_t_2;
+  __pyx_t_2.memview = NULL;
+  __pyx_t_2.data = NULL;
+
+  /* "fairseq/data/data_utils_fast.pyx":98
+ *     cdef long num_tokens
+ *     cdef DTYPE_t[:] indices_view = indices
+ *     cdef DTYPE_t[:, :] shapes_view = fixed_shapes_sorted             # <<<<<<<<<<<<<<
+ * 
+ *     for i in range(len(indices_view)):
+ */
+  __pyx_t_3 = __Pyx_PyObject_to_MemoryviewSlice_dsds_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t(((PyObject *)__pyx_v_fixed_shapes_sorted), PyBUF_WRITABLE); if (unlikely(!__pyx_t_3.memview)) __PYX_ERR(0, 98, __pyx_L1_error)
+  __pyx_v_shapes_view = __pyx_t_3;
+  __pyx_t_3.memview = NULL;
+  __pyx_t_3.data = NULL;
+
+  /* "fairseq/data/data_utils_fast.pyx":100
+ *     cdef DTYPE_t[:, :] shapes_view = fixed_shapes_sorted
+ * 
+ *     for i in range(len(indices_view)):             # <<<<<<<<<<<<<<
+ *         idx = indices_view[i]
+ *         num_tokens = num_tokens_fn(idx)
+ */
+  __pyx_t_4 = __Pyx_MemoryView_Len(__pyx_v_indices_view); 
+  __pyx_t_5 = __pyx_t_4;
+  for (__pyx_t_6 = 0; __pyx_t_6 < __pyx_t_5; __pyx_t_6+=1) {
+    __pyx_v_i = __pyx_t_6;
+
+    /* "fairseq/data/data_utils_fast.pyx":101
+ * 
+ *     for i in range(len(indices_view)):
+ *         idx = indices_view[i]             # <<<<<<<<<<<<<<
+ *         num_tokens = num_tokens_fn(idx)
+ *         sample_lens.append(num_tokens)
+ */
+    __pyx_t_7 = __pyx_v_i;
+    __pyx_t_8 = -1;
+    if (__pyx_t_7 < 0) {
+      __pyx_t_7 += __pyx_v_indices_view.shape[0];
+      if (unlikely(__pyx_t_7 < 0)) __pyx_t_8 = 0;
+    } else if (unlikely(__pyx_t_7 >= __pyx_v_indices_view.shape[0])) __pyx_t_8 = 0;
+    if (unlikely(__pyx_t_8 != -1)) {
+      __Pyx_RaiseBufferIndexError(__pyx_t_8);
+      __PYX_ERR(0, 101, __pyx_L1_error)
+    }
+    __pyx_v_idx = (*((__pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t *) ( /* dim=0 */ (__pyx_v_indices_view.data + __pyx_t_7 * __pyx_v_indices_view.strides[0]) )));
+
+    /* "fairseq/data/data_utils_fast.pyx":102
+ *     for i in range(len(indices_view)):
+ *         idx = indices_view[i]
+ *         num_tokens = num_tokens_fn(idx)             # <<<<<<<<<<<<<<
+ *         sample_lens.append(num_tokens)
+ *         sample_len = max(sample_len, num_tokens)
+ */
+    __pyx_t_9 = __Pyx_PyInt_From_long(__pyx_v_idx); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 102, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_9);
+    __Pyx_INCREF(__pyx_v_num_tokens_fn);
+    __pyx_t_10 = __pyx_v_num_tokens_fn; __pyx_t_11 = NULL;
+    __pyx_t_12 = 0;
+    #if CYTHON_UNPACK_METHODS
+    if (unlikely(PyMethod_Check(__pyx_t_10))) {
+      __pyx_t_11 = PyMethod_GET_SELF(__pyx_t_10);
+      if (likely(__pyx_t_11)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_10);
+        __Pyx_INCREF(__pyx_t_11);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_10, function);
+        __pyx_t_12 = 1;
+      }
+    }
+    #endif
+    {
+      PyObject *__pyx_callargs[2] = {__pyx_t_11, __pyx_t_9};
+      __pyx_t_1 = __Pyx_PyObject_FastCall(__pyx_t_10, __pyx_callargs+1-__pyx_t_12, 1+__pyx_t_12);
+      __Pyx_XDECREF(__pyx_t_11); __pyx_t_11 = 0;
+      __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+      if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 102, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+    }
+    __pyx_t_13 = __Pyx_PyInt_As_long(__pyx_t_1); if (unlikely((__pyx_t_13 == (long)-1) && PyErr_Occurred())) __PYX_ERR(0, 102, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __pyx_v_num_tokens = __pyx_t_13;
+
+    /* "fairseq/data/data_utils_fast.pyx":103
+ *         idx = indices_view[i]
+ *         num_tokens = num_tokens_fn(idx)
+ *         sample_lens.append(num_tokens)             # <<<<<<<<<<<<<<
+ *         sample_len = max(sample_len, num_tokens)
+ * 
+ */
+    __pyx_t_1 = __Pyx_PyInt_From_long(__pyx_v_num_tokens); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 103, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_14 = __Pyx_PyList_Append(__pyx_v_sample_lens, __pyx_t_1); if (unlikely(__pyx_t_14 == ((int)-1))) __PYX_ERR(0, 103, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+    /* "fairseq/data/data_utils_fast.pyx":104
+ *         num_tokens = num_tokens_fn(idx)
+ *         sample_lens.append(num_tokens)
+ *         sample_len = max(sample_len, num_tokens)             # <<<<<<<<<<<<<<
+ * 
+ *         shape_idx = _find_valid_shape(shapes_view, len(batch) + 1, sample_len)
+ */
+    __pyx_t_13 = __pyx_v_num_tokens;
+    __pyx_t_15 = __pyx_v_sample_len;
+    __pyx_t_17 = (__pyx_t_13 > __pyx_t_15);
+    if (__pyx_t_17) {
+      __pyx_t_16 = __pyx_t_13;
+    } else {
+      __pyx_t_16 = __pyx_t_15;
+    }
+    __pyx_v_sample_len = __pyx_t_16;
+
+    /* "fairseq/data/data_utils_fast.pyx":106
+ *         sample_len = max(sample_len, num_tokens)
+ * 
+ *         shape_idx = _find_valid_shape(shapes_view, len(batch) + 1, sample_len)             # <<<<<<<<<<<<<<
+ *         if shape_idx == -1:
+ *             batches.append(batch)
+ */
+    __pyx_t_18 = __Pyx_PyList_GET_SIZE(__pyx_v_batch); if (unlikely(__pyx_t_18 == ((Py_ssize_t)-1))) __PYX_ERR(0, 106, __pyx_L1_error)
+    __pyx_t_1 = __pyx_f_7fairseq_4data_15data_utils_fast__find_valid_shape(__pyx_v_shapes_view, (__pyx_t_18 + 1), __pyx_v_sample_len); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 106, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_XDECREF_SET(__pyx_v_shape_idx, __pyx_t_1);
+    __pyx_t_1 = 0;
+
+    /* "fairseq/data/data_utils_fast.pyx":107
+ * 
+ *         shape_idx = _find_valid_shape(shapes_view, len(batch) + 1, sample_len)
+ *         if shape_idx == -1:             # <<<<<<<<<<<<<<
+ *             batches.append(batch)
+ *             batch = []
+ */
+    __pyx_t_17 = (__Pyx_PyInt_BoolEqObjC(__pyx_v_shape_idx, __pyx_int_neg_1, -1L, 0)); if (unlikely((__pyx_t_17 < 0))) __PYX_ERR(0, 107, __pyx_L1_error)
+    if (__pyx_t_17) {
+
+      /* "fairseq/data/data_utils_fast.pyx":108
+ *         shape_idx = _find_valid_shape(shapes_view, len(batch) + 1, sample_len)
+ *         if shape_idx == -1:
+ *             batches.append(batch)             # <<<<<<<<<<<<<<
+ *             batch = []
+ *             sample_lens = []
+ */
+      __pyx_t_14 = __Pyx_PyList_Append(__pyx_v_batches, __pyx_v_batch); if (unlikely(__pyx_t_14 == ((int)-1))) __PYX_ERR(0, 108, __pyx_L1_error)
+
+      /* "fairseq/data/data_utils_fast.pyx":109
+ *         if shape_idx == -1:
+ *             batches.append(batch)
+ *             batch = []             # <<<<<<<<<<<<<<
+ *             sample_lens = []
+ *             sample_len = 0
+ */
+      __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 109, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF_SET(__pyx_v_batch, ((PyObject*)__pyx_t_1));
+      __pyx_t_1 = 0;
+
+      /* "fairseq/data/data_utils_fast.pyx":110
+ *             batches.append(batch)
+ *             batch = []
+ *             sample_lens = []             # <<<<<<<<<<<<<<
+ *             sample_len = 0
+ *             shapes_view = fixed_shapes_sorted
+ */
+      __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 110, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF_SET(__pyx_v_sample_lens, ((PyObject*)__pyx_t_1));
+      __pyx_t_1 = 0;
+
+      /* "fairseq/data/data_utils_fast.pyx":111
+ *             batch = []
+ *             sample_lens = []
+ *             sample_len = 0             # <<<<<<<<<<<<<<
+ *             shapes_view = fixed_shapes_sorted
+ *         elif shape_idx > 0:
+ */
+      __pyx_v_sample_len = 0;
+
+      /* "fairseq/data/data_utils_fast.pyx":112
+ *             sample_lens = []
+ *             sample_len = 0
+ *             shapes_view = fixed_shapes_sorted             # <<<<<<<<<<<<<<
+ *         elif shape_idx > 0:
+ *             # small optimization for the next call to _find_valid_shape
+ */
+      __pyx_t_3 = __Pyx_PyObject_to_MemoryviewSlice_dsds_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t(((PyObject *)__pyx_v_fixed_shapes_sorted), PyBUF_WRITABLE); if (unlikely(!__pyx_t_3.memview)) __PYX_ERR(0, 112, __pyx_L1_error)
+      __PYX_XCLEAR_MEMVIEW(&__pyx_v_shapes_view, 1);
+      __pyx_v_shapes_view = __pyx_t_3;
+      __pyx_t_3.memview = NULL;
+      __pyx_t_3.data = NULL;
+
+      /* "fairseq/data/data_utils_fast.pyx":107
+ * 
+ *         shape_idx = _find_valid_shape(shapes_view, len(batch) + 1, sample_len)
+ *         if shape_idx == -1:             # <<<<<<<<<<<<<<
+ *             batches.append(batch)
+ *             batch = []
+ */
+      goto __pyx_L5;
+    }
+
+    /* "fairseq/data/data_utils_fast.pyx":113
+ *             sample_len = 0
+ *             shapes_view = fixed_shapes_sorted
+ *         elif shape_idx > 0:             # <<<<<<<<<<<<<<
+ *             # small optimization for the next call to _find_valid_shape
+ *             shapes_view = shapes_view[shape_idx:]
+ */
+    __pyx_t_1 = PyObject_RichCompare(__pyx_v_shape_idx, __pyx_int_0, Py_GT); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 113, __pyx_L1_error)
+    __pyx_t_17 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely((__pyx_t_17 < 0))) __PYX_ERR(0, 113, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    if (__pyx_t_17) {
+
+      /* "fairseq/data/data_utils_fast.pyx":115
+ *         elif shape_idx > 0:
+ *             # small optimization for the next call to _find_valid_shape
+ *             shapes_view = shapes_view[shape_idx:]             # <<<<<<<<<<<<<<
+ * 
+ *         batch.append(idx)
+ */
+      __pyx_t_18 = __Pyx_PyIndex_AsSsize_t(__pyx_v_shape_idx); if (unlikely((__pyx_t_18 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(0, 115, __pyx_L1_error)
+      __pyx_t_3.data = __pyx_v_shapes_view.data;
+      __pyx_t_3.memview = __pyx_v_shapes_view.memview;
+      __PYX_INC_MEMVIEW(&__pyx_t_3, 1);
+      __pyx_t_8 = -1;
+      if (unlikely(__pyx_memoryview_slice_memviewslice(
+    &__pyx_t_3,
+    __pyx_v_shapes_view.shape[0], __pyx_v_shapes_view.strides[0], __pyx_v_shapes_view.suboffsets[0],
+    0,
+    0,
+    &__pyx_t_8,
+    __pyx_t_18,
+    0,
+    0,
+    1,
+    0,
+    0,
+    1) < 0))
+{
+    __PYX_ERR(0, 115, __pyx_L1_error)
+}
+
+__pyx_t_3.shape[1] = __pyx_v_shapes_view.shape[1];
+__pyx_t_3.strides[1] = __pyx_v_shapes_view.strides[1];
+    __pyx_t_3.suboffsets[1] = -1;
+
+__PYX_XCLEAR_MEMVIEW(&__pyx_v_shapes_view, 1);
+      __pyx_v_shapes_view = __pyx_t_3;
+      __pyx_t_3.memview = NULL;
+      __pyx_t_3.data = NULL;
+
+      /* "fairseq/data/data_utils_fast.pyx":113
+ *             sample_len = 0
+ *             shapes_view = fixed_shapes_sorted
+ *         elif shape_idx > 0:             # <<<<<<<<<<<<<<
+ *             # small optimization for the next call to _find_valid_shape
+ *             shapes_view = shapes_view[shape_idx:]
+ */
+    }
+    __pyx_L5:;
+
+    /* "fairseq/data/data_utils_fast.pyx":117
+ *             shapes_view = shapes_view[shape_idx:]
+ * 
+ *         batch.append(idx)             # <<<<<<<<<<<<<<
+ * 
+ *     if len(batch) > 0:
+ */
+    __pyx_t_1 = __Pyx_PyInt_From_long(__pyx_v_idx); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 117, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_14 = __Pyx_PyList_Append(__pyx_v_batch, __pyx_t_1); if (unlikely(__pyx_t_14 == ((int)-1))) __PYX_ERR(0, 117, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  }
+
+  /* "fairseq/data/data_utils_fast.pyx":119
+ *         batch.append(idx)
+ * 
+ *     if len(batch) > 0:             # <<<<<<<<<<<<<<
+ *         batches.append(batch)
+ * 
+ */
+  __pyx_t_4 = __Pyx_PyList_GET_SIZE(__pyx_v_batch); if (unlikely(__pyx_t_4 == ((Py_ssize_t)-1))) __PYX_ERR(0, 119, __pyx_L1_error)
+  __pyx_t_17 = (__pyx_t_4 > 0);
+  if (__pyx_t_17) {
+
+    /* "fairseq/data/data_utils_fast.pyx":120
+ * 
+ *     if len(batch) > 0:
+ *         batches.append(batch)             # <<<<<<<<<<<<<<
+ * 
+ *     return batches
+ */
+    __pyx_t_14 = __Pyx_PyList_Append(__pyx_v_batches, __pyx_v_batch); if (unlikely(__pyx_t_14 == ((int)-1))) __PYX_ERR(0, 120, __pyx_L1_error)
+
+    /* "fairseq/data/data_utils_fast.pyx":119
+ *         batch.append(idx)
+ * 
+ *     if len(batch) > 0:             # <<<<<<<<<<<<<<
+ *         batches.append(batch)
+ * 
+ */
+  }
+
+  /* "fairseq/data/data_utils_fast.pyx":122
+ *         batches.append(batch)
+ * 
+ *     return batches             # <<<<<<<<<<<<<<
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_batches);
+  __pyx_r = __pyx_v_batches;
+  goto __pyx_L0;
+
+  /* "fairseq/data/data_utils_fast.pyx":84
+ * 
+ * @cython.cdivision(True)
+ * cpdef list batch_fixed_shapes_fast(             # <<<<<<<<<<<<<<
+ *     np.ndarray[DTYPE_t, ndim=1] indices,
+ *     num_tokens_fn,
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __PYX_XCLEAR_MEMVIEW(&__pyx_t_2, 1);
+  __PYX_XCLEAR_MEMVIEW(&__pyx_t_3, 1);
+  __Pyx_XDECREF(__pyx_t_9);
+  __Pyx_XDECREF(__pyx_t_10);
+  __Pyx_XDECREF(__pyx_t_11);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_fixed_shapes_sorted.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_indices.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("fairseq.data.data_utils_fast.batch_fixed_shapes_fast", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_fixed_shapes_sorted.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_indices.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XDECREF(__pyx_v_sample_lens);
+  __Pyx_XDECREF(__pyx_v_batch);
+  __Pyx_XDECREF(__pyx_v_batches);
+  __PYX_XCLEAR_MEMVIEW(&__pyx_v_indices_view, 1);
+  __PYX_XCLEAR_MEMVIEW(&__pyx_v_shapes_view, 1);
+  __Pyx_XDECREF(__pyx_v_shape_idx);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* Python wrapper */
+static PyObject *__pyx_pw_7fairseq_4data_15data_utils_fast_3batch_fixed_shapes_fast(PyObject *__pyx_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyMethodDef __pyx_mdef_7fairseq_4data_15data_utils_fast_3batch_fixed_shapes_fast = {"batch_fixed_shapes_fast", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_7fairseq_4data_15data_utils_fast_3batch_fixed_shapes_fast, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_7fairseq_4data_15data_utils_fast_3batch_fixed_shapes_fast(PyObject *__pyx_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  PyArrayObject *__pyx_v_indices = 0;
+  PyObject *__pyx_v_num_tokens_fn = 0;
+  PyArrayObject *__pyx_v_fixed_shapes_sorted = 0;
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[3] = {0,0,0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("batch_fixed_shapes_fast (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_indices,&__pyx_n_s_num_tokens_fn,&__pyx_n_s_fixed_shapes_sorted,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  3: values[2] = __Pyx_Arg_FASTCALL(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = __Pyx_Arg_FASTCALL(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_FASTCALL(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_indices)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 84, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_num_tokens_fn)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[1]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 84, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("batch_fixed_shapes_fast", 1, 3, 3, 1); __PYX_ERR(0, 84, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_fixed_shapes_sorted)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[2]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 84, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("batch_fixed_shapes_fast", 1, 3, 3, 2); __PYX_ERR(0, 84, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "batch_fixed_shapes_fast") < 0)) __PYX_ERR(0, 84, __pyx_L3_error)
+      }
+    } else if (unlikely(__pyx_nargs != 3)) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+      values[1] = __Pyx_Arg_FASTCALL(__pyx_args, 1);
+      values[2] = __Pyx_Arg_FASTCALL(__pyx_args, 2);
+    }
+    __pyx_v_indices = ((PyArrayObject *)values[0]);
+    __pyx_v_num_tokens_fn = values[1];
+    __pyx_v_fixed_shapes_sorted = ((PyArrayObject *)values[2]);
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("batch_fixed_shapes_fast", 1, 3, 3, __pyx_nargs); __PYX_ERR(0, 84, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("fairseq.data.data_utils_fast.batch_fixed_shapes_fast", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_indices), __pyx_ptype_5numpy_ndarray, 1, "indices", 0))) __PYX_ERR(0, 85, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_fixed_shapes_sorted), __pyx_ptype_5numpy_ndarray, 1, "fixed_shapes_sorted", 0))) __PYX_ERR(0, 87, __pyx_L1_error)
+  __pyx_r = __pyx_pf_7fairseq_4data_15data_utils_fast_2batch_fixed_shapes_fast(__pyx_self, __pyx_v_indices, __pyx_v_num_tokens_fn, __pyx_v_fixed_shapes_sorted);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_7fairseq_4data_15data_utils_fast_2batch_fixed_shapes_fast(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_indices, PyObject *__pyx_v_num_tokens_fn, PyArrayObject *__pyx_v_fixed_shapes_sorted) {
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_fixed_shapes_sorted;
+  __Pyx_Buffer __pyx_pybuffer_fixed_shapes_sorted;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_indices;
+  __Pyx_Buffer __pyx_pybuffer_indices;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("batch_fixed_shapes_fast", 1);
+  __pyx_pybuffer_indices.pybuffer.buf = NULL;
+  __pyx_pybuffer_indices.refcount = 0;
+  __pyx_pybuffernd_indices.data = NULL;
+  __pyx_pybuffernd_indices.rcbuffer = &__pyx_pybuffer_indices;
+  __pyx_pybuffer_fixed_shapes_sorted.pybuffer.buf = NULL;
+  __pyx_pybuffer_fixed_shapes_sorted.refcount = 0;
+  __pyx_pybuffernd_fixed_shapes_sorted.data = NULL;
+  __pyx_pybuffernd_fixed_shapes_sorted.rcbuffer = &__pyx_pybuffer_fixed_shapes_sorted;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_indices.rcbuffer->pybuffer, (PyObject*)__pyx_v_indices, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) __PYX_ERR(0, 84, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_indices.diminfo[0].strides = __pyx_pybuffernd_indices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_indices.diminfo[0].shape = __pyx_pybuffernd_indices.rcbuffer->pybuffer.shape[0];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_fixed_shapes_sorted.rcbuffer->pybuffer, (PyObject*)__pyx_v_fixed_shapes_sorted, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 84, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_fixed_shapes_sorted.diminfo[0].strides = __pyx_pybuffernd_fixed_shapes_sorted.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_fixed_shapes_sorted.diminfo[0].shape = __pyx_pybuffernd_fixed_shapes_sorted.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_fixed_shapes_sorted.diminfo[1].strides = __pyx_pybuffernd_fixed_shapes_sorted.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_fixed_shapes_sorted.diminfo[1].shape = __pyx_pybuffernd_fixed_shapes_sorted.rcbuffer->pybuffer.shape[1];
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __pyx_f_7fairseq_4data_15data_utils_fast_batch_fixed_shapes_fast(__pyx_v_indices, __pyx_v_num_tokens_fn, __pyx_v_fixed_shapes_sorted, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 84, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_fixed_shapes_sorted.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_indices.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("fairseq.data.data_utils_fast.batch_fixed_shapes_fast", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_fixed_shapes_sorted.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_indices.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+static struct __pyx_vtabstruct_array __pyx_vtable_array;
+
+static PyObject *__pyx_tp_new_array(PyTypeObject *t, PyObject *a, PyObject *k) {
+  struct __pyx_array_obj *p;
+  PyObject *o;
+  #if CYTHON_COMPILING_IN_LIMITED_API
+  allocfunc alloc_func = (allocfunc)PyType_GetSlot(t, Py_tp_alloc);
+  o = alloc_func(t, 0);
+  #else
+  if (likely(!__Pyx_PyType_HasFeature(t, Py_TPFLAGS_IS_ABSTRACT))) {
+    o = (*t->tp_alloc)(t, 0);
+  } else {
+    o = (PyObject *) PyBaseObject_Type.tp_new(t, __pyx_empty_tuple, 0);
+  }
+  if (unlikely(!o)) return 0;
+  #endif
+  p = ((struct __pyx_array_obj *)o);
+  p->__pyx_vtab = __pyx_vtabptr_array;
+  p->mode = ((PyObject*)Py_None); Py_INCREF(Py_None);
+  p->_format = ((PyObject*)Py_None); Py_INCREF(Py_None);
+  if (unlikely(__pyx_array___cinit__(o, a, k) < 0)) goto bad;
+  return o;
+  bad:
+  Py_DECREF(o); o = 0;
+  return NULL;
+}
+
+static void __pyx_tp_dealloc_array(PyObject *o) {
+  struct __pyx_array_obj *p = (struct __pyx_array_obj *)o;
+  #if CYTHON_USE_TP_FINALIZE
+  if (unlikely((PY_VERSION_HEX >= 0x03080000 || __Pyx_PyType_HasFeature(Py_TYPE(o), Py_TPFLAGS_HAVE_FINALIZE)) && __Pyx_PyObject_GetSlot(o, tp_finalize, destructor)) && (!PyType_IS_GC(Py_TYPE(o)) || !__Pyx_PyObject_GC_IsFinalized(o))) {
+    if (__Pyx_PyObject_GetSlot(o, tp_dealloc, destructor) == __pyx_tp_dealloc_array) {
+      if (PyObject_CallFinalizerFromDealloc(o)) return;
+    }
+  }
+  #endif
+  {
+    PyObject *etype, *eval, *etb;
+    PyErr_Fetch(&etype, &eval, &etb);
+    __Pyx_SET_REFCNT(o, Py_REFCNT(o) + 1);
+    __pyx_array___dealloc__(o);
+    __Pyx_SET_REFCNT(o, Py_REFCNT(o) - 1);
+    PyErr_Restore(etype, eval, etb);
+  }
+  Py_CLEAR(p->mode);
+  Py_CLEAR(p->_format);
+  #if CYTHON_USE_TYPE_SLOTS || CYTHON_COMPILING_IN_PYPY
+  (*Py_TYPE(o)->tp_free)(o);
+  #else
+  {
+    freefunc tp_free = (freefunc)PyType_GetSlot(Py_TYPE(o), Py_tp_free);
+    if (tp_free) tp_free(o);
+  }
+  #endif
+}
+static PyObject *__pyx_sq_item_array(PyObject *o, Py_ssize_t i) {
+  PyObject *r;
+  PyObject *x = PyInt_FromSsize_t(i); if(!x) return 0;
+  r = Py_TYPE(o)->tp_as_mapping->mp_subscript(o, x);
+  Py_DECREF(x);
+  return r;
+}
+
+static int __pyx_mp_ass_subscript_array(PyObject *o, PyObject *i, PyObject *v) {
+  if (v) {
+    return __pyx_array___setitem__(o, i, v);
+  }
+  else {
+    __Pyx_TypeName o_type_name;
+    o_type_name = __Pyx_PyType_GetName(Py_TYPE(o));
+    PyErr_Format(PyExc_NotImplementedError,
+      "Subscript deletion not supported by " __Pyx_FMT_TYPENAME, o_type_name);
+    __Pyx_DECREF_TypeName(o_type_name);
+    return -1;
+  }
+}
+
+static PyObject *__pyx_tp_getattro_array(PyObject *o, PyObject *n) {
+  PyObject *v = __Pyx_PyObject_GenericGetAttr(o, n);
+  if (!v && PyErr_ExceptionMatches(PyExc_AttributeError)) {
+    PyErr_Clear();
+    v = __pyx_array___getattr__(o, n);
+  }
+  return v;
+}
+
+static PyObject *__pyx_getprop___pyx_array_memview(PyObject *o, CYTHON_UNUSED void *x) {
+  return __pyx_pw_15View_dot_MemoryView_5array_7memview_1__get__(o);
+}
+
+static PyMethodDef __pyx_methods_array[] = {
+  {"__getattr__", (PyCFunction)__pyx_array___getattr__, METH_O|METH_COEXIST, 0},
+  {"__reduce_cython__", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw___pyx_array_1__reduce_cython__, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {"__setstate_cython__", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw___pyx_array_3__setstate_cython__, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {0, 0, 0, 0}
+};
+
+static struct PyGetSetDef __pyx_getsets_array[] = {
+  {(char *)"memview", __pyx_getprop___pyx_array_memview, 0, (char *)0, 0},
+  {0, 0, 0, 0, 0}
+};
+#if CYTHON_USE_TYPE_SPECS
+#if !CYTHON_COMPILING_IN_LIMITED_API
+
+static PyBufferProcs __pyx_tp_as_buffer_array = {
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getreadbuffer*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getwritebuffer*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getsegcount*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getcharbuffer*/
+  #endif
+  __pyx_array_getbuffer, /*bf_getbuffer*/
+  0, /*bf_releasebuffer*/
+};
+#endif
+static PyType_Slot __pyx_type___pyx_array_slots[] = {
+  {Py_tp_dealloc, (void *)__pyx_tp_dealloc_array},
+  {Py_sq_length, (void *)__pyx_array___len__},
+  {Py_sq_item, (void *)__pyx_sq_item_array},
+  {Py_mp_length, (void *)__pyx_array___len__},
+  {Py_mp_subscript, (void *)__pyx_array___getitem__},
+  {Py_mp_ass_subscript, (void *)__pyx_mp_ass_subscript_array},
+  {Py_tp_getattro, (void *)__pyx_tp_getattro_array},
+  #if defined(Py_bf_getbuffer)
+  {Py_bf_getbuffer, (void *)__pyx_array_getbuffer},
+  #endif
+  {Py_tp_methods, (void *)__pyx_methods_array},
+  {Py_tp_getset, (void *)__pyx_getsets_array},
+  {Py_tp_new, (void *)__pyx_tp_new_array},
+  {0, 0},
+};
+static PyType_Spec __pyx_type___pyx_array_spec = {
+  "fairseq.data.data_utils_fast.array",
+  sizeof(struct __pyx_array_obj),
+  0,
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_SEQUENCE,
+  __pyx_type___pyx_array_slots,
+};
+#else
+
+static PySequenceMethods __pyx_tp_as_sequence_array = {
+  __pyx_array___len__, /*sq_length*/
+  0, /*sq_concat*/
+  0, /*sq_repeat*/
+  __pyx_sq_item_array, /*sq_item*/
+  0, /*sq_slice*/
+  0, /*sq_ass_item*/
+  0, /*sq_ass_slice*/
+  0, /*sq_contains*/
+  0, /*sq_inplace_concat*/
+  0, /*sq_inplace_repeat*/
+};
+
+static PyMappingMethods __pyx_tp_as_mapping_array = {
+  __pyx_array___len__, /*mp_length*/
+  __pyx_array___getitem__, /*mp_subscript*/
+  __pyx_mp_ass_subscript_array, /*mp_ass_subscript*/
+};
+
+static PyBufferProcs __pyx_tp_as_buffer_array = {
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getreadbuffer*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getwritebuffer*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getsegcount*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getcharbuffer*/
+  #endif
+  __pyx_array_getbuffer, /*bf_getbuffer*/
+  0, /*bf_releasebuffer*/
+};
+
+static PyTypeObject __pyx_type___pyx_array = {
+  PyVarObject_HEAD_INIT(0, 0)
+  "fairseq.data.data_utils_fast.""array", /*tp_name*/
+  sizeof(struct __pyx_array_obj), /*tp_basicsize*/
+  0, /*tp_itemsize*/
+  __pyx_tp_dealloc_array, /*tp_dealloc*/
+  #if PY_VERSION_HEX < 0x030800b4
+  0, /*tp_print*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030800b4
+  0, /*tp_vectorcall_offset*/
+  #endif
+  0, /*tp_getattr*/
+  0, /*tp_setattr*/
+  #if PY_MAJOR_VERSION < 3
+  0, /*tp_compare*/
+  #endif
+  #if PY_MAJOR_VERSION >= 3
+  0, /*tp_as_async*/
+  #endif
+  0, /*tp_repr*/
+  0, /*tp_as_number*/
+  &__pyx_tp_as_sequence_array, /*tp_as_sequence*/
+  &__pyx_tp_as_mapping_array, /*tp_as_mapping*/
+  0, /*tp_hash*/
+  0, /*tp_call*/
+  0, /*tp_str*/
+  __pyx_tp_getattro_array, /*tp_getattro*/
+  0, /*tp_setattro*/
+  &__pyx_tp_as_buffer_array, /*tp_as_buffer*/
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_SEQUENCE, /*tp_flags*/
+  0, /*tp_doc*/
+  0, /*tp_traverse*/
+  0, /*tp_clear*/
+  0, /*tp_richcompare*/
+  0, /*tp_weaklistoffset*/
+  0, /*tp_iter*/
+  0, /*tp_iternext*/
+  __pyx_methods_array, /*tp_methods*/
+  0, /*tp_members*/
+  __pyx_getsets_array, /*tp_getset*/
+  0, /*tp_base*/
+  0, /*tp_dict*/
+  0, /*tp_descr_get*/
+  0, /*tp_descr_set*/
+  #if !CYTHON_USE_TYPE_SPECS
+  0, /*tp_dictoffset*/
+  #endif
+  0, /*tp_init*/
+  0, /*tp_alloc*/
+  __pyx_tp_new_array, /*tp_new*/
+  0, /*tp_free*/
+  0, /*tp_is_gc*/
+  0, /*tp_bases*/
+  0, /*tp_mro*/
+  0, /*tp_cache*/
+  0, /*tp_subclasses*/
+  0, /*tp_weaklist*/
+  0, /*tp_del*/
+  0, /*tp_version_tag*/
+  #if PY_VERSION_HEX >= 0x030400a1
+  #if CYTHON_USE_TP_FINALIZE
+  0, /*tp_finalize*/
+  #else
+  NULL, /*tp_finalize*/
+  #endif
+  #endif
+  #if PY_VERSION_HEX >= 0x030800b1 && (!CYTHON_COMPILING_IN_PYPY || PYPY_VERSION_NUM >= 0x07030800)
+  0, /*tp_vectorcall*/
+  #endif
+  #if __PYX_NEED_TP_PRINT_SLOT == 1
+  0, /*tp_print*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030C0000
+  0, /*tp_watched*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030d00A4
+  0, /*tp_versions_used*/
+  #endif
+  #if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX >= 0x03090000 && PY_VERSION_HEX < 0x030a0000
+  0, /*tp_pypy_flags*/
+  #endif
+};
+#endif
+
+static PyObject *__pyx_tp_new_Enum(PyTypeObject *t, CYTHON_UNUSED PyObject *a, CYTHON_UNUSED PyObject *k) {
+  struct __pyx_MemviewEnum_obj *p;
+  PyObject *o;
+  #if CYTHON_COMPILING_IN_LIMITED_API
+  allocfunc alloc_func = (allocfunc)PyType_GetSlot(t, Py_tp_alloc);
+  o = alloc_func(t, 0);
+  #else
+  if (likely(!__Pyx_PyType_HasFeature(t, Py_TPFLAGS_IS_ABSTRACT))) {
+    o = (*t->tp_alloc)(t, 0);
+  } else {
+    o = (PyObject *) PyBaseObject_Type.tp_new(t, __pyx_empty_tuple, 0);
+  }
+  if (unlikely(!o)) return 0;
+  #endif
+  p = ((struct __pyx_MemviewEnum_obj *)o);
+  p->name = Py_None; Py_INCREF(Py_None);
+  return o;
+}
+
+static void __pyx_tp_dealloc_Enum(PyObject *o) {
+  struct __pyx_MemviewEnum_obj *p = (struct __pyx_MemviewEnum_obj *)o;
+  #if CYTHON_USE_TP_FINALIZE
+  if (unlikely((PY_VERSION_HEX >= 0x03080000 || __Pyx_PyType_HasFeature(Py_TYPE(o), Py_TPFLAGS_HAVE_FINALIZE)) && __Pyx_PyObject_GetSlot(o, tp_finalize, destructor)) && !__Pyx_PyObject_GC_IsFinalized(o)) {
+    if (__Pyx_PyObject_GetSlot(o, tp_dealloc, destructor) == __pyx_tp_dealloc_Enum) {
+      if (PyObject_CallFinalizerFromDealloc(o)) return;
+    }
+  }
+  #endif
+  PyObject_GC_UnTrack(o);
+  Py_CLEAR(p->name);
+  #if CYTHON_USE_TYPE_SLOTS || CYTHON_COMPILING_IN_PYPY
+  (*Py_TYPE(o)->tp_free)(o);
+  #else
+  {
+    freefunc tp_free = (freefunc)PyType_GetSlot(Py_TYPE(o), Py_tp_free);
+    if (tp_free) tp_free(o);
+  }
+  #endif
+}
+
+static int __pyx_tp_traverse_Enum(PyObject *o, visitproc v, void *a) {
+  int e;
+  struct __pyx_MemviewEnum_obj *p = (struct __pyx_MemviewEnum_obj *)o;
+  if (p->name) {
+    e = (*v)(p->name, a); if (e) return e;
+  }
+  return 0;
+}
+
+static int __pyx_tp_clear_Enum(PyObject *o) {
+  PyObject* tmp;
+  struct __pyx_MemviewEnum_obj *p = (struct __pyx_MemviewEnum_obj *)o;
+  tmp = ((PyObject*)p->name);
+  p->name = Py_None; Py_INCREF(Py_None);
+  Py_XDECREF(tmp);
+  return 0;
+}
+
+static PyObject *__pyx_specialmethod___pyx_MemviewEnum___repr__(PyObject *self, CYTHON_UNUSED PyObject *arg) {
+  return __pyx_MemviewEnum___repr__(self);
+}
+
+static PyMethodDef __pyx_methods_Enum[] = {
+  {"__repr__", (PyCFunction)__pyx_specialmethod___pyx_MemviewEnum___repr__, METH_NOARGS|METH_COEXIST, 0},
+  {"__reduce_cython__", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw___pyx_MemviewEnum_1__reduce_cython__, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {"__setstate_cython__", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw___pyx_MemviewEnum_3__setstate_cython__, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {0, 0, 0, 0}
+};
+#if CYTHON_USE_TYPE_SPECS
+static PyType_Slot __pyx_type___pyx_MemviewEnum_slots[] = {
+  {Py_tp_dealloc, (void *)__pyx_tp_dealloc_Enum},
+  {Py_tp_repr, (void *)__pyx_MemviewEnum___repr__},
+  {Py_tp_traverse, (void *)__pyx_tp_traverse_Enum},
+  {Py_tp_clear, (void *)__pyx_tp_clear_Enum},
+  {Py_tp_methods, (void *)__pyx_methods_Enum},
+  {Py_tp_init, (void *)__pyx_MemviewEnum___init__},
+  {Py_tp_new, (void *)__pyx_tp_new_Enum},
+  {0, 0},
+};
+static PyType_Spec __pyx_type___pyx_MemviewEnum_spec = {
+  "fairseq.data.data_utils_fast.Enum",
+  sizeof(struct __pyx_MemviewEnum_obj),
+  0,
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_HAVE_GC,
+  __pyx_type___pyx_MemviewEnum_slots,
+};
+#else
+
+static PyTypeObject __pyx_type___pyx_MemviewEnum = {
+  PyVarObject_HEAD_INIT(0, 0)
+  "fairseq.data.data_utils_fast.""Enum", /*tp_name*/
+  sizeof(struct __pyx_MemviewEnum_obj), /*tp_basicsize*/
+  0, /*tp_itemsize*/
+  __pyx_tp_dealloc_Enum, /*tp_dealloc*/
+  #if PY_VERSION_HEX < 0x030800b4
+  0, /*tp_print*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030800b4
+  0, /*tp_vectorcall_offset*/
+  #endif
+  0, /*tp_getattr*/
+  0, /*tp_setattr*/
+  #if PY_MAJOR_VERSION < 3
+  0, /*tp_compare*/
+  #endif
+  #if PY_MAJOR_VERSION >= 3
+  0, /*tp_as_async*/
+  #endif
+  __pyx_MemviewEnum___repr__, /*tp_repr*/
+  0, /*tp_as_number*/
+  0, /*tp_as_sequence*/
+  0, /*tp_as_mapping*/
+  0, /*tp_hash*/
+  0, /*tp_call*/
+  0, /*tp_str*/
+  0, /*tp_getattro*/
+  0, /*tp_setattro*/
+  0, /*tp_as_buffer*/
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_HAVE_GC, /*tp_flags*/
+  0, /*tp_doc*/
+  __pyx_tp_traverse_Enum, /*tp_traverse*/
+  __pyx_tp_clear_Enum, /*tp_clear*/
+  0, /*tp_richcompare*/
+  0, /*tp_weaklistoffset*/
+  0, /*tp_iter*/
+  0, /*tp_iternext*/
+  __pyx_methods_Enum, /*tp_methods*/
+  0, /*tp_members*/
+  0, /*tp_getset*/
+  0, /*tp_base*/
+  0, /*tp_dict*/
+  0, /*tp_descr_get*/
+  0, /*tp_descr_set*/
+  #if !CYTHON_USE_TYPE_SPECS
+  0, /*tp_dictoffset*/
+  #endif
+  __pyx_MemviewEnum___init__, /*tp_init*/
+  0, /*tp_alloc*/
+  __pyx_tp_new_Enum, /*tp_new*/
+  0, /*tp_free*/
+  0, /*tp_is_gc*/
+  0, /*tp_bases*/
+  0, /*tp_mro*/
+  0, /*tp_cache*/
+  0, /*tp_subclasses*/
+  0, /*tp_weaklist*/
+  0, /*tp_del*/
+  0, /*tp_version_tag*/
+  #if PY_VERSION_HEX >= 0x030400a1
+  #if CYTHON_USE_TP_FINALIZE
+  0, /*tp_finalize*/
+  #else
+  NULL, /*tp_finalize*/
+  #endif
+  #endif
+  #if PY_VERSION_HEX >= 0x030800b1 && (!CYTHON_COMPILING_IN_PYPY || PYPY_VERSION_NUM >= 0x07030800)
+  0, /*tp_vectorcall*/
+  #endif
+  #if __PYX_NEED_TP_PRINT_SLOT == 1
+  0, /*tp_print*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030C0000
+  0, /*tp_watched*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030d00A4
+  0, /*tp_versions_used*/
+  #endif
+  #if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX >= 0x03090000 && PY_VERSION_HEX < 0x030a0000
+  0, /*tp_pypy_flags*/
+  #endif
+};
+#endif
+static struct __pyx_vtabstruct_memoryview __pyx_vtable_memoryview;
+
+static PyObject *__pyx_tp_new_memoryview(PyTypeObject *t, PyObject *a, PyObject *k) {
+  struct __pyx_memoryview_obj *p;
+  PyObject *o;
+  #if CYTHON_COMPILING_IN_LIMITED_API
+  allocfunc alloc_func = (allocfunc)PyType_GetSlot(t, Py_tp_alloc);
+  o = alloc_func(t, 0);
+  #else
+  if (likely(!__Pyx_PyType_HasFeature(t, Py_TPFLAGS_IS_ABSTRACT))) {
+    o = (*t->tp_alloc)(t, 0);
+  } else {
+    o = (PyObject *) PyBaseObject_Type.tp_new(t, __pyx_empty_tuple, 0);
+  }
+  if (unlikely(!o)) return 0;
+  #endif
+  p = ((struct __pyx_memoryview_obj *)o);
+  p->__pyx_vtab = __pyx_vtabptr_memoryview;
+  p->obj = Py_None; Py_INCREF(Py_None);
+  p->_size = Py_None; Py_INCREF(Py_None);
+  p->_array_interface = Py_None; Py_INCREF(Py_None);
+  p->view.obj = NULL;
+  if (unlikely(__pyx_memoryview___cinit__(o, a, k) < 0)) goto bad;
+  return o;
+  bad:
+  Py_DECREF(o); o = 0;
+  return NULL;
+}
+
+static void __pyx_tp_dealloc_memoryview(PyObject *o) {
+  struct __pyx_memoryview_obj *p = (struct __pyx_memoryview_obj *)o;
+  #if CYTHON_USE_TP_FINALIZE
+  if (unlikely((PY_VERSION_HEX >= 0x03080000 || __Pyx_PyType_HasFeature(Py_TYPE(o), Py_TPFLAGS_HAVE_FINALIZE)) && __Pyx_PyObject_GetSlot(o, tp_finalize, destructor)) && !__Pyx_PyObject_GC_IsFinalized(o)) {
+    if (__Pyx_PyObject_GetSlot(o, tp_dealloc, destructor) == __pyx_tp_dealloc_memoryview) {
+      if (PyObject_CallFinalizerFromDealloc(o)) return;
+    }
+  }
+  #endif
+  PyObject_GC_UnTrack(o);
+  {
+    PyObject *etype, *eval, *etb;
+    PyErr_Fetch(&etype, &eval, &etb);
+    __Pyx_SET_REFCNT(o, Py_REFCNT(o) + 1);
+    __pyx_memoryview___dealloc__(o);
+    __Pyx_SET_REFCNT(o, Py_REFCNT(o) - 1);
+    PyErr_Restore(etype, eval, etb);
+  }
+  Py_CLEAR(p->obj);
+  Py_CLEAR(p->_size);
+  Py_CLEAR(p->_array_interface);
+  #if CYTHON_USE_TYPE_SLOTS || CYTHON_COMPILING_IN_PYPY
+  (*Py_TYPE(o)->tp_free)(o);
+  #else
+  {
+    freefunc tp_free = (freefunc)PyType_GetSlot(Py_TYPE(o), Py_tp_free);
+    if (tp_free) tp_free(o);
+  }
+  #endif
+}
+
+static int __pyx_tp_traverse_memoryview(PyObject *o, visitproc v, void *a) {
+  int e;
+  struct __pyx_memoryview_obj *p = (struct __pyx_memoryview_obj *)o;
+  if (p->obj) {
+    e = (*v)(p->obj, a); if (e) return e;
+  }
+  if (p->_size) {
+    e = (*v)(p->_size, a); if (e) return e;
+  }
+  if (p->_array_interface) {
+    e = (*v)(p->_array_interface, a); if (e) return e;
+  }
+  if (p->view.obj) {
+    e = (*v)(p->view.obj, a); if (e) return e;
+  }
+  return 0;
+}
+
+static int __pyx_tp_clear_memoryview(PyObject *o) {
+  PyObject* tmp;
+  struct __pyx_memoryview_obj *p = (struct __pyx_memoryview_obj *)o;
+  tmp = ((PyObject*)p->obj);
+  p->obj = Py_None; Py_INCREF(Py_None);
+  Py_XDECREF(tmp);
+  tmp = ((PyObject*)p->_size);
+  p->_size = Py_None; Py_INCREF(Py_None);
+  Py_XDECREF(tmp);
+  tmp = ((PyObject*)p->_array_interface);
+  p->_array_interface = Py_None; Py_INCREF(Py_None);
+  Py_XDECREF(tmp);
+  Py_CLEAR(p->view.obj);
+  return 0;
+}
+static PyObject *__pyx_sq_item_memoryview(PyObject *o, Py_ssize_t i) {
+  PyObject *r;
+  PyObject *x = PyInt_FromSsize_t(i); if(!x) return 0;
+  r = Py_TYPE(o)->tp_as_mapping->mp_subscript(o, x);
+  Py_DECREF(x);
+  return r;
+}
+
+static int __pyx_mp_ass_subscript_memoryview(PyObject *o, PyObject *i, PyObject *v) {
+  if (v) {
+    return __pyx_memoryview___setitem__(o, i, v);
+  }
+  else {
+    __Pyx_TypeName o_type_name;
+    o_type_name = __Pyx_PyType_GetName(Py_TYPE(o));
+    PyErr_Format(PyExc_NotImplementedError,
+      "Subscript deletion not supported by " __Pyx_FMT_TYPENAME, o_type_name);
+    __Pyx_DECREF_TypeName(o_type_name);
+    return -1;
+  }
+}
+
+static PyObject *__pyx_getprop___pyx_memoryview_T(PyObject *o, CYTHON_UNUSED void *x) {
+  return __pyx_pw_15View_dot_MemoryView_10memoryview_1T_1__get__(o);
+}
+
+static PyObject *__pyx_getprop___pyx_memoryview_base(PyObject *o, CYTHON_UNUSED void *x) {
+  return __pyx_pw_15View_dot_MemoryView_10memoryview_4base_1__get__(o);
+}
+
+static PyObject *__pyx_getprop___pyx_memoryview_shape(PyObject *o, CYTHON_UNUSED void *x) {
+  return __pyx_pw_15View_dot_MemoryView_10memoryview_5shape_1__get__(o);
+}
+
+static PyObject *__pyx_getprop___pyx_memoryview_strides(PyObject *o, CYTHON_UNUSED void *x) {
+  return __pyx_pw_15View_dot_MemoryView_10memoryview_7strides_1__get__(o);
+}
+
+static PyObject *__pyx_getprop___pyx_memoryview_suboffsets(PyObject *o, CYTHON_UNUSED void *x) {
+  return __pyx_pw_15View_dot_MemoryView_10memoryview_10suboffsets_1__get__(o);
+}
+
+static PyObject *__pyx_getprop___pyx_memoryview_ndim(PyObject *o, CYTHON_UNUSED void *x) {
+  return __pyx_pw_15View_dot_MemoryView_10memoryview_4ndim_1__get__(o);
+}
+
+static PyObject *__pyx_getprop___pyx_memoryview_itemsize(PyObject *o, CYTHON_UNUSED void *x) {
+  return __pyx_pw_15View_dot_MemoryView_10memoryview_8itemsize_1__get__(o);
+}
+
+static PyObject *__pyx_getprop___pyx_memoryview_nbytes(PyObject *o, CYTHON_UNUSED void *x) {
+  return __pyx_pw_15View_dot_MemoryView_10memoryview_6nbytes_1__get__(o);
+}
+
+static PyObject *__pyx_getprop___pyx_memoryview_size(PyObject *o, CYTHON_UNUSED void *x) {
+  return __pyx_pw_15View_dot_MemoryView_10memoryview_4size_1__get__(o);
+}
+
+static PyObject *__pyx_specialmethod___pyx_memoryview___repr__(PyObject *self, CYTHON_UNUSED PyObject *arg) {
+  return __pyx_memoryview___repr__(self);
+}
+
+static PyMethodDef __pyx_methods_memoryview[] = {
+  {"__repr__", (PyCFunction)__pyx_specialmethod___pyx_memoryview___repr__, METH_NOARGS|METH_COEXIST, 0},
+  {"is_c_contig", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_memoryview_is_c_contig, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {"is_f_contig", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_memoryview_is_f_contig, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {"copy", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_memoryview_copy, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {"copy_fortran", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_memoryview_copy_fortran, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {"__reduce_cython__", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw___pyx_memoryview_1__reduce_cython__, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {"__setstate_cython__", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw___pyx_memoryview_3__setstate_cython__, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {0, 0, 0, 0}
+};
+
+static struct PyGetSetDef __pyx_getsets_memoryview[] = {
+  {(char *)"T", __pyx_getprop___pyx_memoryview_T, 0, (char *)0, 0},
+  {(char *)"base", __pyx_getprop___pyx_memoryview_base, 0, (char *)0, 0},
+  {(char *)"shape", __pyx_getprop___pyx_memoryview_shape, 0, (char *)0, 0},
+  {(char *)"strides", __pyx_getprop___pyx_memoryview_strides, 0, (char *)0, 0},
+  {(char *)"suboffsets", __pyx_getprop___pyx_memoryview_suboffsets, 0, (char *)0, 0},
+  {(char *)"ndim", __pyx_getprop___pyx_memoryview_ndim, 0, (char *)0, 0},
+  {(char *)"itemsize", __pyx_getprop___pyx_memoryview_itemsize, 0, (char *)0, 0},
+  {(char *)"nbytes", __pyx_getprop___pyx_memoryview_nbytes, 0, (char *)0, 0},
+  {(char *)"size", __pyx_getprop___pyx_memoryview_size, 0, (char *)0, 0},
+  {0, 0, 0, 0, 0}
+};
+#if CYTHON_USE_TYPE_SPECS
+#if !CYTHON_COMPILING_IN_LIMITED_API
+
+static PyBufferProcs __pyx_tp_as_buffer_memoryview = {
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getreadbuffer*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getwritebuffer*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getsegcount*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getcharbuffer*/
+  #endif
+  __pyx_memoryview_getbuffer, /*bf_getbuffer*/
+  0, /*bf_releasebuffer*/
+};
+#endif
+static PyType_Slot __pyx_type___pyx_memoryview_slots[] = {
+  {Py_tp_dealloc, (void *)__pyx_tp_dealloc_memoryview},
+  {Py_tp_repr, (void *)__pyx_memoryview___repr__},
+  {Py_sq_length, (void *)__pyx_memoryview___len__},
+  {Py_sq_item, (void *)__pyx_sq_item_memoryview},
+  {Py_mp_length, (void *)__pyx_memoryview___len__},
+  {Py_mp_subscript, (void *)__pyx_memoryview___getitem__},
+  {Py_mp_ass_subscript, (void *)__pyx_mp_ass_subscript_memoryview},
+  {Py_tp_str, (void *)__pyx_memoryview___str__},
+  #if defined(Py_bf_getbuffer)
+  {Py_bf_getbuffer, (void *)__pyx_memoryview_getbuffer},
+  #endif
+  {Py_tp_traverse, (void *)__pyx_tp_traverse_memoryview},
+  {Py_tp_clear, (void *)__pyx_tp_clear_memoryview},
+  {Py_tp_methods, (void *)__pyx_methods_memoryview},
+  {Py_tp_getset, (void *)__pyx_getsets_memoryview},
+  {Py_tp_new, (void *)__pyx_tp_new_memoryview},
+  {0, 0},
+};
+static PyType_Spec __pyx_type___pyx_memoryview_spec = {
+  "fairseq.data.data_utils_fast.memoryview",
+  sizeof(struct __pyx_memoryview_obj),
+  0,
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_HAVE_GC,
+  __pyx_type___pyx_memoryview_slots,
+};
+#else
+
+static PySequenceMethods __pyx_tp_as_sequence_memoryview = {
+  __pyx_memoryview___len__, /*sq_length*/
+  0, /*sq_concat*/
+  0, /*sq_repeat*/
+  __pyx_sq_item_memoryview, /*sq_item*/
+  0, /*sq_slice*/
+  0, /*sq_ass_item*/
+  0, /*sq_ass_slice*/
+  0, /*sq_contains*/
+  0, /*sq_inplace_concat*/
+  0, /*sq_inplace_repeat*/
+};
+
+static PyMappingMethods __pyx_tp_as_mapping_memoryview = {
+  __pyx_memoryview___len__, /*mp_length*/
+  __pyx_memoryview___getitem__, /*mp_subscript*/
+  __pyx_mp_ass_subscript_memoryview, /*mp_ass_subscript*/
+};
+
+static PyBufferProcs __pyx_tp_as_buffer_memoryview = {
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getreadbuffer*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getwritebuffer*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getsegcount*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getcharbuffer*/
+  #endif
+  __pyx_memoryview_getbuffer, /*bf_getbuffer*/
+  0, /*bf_releasebuffer*/
+};
+
+static PyTypeObject __pyx_type___pyx_memoryview = {
+  PyVarObject_HEAD_INIT(0, 0)
+  "fairseq.data.data_utils_fast.""memoryview", /*tp_name*/
+  sizeof(struct __pyx_memoryview_obj), /*tp_basicsize*/
+  0, /*tp_itemsize*/
+  __pyx_tp_dealloc_memoryview, /*tp_dealloc*/
+  #if PY_VERSION_HEX < 0x030800b4
+  0, /*tp_print*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030800b4
+  0, /*tp_vectorcall_offset*/
+  #endif
+  0, /*tp_getattr*/
+  0, /*tp_setattr*/
+  #if PY_MAJOR_VERSION < 3
+  0, /*tp_compare*/
+  #endif
+  #if PY_MAJOR_VERSION >= 3
+  0, /*tp_as_async*/
+  #endif
+  __pyx_memoryview___repr__, /*tp_repr*/
+  0, /*tp_as_number*/
+  &__pyx_tp_as_sequence_memoryview, /*tp_as_sequence*/
+  &__pyx_tp_as_mapping_memoryview, /*tp_as_mapping*/
+  0, /*tp_hash*/
+  0, /*tp_call*/
+  __pyx_memoryview___str__, /*tp_str*/
+  0, /*tp_getattro*/
+  0, /*tp_setattro*/
+  &__pyx_tp_as_buffer_memoryview, /*tp_as_buffer*/
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_HAVE_GC, /*tp_flags*/
+  0, /*tp_doc*/
+  __pyx_tp_traverse_memoryview, /*tp_traverse*/
+  __pyx_tp_clear_memoryview, /*tp_clear*/
+  0, /*tp_richcompare*/
+  0, /*tp_weaklistoffset*/
+  0, /*tp_iter*/
+  0, /*tp_iternext*/
+  __pyx_methods_memoryview, /*tp_methods*/
+  0, /*tp_members*/
+  __pyx_getsets_memoryview, /*tp_getset*/
+  0, /*tp_base*/
+  0, /*tp_dict*/
+  0, /*tp_descr_get*/
+  0, /*tp_descr_set*/
+  #if !CYTHON_USE_TYPE_SPECS
+  0, /*tp_dictoffset*/
+  #endif
+  0, /*tp_init*/
+  0, /*tp_alloc*/
+  __pyx_tp_new_memoryview, /*tp_new*/
+  0, /*tp_free*/
+  0, /*tp_is_gc*/
+  0, /*tp_bases*/
+  0, /*tp_mro*/
+  0, /*tp_cache*/
+  0, /*tp_subclasses*/
+  0, /*tp_weaklist*/
+  0, /*tp_del*/
+  0, /*tp_version_tag*/
+  #if PY_VERSION_HEX >= 0x030400a1
+  #if CYTHON_USE_TP_FINALIZE
+  0, /*tp_finalize*/
+  #else
+  NULL, /*tp_finalize*/
+  #endif
+  #endif
+  #if PY_VERSION_HEX >= 0x030800b1 && (!CYTHON_COMPILING_IN_PYPY || PYPY_VERSION_NUM >= 0x07030800)
+  0, /*tp_vectorcall*/
+  #endif
+  #if __PYX_NEED_TP_PRINT_SLOT == 1
+  0, /*tp_print*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030C0000
+  0, /*tp_watched*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030d00A4
+  0, /*tp_versions_used*/
+  #endif
+  #if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX >= 0x03090000 && PY_VERSION_HEX < 0x030a0000
+  0, /*tp_pypy_flags*/
+  #endif
+};
+#endif
+static struct __pyx_vtabstruct__memoryviewslice __pyx_vtable__memoryviewslice;
+
+static PyObject *__pyx_tp_new__memoryviewslice(PyTypeObject *t, PyObject *a, PyObject *k) {
+  struct __pyx_memoryviewslice_obj *p;
+  PyObject *o = __pyx_tp_new_memoryview(t, a, k);
+  if (unlikely(!o)) return 0;
+  p = ((struct __pyx_memoryviewslice_obj *)o);
+  p->__pyx_base.__pyx_vtab = (struct __pyx_vtabstruct_memoryview*)__pyx_vtabptr__memoryviewslice;
+  new((void*)&(p->from_slice)) __Pyx_memviewslice();
+  p->from_object = Py_None; Py_INCREF(Py_None);
+  p->from_slice.memview = NULL;
+  return o;
+}
+
+static void __pyx_tp_dealloc__memoryviewslice(PyObject *o) {
+  struct __pyx_memoryviewslice_obj *p = (struct __pyx_memoryviewslice_obj *)o;
+  #if CYTHON_USE_TP_FINALIZE
+  if (unlikely((PY_VERSION_HEX >= 0x03080000 || __Pyx_PyType_HasFeature(Py_TYPE(o), Py_TPFLAGS_HAVE_FINALIZE)) && __Pyx_PyObject_GetSlot(o, tp_finalize, destructor)) && !__Pyx_PyObject_GC_IsFinalized(o)) {
+    if (__Pyx_PyObject_GetSlot(o, tp_dealloc, destructor) == __pyx_tp_dealloc__memoryviewslice) {
+      if (PyObject_CallFinalizerFromDealloc(o)) return;
+    }
+  }
+  #endif
+  PyObject_GC_UnTrack(o);
+  {
+    PyObject *etype, *eval, *etb;
+    PyErr_Fetch(&etype, &eval, &etb);
+    __Pyx_SET_REFCNT(o, Py_REFCNT(o) + 1);
+    __pyx_memoryviewslice___dealloc__(o);
+    __Pyx_SET_REFCNT(o, Py_REFCNT(o) - 1);
+    PyErr_Restore(etype, eval, etb);
+  }
+  __Pyx_call_destructor(p->from_slice);
+  Py_CLEAR(p->from_object);
+  PyObject_GC_Track(o);
+  __pyx_tp_dealloc_memoryview(o);
+}
+
+static int __pyx_tp_traverse__memoryviewslice(PyObject *o, visitproc v, void *a) {
+  int e;
+  struct __pyx_memoryviewslice_obj *p = (struct __pyx_memoryviewslice_obj *)o;
+  e = __pyx_tp_traverse_memoryview(o, v, a); if (e) return e;
+  if (p->from_object) {
+    e = (*v)(p->from_object, a); if (e) return e;
+  }
+  return 0;
+}
+
+static int __pyx_tp_clear__memoryviewslice(PyObject *o) {
+  PyObject* tmp;
+  struct __pyx_memoryviewslice_obj *p = (struct __pyx_memoryviewslice_obj *)o;
+  __pyx_tp_clear_memoryview(o);
+  tmp = ((PyObject*)p->from_object);
+  p->from_object = Py_None; Py_INCREF(Py_None);
+  Py_XDECREF(tmp);
+  __PYX_XCLEAR_MEMVIEW(&p->from_slice, 1);
+  return 0;
+}
+
+static PyMethodDef __pyx_methods__memoryviewslice[] = {
+  {"__reduce_cython__", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw___pyx_memoryviewslice_1__reduce_cython__, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {"__setstate_cython__", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw___pyx_memoryviewslice_3__setstate_cython__, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {0, 0, 0, 0}
+};
+#if CYTHON_USE_TYPE_SPECS
+static PyType_Slot __pyx_type___pyx_memoryviewslice_slots[] = {
+  {Py_tp_dealloc, (void *)__pyx_tp_dealloc__memoryviewslice},
+  {Py_tp_doc, (void *)PyDoc_STR("Internal class for passing memoryview slices to Python")},
+  {Py_tp_traverse, (void *)__pyx_tp_traverse__memoryviewslice},
+  {Py_tp_clear, (void *)__pyx_tp_clear__memoryviewslice},
+  {Py_tp_methods, (void *)__pyx_methods__memoryviewslice},
+  {Py_tp_new, (void *)__pyx_tp_new__memoryviewslice},
+  {0, 0},
+};
+static PyType_Spec __pyx_type___pyx_memoryviewslice_spec = {
+  "fairseq.data.data_utils_fast._memoryviewslice",
+  sizeof(struct __pyx_memoryviewslice_obj),
+  0,
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_HAVE_GC|Py_TPFLAGS_SEQUENCE,
+  __pyx_type___pyx_memoryviewslice_slots,
+};
+#else
+
+static PyTypeObject __pyx_type___pyx_memoryviewslice = {
+  PyVarObject_HEAD_INIT(0, 0)
+  "fairseq.data.data_utils_fast.""_memoryviewslice", /*tp_name*/
+  sizeof(struct __pyx_memoryviewslice_obj), /*tp_basicsize*/
+  0, /*tp_itemsize*/
+  __pyx_tp_dealloc__memoryviewslice, /*tp_dealloc*/
+  #if PY_VERSION_HEX < 0x030800b4
+  0, /*tp_print*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030800b4
+  0, /*tp_vectorcall_offset*/
+  #endif
+  0, /*tp_getattr*/
+  0, /*tp_setattr*/
+  #if PY_MAJOR_VERSION < 3
+  0, /*tp_compare*/
+  #endif
+  #if PY_MAJOR_VERSION >= 3
+  0, /*tp_as_async*/
+  #endif
+  #if CYTHON_COMPILING_IN_PYPY || 0
+  __pyx_memoryview___repr__, /*tp_repr*/
+  #else
+  0, /*tp_repr*/
+  #endif
+  0, /*tp_as_number*/
+  0, /*tp_as_sequence*/
+  0, /*tp_as_mapping*/
+  0, /*tp_hash*/
+  0, /*tp_call*/
+  #if CYTHON_COMPILING_IN_PYPY || 0
+  __pyx_memoryview___str__, /*tp_str*/
+  #else
+  0, /*tp_str*/
+  #endif
+  0, /*tp_getattro*/
+  0, /*tp_setattro*/
+  0, /*tp_as_buffer*/
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_HAVE_GC|Py_TPFLAGS_SEQUENCE, /*tp_flags*/
+  PyDoc_STR("Internal class for passing memoryview slices to Python"), /*tp_doc*/
+  __pyx_tp_traverse__memoryviewslice, /*tp_traverse*/
+  __pyx_tp_clear__memoryviewslice, /*tp_clear*/
+  0, /*tp_richcompare*/
+  0, /*tp_weaklistoffset*/
+  0, /*tp_iter*/
+  0, /*tp_iternext*/
+  __pyx_methods__memoryviewslice, /*tp_methods*/
+  0, /*tp_members*/
+  0, /*tp_getset*/
+  0, /*tp_base*/
+  0, /*tp_dict*/
+  0, /*tp_descr_get*/
+  0, /*tp_descr_set*/
+  #if !CYTHON_USE_TYPE_SPECS
+  0, /*tp_dictoffset*/
+  #endif
+  0, /*tp_init*/
+  0, /*tp_alloc*/
+  __pyx_tp_new__memoryviewslice, /*tp_new*/
+  0, /*tp_free*/
+  0, /*tp_is_gc*/
+  0, /*tp_bases*/
+  0, /*tp_mro*/
+  0, /*tp_cache*/
+  0, /*tp_subclasses*/
+  0, /*tp_weaklist*/
+  0, /*tp_del*/
+  0, /*tp_version_tag*/
+  #if PY_VERSION_HEX >= 0x030400a1
+  #if CYTHON_USE_TP_FINALIZE
+  0, /*tp_finalize*/
+  #else
+  NULL, /*tp_finalize*/
+  #endif
+  #endif
+  #if PY_VERSION_HEX >= 0x030800b1 && (!CYTHON_COMPILING_IN_PYPY || PYPY_VERSION_NUM >= 0x07030800)
+  0, /*tp_vectorcall*/
+  #endif
+  #if __PYX_NEED_TP_PRINT_SLOT == 1
+  0, /*tp_print*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030C0000
+  0, /*tp_watched*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030d00A4
+  0, /*tp_versions_used*/
+  #endif
+  #if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX >= 0x03090000 && PY_VERSION_HEX < 0x030a0000
+  0, /*tp_pypy_flags*/
+  #endif
+};
+#endif
+
+static PyMethodDef __pyx_methods[] = {
+  {0, 0, 0, 0}
+};
+#ifndef CYTHON_SMALL_CODE
+#if defined(__clang__)
+    #define CYTHON_SMALL_CODE
+#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+    #define CYTHON_SMALL_CODE __attribute__((cold))
+#else
+    #define CYTHON_SMALL_CODE
+#endif
+#endif
+/* #### Code section: pystring_table ### */
+
+static int __Pyx_CreateStringTabAndInitStrings(void) {
+  __Pyx_StringTabEntry __pyx_string_tab[] = {
+    {&__pyx_kp_u_, __pyx_k_, sizeof(__pyx_k_), 0, 1, 0, 0},
+    {&__pyx_n_s_ASCII, __pyx_k_ASCII, sizeof(__pyx_k_ASCII), 0, 0, 1, 1},
+    {&__pyx_kp_s_All_dimensions_preceding_dimensi, __pyx_k_All_dimensions_preceding_dimensi, sizeof(__pyx_k_All_dimensions_preceding_dimensi), 0, 0, 1, 0},
+    {&__pyx_n_s_AssertionError, __pyx_k_AssertionError, sizeof(__pyx_k_AssertionError), 0, 0, 1, 1},
+    {&__pyx_kp_s_Buffer_view_does_not_expose_stri, __pyx_k_Buffer_view_does_not_expose_stri, sizeof(__pyx_k_Buffer_view_does_not_expose_stri), 0, 0, 1, 0},
+    {&__pyx_kp_s_Can_only_create_a_buffer_that_is, __pyx_k_Can_only_create_a_buffer_that_is, sizeof(__pyx_k_Can_only_create_a_buffer_that_is), 0, 0, 1, 0},
+    {&__pyx_kp_s_Cannot_assign_to_read_only_memor, __pyx_k_Cannot_assign_to_read_only_memor, sizeof(__pyx_k_Cannot_assign_to_read_only_memor), 0, 0, 1, 0},
+    {&__pyx_kp_s_Cannot_create_writable_memory_vi, __pyx_k_Cannot_create_writable_memory_vi, sizeof(__pyx_k_Cannot_create_writable_memory_vi), 0, 0, 1, 0},
+    {&__pyx_kp_u_Cannot_index_with_type, __pyx_k_Cannot_index_with_type, sizeof(__pyx_k_Cannot_index_with_type), 0, 1, 0, 0},
+    {&__pyx_kp_s_Cannot_transpose_memoryview_with, __pyx_k_Cannot_transpose_memoryview_with, sizeof(__pyx_k_Cannot_transpose_memoryview_with), 0, 0, 1, 0},
+    {&__pyx_n_s_DTYPE, __pyx_k_DTYPE, sizeof(__pyx_k_DTYPE), 0, 0, 1, 1},
+    {&__pyx_kp_s_Dimension_d_is_not_direct, __pyx_k_Dimension_d_is_not_direct, sizeof(__pyx_k_Dimension_d_is_not_direct), 0, 0, 1, 0},
+    {&__pyx_n_s_Ellipsis, __pyx_k_Ellipsis, sizeof(__pyx_k_Ellipsis), 0, 0, 1, 1},
+    {&__pyx_kp_s_Empty_shape_tuple_for_cython_arr, __pyx_k_Empty_shape_tuple_for_cython_arr, sizeof(__pyx_k_Empty_shape_tuple_for_cython_arr), 0, 0, 1, 0},
+    {&__pyx_n_s_ImportError, __pyx_k_ImportError, sizeof(__pyx_k_ImportError), 0, 0, 1, 1},
+    {&__pyx_kp_s_Incompatible_checksums_0x_x_vs_0, __pyx_k_Incompatible_checksums_0x_x_vs_0, sizeof(__pyx_k_Incompatible_checksums_0x_x_vs_0), 0, 0, 1, 0},
+    {&__pyx_n_s_IndexError, __pyx_k_IndexError, sizeof(__pyx_k_IndexError), 0, 0, 1, 1},
+    {&__pyx_kp_s_Index_out_of_bounds_axis_d, __pyx_k_Index_out_of_bounds_axis_d, sizeof(__pyx_k_Index_out_of_bounds_axis_d), 0, 0, 1, 0},
+    {&__pyx_kp_s_Indirect_dimensions_not_supporte, __pyx_k_Indirect_dimensions_not_supporte, sizeof(__pyx_k_Indirect_dimensions_not_supporte), 0, 0, 1, 0},
+    {&__pyx_kp_u_Invalid_mode_expected_c_or_fortr, __pyx_k_Invalid_mode_expected_c_or_fortr, sizeof(__pyx_k_Invalid_mode_expected_c_or_fortr), 0, 1, 0, 0},
+    {&__pyx_kp_u_Invalid_shape_in_axis, __pyx_k_Invalid_shape_in_axis, sizeof(__pyx_k_Invalid_shape_in_axis), 0, 1, 0, 0},
+    {&__pyx_n_s_MemoryError, __pyx_k_MemoryError, sizeof(__pyx_k_MemoryError), 0, 0, 1, 1},
+    {&__pyx_kp_s_MemoryView_of_r_at_0x_x, __pyx_k_MemoryView_of_r_at_0x_x, sizeof(__pyx_k_MemoryView_of_r_at_0x_x), 0, 0, 1, 0},
+    {&__pyx_kp_s_MemoryView_of_r_object, __pyx_k_MemoryView_of_r_object, sizeof(__pyx_k_MemoryView_of_r_object), 0, 0, 1, 0},
+    {&__pyx_n_b_O, __pyx_k_O, sizeof(__pyx_k_O), 0, 0, 0, 1},
+    {&__pyx_kp_u_Out_of_bounds_on_buffer_access_a, __pyx_k_Out_of_bounds_on_buffer_access_a, sizeof(__pyx_k_Out_of_bounds_on_buffer_access_a), 0, 1, 0, 0},
+    {&__pyx_n_s_PickleError, __pyx_k_PickleError, sizeof(__pyx_k_PickleError), 0, 0, 1, 1},
+    {&__pyx_n_s_Sequence, __pyx_k_Sequence, sizeof(__pyx_k_Sequence), 0, 0, 1, 1},
+    {&__pyx_kp_s_Step_may_not_be_zero_axis_d, __pyx_k_Step_may_not_be_zero_axis_d, sizeof(__pyx_k_Step_may_not_be_zero_axis_d), 0, 0, 1, 0},
+    {&__pyx_n_s_TypeError, __pyx_k_TypeError, sizeof(__pyx_k_TypeError), 0, 0, 1, 1},
+    {&__pyx_kp_s_Unable_to_convert_item_to_object, __pyx_k_Unable_to_convert_item_to_object, sizeof(__pyx_k_Unable_to_convert_item_to_object), 0, 0, 1, 0},
+    {&__pyx_n_s_ValueError, __pyx_k_ValueError, sizeof(__pyx_k_ValueError), 0, 0, 1, 1},
+    {&__pyx_n_s_View_MemoryView, __pyx_k_View_MemoryView, sizeof(__pyx_k_View_MemoryView), 0, 0, 1, 1},
+    {&__pyx_kp_u__2, __pyx_k__2, sizeof(__pyx_k__2), 0, 1, 0, 0},
+    {&__pyx_n_s__26, __pyx_k__26, sizeof(__pyx_k__26), 0, 0, 1, 1},
+    {&__pyx_n_s__3, __pyx_k__3, sizeof(__pyx_k__3), 0, 0, 1, 1},
+    {&__pyx_kp_u__6, __pyx_k__6, sizeof(__pyx_k__6), 0, 1, 0, 0},
+    {&__pyx_kp_u__7, __pyx_k__7, sizeof(__pyx_k__7), 0, 1, 0, 0},
+    {&__pyx_n_s_abc, __pyx_k_abc, sizeof(__pyx_k_abc), 0, 0, 1, 1},
+    {&__pyx_n_s_allocate_buffer, __pyx_k_allocate_buffer, sizeof(__pyx_k_allocate_buffer), 0, 0, 1, 1},
+    {&__pyx_kp_u_and, __pyx_k_and, sizeof(__pyx_k_and), 0, 1, 0, 0},
+    {&__pyx_n_s_asyncio_coroutines, __pyx_k_asyncio_coroutines, sizeof(__pyx_k_asyncio_coroutines), 0, 0, 1, 1},
+    {&__pyx_n_s_base, __pyx_k_base, sizeof(__pyx_k_base), 0, 0, 1, 1},
+    {&__pyx_n_s_batch_by_size_fast, __pyx_k_batch_by_size_fast, sizeof(__pyx_k_batch_by_size_fast), 0, 0, 1, 1},
+    {&__pyx_n_s_batch_fixed_shapes_fast, __pyx_k_batch_fixed_shapes_fast, sizeof(__pyx_k_batch_fixed_shapes_fast), 0, 0, 1, 1},
+    {&__pyx_n_s_bsz_mult, __pyx_k_bsz_mult, sizeof(__pyx_k_bsz_mult), 0, 0, 1, 1},
+    {&__pyx_n_s_c, __pyx_k_c, sizeof(__pyx_k_c), 0, 0, 1, 1},
+    {&__pyx_n_u_c, __pyx_k_c, sizeof(__pyx_k_c), 0, 1, 0, 1},
+    {&__pyx_n_s_class, __pyx_k_class, sizeof(__pyx_k_class), 0, 0, 1, 1},
+    {&__pyx_n_s_class_getitem, __pyx_k_class_getitem, sizeof(__pyx_k_class_getitem), 0, 0, 1, 1},
+    {&__pyx_n_s_cline_in_traceback, __pyx_k_cline_in_traceback, sizeof(__pyx_k_cline_in_traceback), 0, 0, 1, 1},
+    {&__pyx_n_s_collections, __pyx_k_collections, sizeof(__pyx_k_collections), 0, 0, 1, 1},
+    {&__pyx_kp_s_collections_abc, __pyx_k_collections_abc, sizeof(__pyx_k_collections_abc), 0, 0, 1, 0},
+    {&__pyx_kp_s_contiguous_and_direct, __pyx_k_contiguous_and_direct, sizeof(__pyx_k_contiguous_and_direct), 0, 0, 1, 0},
+    {&__pyx_kp_s_contiguous_and_indirect, __pyx_k_contiguous_and_indirect, sizeof(__pyx_k_contiguous_and_indirect), 0, 0, 1, 0},
+    {&__pyx_n_s_count, __pyx_k_count, sizeof(__pyx_k_count), 0, 0, 1, 1},
+    {&__pyx_n_s_dict, __pyx_k_dict, sizeof(__pyx_k_dict), 0, 0, 1, 1},
+    {&__pyx_kp_u_disable, __pyx_k_disable, sizeof(__pyx_k_disable), 0, 1, 0, 0},
+    {&__pyx_n_s_dtype_is_object, __pyx_k_dtype_is_object, sizeof(__pyx_k_dtype_is_object), 0, 0, 1, 1},
+    {&__pyx_kp_u_enable, __pyx_k_enable, sizeof(__pyx_k_enable), 0, 1, 0, 0},
+    {&__pyx_n_s_encode, __pyx_k_encode, sizeof(__pyx_k_encode), 0, 0, 1, 1},
+    {&__pyx_n_s_enumerate, __pyx_k_enumerate, sizeof(__pyx_k_enumerate), 0, 0, 1, 1},
+    {&__pyx_n_s_error, __pyx_k_error, sizeof(__pyx_k_error), 0, 0, 1, 1},
+    {&__pyx_n_s_fairseq_data_data_utils_fast, __pyx_k_fairseq_data_data_utils_fast, sizeof(__pyx_k_fairseq_data_data_utils_fast), 0, 0, 1, 1},
+    {&__pyx_kp_s_fairseq_data_data_utils_fast_pyx, __pyx_k_fairseq_data_data_utils_fast_pyx, sizeof(__pyx_k_fairseq_data_data_utils_fast_pyx), 0, 0, 1, 0},
+    {&__pyx_n_s_fixed_shapes_sorted, __pyx_k_fixed_shapes_sorted, sizeof(__pyx_k_fixed_shapes_sorted), 0, 0, 1, 1},
+    {&__pyx_n_s_flags, __pyx_k_flags, sizeof(__pyx_k_flags), 0, 0, 1, 1},
+    {&__pyx_n_s_format, __pyx_k_format, sizeof(__pyx_k_format), 0, 0, 1, 1},
+    {&__pyx_n_s_fortran, __pyx_k_fortran, sizeof(__pyx_k_fortran), 0, 0, 1, 1},
+    {&__pyx_n_u_fortran, __pyx_k_fortran, sizeof(__pyx_k_fortran), 0, 1, 0, 1},
+    {&__pyx_kp_u_gc, __pyx_k_gc, sizeof(__pyx_k_gc), 0, 1, 0, 0},
+    {&__pyx_n_s_getstate, __pyx_k_getstate, sizeof(__pyx_k_getstate), 0, 0, 1, 1},
+    {&__pyx_kp_u_got, __pyx_k_got, sizeof(__pyx_k_got), 0, 1, 0, 0},
+    {&__pyx_kp_u_got_differing_extents_in_dimensi, __pyx_k_got_differing_extents_in_dimensi, sizeof(__pyx_k_got_differing_extents_in_dimensi), 0, 1, 0, 0},
+    {&__pyx_n_s_id, __pyx_k_id, sizeof(__pyx_k_id), 0, 0, 1, 1},
+    {&__pyx_n_s_import, __pyx_k_import, sizeof(__pyx_k_import), 0, 0, 1, 1},
+    {&__pyx_n_s_index, __pyx_k_index, sizeof(__pyx_k_index), 0, 0, 1, 1},
+    {&__pyx_n_s_indices, __pyx_k_indices, sizeof(__pyx_k_indices), 0, 0, 1, 1},
+    {&__pyx_n_s_initializing, __pyx_k_initializing, sizeof(__pyx_k_initializing), 0, 0, 1, 1},
+    {&__pyx_n_s_int64, __pyx_k_int64, sizeof(__pyx_k_int64), 0, 0, 1, 1},
+    {&__pyx_n_s_is_coroutine, __pyx_k_is_coroutine, sizeof(__pyx_k_is_coroutine), 0, 0, 1, 1},
+    {&__pyx_kp_u_isenabled, __pyx_k_isenabled, sizeof(__pyx_k_isenabled), 0, 1, 0, 0},
+    {&__pyx_n_s_itemsize, __pyx_k_itemsize, sizeof(__pyx_k_itemsize), 0, 0, 1, 1},
+    {&__pyx_kp_s_itemsize_0_for_cython_array, __pyx_k_itemsize_0_for_cython_array, sizeof(__pyx_k_itemsize_0_for_cython_array), 0, 0, 1, 0},
+    {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1},
+    {&__pyx_n_s_max, __pyx_k_max, sizeof(__pyx_k_max), 0, 0, 1, 1},
+    {&__pyx_n_s_max_sentences, __pyx_k_max_sentences, sizeof(__pyx_k_max_sentences), 0, 0, 1, 1},
+    {&__pyx_n_s_max_tokens, __pyx_k_max_tokens, sizeof(__pyx_k_max_tokens), 0, 0, 1, 1},
+    {&__pyx_n_s_memview, __pyx_k_memview, sizeof(__pyx_k_memview), 0, 0, 1, 1},
+    {&__pyx_n_s_mode, __pyx_k_mode, sizeof(__pyx_k_mode), 0, 0, 1, 1},
+    {&__pyx_n_s_name, __pyx_k_name, sizeof(__pyx_k_name), 0, 0, 1, 1},
+    {&__pyx_n_s_name_2, __pyx_k_name_2, sizeof(__pyx_k_name_2), 0, 0, 1, 1},
+    {&__pyx_n_s_ndim, __pyx_k_ndim, sizeof(__pyx_k_ndim), 0, 0, 1, 1},
+    {&__pyx_n_s_new, __pyx_k_new, sizeof(__pyx_k_new), 0, 0, 1, 1},
+    {&__pyx_kp_s_no_default___reduce___due_to_non, __pyx_k_no_default___reduce___due_to_non, sizeof(__pyx_k_no_default___reduce___due_to_non), 0, 0, 1, 0},
+    {&__pyx_n_s_np, __pyx_k_np, sizeof(__pyx_k_np), 0, 0, 1, 1},
+    {&__pyx_n_s_num_tokens_fn, __pyx_k_num_tokens_fn, sizeof(__pyx_k_num_tokens_fn), 0, 0, 1, 1},
+    {&__pyx_n_s_numpy, __pyx_k_numpy, sizeof(__pyx_k_numpy), 0, 0, 1, 1},
+    {&__pyx_kp_u_numpy__core_multiarray_failed_to, __pyx_k_numpy__core_multiarray_failed_to, sizeof(__pyx_k_numpy__core_multiarray_failed_to), 0, 1, 0, 0},
+    {&__pyx_kp_u_numpy__core_umath_failed_to_impo, __pyx_k_numpy__core_umath_failed_to_impo, sizeof(__pyx_k_numpy__core_umath_failed_to_impo), 0, 1, 0, 0},
+    {&__pyx_n_s_obj, __pyx_k_obj, sizeof(__pyx_k_obj), 0, 0, 1, 1},
+    {&__pyx_n_s_pack, __pyx_k_pack, sizeof(__pyx_k_pack), 0, 0, 1, 1},
+    {&__pyx_n_s_pickle, __pyx_k_pickle, sizeof(__pyx_k_pickle), 0, 0, 1, 1},
+    {&__pyx_n_s_pyx_PickleError, __pyx_k_pyx_PickleError, sizeof(__pyx_k_pyx_PickleError), 0, 0, 1, 1},
+    {&__pyx_n_s_pyx_checksum, __pyx_k_pyx_checksum, sizeof(__pyx_k_pyx_checksum), 0, 0, 1, 1},
+    {&__pyx_n_s_pyx_result, __pyx_k_pyx_result, sizeof(__pyx_k_pyx_result), 0, 0, 1, 1},
+    {&__pyx_n_s_pyx_state, __pyx_k_pyx_state, sizeof(__pyx_k_pyx_state), 0, 0, 1, 1},
+    {&__pyx_n_s_pyx_type, __pyx_k_pyx_type, sizeof(__pyx_k_pyx_type), 0, 0, 1, 1},
+    {&__pyx_n_s_pyx_unpickle_Enum, __pyx_k_pyx_unpickle_Enum, sizeof(__pyx_k_pyx_unpickle_Enum), 0, 0, 1, 1},
+    {&__pyx_n_s_pyx_vtable, __pyx_k_pyx_vtable, sizeof(__pyx_k_pyx_vtable), 0, 0, 1, 1},
+    {&__pyx_n_s_range, __pyx_k_range, sizeof(__pyx_k_range), 0, 0, 1, 1},
+    {&__pyx_n_s_reduce, __pyx_k_reduce, sizeof(__pyx_k_reduce), 0, 0, 1, 1},
+    {&__pyx_n_s_reduce_cython, __pyx_k_reduce_cython, sizeof(__pyx_k_reduce_cython), 0, 0, 1, 1},
+    {&__pyx_n_s_reduce_ex, __pyx_k_reduce_ex, sizeof(__pyx_k_reduce_ex), 0, 0, 1, 1},
+    {&__pyx_n_s_register, __pyx_k_register, sizeof(__pyx_k_register), 0, 0, 1, 1},
+    {&__pyx_kp_u_sentence_at_index_of_size_exceed, __pyx_k_sentence_at_index_of_size_exceed, sizeof(__pyx_k_sentence_at_index_of_size_exceed), 0, 1, 0, 0},
+    {&__pyx_n_s_setstate, __pyx_k_setstate, sizeof(__pyx_k_setstate), 0, 0, 1, 1},
+    {&__pyx_n_s_setstate_cython, __pyx_k_setstate_cython, sizeof(__pyx_k_setstate_cython), 0, 0, 1, 1},
+    {&__pyx_n_s_shape, __pyx_k_shape, sizeof(__pyx_k_shape), 0, 0, 1, 1},
+    {&__pyx_n_s_size, __pyx_k_size, sizeof(__pyx_k_size), 0, 0, 1, 1},
+    {&__pyx_n_s_spec, __pyx_k_spec, sizeof(__pyx_k_spec), 0, 0, 1, 1},
+    {&__pyx_n_s_start, __pyx_k_start, sizeof(__pyx_k_start), 0, 0, 1, 1},
+    {&__pyx_n_s_step, __pyx_k_step, sizeof(__pyx_k_step), 0, 0, 1, 1},
+    {&__pyx_n_s_stop, __pyx_k_stop, sizeof(__pyx_k_stop), 0, 0, 1, 1},
+    {&__pyx_kp_s_strided_and_direct, __pyx_k_strided_and_direct, sizeof(__pyx_k_strided_and_direct), 0, 0, 1, 0},
+    {&__pyx_kp_s_strided_and_direct_or_indirect, __pyx_k_strided_and_direct_or_indirect, sizeof(__pyx_k_strided_and_direct_or_indirect), 0, 0, 1, 0},
+    {&__pyx_kp_s_strided_and_indirect, __pyx_k_strided_and_indirect, sizeof(__pyx_k_strided_and_indirect), 0, 0, 1, 0},
+    {&__pyx_kp_s_stringsource, __pyx_k_stringsource, sizeof(__pyx_k_stringsource), 0, 0, 1, 0},
+    {&__pyx_n_s_struct, __pyx_k_struct, sizeof(__pyx_k_struct), 0, 0, 1, 1},
+    {&__pyx_n_s_sys, __pyx_k_sys, sizeof(__pyx_k_sys), 0, 0, 1, 1},
+    {&__pyx_n_s_test, __pyx_k_test, sizeof(__pyx_k_test), 0, 0, 1, 1},
+    {&__pyx_kp_s_unable_to_allocate_array_data, __pyx_k_unable_to_allocate_array_data, sizeof(__pyx_k_unable_to_allocate_array_data), 0, 0, 1, 0},
+    {&__pyx_kp_s_unable_to_allocate_shape_and_str, __pyx_k_unable_to_allocate_shape_and_str, sizeof(__pyx_k_unable_to_allocate_shape_and_str), 0, 0, 1, 0},
+    {&__pyx_n_s_unpack, __pyx_k_unpack, sizeof(__pyx_k_unpack), 0, 0, 1, 1},
+    {&__pyx_n_s_update, __pyx_k_update, sizeof(__pyx_k_update), 0, 0, 1, 1},
+    {&__pyx_n_s_version_info, __pyx_k_version_info, sizeof(__pyx_k_version_info), 0, 0, 1, 1},
+    {0, 0, 0, 0, 0, 0, 0}
+  };
+  return __Pyx_InitStrings(__pyx_string_tab);
+}
+/* #### Code section: cached_builtins ### */
+static CYTHON_SMALL_CODE int __Pyx_InitCachedBuiltins(void) {
+  __pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) __PYX_ERR(0, 44, __pyx_L1_error)
+  __pyx_builtin_AssertionError = __Pyx_GetBuiltinName(__pyx_n_s_AssertionError); if (!__pyx_builtin_AssertionError) __PYX_ERR(0, 50, __pyx_L1_error)
+  __pyx_builtin_max = __Pyx_GetBuiltinName(__pyx_n_s_max); if (!__pyx_builtin_max) __PYX_ERR(0, 64, __pyx_L1_error)
+  __pyx_builtin___import__ = __Pyx_GetBuiltinName(__pyx_n_s_import); if (!__pyx_builtin___import__) __PYX_ERR(1, 100, __pyx_L1_error)
+  __pyx_builtin_ValueError = __Pyx_GetBuiltinName(__pyx_n_s_ValueError); if (!__pyx_builtin_ValueError) __PYX_ERR(1, 141, __pyx_L1_error)
+  __pyx_builtin_MemoryError = __Pyx_GetBuiltinName(__pyx_n_s_MemoryError); if (!__pyx_builtin_MemoryError) __PYX_ERR(1, 156, __pyx_L1_error)
+  __pyx_builtin_enumerate = __Pyx_GetBuiltinName(__pyx_n_s_enumerate); if (!__pyx_builtin_enumerate) __PYX_ERR(1, 159, __pyx_L1_error)
+  __pyx_builtin_TypeError = __Pyx_GetBuiltinName(__pyx_n_s_TypeError); if (!__pyx_builtin_TypeError) __PYX_ERR(1, 2, __pyx_L1_error)
+  __pyx_builtin_Ellipsis = __Pyx_GetBuiltinName(__pyx_n_s_Ellipsis); if (!__pyx_builtin_Ellipsis) __PYX_ERR(1, 408, __pyx_L1_error)
+  __pyx_builtin_id = __Pyx_GetBuiltinName(__pyx_n_s_id); if (!__pyx_builtin_id) __PYX_ERR(1, 618, __pyx_L1_error)
+  __pyx_builtin_IndexError = __Pyx_GetBuiltinName(__pyx_n_s_IndexError); if (!__pyx_builtin_IndexError) __PYX_ERR(1, 914, __pyx_L1_error)
+  __pyx_builtin_ImportError = __Pyx_GetBuiltinName(__pyx_n_s_ImportError); if (!__pyx_builtin_ImportError) __PYX_ERR(2, 1043, __pyx_L1_error)
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+/* #### Code section: cached_constants ### */
+
+static CYTHON_SMALL_CODE int __Pyx_InitCachedConstants(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0);
+
+  /* "View.MemoryView":582
+ *     def suboffsets(self):
+ *         if self.view.suboffsets == NULL:
+ *             return (-1,) * self.view.ndim             # <<<<<<<<<<<<<<
+ * 
+ *         return tuple([suboffset for suboffset in self.view.suboffsets[:self.view.ndim]])
+ */
+  __pyx_tuple__4 = PyTuple_New(1); if (unlikely(!__pyx_tuple__4)) __PYX_ERR(1, 582, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__4);
+  __Pyx_INCREF(__pyx_int_neg_1);
+  __Pyx_GIVEREF(__pyx_int_neg_1);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_tuple__4, 0, __pyx_int_neg_1)) __PYX_ERR(1, 582, __pyx_L1_error);
+  __Pyx_GIVEREF(__pyx_tuple__4);
+
+  /* "View.MemoryView":679
+ *     tup = <tuple>index if isinstance(index, tuple) else (index,)
+ * 
+ *     result = [slice(None)] * ndim             # <<<<<<<<<<<<<<
+ *     have_slices = False
+ *     seen_ellipsis = False
+ */
+  __pyx_slice__5 = PySlice_New(Py_None, Py_None, Py_None); if (unlikely(!__pyx_slice__5)) __PYX_ERR(1, 679, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_slice__5);
+  __Pyx_GIVEREF(__pyx_slice__5);
+
+  /* "(tree fragment)":4
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ *     if __pyx_checksum not in (0x82a3537, 0x6ae9995, 0xb068931):             # <<<<<<<<<<<<<<
+ *         from pickle import PickleError as __pyx_PickleError
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x82a3537, 0x6ae9995, 0xb068931) = (name))" % __pyx_checksum
+ */
+  __pyx_tuple__8 = PyTuple_Pack(3, __pyx_int_136983863, __pyx_int_112105877, __pyx_int_184977713); if (unlikely(!__pyx_tuple__8)) __PYX_ERR(1, 4, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__8);
+  __Pyx_GIVEREF(__pyx_tuple__8);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1043
+ *         __pyx_import_array()
+ *     except Exception:
+ *         raise ImportError("numpy._core.multiarray failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_umath() except -1:
+ */
+  __pyx_tuple__9 = PyTuple_Pack(1, __pyx_kp_u_numpy__core_multiarray_failed_to); if (unlikely(!__pyx_tuple__9)) __PYX_ERR(2, 1043, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__9);
+  __Pyx_GIVEREF(__pyx_tuple__9);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1049
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy._core.umath failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_ufunc() except -1:
+ */
+  __pyx_tuple__10 = PyTuple_Pack(1, __pyx_kp_u_numpy__core_umath_failed_to_impo); if (unlikely(!__pyx_tuple__10)) __PYX_ERR(2, 1049, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__10);
+  __Pyx_GIVEREF(__pyx_tuple__10);
+
+  /* "View.MemoryView":100
+ * cdef object __pyx_collections_abc_Sequence "__pyx_collections_abc_Sequence"
+ * try:
+ *     if __import__("sys").version_info >= (3, 3):             # <<<<<<<<<<<<<<
+ *         __pyx_collections_abc_Sequence = __import__("collections.abc").abc.Sequence
+ *     else:
+ */
+  __pyx_tuple__11 = PyTuple_Pack(1, __pyx_n_s_sys); if (unlikely(!__pyx_tuple__11)) __PYX_ERR(1, 100, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__11);
+  __Pyx_GIVEREF(__pyx_tuple__11);
+  __pyx_tuple__12 = PyTuple_Pack(2, __pyx_int_3, __pyx_int_3); if (unlikely(!__pyx_tuple__12)) __PYX_ERR(1, 100, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__12);
+  __Pyx_GIVEREF(__pyx_tuple__12);
+
+  /* "View.MemoryView":101
+ * try:
+ *     if __import__("sys").version_info >= (3, 3):
+ *         __pyx_collections_abc_Sequence = __import__("collections.abc").abc.Sequence             # <<<<<<<<<<<<<<
+ *     else:
+ *         __pyx_collections_abc_Sequence = __import__("collections").Sequence
+ */
+  __pyx_tuple__13 = PyTuple_Pack(1, __pyx_kp_s_collections_abc); if (unlikely(!__pyx_tuple__13)) __PYX_ERR(1, 101, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__13);
+  __Pyx_GIVEREF(__pyx_tuple__13);
+
+  /* "View.MemoryView":103
+ *         __pyx_collections_abc_Sequence = __import__("collections.abc").abc.Sequence
+ *     else:
+ *         __pyx_collections_abc_Sequence = __import__("collections").Sequence             # <<<<<<<<<<<<<<
+ * except:
+ * 
+ */
+  __pyx_tuple__14 = PyTuple_Pack(1, __pyx_n_s_collections); if (unlikely(!__pyx_tuple__14)) __PYX_ERR(1, 103, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__14);
+  __Pyx_GIVEREF(__pyx_tuple__14);
+
+  /* "View.MemoryView":309
+ *         return self.name
+ * 
+ * cdef generic = Enum("<strided and direct or indirect>")             # <<<<<<<<<<<<<<
+ * cdef strided = Enum("<strided and direct>") # default
+ * cdef indirect = Enum("<strided and indirect>")
+ */
+  __pyx_tuple__15 = PyTuple_Pack(1, __pyx_kp_s_strided_and_direct_or_indirect); if (unlikely(!__pyx_tuple__15)) __PYX_ERR(1, 309, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__15);
+  __Pyx_GIVEREF(__pyx_tuple__15);
+
+  /* "View.MemoryView":310
+ * 
+ * cdef generic = Enum("<strided and direct or indirect>")
+ * cdef strided = Enum("<strided and direct>") # default             # <<<<<<<<<<<<<<
+ * cdef indirect = Enum("<strided and indirect>")
+ * 
+ */
+  __pyx_tuple__16 = PyTuple_Pack(1, __pyx_kp_s_strided_and_direct); if (unlikely(!__pyx_tuple__16)) __PYX_ERR(1, 310, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__16);
+  __Pyx_GIVEREF(__pyx_tuple__16);
+
+  /* "View.MemoryView":311
+ * cdef generic = Enum("<strided and direct or indirect>")
+ * cdef strided = Enum("<strided and direct>") # default
+ * cdef indirect = Enum("<strided and indirect>")             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_tuple__17 = PyTuple_Pack(1, __pyx_kp_s_strided_and_indirect); if (unlikely(!__pyx_tuple__17)) __PYX_ERR(1, 311, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__17);
+  __Pyx_GIVEREF(__pyx_tuple__17);
+
+  /* "View.MemoryView":314
+ * 
+ * 
+ * cdef contiguous = Enum("<contiguous and direct>")             # <<<<<<<<<<<<<<
+ * cdef indirect_contiguous = Enum("<contiguous and indirect>")
+ * 
+ */
+  __pyx_tuple__18 = PyTuple_Pack(1, __pyx_kp_s_contiguous_and_direct); if (unlikely(!__pyx_tuple__18)) __PYX_ERR(1, 314, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__18);
+  __Pyx_GIVEREF(__pyx_tuple__18);
+
+  /* "View.MemoryView":315
+ * 
+ * cdef contiguous = Enum("<contiguous and direct>")
+ * cdef indirect_contiguous = Enum("<contiguous and indirect>")             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_tuple__19 = PyTuple_Pack(1, __pyx_kp_s_contiguous_and_indirect); if (unlikely(!__pyx_tuple__19)) __PYX_ERR(1, 315, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__19);
+  __Pyx_GIVEREF(__pyx_tuple__19);
+
+  /* "(tree fragment)":1
+ * def __pyx_unpickle_Enum(__pyx_type, long __pyx_checksum, __pyx_state):             # <<<<<<<<<<<<<<
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ */
+  __pyx_tuple__20 = PyTuple_Pack(5, __pyx_n_s_pyx_type, __pyx_n_s_pyx_checksum, __pyx_n_s_pyx_state, __pyx_n_s_pyx_PickleError, __pyx_n_s_pyx_result); if (unlikely(!__pyx_tuple__20)) __PYX_ERR(1, 1, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__20);
+  __Pyx_GIVEREF(__pyx_tuple__20);
+  __pyx_codeobj__21 = (PyObject*)__Pyx_PyCode_New(3, 0, 0, 5, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__20, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_stringsource, __pyx_n_s_pyx_unpickle_Enum, 1, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__21)) __PYX_ERR(1, 1, __pyx_L1_error)
+
+  /* "fairseq/data/data_utils_fast.pyx":27
+ * 
+ * @cython.cdivision(True)
+ * cpdef list batch_by_size_fast(             # <<<<<<<<<<<<<<
+ *     np.ndarray[DTYPE_t, ndim=1] indices,
+ *     num_tokens_fn,
+ */
+  __pyx_tuple__22 = PyTuple_Pack(5, __pyx_n_s_indices, __pyx_n_s_num_tokens_fn, __pyx_n_s_max_tokens, __pyx_n_s_max_sentences, __pyx_n_s_bsz_mult); if (unlikely(!__pyx_tuple__22)) __PYX_ERR(0, 27, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__22);
+  __Pyx_GIVEREF(__pyx_tuple__22);
+  __pyx_codeobj__23 = (PyObject*)__Pyx_PyCode_New(5, 0, 0, 5, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__22, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_fairseq_data_data_utils_fast_pyx, __pyx_n_s_batch_by_size_fast, 27, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__23)) __PYX_ERR(0, 27, __pyx_L1_error)
+
+  /* "fairseq/data/data_utils_fast.pyx":84
+ * 
+ * @cython.cdivision(True)
+ * cpdef list batch_fixed_shapes_fast(             # <<<<<<<<<<<<<<
+ *     np.ndarray[DTYPE_t, ndim=1] indices,
+ *     num_tokens_fn,
+ */
+  __pyx_tuple__24 = PyTuple_Pack(3, __pyx_n_s_indices, __pyx_n_s_num_tokens_fn, __pyx_n_s_fixed_shapes_sorted); if (unlikely(!__pyx_tuple__24)) __PYX_ERR(0, 84, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__24);
+  __Pyx_GIVEREF(__pyx_tuple__24);
+  __pyx_codeobj__25 = (PyObject*)__Pyx_PyCode_New(3, 0, 0, 3, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__24, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_fairseq_data_data_utils_fast_pyx, __pyx_n_s_batch_fixed_shapes_fast, 84, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__25)) __PYX_ERR(0, 84, __pyx_L1_error)
+  __Pyx_RefNannyFinishContext();
+  return 0;
+  __pyx_L1_error:;
+  __Pyx_RefNannyFinishContext();
+  return -1;
+}
+/* #### Code section: init_constants ### */
+
+static CYTHON_SMALL_CODE int __Pyx_InitConstants(void) {
+  if (__Pyx_CreateStringTabAndInitStrings() < 0) __PYX_ERR(0, 1, __pyx_L1_error);
+  __pyx_int_0 = PyInt_FromLong(0); if (unlikely(!__pyx_int_0)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_1 = PyInt_FromLong(1); if (unlikely(!__pyx_int_1)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_3 = PyInt_FromLong(3); if (unlikely(!__pyx_int_3)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_112105877 = PyInt_FromLong(112105877L); if (unlikely(!__pyx_int_112105877)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_136983863 = PyInt_FromLong(136983863L); if (unlikely(!__pyx_int_136983863)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_184977713 = PyInt_FromLong(184977713L); if (unlikely(!__pyx_int_184977713)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_neg_1 = PyInt_FromLong(-1); if (unlikely(!__pyx_int_neg_1)) __PYX_ERR(0, 1, __pyx_L1_error)
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+/* #### Code section: init_globals ### */
+
+static CYTHON_SMALL_CODE int __Pyx_InitGlobals(void) {
+  /* AssertionsEnabled.init */
+  if (likely(__Pyx_init_assertions_enabled() == 0)); else
+
+if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 1, __pyx_L1_error)
+
+  /* NumpyImportArray.init */
+  /*
+ * Cython has automatically inserted a call to _import_array since
+ * you didn't include one when you cimported numpy. To disable this
+ * add the line
+ *   <void>numpy._import_array
+ */
+#ifdef NPY_FEATURE_VERSION
+#ifndef NO_IMPORT_ARRAY
+if (unlikely(_import_array() == -1)) {
+    PyErr_SetString(PyExc_ImportError, "numpy.core.multiarray failed to import "
+    "(auto-generated because you didn't call 'numpy.import_array()' after cimporting numpy; "
+    "use '<void>numpy._import_array' to disable if you are certain you don't need it).");
+}
+#endif
+#endif
+
+if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 1, __pyx_L1_error)
+
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+/* #### Code section: init_module ### */
+
+static CYTHON_SMALL_CODE int __Pyx_modinit_global_init_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_variable_export_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_function_export_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_type_init_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_type_import_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_variable_import_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_function_import_code(void); /*proto*/
+
+static int __Pyx_modinit_global_init_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_global_init_code", 0);
+  /*--- Global init code ---*/
+  __pyx_collections_abc_Sequence = Py_None; Py_INCREF(Py_None);
+  generic = Py_None; Py_INCREF(Py_None);
+  strided = Py_None; Py_INCREF(Py_None);
+  indirect = Py_None; Py_INCREF(Py_None);
+  contiguous = Py_None; Py_INCREF(Py_None);
+  indirect_contiguous = Py_None; Py_INCREF(Py_None);
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_variable_export_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_variable_export_code", 0);
+  /*--- Variable export code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_function_export_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_function_export_code", 0);
+  /*--- Function export code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_type_init_code(void) {
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__Pyx_modinit_type_init_code", 0);
+  /*--- Type init code ---*/
+  __pyx_vtabptr_array = &__pyx_vtable_array;
+  __pyx_vtable_array.get_memview = (PyObject *(*)(struct __pyx_array_obj *))__pyx_array_get_memview;
+  #if CYTHON_USE_TYPE_SPECS
+  __pyx_array_type = (PyTypeObject *) __Pyx_PyType_FromModuleAndSpec(__pyx_m, &__pyx_type___pyx_array_spec, NULL); if (unlikely(!__pyx_array_type)) __PYX_ERR(1, 114, __pyx_L1_error)
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  __pyx_array_type->tp_as_buffer = &__pyx_tp_as_buffer_array;
+  if (!__pyx_array_type->tp_as_buffer->bf_releasebuffer && __pyx_array_type->tp_base->tp_as_buffer && __pyx_array_type->tp_base->tp_as_buffer->bf_releasebuffer) {
+    __pyx_array_type->tp_as_buffer->bf_releasebuffer = __pyx_array_type->tp_base->tp_as_buffer->bf_releasebuffer;
+  }
+  #elif defined(Py_bf_getbuffer) && defined(Py_bf_releasebuffer)
+  /* PY_VERSION_HEX >= 0x03090000 || Py_LIMITED_API >= 0x030B0000 */
+  #elif defined(_MSC_VER)
+  #pragma message ("The buffer protocol is not supported in the Limited C-API < 3.11.")
+  #else
+  #warning "The buffer protocol is not supported in the Limited C-API < 3.11."
+  #endif
+  if (__Pyx_fix_up_extension_type_from_spec(&__pyx_type___pyx_array_spec, __pyx_array_type) < 0) __PYX_ERR(1, 114, __pyx_L1_error)
+  #else
+  __pyx_array_type = &__pyx_type___pyx_array;
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  #endif
+  #if !CYTHON_USE_TYPE_SPECS
+  if (__Pyx_PyType_Ready(__pyx_array_type) < 0) __PYX_ERR(1, 114, __pyx_L1_error)
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  __pyx_array_type->tp_print = 0;
+  #endif
+  if (__Pyx_SetVtable(__pyx_array_type, __pyx_vtabptr_array) < 0) __PYX_ERR(1, 114, __pyx_L1_error)
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if (__Pyx_MergeVtables(__pyx_array_type) < 0) __PYX_ERR(1, 114, __pyx_L1_error)
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if (__Pyx_setup_reduce((PyObject *) __pyx_array_type) < 0) __PYX_ERR(1, 114, __pyx_L1_error)
+  #endif
+  #if CYTHON_USE_TYPE_SPECS
+  __pyx_MemviewEnum_type = (PyTypeObject *) __Pyx_PyType_FromModuleAndSpec(__pyx_m, &__pyx_type___pyx_MemviewEnum_spec, NULL); if (unlikely(!__pyx_MemviewEnum_type)) __PYX_ERR(1, 302, __pyx_L1_error)
+  if (__Pyx_fix_up_extension_type_from_spec(&__pyx_type___pyx_MemviewEnum_spec, __pyx_MemviewEnum_type) < 0) __PYX_ERR(1, 302, __pyx_L1_error)
+  #else
+  __pyx_MemviewEnum_type = &__pyx_type___pyx_MemviewEnum;
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  #endif
+  #if !CYTHON_USE_TYPE_SPECS
+  if (__Pyx_PyType_Ready(__pyx_MemviewEnum_type) < 0) __PYX_ERR(1, 302, __pyx_L1_error)
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  __pyx_MemviewEnum_type->tp_print = 0;
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if ((CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP) && likely(!__pyx_MemviewEnum_type->tp_dictoffset && __pyx_MemviewEnum_type->tp_getattro == PyObject_GenericGetAttr)) {
+    __pyx_MemviewEnum_type->tp_getattro = __Pyx_PyObject_GenericGetAttr;
+  }
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if (__Pyx_setup_reduce((PyObject *) __pyx_MemviewEnum_type) < 0) __PYX_ERR(1, 302, __pyx_L1_error)
+  #endif
+  __pyx_vtabptr_memoryview = &__pyx_vtable_memoryview;
+  __pyx_vtable_memoryview.get_item_pointer = (char *(*)(struct __pyx_memoryview_obj *, PyObject *))__pyx_memoryview_get_item_pointer;
+  __pyx_vtable_memoryview.is_slice = (PyObject *(*)(struct __pyx_memoryview_obj *, PyObject *))__pyx_memoryview_is_slice;
+  __pyx_vtable_memoryview.setitem_slice_assignment = (PyObject *(*)(struct __pyx_memoryview_obj *, PyObject *, PyObject *))__pyx_memoryview_setitem_slice_assignment;
+  __pyx_vtable_memoryview.setitem_slice_assign_scalar = (PyObject *(*)(struct __pyx_memoryview_obj *, struct __pyx_memoryview_obj *, PyObject *))__pyx_memoryview_setitem_slice_assign_scalar;
+  __pyx_vtable_memoryview.setitem_indexed = (PyObject *(*)(struct __pyx_memoryview_obj *, PyObject *, PyObject *))__pyx_memoryview_setitem_indexed;
+  __pyx_vtable_memoryview.convert_item_to_object = (PyObject *(*)(struct __pyx_memoryview_obj *, char *))__pyx_memoryview_convert_item_to_object;
+  __pyx_vtable_memoryview.assign_item_from_object = (PyObject *(*)(struct __pyx_memoryview_obj *, char *, PyObject *))__pyx_memoryview_assign_item_from_object;
+  __pyx_vtable_memoryview._get_base = (PyObject *(*)(struct __pyx_memoryview_obj *))__pyx_memoryview__get_base;
+  #if CYTHON_USE_TYPE_SPECS
+  __pyx_memoryview_type = (PyTypeObject *) __Pyx_PyType_FromModuleAndSpec(__pyx_m, &__pyx_type___pyx_memoryview_spec, NULL); if (unlikely(!__pyx_memoryview_type)) __PYX_ERR(1, 337, __pyx_L1_error)
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  __pyx_memoryview_type->tp_as_buffer = &__pyx_tp_as_buffer_memoryview;
+  if (!__pyx_memoryview_type->tp_as_buffer->bf_releasebuffer && __pyx_memoryview_type->tp_base->tp_as_buffer && __pyx_memoryview_type->tp_base->tp_as_buffer->bf_releasebuffer) {
+    __pyx_memoryview_type->tp_as_buffer->bf_releasebuffer = __pyx_memoryview_type->tp_base->tp_as_buffer->bf_releasebuffer;
+  }
+  #elif defined(Py_bf_getbuffer) && defined(Py_bf_releasebuffer)
+  /* PY_VERSION_HEX >= 0x03090000 || Py_LIMITED_API >= 0x030B0000 */
+  #elif defined(_MSC_VER)
+  #pragma message ("The buffer protocol is not supported in the Limited C-API < 3.11.")
+  #else
+  #warning "The buffer protocol is not supported in the Limited C-API < 3.11."
+  #endif
+  if (__Pyx_fix_up_extension_type_from_spec(&__pyx_type___pyx_memoryview_spec, __pyx_memoryview_type) < 0) __PYX_ERR(1, 337, __pyx_L1_error)
+  #else
+  __pyx_memoryview_type = &__pyx_type___pyx_memoryview;
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  #endif
+  #if !CYTHON_USE_TYPE_SPECS
+  if (__Pyx_PyType_Ready(__pyx_memoryview_type) < 0) __PYX_ERR(1, 337, __pyx_L1_error)
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  __pyx_memoryview_type->tp_print = 0;
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if ((CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP) && likely(!__pyx_memoryview_type->tp_dictoffset && __pyx_memoryview_type->tp_getattro == PyObject_GenericGetAttr)) {
+    __pyx_memoryview_type->tp_getattro = __Pyx_PyObject_GenericGetAttr;
+  }
+  #endif
+  if (__Pyx_SetVtable(__pyx_memoryview_type, __pyx_vtabptr_memoryview) < 0) __PYX_ERR(1, 337, __pyx_L1_error)
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if (__Pyx_MergeVtables(__pyx_memoryview_type) < 0) __PYX_ERR(1, 337, __pyx_L1_error)
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if (__Pyx_setup_reduce((PyObject *) __pyx_memoryview_type) < 0) __PYX_ERR(1, 337, __pyx_L1_error)
+  #endif
+  __pyx_vtabptr__memoryviewslice = &__pyx_vtable__memoryviewslice;
+  __pyx_vtable__memoryviewslice.__pyx_base = *__pyx_vtabptr_memoryview;
+  __pyx_vtable__memoryviewslice.__pyx_base.convert_item_to_object = (PyObject *(*)(struct __pyx_memoryview_obj *, char *))__pyx_memoryviewslice_convert_item_to_object;
+  __pyx_vtable__memoryviewslice.__pyx_base.assign_item_from_object = (PyObject *(*)(struct __pyx_memoryview_obj *, char *, PyObject *))__pyx_memoryviewslice_assign_item_from_object;
+  __pyx_vtable__memoryviewslice.__pyx_base._get_base = (PyObject *(*)(struct __pyx_memoryview_obj *))__pyx_memoryviewslice__get_base;
+  #if CYTHON_USE_TYPE_SPECS
+  __pyx_t_1 = PyTuple_Pack(1, (PyObject *)__pyx_memoryview_type); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 952, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_memoryviewslice_type = (PyTypeObject *) __Pyx_PyType_FromModuleAndSpec(__pyx_m, &__pyx_type___pyx_memoryviewslice_spec, __pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (unlikely(!__pyx_memoryviewslice_type)) __PYX_ERR(1, 952, __pyx_L1_error)
+  if (__Pyx_fix_up_extension_type_from_spec(&__pyx_type___pyx_memoryviewslice_spec, __pyx_memoryviewslice_type) < 0) __PYX_ERR(1, 952, __pyx_L1_error)
+  #else
+  __pyx_memoryviewslice_type = &__pyx_type___pyx_memoryviewslice;
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  __pyx_memoryviewslice_type->tp_base = __pyx_memoryview_type;
+  #endif
+  #if !CYTHON_USE_TYPE_SPECS
+  if (__Pyx_PyType_Ready(__pyx_memoryviewslice_type) < 0) __PYX_ERR(1, 952, __pyx_L1_error)
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  __pyx_memoryviewslice_type->tp_print = 0;
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if ((CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP) && likely(!__pyx_memoryviewslice_type->tp_dictoffset && __pyx_memoryviewslice_type->tp_getattro == PyObject_GenericGetAttr)) {
+    __pyx_memoryviewslice_type->tp_getattro = __Pyx_PyObject_GenericGetAttr;
+  }
+  #endif
+  if (__Pyx_SetVtable(__pyx_memoryviewslice_type, __pyx_vtabptr__memoryviewslice) < 0) __PYX_ERR(1, 952, __pyx_L1_error)
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if (__Pyx_MergeVtables(__pyx_memoryviewslice_type) < 0) __PYX_ERR(1, 952, __pyx_L1_error)
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if (__Pyx_setup_reduce((PyObject *) __pyx_memoryviewslice_type) < 0) __PYX_ERR(1, 952, __pyx_L1_error)
+  #endif
+  __Pyx_RefNannyFinishContext();
+  return 0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_RefNannyFinishContext();
+  return -1;
+}
+
+static int __Pyx_modinit_type_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__Pyx_modinit_type_import_code", 0);
+  /*--- Type import code ---*/
+  __pyx_t_1 = PyImport_ImportModule(__Pyx_BUILTIN_MODULE_NAME); if (unlikely(!__pyx_t_1)) __PYX_ERR(3, 9, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_ptype_7cpython_4type_type = __Pyx_ImportType_3_0_12(__pyx_t_1, __Pyx_BUILTIN_MODULE_NAME, "type", 
+  #if defined(PYPY_VERSION_NUM) && PYPY_VERSION_NUM < 0x050B0000
+  sizeof(PyTypeObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyTypeObject),
+  #elif CYTHON_COMPILING_IN_LIMITED_API
+  sizeof(PyTypeObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyTypeObject),
+  #else
+  sizeof(PyHeapTypeObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyHeapTypeObject),
+  #endif
+  __Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_7cpython_4type_type) __PYX_ERR(3, 9, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = PyImport_ImportModule("numpy"); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 272, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_ptype_5numpy_dtype = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "dtype", sizeof(PyArray_Descr), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyArray_Descr),__Pyx_ImportType_CheckSize_Ignore_3_0_12); if (!__pyx_ptype_5numpy_dtype) __PYX_ERR(2, 272, __pyx_L1_error)
+  __pyx_ptype_5numpy_flatiter = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "flatiter", sizeof(PyArrayIterObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyArrayIterObject),__Pyx_ImportType_CheckSize_Ignore_3_0_12); if (!__pyx_ptype_5numpy_flatiter) __PYX_ERR(2, 317, __pyx_L1_error)
+  __pyx_ptype_5numpy_broadcast = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "broadcast", sizeof(PyArrayMultiIterObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyArrayMultiIterObject),__Pyx_ImportType_CheckSize_Ignore_3_0_12); if (!__pyx_ptype_5numpy_broadcast) __PYX_ERR(2, 321, __pyx_L1_error)
+  __pyx_ptype_5numpy_ndarray = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "ndarray", sizeof(PyArrayObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyArrayObject),__Pyx_ImportType_CheckSize_Ignore_3_0_12); if (!__pyx_ptype_5numpy_ndarray) __PYX_ERR(2, 360, __pyx_L1_error)
+  __pyx_ptype_5numpy_generic = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "generic", sizeof(PyObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyObject),__Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_5numpy_generic) __PYX_ERR(2, 865, __pyx_L1_error)
+  __pyx_ptype_5numpy_number = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "number", sizeof(PyObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyObject),__Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_5numpy_number) __PYX_ERR(2, 867, __pyx_L1_error)
+  __pyx_ptype_5numpy_integer = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "integer", sizeof(PyObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyObject),__Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_5numpy_integer) __PYX_ERR(2, 869, __pyx_L1_error)
+  __pyx_ptype_5numpy_signedinteger = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "signedinteger", sizeof(PyObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyObject),__Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_5numpy_signedinteger) __PYX_ERR(2, 871, __pyx_L1_error)
+  __pyx_ptype_5numpy_unsignedinteger = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "unsignedinteger", sizeof(PyObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyObject),__Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_5numpy_unsignedinteger) __PYX_ERR(2, 873, __pyx_L1_error)
+  __pyx_ptype_5numpy_inexact = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "inexact", sizeof(PyObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyObject),__Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_5numpy_inexact) __PYX_ERR(2, 875, __pyx_L1_error)
+  __pyx_ptype_5numpy_floating = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "floating", sizeof(PyObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyObject),__Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_5numpy_floating) __PYX_ERR(2, 877, __pyx_L1_error)
+  __pyx_ptype_5numpy_complexfloating = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "complexfloating", sizeof(PyObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyObject),__Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_5numpy_complexfloating) __PYX_ERR(2, 879, __pyx_L1_error)
+  __pyx_ptype_5numpy_flexible = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "flexible", sizeof(PyObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyObject),__Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_5numpy_flexible) __PYX_ERR(2, 881, __pyx_L1_error)
+  __pyx_ptype_5numpy_character = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "character", sizeof(PyObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyObject),__Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_5numpy_character) __PYX_ERR(2, 883, __pyx_L1_error)
+  __pyx_ptype_5numpy_ufunc = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "ufunc", sizeof(PyUFuncObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyUFuncObject),__Pyx_ImportType_CheckSize_Ignore_3_0_12); if (!__pyx_ptype_5numpy_ufunc) __PYX_ERR(2, 947, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_RefNannyFinishContext();
+  return 0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_RefNannyFinishContext();
+  return -1;
+}
+
+static int __Pyx_modinit_variable_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_variable_import_code", 0);
+  /*--- Variable import code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_function_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_function_import_code", 0);
+  /*--- Function import code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+
+#if PY_MAJOR_VERSION >= 3
+#if CYTHON_PEP489_MULTI_PHASE_INIT
+static PyObject* __pyx_pymod_create(PyObject *spec, PyModuleDef *def); /*proto*/
+static int __pyx_pymod_exec_data_utils_fast(PyObject* module); /*proto*/
+static PyModuleDef_Slot __pyx_moduledef_slots[] = {
+  {Py_mod_create, (void*)__pyx_pymod_create},
+  {Py_mod_exec, (void*)__pyx_pymod_exec_data_utils_fast},
+  {0, NULL}
+};
+#endif
+
+#ifdef __cplusplus
+namespace {
+  struct PyModuleDef __pyx_moduledef =
+  #else
+  static struct PyModuleDef __pyx_moduledef =
+  #endif
+  {
+      PyModuleDef_HEAD_INIT,
+      "data_utils_fast",
+      0, /* m_doc */
+    #if CYTHON_PEP489_MULTI_PHASE_INIT
+      0, /* m_size */
+    #elif CYTHON_USE_MODULE_STATE
+      sizeof(__pyx_mstate), /* m_size */
+    #else
+      -1, /* m_size */
+    #endif
+      __pyx_methods /* m_methods */,
+    #if CYTHON_PEP489_MULTI_PHASE_INIT
+      __pyx_moduledef_slots, /* m_slots */
+    #else
+      NULL, /* m_reload */
+    #endif
+    #if CYTHON_USE_MODULE_STATE
+      __pyx_m_traverse, /* m_traverse */
+      __pyx_m_clear, /* m_clear */
+      NULL /* m_free */
+    #else
+      NULL, /* m_traverse */
+      NULL, /* m_clear */
+      NULL /* m_free */
+    #endif
+  };
+  #ifdef __cplusplus
+} /* anonymous namespace */
+#endif
+#endif
+
+#ifndef CYTHON_NO_PYINIT_EXPORT
+#define __Pyx_PyMODINIT_FUNC PyMODINIT_FUNC
+#elif PY_MAJOR_VERSION < 3
+#ifdef __cplusplus
+#define __Pyx_PyMODINIT_FUNC extern "C" void
+#else
+#define __Pyx_PyMODINIT_FUNC void
+#endif
+#else
+#ifdef __cplusplus
+#define __Pyx_PyMODINIT_FUNC extern "C" PyObject *
+#else
+#define __Pyx_PyMODINIT_FUNC PyObject *
+#endif
+#endif
+
+
+#if PY_MAJOR_VERSION < 3
+__Pyx_PyMODINIT_FUNC initdata_utils_fast(void) CYTHON_SMALL_CODE; /*proto*/
+__Pyx_PyMODINIT_FUNC initdata_utils_fast(void)
+#else
+__Pyx_PyMODINIT_FUNC PyInit_data_utils_fast(void) CYTHON_SMALL_CODE; /*proto*/
+__Pyx_PyMODINIT_FUNC PyInit_data_utils_fast(void)
+#if CYTHON_PEP489_MULTI_PHASE_INIT
+{
+  return PyModuleDef_Init(&__pyx_moduledef);
+}
+static CYTHON_SMALL_CODE int __Pyx_check_single_interpreter(void) {
+    #if PY_VERSION_HEX >= 0x030700A1
+    static PY_INT64_T main_interpreter_id = -1;
+    PY_INT64_T current_id = PyInterpreterState_GetID(PyThreadState_Get()->interp);
+    if (main_interpreter_id == -1) {
+        main_interpreter_id = current_id;
+        return (unlikely(current_id == -1)) ? -1 : 0;
+    } else if (unlikely(main_interpreter_id != current_id))
+    #else
+    static PyInterpreterState *main_interpreter = NULL;
+    PyInterpreterState *current_interpreter = PyThreadState_Get()->interp;
+    if (!main_interpreter) {
+        main_interpreter = current_interpreter;
+    } else if (unlikely(main_interpreter != current_interpreter))
+    #endif
+    {
+        PyErr_SetString(
+            PyExc_ImportError,
+            "Interpreter change detected - this module can only be loaded into one interpreter per process.");
+        return -1;
+    }
+    return 0;
+}
+#if CYTHON_COMPILING_IN_LIMITED_API
+static CYTHON_SMALL_CODE int __Pyx_copy_spec_to_module(PyObject *spec, PyObject *module, const char* from_name, const char* to_name, int allow_none)
+#else
+static CYTHON_SMALL_CODE int __Pyx_copy_spec_to_module(PyObject *spec, PyObject *moddict, const char* from_name, const char* to_name, int allow_none)
+#endif
+{
+    PyObject *value = PyObject_GetAttrString(spec, from_name);
+    int result = 0;
+    if (likely(value)) {
+        if (allow_none || value != Py_None) {
+#if CYTHON_COMPILING_IN_LIMITED_API
+            result = PyModule_AddObject(module, to_name, value);
+#else
+            result = PyDict_SetItemString(moddict, to_name, value);
+#endif
+        }
+        Py_DECREF(value);
+    } else if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
+        PyErr_Clear();
+    } else {
+        result = -1;
+    }
+    return result;
+}
+static CYTHON_SMALL_CODE PyObject* __pyx_pymod_create(PyObject *spec, PyModuleDef *def) {
+    PyObject *module = NULL, *moddict, *modname;
+    CYTHON_UNUSED_VAR(def);
+    if (__Pyx_check_single_interpreter())
+        return NULL;
+    if (__pyx_m)
+        return __Pyx_NewRef(__pyx_m);
+    modname = PyObject_GetAttrString(spec, "name");
+    if (unlikely(!modname)) goto bad;
+    module = PyModule_NewObject(modname);
+    Py_DECREF(modname);
+    if (unlikely(!module)) goto bad;
+#if CYTHON_COMPILING_IN_LIMITED_API
+    moddict = module;
+#else
+    moddict = PyModule_GetDict(module);
+    if (unlikely(!moddict)) goto bad;
+#endif
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "loader", "__loader__", 1) < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "origin", "__file__", 1) < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "parent", "__package__", 1) < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "submodule_search_locations", "__path__", 0) < 0)) goto bad;
+    return module;
+bad:
+    Py_XDECREF(module);
+    return NULL;
+}
+
+
+static CYTHON_SMALL_CODE int __pyx_pymod_exec_data_utils_fast(PyObject *__pyx_pyinit_module)
+#endif
+#endif
+{
+  int stringtab_initialized = 0;
+  #if CYTHON_USE_MODULE_STATE
+  int pystate_addmodule_run = 0;
+  #endif
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  int __pyx_t_6;
+  PyObject *__pyx_t_7 = NULL;
+  static PyThread_type_lock __pyx_t_8[8];
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannyDeclarations
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  if (__pyx_m) {
+    if (__pyx_m == __pyx_pyinit_module) return 0;
+    PyErr_SetString(PyExc_RuntimeError, "Module 'data_utils_fast' has already been imported. Re-initialisation is not supported.");
+    return -1;
+  }
+  #elif PY_MAJOR_VERSION >= 3
+  if (__pyx_m) return __Pyx_NewRef(__pyx_m);
+  #endif
+  /*--- Module creation code ---*/
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  __pyx_m = __pyx_pyinit_module;
+  Py_INCREF(__pyx_m);
+  #else
+  #if PY_MAJOR_VERSION < 3
+  __pyx_m = Py_InitModule4("data_utils_fast", __pyx_methods, 0, 0, PYTHON_API_VERSION); Py_XINCREF(__pyx_m);
+  if (unlikely(!__pyx_m)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #elif CYTHON_USE_MODULE_STATE
+  __pyx_t_1 = PyModule_Create(&__pyx_moduledef); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 1, __pyx_L1_error)
+  {
+    int add_module_result = PyState_AddModule(__pyx_t_1, &__pyx_moduledef);
+    __pyx_t_1 = 0; /* transfer ownership from __pyx_t_1 to "data_utils_fast" pseudovariable */
+    if (unlikely((add_module_result < 0))) __PYX_ERR(0, 1, __pyx_L1_error)
+    pystate_addmodule_run = 1;
+  }
+  #else
+  __pyx_m = PyModule_Create(&__pyx_moduledef);
+  if (unlikely(!__pyx_m)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #endif
+  CYTHON_UNUSED_VAR(__pyx_t_1);
+  __pyx_d = PyModule_GetDict(__pyx_m); if (unlikely(!__pyx_d)) __PYX_ERR(0, 1, __pyx_L1_error)
+  Py_INCREF(__pyx_d);
+  __pyx_b = __Pyx_PyImport_AddModuleRef(__Pyx_BUILTIN_MODULE_NAME); if (unlikely(!__pyx_b)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_cython_runtime = __Pyx_PyImport_AddModuleRef((const char *) "cython_runtime"); if (unlikely(!__pyx_cython_runtime)) __PYX_ERR(0, 1, __pyx_L1_error)
+  if (PyObject_SetAttrString(__pyx_m, "__builtins__", __pyx_b) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #if CYTHON_REFNANNY
+__Pyx_RefNanny = __Pyx_RefNannyImportAPI("refnanny");
+if (!__Pyx_RefNanny) {
+  PyErr_Clear();
+  __Pyx_RefNanny = __Pyx_RefNannyImportAPI("Cython.Runtime.refnanny");
+  if (!__Pyx_RefNanny)
+      Py_FatalError("failed to import 'refnanny' module");
+}
+#endif
+  __Pyx_RefNannySetupContext("__Pyx_PyMODINIT_FUNC PyInit_data_utils_fast(void)", 0);
+  if (__Pyx_check_binary_version(__PYX_LIMITED_VERSION_HEX, __Pyx_get_runtime_version(), CYTHON_COMPILING_IN_LIMITED_API) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #ifdef __Pxy_PyFrame_Initialize_Offsets
+  __Pxy_PyFrame_Initialize_Offsets();
+  #endif
+  __pyx_empty_tuple = PyTuple_New(0); if (unlikely(!__pyx_empty_tuple)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_bytes = PyBytes_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_bytes)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_unicode = PyUnicode_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_unicode)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #ifdef __Pyx_CyFunction_USED
+  if (__pyx_CyFunction_init(__pyx_m) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_FusedFunction_USED
+  if (__pyx_FusedFunction_init(__pyx_m) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_Coroutine_USED
+  if (__pyx_Coroutine_init(__pyx_m) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_Generator_USED
+  if (__pyx_Generator_init(__pyx_m) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_AsyncGen_USED
+  if (__pyx_AsyncGen_init(__pyx_m) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_StopAsyncIteration_USED
+  if (__pyx_StopAsyncIteration_init(__pyx_m) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  /*--- Library function declarations ---*/
+  /*--- Threads initialization code ---*/
+  #if defined(WITH_THREAD) && PY_VERSION_HEX < 0x030700F0 && defined(__PYX_FORCE_INIT_THREADS) && __PYX_FORCE_INIT_THREADS
+  PyEval_InitThreads();
+  #endif
+  /*--- Initialize various global constants etc. ---*/
+  if (__Pyx_InitConstants() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  stringtab_initialized = 1;
+  if (__Pyx_InitGlobals() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #if PY_MAJOR_VERSION < 3 && (__PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT)
+  if (__Pyx_init_sys_getdefaultencoding_params() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  if (__pyx_module_is_main_fairseq__data__data_utils_fast) {
+    if (PyObject_SetAttr(__pyx_m, __pyx_n_s_name_2, __pyx_n_s_main) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  }
+  #if PY_MAJOR_VERSION >= 3
+  {
+    PyObject *modules = PyImport_GetModuleDict(); if (unlikely(!modules)) __PYX_ERR(0, 1, __pyx_L1_error)
+    if (!PyDict_GetItemString(modules, "fairseq.data.data_utils_fast")) {
+      if (unlikely((PyDict_SetItemString(modules, "fairseq.data.data_utils_fast", __pyx_m) < 0))) __PYX_ERR(0, 1, __pyx_L1_error)
+    }
+  }
+  #endif
+  /*--- Builtin init code ---*/
+  if (__Pyx_InitCachedBuiltins() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  /*--- Constants init code ---*/
+  if (__Pyx_InitCachedConstants() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  /*--- Global type/function init code ---*/
+  (void)__Pyx_modinit_global_init_code();
+  (void)__Pyx_modinit_variable_export_code();
+  (void)__Pyx_modinit_function_export_code();
+  if (unlikely((__Pyx_modinit_type_init_code() < 0))) __PYX_ERR(0, 1, __pyx_L1_error)
+  if (unlikely((__Pyx_modinit_type_import_code() < 0))) __PYX_ERR(0, 1, __pyx_L1_error)
+  (void)__Pyx_modinit_variable_import_code();
+  (void)__Pyx_modinit_function_import_code();
+  /*--- Execution code ---*/
+  #if defined(__Pyx_Generator_USED) || defined(__Pyx_Coroutine_USED)
+  if (__Pyx_patch_abc() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+
+  /* "View.MemoryView":99
+ * 
+ * cdef object __pyx_collections_abc_Sequence "__pyx_collections_abc_Sequence"
+ * try:             # <<<<<<<<<<<<<<
+ *     if __import__("sys").version_info >= (3, 3):
+ *         __pyx_collections_abc_Sequence = __import__("collections.abc").abc.Sequence
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "View.MemoryView":100
+ * cdef object __pyx_collections_abc_Sequence "__pyx_collections_abc_Sequence"
+ * try:
+ *     if __import__("sys").version_info >= (3, 3):             # <<<<<<<<<<<<<<
+ *         __pyx_collections_abc_Sequence = __import__("collections.abc").abc.Sequence
+ *     else:
+ */
+      __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin___import__, __pyx_tuple__11, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 100, __pyx_L2_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_version_info); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 100, __pyx_L2_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_4 = PyObject_RichCompare(__pyx_t_5, __pyx_tuple__12, Py_GE); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 100, __pyx_L2_error)
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely((__pyx_t_6 < 0))) __PYX_ERR(1, 100, __pyx_L2_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+
+        /* "View.MemoryView":101
+ * try:
+ *     if __import__("sys").version_info >= (3, 3):
+ *         __pyx_collections_abc_Sequence = __import__("collections.abc").abc.Sequence             # <<<<<<<<<<<<<<
+ *     else:
+ *         __pyx_collections_abc_Sequence = __import__("collections").Sequence
+ */
+        __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin___import__, __pyx_tuple__13, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 101, __pyx_L2_error)
+        __Pyx_GOTREF(__pyx_t_4);
+        __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_abc); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 101, __pyx_L2_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+        __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_Sequence); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 101, __pyx_L2_error)
+        __Pyx_GOTREF(__pyx_t_4);
+        __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+        __Pyx_XGOTREF(__pyx_collections_abc_Sequence);
+        __Pyx_DECREF_SET(__pyx_collections_abc_Sequence, __pyx_t_4);
+        __Pyx_GIVEREF(__pyx_t_4);
+        __pyx_t_4 = 0;
+
+        /* "View.MemoryView":100
+ * cdef object __pyx_collections_abc_Sequence "__pyx_collections_abc_Sequence"
+ * try:
+ *     if __import__("sys").version_info >= (3, 3):             # <<<<<<<<<<<<<<
+ *         __pyx_collections_abc_Sequence = __import__("collections.abc").abc.Sequence
+ *     else:
+ */
+        goto __pyx_L8;
+      }
+
+      /* "View.MemoryView":103
+ *         __pyx_collections_abc_Sequence = __import__("collections.abc").abc.Sequence
+ *     else:
+ *         __pyx_collections_abc_Sequence = __import__("collections").Sequence             # <<<<<<<<<<<<<<
+ * except:
+ * 
+ */
+      /*else*/ {
+        __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin___import__, __pyx_tuple__14, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 103, __pyx_L2_error)
+        __Pyx_GOTREF(__pyx_t_4);
+        __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_Sequence); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 103, __pyx_L2_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+        __Pyx_XGOTREF(__pyx_collections_abc_Sequence);
+        __Pyx_DECREF_SET(__pyx_collections_abc_Sequence, __pyx_t_5);
+        __Pyx_GIVEREF(__pyx_t_5);
+        __pyx_t_5 = 0;
+      }
+      __pyx_L8:;
+
+      /* "View.MemoryView":99
+ * 
+ * cdef object __pyx_collections_abc_Sequence "__pyx_collections_abc_Sequence"
+ * try:             # <<<<<<<<<<<<<<
+ *     if __import__("sys").version_info >= (3, 3):
+ *         __pyx_collections_abc_Sequence = __import__("collections.abc").abc.Sequence
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L7_try_end;
+    __pyx_L2_error:;
+    __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+
+    /* "View.MemoryView":104
+ *     else:
+ *         __pyx_collections_abc_Sequence = __import__("collections").Sequence
+ * except:             # <<<<<<<<<<<<<<
+ * 
+ *     __pyx_collections_abc_Sequence = None
+ */
+    /*except:*/ {
+      __Pyx_AddTraceback("View.MemoryView", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_4, &__pyx_t_7) < 0) __PYX_ERR(1, 104, __pyx_L4_except_error)
+      __Pyx_XGOTREF(__pyx_t_5);
+      __Pyx_XGOTREF(__pyx_t_4);
+      __Pyx_XGOTREF(__pyx_t_7);
+
+      /* "View.MemoryView":106
+ * except:
+ * 
+ *     __pyx_collections_abc_Sequence = None             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+      __Pyx_INCREF(Py_None);
+      __Pyx_XGOTREF(__pyx_collections_abc_Sequence);
+      __Pyx_DECREF_SET(__pyx_collections_abc_Sequence, Py_None);
+      __Pyx_GIVEREF(Py_None);
+      __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+      goto __pyx_L3_exception_handled;
+    }
+
+    /* "View.MemoryView":99
+ * 
+ * cdef object __pyx_collections_abc_Sequence "__pyx_collections_abc_Sequence"
+ * try:             # <<<<<<<<<<<<<<
+ *     if __import__("sys").version_info >= (3, 3):
+ *         __pyx_collections_abc_Sequence = __import__("collections.abc").abc.Sequence
+ */
+    __pyx_L4_except_error:;
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L3_exception_handled:;
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    __pyx_L7_try_end:;
+  }
+
+  /* "View.MemoryView":241
+ * 
+ * 
+ *     try:             # <<<<<<<<<<<<<<
+ *         count = __pyx_collections_abc_Sequence.count
+ *         index = __pyx_collections_abc_Sequence.index
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_3, &__pyx_t_2, &__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_1);
+    /*try:*/ {
+
+      /* "View.MemoryView":242
+ * 
+ *     try:
+ *         count = __pyx_collections_abc_Sequence.count             # <<<<<<<<<<<<<<
+ *         index = __pyx_collections_abc_Sequence.index
+ *     except:
+ */
+      __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_collections_abc_Sequence, __pyx_n_s_count); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 242, __pyx_L11_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      if (__Pyx_SetItemOnTypeDict(__pyx_array_type, __pyx_n_s_count, __pyx_t_7) < 0) __PYX_ERR(1, 242, __pyx_L11_error)
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      PyType_Modified(__pyx_array_type);
+
+      /* "View.MemoryView":243
+ *     try:
+ *         count = __pyx_collections_abc_Sequence.count
+ *         index = __pyx_collections_abc_Sequence.index             # <<<<<<<<<<<<<<
+ *     except:
+ *         pass
+ */
+      __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_collections_abc_Sequence, __pyx_n_s_index); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 243, __pyx_L11_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      if (__Pyx_SetItemOnTypeDict(__pyx_array_type, __pyx_n_s_index, __pyx_t_7) < 0) __PYX_ERR(1, 243, __pyx_L11_error)
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      PyType_Modified(__pyx_array_type);
+
+      /* "View.MemoryView":241
+ * 
+ * 
+ *     try:             # <<<<<<<<<<<<<<
+ *         count = __pyx_collections_abc_Sequence.count
+ *         index = __pyx_collections_abc_Sequence.index
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    goto __pyx_L16_try_end;
+    __pyx_L11_error:;
+    __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+    /* "View.MemoryView":244
+ *         count = __pyx_collections_abc_Sequence.count
+ *         index = __pyx_collections_abc_Sequence.index
+ *     except:             # <<<<<<<<<<<<<<
+ *         pass
+ * 
+ */
+    /*except:*/ {
+      __Pyx_ErrRestore(0,0,0);
+      goto __pyx_L12_exception_handled;
+    }
+    __pyx_L12_exception_handled:;
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_ExceptionReset(__pyx_t_3, __pyx_t_2, __pyx_t_1);
+    __pyx_L16_try_end:;
+  }
+
+  /* "View.MemoryView":309
+ *         return self.name
+ * 
+ * cdef generic = Enum("<strided and direct or indirect>")             # <<<<<<<<<<<<<<
+ * cdef strided = Enum("<strided and direct>") # default
+ * cdef indirect = Enum("<strided and indirect>")
+ */
+  __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)__pyx_MemviewEnum_type), __pyx_tuple__15, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 309, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  __Pyx_XGOTREF(generic);
+  __Pyx_DECREF_SET(generic, __pyx_t_7);
+  __Pyx_GIVEREF(__pyx_t_7);
+  __pyx_t_7 = 0;
+
+  /* "View.MemoryView":310
+ * 
+ * cdef generic = Enum("<strided and direct or indirect>")
+ * cdef strided = Enum("<strided and direct>") # default             # <<<<<<<<<<<<<<
+ * cdef indirect = Enum("<strided and indirect>")
+ * 
+ */
+  __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)__pyx_MemviewEnum_type), __pyx_tuple__16, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 310, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  __Pyx_XGOTREF(strided);
+  __Pyx_DECREF_SET(strided, __pyx_t_7);
+  __Pyx_GIVEREF(__pyx_t_7);
+  __pyx_t_7 = 0;
+
+  /* "View.MemoryView":311
+ * cdef generic = Enum("<strided and direct or indirect>")
+ * cdef strided = Enum("<strided and direct>") # default
+ * cdef indirect = Enum("<strided and indirect>")             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)__pyx_MemviewEnum_type), __pyx_tuple__17, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 311, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  __Pyx_XGOTREF(indirect);
+  __Pyx_DECREF_SET(indirect, __pyx_t_7);
+  __Pyx_GIVEREF(__pyx_t_7);
+  __pyx_t_7 = 0;
+
+  /* "View.MemoryView":314
+ * 
+ * 
+ * cdef contiguous = Enum("<contiguous and direct>")             # <<<<<<<<<<<<<<
+ * cdef indirect_contiguous = Enum("<contiguous and indirect>")
+ * 
+ */
+  __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)__pyx_MemviewEnum_type), __pyx_tuple__18, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 314, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  __Pyx_XGOTREF(contiguous);
+  __Pyx_DECREF_SET(contiguous, __pyx_t_7);
+  __Pyx_GIVEREF(__pyx_t_7);
+  __pyx_t_7 = 0;
+
+  /* "View.MemoryView":315
+ * 
+ * cdef contiguous = Enum("<contiguous and direct>")
+ * cdef indirect_contiguous = Enum("<contiguous and indirect>")             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)__pyx_MemviewEnum_type), __pyx_tuple__19, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 315, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  __Pyx_XGOTREF(indirect_contiguous);
+  __Pyx_DECREF_SET(indirect_contiguous, __pyx_t_7);
+  __Pyx_GIVEREF(__pyx_t_7);
+  __pyx_t_7 = 0;
+
+  /* "View.MemoryView":323
+ * 
+ * 
+ * cdef int __pyx_memoryview_thread_locks_used = 0             # <<<<<<<<<<<<<<
+ * cdef PyThread_type_lock[8] __pyx_memoryview_thread_locks = [
+ *     PyThread_allocate_lock(),
+ */
+  __pyx_memoryview_thread_locks_used = 0;
+
+  /* "View.MemoryView":324
+ * 
+ * cdef int __pyx_memoryview_thread_locks_used = 0
+ * cdef PyThread_type_lock[8] __pyx_memoryview_thread_locks = [             # <<<<<<<<<<<<<<
+ *     PyThread_allocate_lock(),
+ *     PyThread_allocate_lock(),
+ */
+  __pyx_t_8[0] = PyThread_allocate_lock();
+  __pyx_t_8[1] = PyThread_allocate_lock();
+  __pyx_t_8[2] = PyThread_allocate_lock();
+  __pyx_t_8[3] = PyThread_allocate_lock();
+  __pyx_t_8[4] = PyThread_allocate_lock();
+  __pyx_t_8[5] = PyThread_allocate_lock();
+  __pyx_t_8[6] = PyThread_allocate_lock();
+  __pyx_t_8[7] = PyThread_allocate_lock();
+  memcpy(&(__pyx_memoryview_thread_locks[0]), __pyx_t_8, sizeof(__pyx_memoryview_thread_locks[0]) * (8));
+
+  /* "View.MemoryView":982
+ * 
+ * 
+ *     try:             # <<<<<<<<<<<<<<
+ *         count = __pyx_collections_abc_Sequence.count
+ *         index = __pyx_collections_abc_Sequence.index
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "View.MemoryView":983
+ * 
+ *     try:
+ *         count = __pyx_collections_abc_Sequence.count             # <<<<<<<<<<<<<<
+ *         index = __pyx_collections_abc_Sequence.index
+ *     except:
+ */
+      __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_collections_abc_Sequence, __pyx_n_s_count); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 983, __pyx_L17_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      if (__Pyx_SetItemOnTypeDict(__pyx_memoryviewslice_type, __pyx_n_s_count, __pyx_t_7) < 0) __PYX_ERR(1, 983, __pyx_L17_error)
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      PyType_Modified(__pyx_memoryviewslice_type);
+
+      /* "View.MemoryView":984
+ *     try:
+ *         count = __pyx_collections_abc_Sequence.count
+ *         index = __pyx_collections_abc_Sequence.index             # <<<<<<<<<<<<<<
+ *     except:
+ *         pass
+ */
+      __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_collections_abc_Sequence, __pyx_n_s_index); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 984, __pyx_L17_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      if (__Pyx_SetItemOnTypeDict(__pyx_memoryviewslice_type, __pyx_n_s_index, __pyx_t_7) < 0) __PYX_ERR(1, 984, __pyx_L17_error)
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      PyType_Modified(__pyx_memoryviewslice_type);
+
+      /* "View.MemoryView":982
+ * 
+ * 
+ *     try:             # <<<<<<<<<<<<<<
+ *         count = __pyx_collections_abc_Sequence.count
+ *         index = __pyx_collections_abc_Sequence.index
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L22_try_end;
+    __pyx_L17_error:;
+    __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+    /* "View.MemoryView":985
+ *         count = __pyx_collections_abc_Sequence.count
+ *         index = __pyx_collections_abc_Sequence.index
+ *     except:             # <<<<<<<<<<<<<<
+ *         pass
+ * 
+ */
+    /*except:*/ {
+      __Pyx_ErrRestore(0,0,0);
+      goto __pyx_L18_exception_handled;
+    }
+    __pyx_L18_exception_handled:;
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    __pyx_L22_try_end:;
+  }
+
+  /* "View.MemoryView":988
+ *         pass
+ * 
+ * try:             # <<<<<<<<<<<<<<
+ *     if __pyx_collections_abc_Sequence:
+ * 
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_3, &__pyx_t_2, &__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_1);
+    /*try:*/ {
+
+      /* "View.MemoryView":989
+ * 
+ * try:
+ *     if __pyx_collections_abc_Sequence:             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_collections_abc_Sequence); if (unlikely((__pyx_t_6 < 0))) __PYX_ERR(1, 989, __pyx_L23_error)
+      if (__pyx_t_6) {
+
+        /* "View.MemoryView":993
+ * 
+ * 
+ *         __pyx_collections_abc_Sequence.register(_memoryviewslice)             # <<<<<<<<<<<<<<
+ *         __pyx_collections_abc_Sequence.register(array)
+ * except:
+ */
+        __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_collections_abc_Sequence, __pyx_n_s_register); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 993, __pyx_L23_error)
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_t_7, ((PyObject *)__pyx_memoryviewslice_type)); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 993, __pyx_L23_error)
+        __Pyx_GOTREF(__pyx_t_4);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+
+        /* "View.MemoryView":994
+ * 
+ *         __pyx_collections_abc_Sequence.register(_memoryviewslice)
+ *         __pyx_collections_abc_Sequence.register(array)             # <<<<<<<<<<<<<<
+ * except:
+ *     pass  # ignore failure, it's a minor issue
+ */
+        __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_collections_abc_Sequence, __pyx_n_s_register); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 994, __pyx_L23_error)
+        __Pyx_GOTREF(__pyx_t_4);
+        __pyx_t_7 = __Pyx_PyObject_CallOneArg(__pyx_t_4, ((PyObject *)__pyx_array_type)); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 994, __pyx_L23_error)
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+        /* "View.MemoryView":989
+ * 
+ * try:
+ *     if __pyx_collections_abc_Sequence:             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+      }
+
+      /* "View.MemoryView":988
+ *         pass
+ * 
+ * try:             # <<<<<<<<<<<<<<
+ *     if __pyx_collections_abc_Sequence:
+ * 
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    goto __pyx_L28_try_end;
+    __pyx_L23_error:;
+    __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+    /* "View.MemoryView":995
+ *         __pyx_collections_abc_Sequence.register(_memoryviewslice)
+ *         __pyx_collections_abc_Sequence.register(array)
+ * except:             # <<<<<<<<<<<<<<
+ *     pass  # ignore failure, it's a minor issue
+ * 
+ */
+    /*except:*/ {
+      __Pyx_ErrRestore(0,0,0);
+      goto __pyx_L24_exception_handled;
+    }
+    __pyx_L24_exception_handled:;
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_ExceptionReset(__pyx_t_3, __pyx_t_2, __pyx_t_1);
+    __pyx_L28_try_end:;
+  }
+
+  /* "(tree fragment)":1
+ * def __pyx_unpickle_Enum(__pyx_type, long __pyx_checksum, __pyx_state):             # <<<<<<<<<<<<<<
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ */
+  __pyx_t_7 = PyCFunction_NewEx(&__pyx_mdef_15View_dot_MemoryView_1__pyx_unpickle_Enum, NULL, __pyx_n_s_View_MemoryView); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 1, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_pyx_unpickle_Enum, __pyx_t_7) < 0) __PYX_ERR(1, 1, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+  /* "fairseq/data/data_utils_fast.pyx":7
+ * # LICENSE file in the root directory of this source tree.
+ * 
+ * import numpy as np             # <<<<<<<<<<<<<<
+ * 
+ * cimport cython
+ */
+  __pyx_t_7 = __Pyx_ImportDottedModule(__pyx_n_s_numpy, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 7, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_7) < 0) __PYX_ERR(0, 7, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+  /* "fairseq/data/data_utils_fast.pyx":12
+ * cimport numpy as np
+ * 
+ * DTYPE = np.int64             # <<<<<<<<<<<<<<
+ * ctypedef np.int64_t DTYPE_t
+ * 
+ */
+  __Pyx_GetModuleGlobalName(__pyx_t_7, __pyx_n_s_np); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 12, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_7, __pyx_n_s_int64); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 12, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_DTYPE, __pyx_t_4) < 0) __PYX_ERR(0, 12, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+
+  /* "fairseq/data/data_utils_fast.pyx":27
+ * 
+ * @cython.cdivision(True)
+ * cpdef list batch_by_size_fast(             # <<<<<<<<<<<<<<
+ *     np.ndarray[DTYPE_t, ndim=1] indices,
+ *     num_tokens_fn,
+ */
+  __pyx_t_4 = __Pyx_CyFunction_New(&__pyx_mdef_7fairseq_4data_15data_utils_fast_1batch_by_size_fast, 0, __pyx_n_s_batch_by_size_fast, NULL, __pyx_n_s_fairseq_data_data_utils_fast, __pyx_d, ((PyObject *)__pyx_codeobj__23)); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 27, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_batch_by_size_fast, __pyx_t_4) < 0) __PYX_ERR(0, 27, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+
+  /* "fairseq/data/data_utils_fast.pyx":84
+ * 
+ * @cython.cdivision(True)
+ * cpdef list batch_fixed_shapes_fast(             # <<<<<<<<<<<<<<
+ *     np.ndarray[DTYPE_t, ndim=1] indices,
+ *     num_tokens_fn,
+ */
+  __pyx_t_4 = __Pyx_CyFunction_New(&__pyx_mdef_7fairseq_4data_15data_utils_fast_3batch_fixed_shapes_fast, 0, __pyx_n_s_batch_fixed_shapes_fast, NULL, __pyx_n_s_fairseq_data_data_utils_fast, __pyx_d, ((PyObject *)__pyx_codeobj__25)); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 84, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_batch_fixed_shapes_fast, __pyx_t_4) < 0) __PYX_ERR(0, 84, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+
+  /* "fairseq/data/data_utils_fast.pyx":1
+ * # cython: language_level=3             # <<<<<<<<<<<<<<
+ * # Copyright (c) Facebook, Inc. and its affiliates.
+ * #
+ */
+  __pyx_t_4 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_4) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+
+  /*--- Wrapped vars code ---*/
+
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_7);
+  if (__pyx_m) {
+    if (__pyx_d && stringtab_initialized) {
+      __Pyx_AddTraceback("init fairseq.data.data_utils_fast", __pyx_clineno, __pyx_lineno, __pyx_filename);
+    }
+    #if !CYTHON_USE_MODULE_STATE
+    Py_CLEAR(__pyx_m);
+    #else
+    Py_DECREF(__pyx_m);
+    if (pystate_addmodule_run) {
+      PyObject *tp, *value, *tb;
+      PyErr_Fetch(&tp, &value, &tb);
+      PyState_RemoveModule(&__pyx_moduledef);
+      PyErr_Restore(tp, value, tb);
+    }
+    #endif
+  } else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_ImportError, "init fairseq.data.data_utils_fast");
+  }
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  return (__pyx_m != NULL) ? 0 : -1;
+  #elif PY_MAJOR_VERSION >= 3
+  return __pyx_m;
+  #else
+  return;
+  #endif
+}
+/* #### Code section: cleanup_globals ### */
+/* #### Code section: cleanup_module ### */
+/* #### Code section: main_method ### */
+/* #### Code section: utility_code_pragmas ### */
+#ifdef _MSC_VER
+#pragma warning( push )
+/* Warning 4127: conditional expression is constant
+ * Cython uses constant conditional expressions to allow in inline functions to be optimized at
+ * compile-time, so this warning is not useful
+ */
+#pragma warning( disable : 4127 )
+#endif
+
+
+
+/* #### Code section: utility_code_def ### */
+
+/* --- Runtime support code --- */
+/* Refnanny */
+#if CYTHON_REFNANNY
+static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname) {
+    PyObject *m = NULL, *p = NULL;
+    void *r = NULL;
+    m = PyImport_ImportModule(modname);
+    if (!m) goto end;
+    p = PyObject_GetAttrString(m, "RefNannyAPI");
+    if (!p) goto end;
+    r = PyLong_AsVoidPtr(p);
+end:
+    Py_XDECREF(p);
+    Py_XDECREF(m);
+    return (__Pyx_RefNannyAPIStruct *)r;
+}
+#endif
+
+/* PyErrExceptionMatches */
+#if CYTHON_FAST_THREAD_STATE
+static int __Pyx_PyErr_ExceptionMatchesTuple(PyObject *exc_type, PyObject *tuple) {
+    Py_ssize_t i, n;
+    n = PyTuple_GET_SIZE(tuple);
+#if PY_MAJOR_VERSION >= 3
+    for (i=0; i<n; i++) {
+        if (exc_type == PyTuple_GET_ITEM(tuple, i)) return 1;
+    }
+#endif
+    for (i=0; i<n; i++) {
+        if (__Pyx_PyErr_GivenExceptionMatches(exc_type, PyTuple_GET_ITEM(tuple, i))) return 1;
+    }
+    return 0;
+}
+static CYTHON_INLINE int __Pyx_PyErr_ExceptionMatchesInState(PyThreadState* tstate, PyObject* err) {
+    int result;
+    PyObject *exc_type;
+#if PY_VERSION_HEX >= 0x030C00A6
+    PyObject *current_exception = tstate->current_exception;
+    if (unlikely(!current_exception)) return 0;
+    exc_type = (PyObject*) Py_TYPE(current_exception);
+    if (exc_type == err) return 1;
+#else
+    exc_type = tstate->curexc_type;
+    if (exc_type == err) return 1;
+    if (unlikely(!exc_type)) return 0;
+#endif
+    #if CYTHON_AVOID_BORROWED_REFS
+    Py_INCREF(exc_type);
+    #endif
+    if (unlikely(PyTuple_Check(err))) {
+        result = __Pyx_PyErr_ExceptionMatchesTuple(exc_type, err);
+    } else {
+        result = __Pyx_PyErr_GivenExceptionMatches(exc_type, err);
+    }
+    #if CYTHON_AVOID_BORROWED_REFS
+    Py_DECREF(exc_type);
+    #endif
+    return result;
+}
+#endif
+
+/* PyErrFetchRestore */
+#if CYTHON_FAST_THREAD_STATE
+static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) {
+#if PY_VERSION_HEX >= 0x030C00A6
+    PyObject *tmp_value;
+    assert(type == NULL || (value != NULL && type == (PyObject*) Py_TYPE(value)));
+    if (value) {
+        #if CYTHON_COMPILING_IN_CPYTHON
+        if (unlikely(((PyBaseExceptionObject*) value)->traceback != tb))
+        #endif
+            PyException_SetTraceback(value, tb);
+    }
+    tmp_value = tstate->current_exception;
+    tstate->current_exception = value;
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(type);
+    Py_XDECREF(tb);
+#else
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    tmp_type = tstate->curexc_type;
+    tmp_value = tstate->curexc_value;
+    tmp_tb = tstate->curexc_traceback;
+    tstate->curexc_type = type;
+    tstate->curexc_value = value;
+    tstate->curexc_traceback = tb;
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+#endif
+}
+static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+#if PY_VERSION_HEX >= 0x030C00A6
+    PyObject* exc_value;
+    exc_value = tstate->current_exception;
+    tstate->current_exception = 0;
+    *value = exc_value;
+    *type = NULL;
+    *tb = NULL;
+    if (exc_value) {
+        *type = (PyObject*) Py_TYPE(exc_value);
+        Py_INCREF(*type);
+        #if CYTHON_COMPILING_IN_CPYTHON
+        *tb = ((PyBaseExceptionObject*) exc_value)->traceback;
+        Py_XINCREF(*tb);
+        #else
+        *tb = PyException_GetTraceback(exc_value);
+        #endif
+    }
+#else
+    *type = tstate->curexc_type;
+    *value = tstate->curexc_value;
+    *tb = tstate->curexc_traceback;
+    tstate->curexc_type = 0;
+    tstate->curexc_value = 0;
+    tstate->curexc_traceback = 0;
+#endif
+}
+#endif
+
+/* PyObjectGetAttrStr */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name) {
+    PyTypeObject* tp = Py_TYPE(obj);
+    if (likely(tp->tp_getattro))
+        return tp->tp_getattro(obj, attr_name);
+#if PY_MAJOR_VERSION < 3
+    if (likely(tp->tp_getattr))
+        return tp->tp_getattr(obj, PyString_AS_STRING(attr_name));
+#endif
+    return PyObject_GetAttr(obj, attr_name);
+}
+#endif
+
+/* PyObjectGetAttrStrNoError */
+#if __PYX_LIMITED_VERSION_HEX < 0x030d00A1
+static void __Pyx_PyObject_GetAttrStr_ClearAttributeError(void) {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    if (likely(__Pyx_PyErr_ExceptionMatches(PyExc_AttributeError)))
+        __Pyx_PyErr_Clear();
+}
+#endif
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStrNoError(PyObject* obj, PyObject* attr_name) {
+    PyObject *result;
+#if __PYX_LIMITED_VERSION_HEX >= 0x030d00A1
+    (void) PyObject_GetOptionalAttr(obj, attr_name, &result);
+    return result;
+#else
+#if CYTHON_COMPILING_IN_CPYTHON && CYTHON_USE_TYPE_SLOTS && PY_VERSION_HEX >= 0x030700B1
+    PyTypeObject* tp = Py_TYPE(obj);
+    if (likely(tp->tp_getattro == PyObject_GenericGetAttr)) {
+        return _PyObject_GenericGetAttrWithDict(obj, attr_name, NULL, 1);
+    }
+#endif
+    result = __Pyx_PyObject_GetAttrStr(obj, attr_name);
+    if (unlikely(!result)) {
+        __Pyx_PyObject_GetAttrStr_ClearAttributeError();
+    }
+    return result;
+#endif
+}
+
+/* GetBuiltinName */
+static PyObject *__Pyx_GetBuiltinName(PyObject *name) {
+    PyObject* result = __Pyx_PyObject_GetAttrStrNoError(__pyx_b, name);
+    if (unlikely(!result) && !PyErr_Occurred()) {
+        PyErr_Format(PyExc_NameError,
+#if PY_MAJOR_VERSION >= 3
+            "name '%U' is not defined", name);
+#else
+            "name '%.200s' is not defined", PyString_AS_STRING(name));
+#endif
+    }
+    return result;
+}
+
+/* TupleAndListFromArray */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE void __Pyx_copy_object_array(PyObject *const *CYTHON_RESTRICT src, PyObject** CYTHON_RESTRICT dest, Py_ssize_t length) {
+    PyObject *v;
+    Py_ssize_t i;
+    for (i = 0; i < length; i++) {
+        v = dest[i] = src[i];
+        Py_INCREF(v);
+    }
+}
+static CYTHON_INLINE PyObject *
+__Pyx_PyTuple_FromArray(PyObject *const *src, Py_ssize_t n)
+{
+    PyObject *res;
+    if (n <= 0) {
+        Py_INCREF(__pyx_empty_tuple);
+        return __pyx_empty_tuple;
+    }
+    res = PyTuple_New(n);
+    if (unlikely(res == NULL)) return NULL;
+    __Pyx_copy_object_array(src, ((PyTupleObject*)res)->ob_item, n);
+    return res;
+}
+static CYTHON_INLINE PyObject *
+__Pyx_PyList_FromArray(PyObject *const *src, Py_ssize_t n)
+{
+    PyObject *res;
+    if (n <= 0) {
+        return PyList_New(0);
+    }
+    res = PyList_New(n);
+    if (unlikely(res == NULL)) return NULL;
+    __Pyx_copy_object_array(src, ((PyListObject*)res)->ob_item, n);
+    return res;
+}
+#endif
+
+/* BytesEquals */
+static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int equals) {
+#if CYTHON_COMPILING_IN_PYPY || CYTHON_COMPILING_IN_LIMITED_API
+    return PyObject_RichCompareBool(s1, s2, equals);
+#else
+    if (s1 == s2) {
+        return (equals == Py_EQ);
+    } else if (PyBytes_CheckExact(s1) & PyBytes_CheckExact(s2)) {
+        const char *ps1, *ps2;
+        Py_ssize_t length = PyBytes_GET_SIZE(s1);
+        if (length != PyBytes_GET_SIZE(s2))
+            return (equals == Py_NE);
+        ps1 = PyBytes_AS_STRING(s1);
+        ps2 = PyBytes_AS_STRING(s2);
+        if (ps1[0] != ps2[0]) {
+            return (equals == Py_NE);
+        } else if (length == 1) {
+            return (equals == Py_EQ);
+        } else {
+            int result;
+#if CYTHON_USE_UNICODE_INTERNALS && (PY_VERSION_HEX < 0x030B0000)
+            Py_hash_t hash1, hash2;
+            hash1 = ((PyBytesObject*)s1)->ob_shash;
+            hash2 = ((PyBytesObject*)s2)->ob_shash;
+            if (hash1 != hash2 && hash1 != -1 && hash2 != -1) {
+                return (equals == Py_NE);
+            }
+#endif
+            result = memcmp(ps1, ps2, (size_t)length);
+            return (equals == Py_EQ) ? (result == 0) : (result != 0);
+        }
+    } else if ((s1 == Py_None) & PyBytes_CheckExact(s2)) {
+        return (equals == Py_NE);
+    } else if ((s2 == Py_None) & PyBytes_CheckExact(s1)) {
+        return (equals == Py_NE);
+    } else {
+        int result;
+        PyObject* py_result = PyObject_RichCompare(s1, s2, equals);
+        if (!py_result)
+            return -1;
+        result = __Pyx_PyObject_IsTrue(py_result);
+        Py_DECREF(py_result);
+        return result;
+    }
+#endif
+}
+
+/* UnicodeEquals */
+static CYTHON_INLINE int __Pyx_PyUnicode_Equals(PyObject* s1, PyObject* s2, int equals) {
+#if CYTHON_COMPILING_IN_PYPY || CYTHON_COMPILING_IN_LIMITED_API
+    return PyObject_RichCompareBool(s1, s2, equals);
+#else
+#if PY_MAJOR_VERSION < 3
+    PyObject* owned_ref = NULL;
+#endif
+    int s1_is_unicode, s2_is_unicode;
+    if (s1 == s2) {
+        goto return_eq;
+    }
+    s1_is_unicode = PyUnicode_CheckExact(s1);
+    s2_is_unicode = PyUnicode_CheckExact(s2);
+#if PY_MAJOR_VERSION < 3
+    if ((s1_is_unicode & (!s2_is_unicode)) && PyString_CheckExact(s2)) {
+        owned_ref = PyUnicode_FromObject(s2);
+        if (unlikely(!owned_ref))
+            return -1;
+        s2 = owned_ref;
+        s2_is_unicode = 1;
+    } else if ((s2_is_unicode & (!s1_is_unicode)) && PyString_CheckExact(s1)) {
+        owned_ref = PyUnicode_FromObject(s1);
+        if (unlikely(!owned_ref))
+            return -1;
+        s1 = owned_ref;
+        s1_is_unicode = 1;
+    } else if (((!s2_is_unicode) & (!s1_is_unicode))) {
+        return __Pyx_PyBytes_Equals(s1, s2, equals);
+    }
+#endif
+    if (s1_is_unicode & s2_is_unicode) {
+        Py_ssize_t length;
+        int kind;
+        void *data1, *data2;
+        if (unlikely(__Pyx_PyUnicode_READY(s1) < 0) || unlikely(__Pyx_PyUnicode_READY(s2) < 0))
+            return -1;
+        length = __Pyx_PyUnicode_GET_LENGTH(s1);
+        if (length != __Pyx_PyUnicode_GET_LENGTH(s2)) {
+            goto return_ne;
+        }
+#if CYTHON_USE_UNICODE_INTERNALS
+        {
+            Py_hash_t hash1, hash2;
+        #if CYTHON_PEP393_ENABLED
+            hash1 = ((PyASCIIObject*)s1)->hash;
+            hash2 = ((PyASCIIObject*)s2)->hash;
+        #else
+            hash1 = ((PyUnicodeObject*)s1)->hash;
+            hash2 = ((PyUnicodeObject*)s2)->hash;
+        #endif
+            if (hash1 != hash2 && hash1 != -1 && hash2 != -1) {
+                goto return_ne;
+            }
+        }
+#endif
+        kind = __Pyx_PyUnicode_KIND(s1);
+        if (kind != __Pyx_PyUnicode_KIND(s2)) {
+            goto return_ne;
+        }
+        data1 = __Pyx_PyUnicode_DATA(s1);
+        data2 = __Pyx_PyUnicode_DATA(s2);
+        if (__Pyx_PyUnicode_READ(kind, data1, 0) != __Pyx_PyUnicode_READ(kind, data2, 0)) {
+            goto return_ne;
+        } else if (length == 1) {
+            goto return_eq;
+        } else {
+            int result = memcmp(data1, data2, (size_t)(length * kind));
+            #if PY_MAJOR_VERSION < 3
+            Py_XDECREF(owned_ref);
+            #endif
+            return (equals == Py_EQ) ? (result == 0) : (result != 0);
+        }
+    } else if ((s1 == Py_None) & s2_is_unicode) {
+        goto return_ne;
+    } else if ((s2 == Py_None) & s1_is_unicode) {
+        goto return_ne;
+    } else {
+        int result;
+        PyObject* py_result = PyObject_RichCompare(s1, s2, equals);
+        #if PY_MAJOR_VERSION < 3
+        Py_XDECREF(owned_ref);
+        #endif
+        if (!py_result)
+            return -1;
+        result = __Pyx_PyObject_IsTrue(py_result);
+        Py_DECREF(py_result);
+        return result;
+    }
+return_eq:
+    #if PY_MAJOR_VERSION < 3
+    Py_XDECREF(owned_ref);
+    #endif
+    return (equals == Py_EQ);
+return_ne:
+    #if PY_MAJOR_VERSION < 3
+    Py_XDECREF(owned_ref);
+    #endif
+    return (equals == Py_NE);
+#endif
+}
+
+/* fastcall */
+#if CYTHON_METH_FASTCALL
+static CYTHON_INLINE PyObject * __Pyx_GetKwValue_FASTCALL(PyObject *kwnames, PyObject *const *kwvalues, PyObject *s)
+{
+    Py_ssize_t i, n = PyTuple_GET_SIZE(kwnames);
+    for (i = 0; i < n; i++)
+    {
+        if (s == PyTuple_GET_ITEM(kwnames, i)) return kwvalues[i];
+    }
+    for (i = 0; i < n; i++)
+    {
+        int eq = __Pyx_PyUnicode_Equals(s, PyTuple_GET_ITEM(kwnames, i), Py_EQ);
+        if (unlikely(eq != 0)) {
+            if (unlikely(eq < 0)) return NULL;
+            return kwvalues[i];
+        }
+    }
+    return NULL;
+}
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030d0000
+CYTHON_UNUSED static PyObject *__Pyx_KwargsAsDict_FASTCALL(PyObject *kwnames, PyObject *const *kwvalues) {
+    Py_ssize_t i, nkwargs = PyTuple_GET_SIZE(kwnames);
+    PyObject *dict;
+    dict = PyDict_New();
+    if (unlikely(!dict))
+        return NULL;
+    for (i=0; i<nkwargs; i++) {
+        PyObject *key = PyTuple_GET_ITEM(kwnames, i);
+        if (unlikely(PyDict_SetItem(dict, key, kwvalues[i]) < 0))
+            goto bad;
+    }
+    return dict;
+bad:
+    Py_DECREF(dict);
+    return NULL;
+}
+#endif
+#endif
+
+/* RaiseArgTupleInvalid */
+static void __Pyx_RaiseArgtupleInvalid(
+    const char* func_name,
+    int exact,
+    Py_ssize_t num_min,
+    Py_ssize_t num_max,
+    Py_ssize_t num_found)
+{
+    Py_ssize_t num_expected;
+    const char *more_or_less;
+    if (num_found < num_min) {
+        num_expected = num_min;
+        more_or_less = "at least";
+    } else {
+        num_expected = num_max;
+        more_or_less = "at most";
+    }
+    if (exact) {
+        more_or_less = "exactly";
+    }
+    PyErr_Format(PyExc_TypeError,
+                 "%.200s() takes %.8s %" CYTHON_FORMAT_SSIZE_T "d positional argument%.1s (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                 func_name, more_or_less, num_expected,
+                 (num_expected == 1) ? "" : "s", num_found);
+}
+
+/* RaiseDoubleKeywords */
+static void __Pyx_RaiseDoubleKeywordsError(
+    const char* func_name,
+    PyObject* kw_name)
+{
+    PyErr_Format(PyExc_TypeError,
+        #if PY_MAJOR_VERSION >= 3
+        "%s() got multiple values for keyword argument '%U'", func_name, kw_name);
+        #else
+        "%s() got multiple values for keyword argument '%s'", func_name,
+        PyString_AsString(kw_name));
+        #endif
+}
+
+/* ParseKeywords */
+static int __Pyx_ParseOptionalKeywords(
+    PyObject *kwds,
+    PyObject *const *kwvalues,
+    PyObject **argnames[],
+    PyObject *kwds2,
+    PyObject *values[],
+    Py_ssize_t num_pos_args,
+    const char* function_name)
+{
+    PyObject *key = 0, *value = 0;
+    Py_ssize_t pos = 0;
+    PyObject*** name;
+    PyObject*** first_kw_arg = argnames + num_pos_args;
+    int kwds_is_tuple = CYTHON_METH_FASTCALL && likely(PyTuple_Check(kwds));
+    while (1) {
+        Py_XDECREF(key); key = NULL;
+        Py_XDECREF(value); value = NULL;
+        if (kwds_is_tuple) {
+            Py_ssize_t size;
+#if CYTHON_ASSUME_SAFE_MACROS
+            size = PyTuple_GET_SIZE(kwds);
+#else
+            size = PyTuple_Size(kwds);
+            if (size < 0) goto bad;
+#endif
+            if (pos >= size) break;
+#if CYTHON_AVOID_BORROWED_REFS
+            key = __Pyx_PySequence_ITEM(kwds, pos);
+            if (!key) goto bad;
+#elif CYTHON_ASSUME_SAFE_MACROS
+            key = PyTuple_GET_ITEM(kwds, pos);
+#else
+            key = PyTuple_GetItem(kwds, pos);
+            if (!key) goto bad;
+#endif
+            value = kwvalues[pos];
+            pos++;
+        }
+        else
+        {
+            if (!PyDict_Next(kwds, &pos, &key, &value)) break;
+#if CYTHON_AVOID_BORROWED_REFS
+            Py_INCREF(key);
+#endif
+        }
+        name = first_kw_arg;
+        while (*name && (**name != key)) name++;
+        if (*name) {
+            values[name-argnames] = value;
+#if CYTHON_AVOID_BORROWED_REFS
+            Py_INCREF(value);
+            Py_DECREF(key);
+#endif
+            key = NULL;
+            value = NULL;
+            continue;
+        }
+#if !CYTHON_AVOID_BORROWED_REFS
+        Py_INCREF(key);
+#endif
+        Py_INCREF(value);
+        name = first_kw_arg;
+        #if PY_MAJOR_VERSION < 3
+        if (likely(PyString_Check(key))) {
+            while (*name) {
+                if ((CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**name) == PyString_GET_SIZE(key))
+                        && _PyString_Eq(**name, key)) {
+                    values[name-argnames] = value;
+#if CYTHON_AVOID_BORROWED_REFS
+                    value = NULL;
+#endif
+                    break;
+                }
+                name++;
+            }
+            if (*name) continue;
+            else {
+                PyObject*** argname = argnames;
+                while (argname != first_kw_arg) {
+                    if ((**argname == key) || (
+                            (CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**argname) == PyString_GET_SIZE(key))
+                             && _PyString_Eq(**argname, key))) {
+                        goto arg_passed_twice;
+                    }
+                    argname++;
+                }
+            }
+        } else
+        #endif
+        if (likely(PyUnicode_Check(key))) {
+            while (*name) {
+                int cmp = (
+                #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
+                    (__Pyx_PyUnicode_GET_LENGTH(**name) != __Pyx_PyUnicode_GET_LENGTH(key)) ? 1 :
+                #endif
+                    PyUnicode_Compare(**name, key)
+                );
+                if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
+                if (cmp == 0) {
+                    values[name-argnames] = value;
+#if CYTHON_AVOID_BORROWED_REFS
+                    value = NULL;
+#endif
+                    break;
+                }
+                name++;
+            }
+            if (*name) continue;
+            else {
+                PyObject*** argname = argnames;
+                while (argname != first_kw_arg) {
+                    int cmp = (**argname == key) ? 0 :
+                    #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
+                        (__Pyx_PyUnicode_GET_LENGTH(**argname) != __Pyx_PyUnicode_GET_LENGTH(key)) ? 1 :
+                    #endif
+                        PyUnicode_Compare(**argname, key);
+                    if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
+                    if (cmp == 0) goto arg_passed_twice;
+                    argname++;
+                }
+            }
+        } else
+            goto invalid_keyword_type;
+        if (kwds2) {
+            if (unlikely(PyDict_SetItem(kwds2, key, value))) goto bad;
+        } else {
+            goto invalid_keyword;
+        }
+    }
+    Py_XDECREF(key);
+    Py_XDECREF(value);
+    return 0;
+arg_passed_twice:
+    __Pyx_RaiseDoubleKeywordsError(function_name, key);
+    goto bad;
+invalid_keyword_type:
+    PyErr_Format(PyExc_TypeError,
+        "%.200s() keywords must be strings", function_name);
+    goto bad;
+invalid_keyword:
+    #if PY_MAJOR_VERSION < 3
+    PyErr_Format(PyExc_TypeError,
+        "%.200s() got an unexpected keyword argument '%.200s'",
+        function_name, PyString_AsString(key));
+    #else
+    PyErr_Format(PyExc_TypeError,
+        "%s() got an unexpected keyword argument '%U'",
+        function_name, key);
+    #endif
+bad:
+    Py_XDECREF(key);
+    Py_XDECREF(value);
+    return -1;
+}
+
+/* ArgTypeTest */
+static int __Pyx__ArgTypeTest(PyObject *obj, PyTypeObject *type, const char *name, int exact)
+{
+    __Pyx_TypeName type_name;
+    __Pyx_TypeName obj_type_name;
+    if (unlikely(!type)) {
+        PyErr_SetString(PyExc_SystemError, "Missing type object");
+        return 0;
+    }
+    else if (exact) {
+        #if PY_MAJOR_VERSION == 2
+        if ((type == &PyBaseString_Type) && likely(__Pyx_PyBaseString_CheckExact(obj))) return 1;
+        #endif
+    }
+    else {
+        if (likely(__Pyx_TypeCheck(obj, type))) return 1;
+    }
+    type_name = __Pyx_PyType_GetName(type);
+    obj_type_name = __Pyx_PyType_GetName(Py_TYPE(obj));
+    PyErr_Format(PyExc_TypeError,
+        "Argument '%.200s' has incorrect type (expected " __Pyx_FMT_TYPENAME
+        ", got " __Pyx_FMT_TYPENAME ")", name, type_name, obj_type_name);
+    __Pyx_DECREF_TypeName(type_name);
+    __Pyx_DECREF_TypeName(obj_type_name);
+    return 0;
+}
+
+/* RaiseException */
+#if PY_MAJOR_VERSION < 3
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause) {
+    __Pyx_PyThreadState_declare
+    CYTHON_UNUSED_VAR(cause);
+    Py_XINCREF(type);
+    if (!value || value == Py_None)
+        value = NULL;
+    else
+        Py_INCREF(value);
+    if (!tb || tb == Py_None)
+        tb = NULL;
+    else {
+        Py_INCREF(tb);
+        if (!PyTraceBack_Check(tb)) {
+            PyErr_SetString(PyExc_TypeError,
+                "raise: arg 3 must be a traceback or None");
+            goto raise_error;
+        }
+    }
+    if (PyType_Check(type)) {
+#if CYTHON_COMPILING_IN_PYPY
+        if (!value) {
+            Py_INCREF(Py_None);
+            value = Py_None;
+        }
+#endif
+        PyErr_NormalizeException(&type, &value, &tb);
+    } else {
+        if (value) {
+            PyErr_SetString(PyExc_TypeError,
+                "instance exception may not have a separate value");
+            goto raise_error;
+        }
+        value = type;
+        type = (PyObject*) Py_TYPE(type);
+        Py_INCREF(type);
+        if (!PyType_IsSubtype((PyTypeObject *)type, (PyTypeObject *)PyExc_BaseException)) {
+            PyErr_SetString(PyExc_TypeError,
+                "raise: exception class must be a subclass of BaseException");
+            goto raise_error;
+        }
+    }
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrRestore(type, value, tb);
+    return;
+raise_error:
+    Py_XDECREF(value);
+    Py_XDECREF(type);
+    Py_XDECREF(tb);
+    return;
+}
+#else
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause) {
+    PyObject* owned_instance = NULL;
+    if (tb == Py_None) {
+        tb = 0;
+    } else if (tb && !PyTraceBack_Check(tb)) {
+        PyErr_SetString(PyExc_TypeError,
+            "raise: arg 3 must be a traceback or None");
+        goto bad;
+    }
+    if (value == Py_None)
+        value = 0;
+    if (PyExceptionInstance_Check(type)) {
+        if (value) {
+            PyErr_SetString(PyExc_TypeError,
+                "instance exception may not have a separate value");
+            goto bad;
+        }
+        value = type;
+        type = (PyObject*) Py_TYPE(value);
+    } else if (PyExceptionClass_Check(type)) {
+        PyObject *instance_class = NULL;
+        if (value && PyExceptionInstance_Check(value)) {
+            instance_class = (PyObject*) Py_TYPE(value);
+            if (instance_class != type) {
+                int is_subclass = PyObject_IsSubclass(instance_class, type);
+                if (!is_subclass) {
+                    instance_class = NULL;
+                } else if (unlikely(is_subclass == -1)) {
+                    goto bad;
+                } else {
+                    type = instance_class;
+                }
+            }
+        }
+        if (!instance_class) {
+            PyObject *args;
+            if (!value)
+                args = PyTuple_New(0);
+            else if (PyTuple_Check(value)) {
+                Py_INCREF(value);
+                args = value;
+            } else
+                args = PyTuple_Pack(1, value);
+            if (!args)
+                goto bad;
+            owned_instance = PyObject_Call(type, args, NULL);
+            Py_DECREF(args);
+            if (!owned_instance)
+                goto bad;
+            value = owned_instance;
+            if (!PyExceptionInstance_Check(value)) {
+                PyErr_Format(PyExc_TypeError,
+                             "calling %R should have returned an instance of "
+                             "BaseException, not %R",
+                             type, Py_TYPE(value));
+                goto bad;
+            }
+        }
+    } else {
+        PyErr_SetString(PyExc_TypeError,
+            "raise: exception class must be a subclass of BaseException");
+        goto bad;
+    }
+    if (cause) {
+        PyObject *fixed_cause;
+        if (cause == Py_None) {
+            fixed_cause = NULL;
+        } else if (PyExceptionClass_Check(cause)) {
+            fixed_cause = PyObject_CallObject(cause, NULL);
+            if (fixed_cause == NULL)
+                goto bad;
+        } else if (PyExceptionInstance_Check(cause)) {
+            fixed_cause = cause;
+            Py_INCREF(fixed_cause);
+        } else {
+            PyErr_SetString(PyExc_TypeError,
+                            "exception causes must derive from "
+                            "BaseException");
+            goto bad;
+        }
+        PyException_SetCause(value, fixed_cause);
+    }
+    PyErr_SetObject(type, value);
+    if (tb) {
+      #if PY_VERSION_HEX >= 0x030C00A6
+        PyException_SetTraceback(value, tb);
+      #elif CYTHON_FAST_THREAD_STATE
+        PyThreadState *tstate = __Pyx_PyThreadState_Current;
+        PyObject* tmp_tb = tstate->curexc_traceback;
+        if (tb != tmp_tb) {
+            Py_INCREF(tb);
+            tstate->curexc_traceback = tb;
+            Py_XDECREF(tmp_tb);
+        }
+#else
+        PyObject *tmp_type, *tmp_value, *tmp_tb;
+        PyErr_Fetch(&tmp_type, &tmp_value, &tmp_tb);
+        Py_INCREF(tb);
+        PyErr_Restore(tmp_type, tmp_value, tb);
+        Py_XDECREF(tmp_tb);
+#endif
+    }
+bad:
+    Py_XDECREF(owned_instance);
+    return;
+}
+#endif
+
+/* PyFunctionFastCall */
+#if CYTHON_FAST_PYCALL && !CYTHON_VECTORCALL
+static PyObject* __Pyx_PyFunction_FastCallNoKw(PyCodeObject *co, PyObject **args, Py_ssize_t na,
+                                               PyObject *globals) {
+    PyFrameObject *f;
+    PyThreadState *tstate = __Pyx_PyThreadState_Current;
+    PyObject **fastlocals;
+    Py_ssize_t i;
+    PyObject *result;
+    assert(globals != NULL);
+    /* XXX Perhaps we should create a specialized
+       PyFrame_New() that doesn't take locals, but does
+       take builtins without sanity checking them.
+       */
+    assert(tstate != NULL);
+    f = PyFrame_New(tstate, co, globals, NULL);
+    if (f == NULL) {
+        return NULL;
+    }
+    fastlocals = __Pyx_PyFrame_GetLocalsplus(f);
+    for (i = 0; i < na; i++) {
+        Py_INCREF(*args);
+        fastlocals[i] = *args++;
+    }
+    result = PyEval_EvalFrameEx(f,0);
+    ++tstate->recursion_depth;
+    Py_DECREF(f);
+    --tstate->recursion_depth;
+    return result;
+}
+static PyObject *__Pyx_PyFunction_FastCallDict(PyObject *func, PyObject **args, Py_ssize_t nargs, PyObject *kwargs) {
+    PyCodeObject *co = (PyCodeObject *)PyFunction_GET_CODE(func);
+    PyObject *globals = PyFunction_GET_GLOBALS(func);
+    PyObject *argdefs = PyFunction_GET_DEFAULTS(func);
+    PyObject *closure;
+#if PY_MAJOR_VERSION >= 3
+    PyObject *kwdefs;
+#endif
+    PyObject *kwtuple, **k;
+    PyObject **d;
+    Py_ssize_t nd;
+    Py_ssize_t nk;
+    PyObject *result;
+    assert(kwargs == NULL || PyDict_Check(kwargs));
+    nk = kwargs ? PyDict_Size(kwargs) : 0;
+    #if PY_MAJOR_VERSION < 3
+    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object"))) {
+        return NULL;
+    }
+    #else
+    if (unlikely(Py_EnterRecursiveCall(" while calling a Python object"))) {
+        return NULL;
+    }
+    #endif
+    if (
+#if PY_MAJOR_VERSION >= 3
+            co->co_kwonlyargcount == 0 &&
+#endif
+            likely(kwargs == NULL || nk == 0) &&
+            co->co_flags == (CO_OPTIMIZED | CO_NEWLOCALS | CO_NOFREE)) {
+        if (argdefs == NULL && co->co_argcount == nargs) {
+            result = __Pyx_PyFunction_FastCallNoKw(co, args, nargs, globals);
+            goto done;
+        }
+        else if (nargs == 0 && argdefs != NULL
+                 && co->co_argcount == Py_SIZE(argdefs)) {
+            /* function called with no arguments, but all parameters have
+               a default value: use default values as arguments .*/
+            args = &PyTuple_GET_ITEM(argdefs, 0);
+            result =__Pyx_PyFunction_FastCallNoKw(co, args, Py_SIZE(argdefs), globals);
+            goto done;
+        }
+    }
+    if (kwargs != NULL) {
+        Py_ssize_t pos, i;
+        kwtuple = PyTuple_New(2 * nk);
+        if (kwtuple == NULL) {
+            result = NULL;
+            goto done;
+        }
+        k = &PyTuple_GET_ITEM(kwtuple, 0);
+        pos = i = 0;
+        while (PyDict_Next(kwargs, &pos, &k[i], &k[i+1])) {
+            Py_INCREF(k[i]);
+            Py_INCREF(k[i+1]);
+            i += 2;
+        }
+        nk = i / 2;
+    }
+    else {
+        kwtuple = NULL;
+        k = NULL;
+    }
+    closure = PyFunction_GET_CLOSURE(func);
+#if PY_MAJOR_VERSION >= 3
+    kwdefs = PyFunction_GET_KW_DEFAULTS(func);
+#endif
+    if (argdefs != NULL) {
+        d = &PyTuple_GET_ITEM(argdefs, 0);
+        nd = Py_SIZE(argdefs);
+    }
+    else {
+        d = NULL;
+        nd = 0;
+    }
+#if PY_MAJOR_VERSION >= 3
+    result = PyEval_EvalCodeEx((PyObject*)co, globals, (PyObject *)NULL,
+                               args, (int)nargs,
+                               k, (int)nk,
+                               d, (int)nd, kwdefs, closure);
+#else
+    result = PyEval_EvalCodeEx(co, globals, (PyObject *)NULL,
+                               args, (int)nargs,
+                               k, (int)nk,
+                               d, (int)nd, closure);
+#endif
+    Py_XDECREF(kwtuple);
+done:
+    Py_LeaveRecursiveCall();
+    return result;
+}
+#endif
+
+/* PyObjectCall */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw) {
+    PyObject *result;
+    ternaryfunc call = Py_TYPE(func)->tp_call;
+    if (unlikely(!call))
+        return PyObject_Call(func, arg, kw);
+    #if PY_MAJOR_VERSION < 3
+    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object")))
+        return NULL;
+    #else
+    if (unlikely(Py_EnterRecursiveCall(" while calling a Python object")))
+        return NULL;
+    #endif
+    result = (*call)(func, arg, kw);
+    Py_LeaveRecursiveCall();
+    if (unlikely(!result) && unlikely(!PyErr_Occurred())) {
+        PyErr_SetString(
+            PyExc_SystemError,
+            "NULL result without error in PyObject_Call");
+    }
+    return result;
+}
+#endif
+
+/* PyObjectCallMethO */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg) {
+    PyObject *self, *result;
+    PyCFunction cfunc;
+    cfunc = __Pyx_CyOrPyCFunction_GET_FUNCTION(func);
+    self = __Pyx_CyOrPyCFunction_GET_SELF(func);
+    #if PY_MAJOR_VERSION < 3
+    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object")))
+        return NULL;
+    #else
+    if (unlikely(Py_EnterRecursiveCall(" while calling a Python object")))
+        return NULL;
+    #endif
+    result = cfunc(self, arg);
+    Py_LeaveRecursiveCall();
+    if (unlikely(!result) && unlikely(!PyErr_Occurred())) {
+        PyErr_SetString(
+            PyExc_SystemError,
+            "NULL result without error in PyObject_Call");
+    }
+    return result;
+}
+#endif
+
+/* PyObjectFastCall */
+#if PY_VERSION_HEX < 0x03090000 || CYTHON_COMPILING_IN_LIMITED_API
+static PyObject* __Pyx_PyObject_FastCall_fallback(PyObject *func, PyObject **args, size_t nargs, PyObject *kwargs) {
+    PyObject *argstuple;
+    PyObject *result = 0;
+    size_t i;
+    argstuple = PyTuple_New((Py_ssize_t)nargs);
+    if (unlikely(!argstuple)) return NULL;
+    for (i = 0; i < nargs; i++) {
+        Py_INCREF(args[i]);
+        if (__Pyx_PyTuple_SET_ITEM(argstuple, (Py_ssize_t)i, args[i]) < 0) goto bad;
+    }
+    result = __Pyx_PyObject_Call(func, argstuple, kwargs);
+  bad:
+    Py_DECREF(argstuple);
+    return result;
+}
+#endif
+static CYTHON_INLINE PyObject* __Pyx_PyObject_FastCallDict(PyObject *func, PyObject **args, size_t _nargs, PyObject *kwargs) {
+    Py_ssize_t nargs = __Pyx_PyVectorcall_NARGS(_nargs);
+#if CYTHON_COMPILING_IN_CPYTHON
+    if (nargs == 0 && kwargs == NULL) {
+        if (__Pyx_CyOrPyCFunction_Check(func) && likely( __Pyx_CyOrPyCFunction_GET_FLAGS(func) & METH_NOARGS))
+            return __Pyx_PyObject_CallMethO(func, NULL);
+    }
+    else if (nargs == 1 && kwargs == NULL) {
+        if (__Pyx_CyOrPyCFunction_Check(func) && likely( __Pyx_CyOrPyCFunction_GET_FLAGS(func) & METH_O))
+            return __Pyx_PyObject_CallMethO(func, args[0]);
+    }
+#endif
+    #if PY_VERSION_HEX < 0x030800B1
+    #if CYTHON_FAST_PYCCALL
+    if (PyCFunction_Check(func)) {
+        if (kwargs) {
+            return _PyCFunction_FastCallDict(func, args, nargs, kwargs);
+        } else {
+            return _PyCFunction_FastCallKeywords(func, args, nargs, NULL);
+        }
+    }
+    #if PY_VERSION_HEX >= 0x030700A1
+    if (!kwargs && __Pyx_IS_TYPE(func, &PyMethodDescr_Type)) {
+        return _PyMethodDescr_FastCallKeywords(func, args, nargs, NULL);
+    }
+    #endif
+    #endif
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(func)) {
+        return __Pyx_PyFunction_FastCallDict(func, args, nargs, kwargs);
+    }
+    #endif
+    #endif
+    if (kwargs == NULL) {
+        #if CYTHON_VECTORCALL
+        #if PY_VERSION_HEX < 0x03090000
+        vectorcallfunc f = _PyVectorcall_Function(func);
+        #else
+        vectorcallfunc f = PyVectorcall_Function(func);
+        #endif
+        if (f) {
+            return f(func, args, (size_t)nargs, NULL);
+        }
+        #elif defined(__Pyx_CyFunction_USED) && CYTHON_BACKPORT_VECTORCALL
+        if (__Pyx_CyFunction_CheckExact(func)) {
+            __pyx_vectorcallfunc f = __Pyx_CyFunction_func_vectorcall(func);
+            if (f) return f(func, args, (size_t)nargs, NULL);
+        }
+        #endif
+    }
+    if (nargs == 0) {
+        return __Pyx_PyObject_Call(func, __pyx_empty_tuple, kwargs);
+    }
+    #if PY_VERSION_HEX >= 0x03090000 && !CYTHON_COMPILING_IN_LIMITED_API
+    return PyObject_VectorcallDict(func, args, (size_t)nargs, kwargs);
+    #else
+    return __Pyx_PyObject_FastCall_fallback(func, args, (size_t)nargs, kwargs);
+    #endif
+}
+
+/* RaiseUnexpectedTypeError */
+static int
+__Pyx_RaiseUnexpectedTypeError(const char *expected, PyObject *obj)
+{
+    __Pyx_TypeName obj_type_name = __Pyx_PyType_GetName(Py_TYPE(obj));
+    PyErr_Format(PyExc_TypeError, "Expected %s, got " __Pyx_FMT_TYPENAME,
+                 expected, obj_type_name);
+    __Pyx_DECREF_TypeName(obj_type_name);
+    return 0;
+}
+
+/* CIntToDigits */
+static const char DIGIT_PAIRS_10[2*10*10+1] = {
+    "00010203040506070809"
+    "10111213141516171819"
+    "20212223242526272829"
+    "30313233343536373839"
+    "40414243444546474849"
+    "50515253545556575859"
+    "60616263646566676869"
+    "70717273747576777879"
+    "80818283848586878889"
+    "90919293949596979899"
+};
+static const char DIGIT_PAIRS_8[2*8*8+1] = {
+    "0001020304050607"
+    "1011121314151617"
+    "2021222324252627"
+    "3031323334353637"
+    "4041424344454647"
+    "5051525354555657"
+    "6061626364656667"
+    "7071727374757677"
+};
+static const char DIGITS_HEX[2*16+1] = {
+    "0123456789abcdef"
+    "0123456789ABCDEF"
+};
+
+/* BuildPyUnicode */
+static PyObject* __Pyx_PyUnicode_BuildFromAscii(Py_ssize_t ulength, char* chars, int clength,
+                                                int prepend_sign, char padding_char) {
+    PyObject *uval;
+    Py_ssize_t uoffset = ulength - clength;
+#if CYTHON_USE_UNICODE_INTERNALS
+    Py_ssize_t i;
+#if CYTHON_PEP393_ENABLED
+    void *udata;
+    uval = PyUnicode_New(ulength, 127);
+    if (unlikely(!uval)) return NULL;
+    udata = PyUnicode_DATA(uval);
+#else
+    Py_UNICODE *udata;
+    uval = PyUnicode_FromUnicode(NULL, ulength);
+    if (unlikely(!uval)) return NULL;
+    udata = PyUnicode_AS_UNICODE(uval);
+#endif
+    if (uoffset > 0) {
+        i = 0;
+        if (prepend_sign) {
+            __Pyx_PyUnicode_WRITE(PyUnicode_1BYTE_KIND, udata, 0, '-');
+            i++;
+        }
+        for (; i < uoffset; i++) {
+            __Pyx_PyUnicode_WRITE(PyUnicode_1BYTE_KIND, udata, i, padding_char);
+        }
+    }
+    for (i=0; i < clength; i++) {
+        __Pyx_PyUnicode_WRITE(PyUnicode_1BYTE_KIND, udata, uoffset+i, chars[i]);
+    }
+#else
+    {
+        PyObject *sign = NULL, *padding = NULL;
+        uval = NULL;
+        if (uoffset > 0) {
+            prepend_sign = !!prepend_sign;
+            if (uoffset > prepend_sign) {
+                padding = PyUnicode_FromOrdinal(padding_char);
+                if (likely(padding) && uoffset > prepend_sign + 1) {
+                    PyObject *tmp;
+                    PyObject *repeat = PyInt_FromSsize_t(uoffset - prepend_sign);
+                    if (unlikely(!repeat)) goto done_or_error;
+                    tmp = PyNumber_Multiply(padding, repeat);
+                    Py_DECREF(repeat);
+                    Py_DECREF(padding);
+                    padding = tmp;
+                }
+                if (unlikely(!padding)) goto done_or_error;
+            }
+            if (prepend_sign) {
+                sign = PyUnicode_FromOrdinal('-');
+                if (unlikely(!sign)) goto done_or_error;
+            }
+        }
+        uval = PyUnicode_DecodeASCII(chars, clength, NULL);
+        if (likely(uval) && padding) {
+            PyObject *tmp = PyNumber_Add(padding, uval);
+            Py_DECREF(uval);
+            uval = tmp;
+        }
+        if (likely(uval) && sign) {
+            PyObject *tmp = PyNumber_Add(sign, uval);
+            Py_DECREF(uval);
+            uval = tmp;
+        }
+done_or_error:
+        Py_XDECREF(padding);
+        Py_XDECREF(sign);
+    }
+#endif
+    return uval;
+}
+
+/* CIntToPyUnicode */
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_From_int(int value, Py_ssize_t width, char padding_char, char format_char) {
+    char digits[sizeof(int)*3+2];
+    char *dpos, *end = digits + sizeof(int)*3+2;
+    const char *hex_digits = DIGITS_HEX;
+    Py_ssize_t length, ulength;
+    int prepend_sign, last_one_off;
+    int remaining;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const int neg_one = (int) -1, const_zero = (int) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+    if (format_char == 'X') {
+        hex_digits += 16;
+        format_char = 'x';
+    }
+    remaining = value;
+    last_one_off = 0;
+    dpos = end;
+    do {
+        int digit_pos;
+        switch (format_char) {
+        case 'o':
+            digit_pos = abs((int)(remaining % (8*8)));
+            remaining = (int) (remaining / (8*8));
+            dpos -= 2;
+            memcpy(dpos, DIGIT_PAIRS_8 + digit_pos * 2, 2);
+            last_one_off = (digit_pos < 8);
+            break;
+        case 'd':
+            digit_pos = abs((int)(remaining % (10*10)));
+            remaining = (int) (remaining / (10*10));
+            dpos -= 2;
+            memcpy(dpos, DIGIT_PAIRS_10 + digit_pos * 2, 2);
+            last_one_off = (digit_pos < 10);
+            break;
+        case 'x':
+            *(--dpos) = hex_digits[abs((int)(remaining % 16))];
+            remaining = (int) (remaining / 16);
+            break;
+        default:
+            assert(0);
+            break;
+        }
+    } while (unlikely(remaining != 0));
+    assert(!last_one_off || *dpos == '0');
+    dpos += last_one_off;
+    length = end - dpos;
+    ulength = length;
+    prepend_sign = 0;
+    if (!is_unsigned && value <= neg_one) {
+        if (padding_char == ' ' || width <= length + 1) {
+            *(--dpos) = '-';
+            ++length;
+        } else {
+            prepend_sign = 1;
+        }
+        ++ulength;
+    }
+    if (width > ulength) {
+        ulength = width;
+    }
+    if (ulength == 1) {
+        return PyUnicode_FromOrdinal(*dpos);
+    }
+    return __Pyx_PyUnicode_BuildFromAscii(ulength, dpos, (int) length, prepend_sign, padding_char);
+}
+
+/* CIntToPyUnicode */
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_From_Py_ssize_t(Py_ssize_t value, Py_ssize_t width, char padding_char, char format_char) {
+    char digits[sizeof(Py_ssize_t)*3+2];
+    char *dpos, *end = digits + sizeof(Py_ssize_t)*3+2;
+    const char *hex_digits = DIGITS_HEX;
+    Py_ssize_t length, ulength;
+    int prepend_sign, last_one_off;
+    Py_ssize_t remaining;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const Py_ssize_t neg_one = (Py_ssize_t) -1, const_zero = (Py_ssize_t) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+    if (format_char == 'X') {
+        hex_digits += 16;
+        format_char = 'x';
+    }
+    remaining = value;
+    last_one_off = 0;
+    dpos = end;
+    do {
+        int digit_pos;
+        switch (format_char) {
+        case 'o':
+            digit_pos = abs((int)(remaining % (8*8)));
+            remaining = (Py_ssize_t) (remaining / (8*8));
+            dpos -= 2;
+            memcpy(dpos, DIGIT_PAIRS_8 + digit_pos * 2, 2);
+            last_one_off = (digit_pos < 8);
+            break;
+        case 'd':
+            digit_pos = abs((int)(remaining % (10*10)));
+            remaining = (Py_ssize_t) (remaining / (10*10));
+            dpos -= 2;
+            memcpy(dpos, DIGIT_PAIRS_10 + digit_pos * 2, 2);
+            last_one_off = (digit_pos < 10);
+            break;
+        case 'x':
+            *(--dpos) = hex_digits[abs((int)(remaining % 16))];
+            remaining = (Py_ssize_t) (remaining / 16);
+            break;
+        default:
+            assert(0);
+            break;
+        }
+    } while (unlikely(remaining != 0));
+    assert(!last_one_off || *dpos == '0');
+    dpos += last_one_off;
+    length = end - dpos;
+    ulength = length;
+    prepend_sign = 0;
+    if (!is_unsigned && value <= neg_one) {
+        if (padding_char == ' ' || width <= length + 1) {
+            *(--dpos) = '-';
+            ++length;
+        } else {
+            prepend_sign = 1;
+        }
+        ++ulength;
+    }
+    if (width > ulength) {
+        ulength = width;
+    }
+    if (ulength == 1) {
+        return PyUnicode_FromOrdinal(*dpos);
+    }
+    return __Pyx_PyUnicode_BuildFromAscii(ulength, dpos, (int) length, prepend_sign, padding_char);
+}
+
+/* JoinPyUnicode */
+static PyObject* __Pyx_PyUnicode_Join(PyObject* value_tuple, Py_ssize_t value_count, Py_ssize_t result_ulength,
+                                      Py_UCS4 max_char) {
+#if CYTHON_USE_UNICODE_INTERNALS && CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    PyObject *result_uval;
+    int result_ukind, kind_shift;
+    Py_ssize_t i, char_pos;
+    void *result_udata;
+    CYTHON_MAYBE_UNUSED_VAR(max_char);
+#if CYTHON_PEP393_ENABLED
+    result_uval = PyUnicode_New(result_ulength, max_char);
+    if (unlikely(!result_uval)) return NULL;
+    result_ukind = (max_char <= 255) ? PyUnicode_1BYTE_KIND : (max_char <= 65535) ? PyUnicode_2BYTE_KIND : PyUnicode_4BYTE_KIND;
+    kind_shift = (result_ukind == PyUnicode_4BYTE_KIND) ? 2 : result_ukind - 1;
+    result_udata = PyUnicode_DATA(result_uval);
+#else
+    result_uval = PyUnicode_FromUnicode(NULL, result_ulength);
+    if (unlikely(!result_uval)) return NULL;
+    result_ukind = sizeof(Py_UNICODE);
+    kind_shift = (result_ukind == 4) ? 2 : result_ukind - 1;
+    result_udata = PyUnicode_AS_UNICODE(result_uval);
+#endif
+    assert(kind_shift == 2 || kind_shift == 1 || kind_shift == 0);
+    char_pos = 0;
+    for (i=0; i < value_count; i++) {
+        int ukind;
+        Py_ssize_t ulength;
+        void *udata;
+        PyObject *uval = PyTuple_GET_ITEM(value_tuple, i);
+        if (unlikely(__Pyx_PyUnicode_READY(uval)))
+            goto bad;
+        ulength = __Pyx_PyUnicode_GET_LENGTH(uval);
+        if (unlikely(!ulength))
+            continue;
+        if (unlikely((PY_SSIZE_T_MAX >> kind_shift) - ulength < char_pos))
+            goto overflow;
+        ukind = __Pyx_PyUnicode_KIND(uval);
+        udata = __Pyx_PyUnicode_DATA(uval);
+        if (!CYTHON_PEP393_ENABLED || ukind == result_ukind) {
+            memcpy((char *)result_udata + (char_pos << kind_shift), udata, (size_t) (ulength << kind_shift));
+        } else {
+            #if PY_VERSION_HEX >= 0x030d0000
+            if (unlikely(PyUnicode_CopyCharacters(result_uval, char_pos, uval, 0, ulength) < 0)) goto bad;
+            #elif CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030300F0 || defined(_PyUnicode_FastCopyCharacters)
+            _PyUnicode_FastCopyCharacters(result_uval, char_pos, uval, 0, ulength);
+            #else
+            Py_ssize_t j;
+            for (j=0; j < ulength; j++) {
+                Py_UCS4 uchar = __Pyx_PyUnicode_READ(ukind, udata, j);
+                __Pyx_PyUnicode_WRITE(result_ukind, result_udata, char_pos+j, uchar);
+            }
+            #endif
+        }
+        char_pos += ulength;
+    }
+    return result_uval;
+overflow:
+    PyErr_SetString(PyExc_OverflowError, "join() result is too long for a Python string");
+bad:
+    Py_DECREF(result_uval);
+    return NULL;
+#else
+    CYTHON_UNUSED_VAR(max_char);
+    CYTHON_UNUSED_VAR(result_ulength);
+    CYTHON_UNUSED_VAR(value_count);
+    return PyUnicode_Join(__pyx_empty_unicode, value_tuple);
+#endif
+}
+
+/* GetAttr */
+static CYTHON_INLINE PyObject *__Pyx_GetAttr(PyObject *o, PyObject *n) {
+#if CYTHON_USE_TYPE_SLOTS
+#if PY_MAJOR_VERSION >= 3
+    if (likely(PyUnicode_Check(n)))
+#else
+    if (likely(PyString_Check(n)))
+#endif
+        return __Pyx_PyObject_GetAttrStr(o, n);
+#endif
+    return PyObject_GetAttr(o, n);
+}
+
+/* GetItemInt */
+static PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j) {
+    PyObject *r;
+    if (unlikely(!j)) return NULL;
+    r = PyObject_GetItem(o, j);
+    Py_DECREF(j);
+    return r;
+}
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, Py_ssize_t i,
+                                                              CYTHON_NCP_UNUSED int wraparound,
+                                                              CYTHON_NCP_UNUSED int boundscheck) {
+#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    Py_ssize_t wrapped_i = i;
+    if (wraparound & unlikely(i < 0)) {
+        wrapped_i += PyList_GET_SIZE(o);
+    }
+    if ((!boundscheck) || likely(__Pyx_is_valid_index(wrapped_i, PyList_GET_SIZE(o)))) {
+        PyObject *r = PyList_GET_ITEM(o, wrapped_i);
+        Py_INCREF(r);
+        return r;
+    }
+    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
+#else
+    return PySequence_GetItem(o, i);
+#endif
+}
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, Py_ssize_t i,
+                                                              CYTHON_NCP_UNUSED int wraparound,
+                                                              CYTHON_NCP_UNUSED int boundscheck) {
+#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    Py_ssize_t wrapped_i = i;
+    if (wraparound & unlikely(i < 0)) {
+        wrapped_i += PyTuple_GET_SIZE(o);
+    }
+    if ((!boundscheck) || likely(__Pyx_is_valid_index(wrapped_i, PyTuple_GET_SIZE(o)))) {
+        PyObject *r = PyTuple_GET_ITEM(o, wrapped_i);
+        Py_INCREF(r);
+        return r;
+    }
+    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
+#else
+    return PySequence_GetItem(o, i);
+#endif
+}
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i, int is_list,
+                                                     CYTHON_NCP_UNUSED int wraparound,
+                                                     CYTHON_NCP_UNUSED int boundscheck) {
+#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS && CYTHON_USE_TYPE_SLOTS
+    if (is_list || PyList_CheckExact(o)) {
+        Py_ssize_t n = ((!wraparound) | likely(i >= 0)) ? i : i + PyList_GET_SIZE(o);
+        if ((!boundscheck) || (likely(__Pyx_is_valid_index(n, PyList_GET_SIZE(o))))) {
+            PyObject *r = PyList_GET_ITEM(o, n);
+            Py_INCREF(r);
+            return r;
+        }
+    }
+    else if (PyTuple_CheckExact(o)) {
+        Py_ssize_t n = ((!wraparound) | likely(i >= 0)) ? i : i + PyTuple_GET_SIZE(o);
+        if ((!boundscheck) || likely(__Pyx_is_valid_index(n, PyTuple_GET_SIZE(o)))) {
+            PyObject *r = PyTuple_GET_ITEM(o, n);
+            Py_INCREF(r);
+            return r;
+        }
+    } else {
+        PyMappingMethods *mm = Py_TYPE(o)->tp_as_mapping;
+        PySequenceMethods *sm = Py_TYPE(o)->tp_as_sequence;
+        if (mm && mm->mp_subscript) {
+            PyObject *r, *key = PyInt_FromSsize_t(i);
+            if (unlikely(!key)) return NULL;
+            r = mm->mp_subscript(o, key);
+            Py_DECREF(key);
+            return r;
+        }
+        if (likely(sm && sm->sq_item)) {
+            if (wraparound && unlikely(i < 0) && likely(sm->sq_length)) {
+                Py_ssize_t l = sm->sq_length(o);
+                if (likely(l >= 0)) {
+                    i += l;
+                } else {
+                    if (!PyErr_ExceptionMatches(PyExc_OverflowError))
+                        return NULL;
+                    PyErr_Clear();
+                }
+            }
+            return sm->sq_item(o, i);
+        }
+    }
+#else
+    if (is_list || !PyMapping_Check(o)) {
+        return PySequence_GetItem(o, i);
+    }
+#endif
+    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
+}
+
+/* PyObjectCallOneArg */
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg) {
+    PyObject *args[2] = {NULL, arg};
+    return __Pyx_PyObject_FastCall(func, args+1, 1 | __Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET);
+}
+
+/* ObjectGetItem */
+#if CYTHON_USE_TYPE_SLOTS
+static PyObject *__Pyx_PyObject_GetIndex(PyObject *obj, PyObject *index) {
+    PyObject *runerr = NULL;
+    Py_ssize_t key_value;
+    key_value = __Pyx_PyIndex_AsSsize_t(index);
+    if (likely(key_value != -1 || !(runerr = PyErr_Occurred()))) {
+        return __Pyx_GetItemInt_Fast(obj, key_value, 0, 1, 1);
+    }
+    if (PyErr_GivenExceptionMatches(runerr, PyExc_OverflowError)) {
+        __Pyx_TypeName index_type_name = __Pyx_PyType_GetName(Py_TYPE(index));
+        PyErr_Clear();
+        PyErr_Format(PyExc_IndexError,
+            "cannot fit '" __Pyx_FMT_TYPENAME "' into an index-sized integer", index_type_name);
+        __Pyx_DECREF_TypeName(index_type_name);
+    }
+    return NULL;
+}
+static PyObject *__Pyx_PyObject_GetItem_Slow(PyObject *obj, PyObject *key) {
+    __Pyx_TypeName obj_type_name;
+    if (likely(PyType_Check(obj))) {
+        PyObject *meth = __Pyx_PyObject_GetAttrStrNoError(obj, __pyx_n_s_class_getitem);
+        if (!meth) {
+            PyErr_Clear();
+        } else {
+            PyObject *result = __Pyx_PyObject_CallOneArg(meth, key);
+            Py_DECREF(meth);
+            return result;
+        }
+    }
+    obj_type_name = __Pyx_PyType_GetName(Py_TYPE(obj));
+    PyErr_Format(PyExc_TypeError,
+        "'" __Pyx_FMT_TYPENAME "' object is not subscriptable", obj_type_name);
+    __Pyx_DECREF_TypeName(obj_type_name);
+    return NULL;
+}
+static PyObject *__Pyx_PyObject_GetItem(PyObject *obj, PyObject *key) {
+    PyTypeObject *tp = Py_TYPE(obj);
+    PyMappingMethods *mm = tp->tp_as_mapping;
+    PySequenceMethods *sm = tp->tp_as_sequence;
+    if (likely(mm && mm->mp_subscript)) {
+        return mm->mp_subscript(obj, key);
+    }
+    if (likely(sm && sm->sq_item)) {
+        return __Pyx_PyObject_GetIndex(obj, key);
+    }
+    return __Pyx_PyObject_GetItem_Slow(obj, key);
+}
+#endif
+
+/* KeywordStringCheck */
+static int __Pyx_CheckKeywordStrings(
+    PyObject *kw,
+    const char* function_name,
+    int kw_allowed)
+{
+    PyObject* key = 0;
+    Py_ssize_t pos = 0;
+#if CYTHON_COMPILING_IN_PYPY
+    if (!kw_allowed && PyDict_Next(kw, &pos, &key, 0))
+        goto invalid_keyword;
+    return 1;
+#else
+    if (CYTHON_METH_FASTCALL && likely(PyTuple_Check(kw))) {
+        Py_ssize_t kwsize;
+#if CYTHON_ASSUME_SAFE_MACROS
+        kwsize = PyTuple_GET_SIZE(kw);
+#else
+        kwsize = PyTuple_Size(kw);
+        if (kwsize < 0) return 0;
+#endif
+        if (unlikely(kwsize == 0))
+            return 1;
+        if (!kw_allowed) {
+#if CYTHON_ASSUME_SAFE_MACROS
+            key = PyTuple_GET_ITEM(kw, 0);
+#else
+            key = PyTuple_GetItem(kw, pos);
+            if (!key) return 0;
+#endif
+            goto invalid_keyword;
+        }
+#if PY_VERSION_HEX < 0x03090000
+        for (pos = 0; pos < kwsize; pos++) {
+#if CYTHON_ASSUME_SAFE_MACROS
+            key = PyTuple_GET_ITEM(kw, pos);
+#else
+            key = PyTuple_GetItem(kw, pos);
+            if (!key) return 0;
+#endif
+            if (unlikely(!PyUnicode_Check(key)))
+                goto invalid_keyword_type;
+        }
+#endif
+        return 1;
+    }
+    while (PyDict_Next(kw, &pos, &key, 0)) {
+        #if PY_MAJOR_VERSION < 3
+        if (unlikely(!PyString_Check(key)))
+        #endif
+            if (unlikely(!PyUnicode_Check(key)))
+                goto invalid_keyword_type;
+    }
+    if (!kw_allowed && unlikely(key))
+        goto invalid_keyword;
+    return 1;
+invalid_keyword_type:
+    PyErr_Format(PyExc_TypeError,
+        "%.200s() keywords must be strings", function_name);
+    return 0;
+#endif
+invalid_keyword:
+    #if PY_MAJOR_VERSION < 3
+    PyErr_Format(PyExc_TypeError,
+        "%.200s() got an unexpected keyword argument '%.200s'",
+        function_name, PyString_AsString(key));
+    #else
+    PyErr_Format(PyExc_TypeError,
+        "%s() got an unexpected keyword argument '%U'",
+        function_name, key);
+    #endif
+    return 0;
+}
+
+/* DivInt[Py_ssize_t] */
+static CYTHON_INLINE Py_ssize_t __Pyx_div_Py_ssize_t(Py_ssize_t a, Py_ssize_t b) {
+    Py_ssize_t q = a / b;
+    Py_ssize_t r = a - q*b;
+    q -= ((r != 0) & ((r ^ b) < 0));
+    return q;
+}
+
+/* GetAttr3 */
+#if __PYX_LIMITED_VERSION_HEX < 0x030d00A1
+static PyObject *__Pyx_GetAttr3Default(PyObject *d) {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    if (unlikely(!__Pyx_PyErr_ExceptionMatches(PyExc_AttributeError)))
+        return NULL;
+    __Pyx_PyErr_Clear();
+    Py_INCREF(d);
+    return d;
+}
+#endif
+static CYTHON_INLINE PyObject *__Pyx_GetAttr3(PyObject *o, PyObject *n, PyObject *d) {
+    PyObject *r;
+#if __PYX_LIMITED_VERSION_HEX >= 0x030d00A1
+    int res = PyObject_GetOptionalAttr(o, n, &r);
+    return (res != 0) ? r : __Pyx_NewRef(d);
+#else
+  #if CYTHON_USE_TYPE_SLOTS
+    if (likely(PyString_Check(n))) {
+        r = __Pyx_PyObject_GetAttrStrNoError(o, n);
+        if (unlikely(!r) && likely(!PyErr_Occurred())) {
+            r = __Pyx_NewRef(d);
+        }
+        return r;
+    }
+  #endif
+    r = PyObject_GetAttr(o, n);
+    return (likely(r)) ? r : __Pyx_GetAttr3Default(d);
+#endif
+}
+
+/* PyDictVersioning */
+#if CYTHON_USE_DICT_VERSIONS && CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_tp_dict_version(PyObject *obj) {
+    PyObject *dict = Py_TYPE(obj)->tp_dict;
+    return likely(dict) ? __PYX_GET_DICT_VERSION(dict) : 0;
+}
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_object_dict_version(PyObject *obj) {
+    PyObject **dictptr = NULL;
+    Py_ssize_t offset = Py_TYPE(obj)->tp_dictoffset;
+    if (offset) {
+#if CYTHON_COMPILING_IN_CPYTHON
+        dictptr = (likely(offset > 0)) ? (PyObject **) ((char *)obj + offset) : _PyObject_GetDictPtr(obj);
+#else
+        dictptr = _PyObject_GetDictPtr(obj);
+#endif
+    }
+    return (dictptr && *dictptr) ? __PYX_GET_DICT_VERSION(*dictptr) : 0;
+}
+static CYTHON_INLINE int __Pyx_object_dict_version_matches(PyObject* obj, PY_UINT64_T tp_dict_version, PY_UINT64_T obj_dict_version) {
+    PyObject *dict = Py_TYPE(obj)->tp_dict;
+    if (unlikely(!dict) || unlikely(tp_dict_version != __PYX_GET_DICT_VERSION(dict)))
+        return 0;
+    return obj_dict_version == __Pyx_get_object_dict_version(obj);
+}
+#endif
+
+/* GetModuleGlobalName */
+#if CYTHON_USE_DICT_VERSIONS
+static PyObject *__Pyx__GetModuleGlobalName(PyObject *name, PY_UINT64_T *dict_version, PyObject **dict_cached_value)
+#else
+static CYTHON_INLINE PyObject *__Pyx__GetModuleGlobalName(PyObject *name)
+#endif
+{
+    PyObject *result;
+#if !CYTHON_AVOID_BORROWED_REFS
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1 && PY_VERSION_HEX < 0x030d0000
+    result = _PyDict_GetItem_KnownHash(__pyx_d, name, ((PyASCIIObject *) name)->hash);
+    __PYX_UPDATE_DICT_CACHE(__pyx_d, result, *dict_cached_value, *dict_version)
+    if (likely(result)) {
+        return __Pyx_NewRef(result);
+    } else if (unlikely(PyErr_Occurred())) {
+        return NULL;
+    }
+#elif CYTHON_COMPILING_IN_LIMITED_API
+    if (unlikely(!__pyx_m)) {
+        return NULL;
+    }
+    result = PyObject_GetAttr(__pyx_m, name);
+    if (likely(result)) {
+        return result;
+    }
+#else
+    result = PyDict_GetItem(__pyx_d, name);
+    __PYX_UPDATE_DICT_CACHE(__pyx_d, result, *dict_cached_value, *dict_version)
+    if (likely(result)) {
+        return __Pyx_NewRef(result);
+    }
+#endif
+#else
+    result = PyObject_GetItem(__pyx_d, name);
+    __PYX_UPDATE_DICT_CACHE(__pyx_d, result, *dict_cached_value, *dict_version)
+    if (likely(result)) {
+        return __Pyx_NewRef(result);
+    }
+    PyErr_Clear();
+#endif
+    return __Pyx_GetBuiltinName(name);
+}
+
+/* RaiseTooManyValuesToUnpack */
+static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected) {
+    PyErr_Format(PyExc_ValueError,
+                 "too many values to unpack (expected %" CYTHON_FORMAT_SSIZE_T "d)", expected);
+}
+
+/* RaiseNeedMoreValuesToUnpack */
+static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index) {
+    PyErr_Format(PyExc_ValueError,
+                 "need more than %" CYTHON_FORMAT_SSIZE_T "d value%.1s to unpack",
+                 index, (index == 1) ? "" : "s");
+}
+
+/* RaiseNoneIterError */
+static CYTHON_INLINE void __Pyx_RaiseNoneNotIterableError(void) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
+}
+
+/* ExtTypeTest */
+static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type) {
+    __Pyx_TypeName obj_type_name;
+    __Pyx_TypeName type_name;
+    if (unlikely(!type)) {
+        PyErr_SetString(PyExc_SystemError, "Missing type object");
+        return 0;
+    }
+    if (likely(__Pyx_TypeCheck(obj, type)))
+        return 1;
+    obj_type_name = __Pyx_PyType_GetName(Py_TYPE(obj));
+    type_name = __Pyx_PyType_GetName(type);
+    PyErr_Format(PyExc_TypeError,
+                 "Cannot convert " __Pyx_FMT_TYPENAME " to " __Pyx_FMT_TYPENAME,
+                 obj_type_name, type_name);
+    __Pyx_DECREF_TypeName(obj_type_name);
+    __Pyx_DECREF_TypeName(type_name);
+    return 0;
+}
+
+/* GetTopmostException */
+#if CYTHON_USE_EXC_INFO_STACK && CYTHON_FAST_THREAD_STATE
+static _PyErr_StackItem *
+__Pyx_PyErr_GetTopmostException(PyThreadState *tstate)
+{
+    _PyErr_StackItem *exc_info = tstate->exc_info;
+    while ((exc_info->exc_value == NULL || exc_info->exc_value == Py_None) &&
+           exc_info->previous_item != NULL)
+    {
+        exc_info = exc_info->previous_item;
+    }
+    return exc_info;
+}
+#endif
+
+/* SaveResetException */
+#if CYTHON_FAST_THREAD_STATE
+static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+  #if CYTHON_USE_EXC_INFO_STACK && PY_VERSION_HEX >= 0x030B00a4
+    _PyErr_StackItem *exc_info = __Pyx_PyErr_GetTopmostException(tstate);
+    PyObject *exc_value = exc_info->exc_value;
+    if (exc_value == NULL || exc_value == Py_None) {
+        *value = NULL;
+        *type = NULL;
+        *tb = NULL;
+    } else {
+        *value = exc_value;
+        Py_INCREF(*value);
+        *type = (PyObject*) Py_TYPE(exc_value);
+        Py_INCREF(*type);
+        *tb = PyException_GetTraceback(exc_value);
+    }
+  #elif CYTHON_USE_EXC_INFO_STACK
+    _PyErr_StackItem *exc_info = __Pyx_PyErr_GetTopmostException(tstate);
+    *type = exc_info->exc_type;
+    *value = exc_info->exc_value;
+    *tb = exc_info->exc_traceback;
+    Py_XINCREF(*type);
+    Py_XINCREF(*value);
+    Py_XINCREF(*tb);
+  #else
+    *type = tstate->exc_type;
+    *value = tstate->exc_value;
+    *tb = tstate->exc_traceback;
+    Py_XINCREF(*type);
+    Py_XINCREF(*value);
+    Py_XINCREF(*tb);
+  #endif
+}
+static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) {
+  #if CYTHON_USE_EXC_INFO_STACK && PY_VERSION_HEX >= 0x030B00a4
+    _PyErr_StackItem *exc_info = tstate->exc_info;
+    PyObject *tmp_value = exc_info->exc_value;
+    exc_info->exc_value = value;
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(type);
+    Py_XDECREF(tb);
+  #else
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    #if CYTHON_USE_EXC_INFO_STACK
+    _PyErr_StackItem *exc_info = tstate->exc_info;
+    tmp_type = exc_info->exc_type;
+    tmp_value = exc_info->exc_value;
+    tmp_tb = exc_info->exc_traceback;
+    exc_info->exc_type = type;
+    exc_info->exc_value = value;
+    exc_info->exc_traceback = tb;
+    #else
+    tmp_type = tstate->exc_type;
+    tmp_value = tstate->exc_value;
+    tmp_tb = tstate->exc_traceback;
+    tstate->exc_type = type;
+    tstate->exc_value = value;
+    tstate->exc_traceback = tb;
+    #endif
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+  #endif
+}
+#endif
+
+/* GetException */
+#if CYTHON_FAST_THREAD_STATE
+static int __Pyx__GetException(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb)
+#else
+static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb)
+#endif
+{
+    PyObject *local_type = NULL, *local_value, *local_tb = NULL;
+#if CYTHON_FAST_THREAD_STATE
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+  #if PY_VERSION_HEX >= 0x030C00A6
+    local_value = tstate->current_exception;
+    tstate->current_exception = 0;
+    if (likely(local_value)) {
+        local_type = (PyObject*) Py_TYPE(local_value);
+        Py_INCREF(local_type);
+        local_tb = PyException_GetTraceback(local_value);
+    }
+  #else
+    local_type = tstate->curexc_type;
+    local_value = tstate->curexc_value;
+    local_tb = tstate->curexc_traceback;
+    tstate->curexc_type = 0;
+    tstate->curexc_value = 0;
+    tstate->curexc_traceback = 0;
+  #endif
+#else
+    PyErr_Fetch(&local_type, &local_value, &local_tb);
+#endif
+    PyErr_NormalizeException(&local_type, &local_value, &local_tb);
+#if CYTHON_FAST_THREAD_STATE && PY_VERSION_HEX >= 0x030C00A6
+    if (unlikely(tstate->current_exception))
+#elif CYTHON_FAST_THREAD_STATE
+    if (unlikely(tstate->curexc_type))
+#else
+    if (unlikely(PyErr_Occurred()))
+#endif
+        goto bad;
+    #if PY_MAJOR_VERSION >= 3
+    if (local_tb) {
+        if (unlikely(PyException_SetTraceback(local_value, local_tb) < 0))
+            goto bad;
+    }
+    #endif
+    Py_XINCREF(local_tb);
+    Py_XINCREF(local_type);
+    Py_XINCREF(local_value);
+    *type = local_type;
+    *value = local_value;
+    *tb = local_tb;
+#if CYTHON_FAST_THREAD_STATE
+    #if CYTHON_USE_EXC_INFO_STACK
+    {
+        _PyErr_StackItem *exc_info = tstate->exc_info;
+      #if PY_VERSION_HEX >= 0x030B00a4
+        tmp_value = exc_info->exc_value;
+        exc_info->exc_value = local_value;
+        tmp_type = NULL;
+        tmp_tb = NULL;
+        Py_XDECREF(local_type);
+        Py_XDECREF(local_tb);
+      #else
+        tmp_type = exc_info->exc_type;
+        tmp_value = exc_info->exc_value;
+        tmp_tb = exc_info->exc_traceback;
+        exc_info->exc_type = local_type;
+        exc_info->exc_value = local_value;
+        exc_info->exc_traceback = local_tb;
+      #endif
+    }
+    #else
+    tmp_type = tstate->exc_type;
+    tmp_value = tstate->exc_value;
+    tmp_tb = tstate->exc_traceback;
+    tstate->exc_type = local_type;
+    tstate->exc_value = local_value;
+    tstate->exc_traceback = local_tb;
+    #endif
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+#else
+    PyErr_SetExcInfo(local_type, local_value, local_tb);
+#endif
+    return 0;
+bad:
+    *type = 0;
+    *value = 0;
+    *tb = 0;
+    Py_XDECREF(local_type);
+    Py_XDECREF(local_value);
+    Py_XDECREF(local_tb);
+    return -1;
+}
+
+/* SwapException */
+#if CYTHON_FAST_THREAD_STATE
+static CYTHON_INLINE void __Pyx__ExceptionSwap(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+  #if CYTHON_USE_EXC_INFO_STACK && PY_VERSION_HEX >= 0x030B00a4
+    _PyErr_StackItem *exc_info = tstate->exc_info;
+    tmp_value = exc_info->exc_value;
+    exc_info->exc_value = *value;
+    if (tmp_value == NULL || tmp_value == Py_None) {
+        Py_XDECREF(tmp_value);
+        tmp_value = NULL;
+        tmp_type = NULL;
+        tmp_tb = NULL;
+    } else {
+        tmp_type = (PyObject*) Py_TYPE(tmp_value);
+        Py_INCREF(tmp_type);
+        #if CYTHON_COMPILING_IN_CPYTHON
+        tmp_tb = ((PyBaseExceptionObject*) tmp_value)->traceback;
+        Py_XINCREF(tmp_tb);
+        #else
+        tmp_tb = PyException_GetTraceback(tmp_value);
+        #endif
+    }
+  #elif CYTHON_USE_EXC_INFO_STACK
+    _PyErr_StackItem *exc_info = tstate->exc_info;
+    tmp_type = exc_info->exc_type;
+    tmp_value = exc_info->exc_value;
+    tmp_tb = exc_info->exc_traceback;
+    exc_info->exc_type = *type;
+    exc_info->exc_value = *value;
+    exc_info->exc_traceback = *tb;
+  #else
+    tmp_type = tstate->exc_type;
+    tmp_value = tstate->exc_value;
+    tmp_tb = tstate->exc_traceback;
+    tstate->exc_type = *type;
+    tstate->exc_value = *value;
+    tstate->exc_traceback = *tb;
+  #endif
+    *type = tmp_type;
+    *value = tmp_value;
+    *tb = tmp_tb;
+}
+#else
+static CYTHON_INLINE void __Pyx_ExceptionSwap(PyObject **type, PyObject **value, PyObject **tb) {
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    PyErr_GetExcInfo(&tmp_type, &tmp_value, &tmp_tb);
+    PyErr_SetExcInfo(*type, *value, *tb);
+    *type = tmp_type;
+    *value = tmp_value;
+    *tb = tmp_tb;
+}
+#endif
+
+/* Import */
+static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level) {
+    PyObject *module = 0;
+    PyObject *empty_dict = 0;
+    PyObject *empty_list = 0;
+    #if PY_MAJOR_VERSION < 3
+    PyObject *py_import;
+    py_import = __Pyx_PyObject_GetAttrStr(__pyx_b, __pyx_n_s_import);
+    if (unlikely(!py_import))
+        goto bad;
+    if (!from_list) {
+        empty_list = PyList_New(0);
+        if (unlikely(!empty_list))
+            goto bad;
+        from_list = empty_list;
+    }
+    #endif
+    empty_dict = PyDict_New();
+    if (unlikely(!empty_dict))
+        goto bad;
+    {
+        #if PY_MAJOR_VERSION >= 3
+        if (level == -1) {
+            if (strchr(__Pyx_MODULE_NAME, '.') != NULL) {
+                module = PyImport_ImportModuleLevelObject(
+                    name, __pyx_d, empty_dict, from_list, 1);
+                if (unlikely(!module)) {
+                    if (unlikely(!PyErr_ExceptionMatches(PyExc_ImportError)))
+                        goto bad;
+                    PyErr_Clear();
+                }
+            }
+            level = 0;
+        }
+        #endif
+        if (!module) {
+            #if PY_MAJOR_VERSION < 3
+            PyObject *py_level = PyInt_FromLong(level);
+            if (unlikely(!py_level))
+                goto bad;
+            module = PyObject_CallFunctionObjArgs(py_import,
+                name, __pyx_d, empty_dict, from_list, py_level, (PyObject *)NULL);
+            Py_DECREF(py_level);
+            #else
+            module = PyImport_ImportModuleLevelObject(
+                name, __pyx_d, empty_dict, from_list, level);
+            #endif
+        }
+    }
+bad:
+    Py_XDECREF(empty_dict);
+    Py_XDECREF(empty_list);
+    #if PY_MAJOR_VERSION < 3
+    Py_XDECREF(py_import);
+    #endif
+    return module;
+}
+
+/* ImportDottedModule */
+#if PY_MAJOR_VERSION >= 3
+static PyObject *__Pyx__ImportDottedModule_Error(PyObject *name, PyObject *parts_tuple, Py_ssize_t count) {
+    PyObject *partial_name = NULL, *slice = NULL, *sep = NULL;
+    if (unlikely(PyErr_Occurred())) {
+        PyErr_Clear();
+    }
+    if (likely(PyTuple_GET_SIZE(parts_tuple) == count)) {
+        partial_name = name;
+    } else {
+        slice = PySequence_GetSlice(parts_tuple, 0, count);
+        if (unlikely(!slice))
+            goto bad;
+        sep = PyUnicode_FromStringAndSize(".", 1);
+        if (unlikely(!sep))
+            goto bad;
+        partial_name = PyUnicode_Join(sep, slice);
+    }
+    PyErr_Format(
+#if PY_MAJOR_VERSION < 3
+        PyExc_ImportError,
+        "No module named '%s'", PyString_AS_STRING(partial_name));
+#else
+#if PY_VERSION_HEX >= 0x030600B1
+        PyExc_ModuleNotFoundError,
+#else
+        PyExc_ImportError,
+#endif
+        "No module named '%U'", partial_name);
+#endif
+bad:
+    Py_XDECREF(sep);
+    Py_XDECREF(slice);
+    Py_XDECREF(partial_name);
+    return NULL;
+}
+#endif
+#if PY_MAJOR_VERSION >= 3
+static PyObject *__Pyx__ImportDottedModule_Lookup(PyObject *name) {
+    PyObject *imported_module;
+#if PY_VERSION_HEX < 0x030700A1 || (CYTHON_COMPILING_IN_PYPY && PYPY_VERSION_NUM  < 0x07030400)
+    PyObject *modules = PyImport_GetModuleDict();
+    if (unlikely(!modules))
+        return NULL;
+    imported_module = __Pyx_PyDict_GetItemStr(modules, name);
+    Py_XINCREF(imported_module);
+#else
+    imported_module = PyImport_GetModule(name);
+#endif
+    return imported_module;
+}
+#endif
+#if PY_MAJOR_VERSION >= 3
+static PyObject *__Pyx_ImportDottedModule_WalkParts(PyObject *module, PyObject *name, PyObject *parts_tuple) {
+    Py_ssize_t i, nparts;
+    nparts = PyTuple_GET_SIZE(parts_tuple);
+    for (i=1; i < nparts && module; i++) {
+        PyObject *part, *submodule;
+#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+        part = PyTuple_GET_ITEM(parts_tuple, i);
+#else
+        part = PySequence_ITEM(parts_tuple, i);
+#endif
+        submodule = __Pyx_PyObject_GetAttrStrNoError(module, part);
+#if !(CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS)
+        Py_DECREF(part);
+#endif
+        Py_DECREF(module);
+        module = submodule;
+    }
+    if (unlikely(!module)) {
+        return __Pyx__ImportDottedModule_Error(name, parts_tuple, i);
+    }
+    return module;
+}
+#endif
+static PyObject *__Pyx__ImportDottedModule(PyObject *name, PyObject *parts_tuple) {
+#if PY_MAJOR_VERSION < 3
+    PyObject *module, *from_list, *star = __pyx_n_s__3;
+    CYTHON_UNUSED_VAR(parts_tuple);
+    from_list = PyList_New(1);
+    if (unlikely(!from_list))
+        return NULL;
+    Py_INCREF(star);
+    PyList_SET_ITEM(from_list, 0, star);
+    module = __Pyx_Import(name, from_list, 0);
+    Py_DECREF(from_list);
+    return module;
+#else
+    PyObject *imported_module;
+    PyObject *module = __Pyx_Import(name, NULL, 0);
+    if (!parts_tuple || unlikely(!module))
+        return module;
+    imported_module = __Pyx__ImportDottedModule_Lookup(name);
+    if (likely(imported_module)) {
+        Py_DECREF(module);
+        return imported_module;
+    }
+    PyErr_Clear();
+    return __Pyx_ImportDottedModule_WalkParts(module, name, parts_tuple);
+#endif
+}
+static PyObject *__Pyx_ImportDottedModule(PyObject *name, PyObject *parts_tuple) {
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030400B1
+    PyObject *module = __Pyx__ImportDottedModule_Lookup(name);
+    if (likely(module)) {
+        PyObject *spec = __Pyx_PyObject_GetAttrStrNoError(module, __pyx_n_s_spec);
+        if (likely(spec)) {
+            PyObject *unsafe = __Pyx_PyObject_GetAttrStrNoError(spec, __pyx_n_s_initializing);
+            if (likely(!unsafe || !__Pyx_PyObject_IsTrue(unsafe))) {
+                Py_DECREF(spec);
+                spec = NULL;
+            }
+            Py_XDECREF(unsafe);
+        }
+        if (likely(!spec)) {
+            PyErr_Clear();
+            return module;
+        }
+        Py_DECREF(spec);
+        Py_DECREF(module);
+    } else if (PyErr_Occurred()) {
+        PyErr_Clear();
+    }
+#endif
+    return __Pyx__ImportDottedModule(name, parts_tuple);
+}
+
+/* FastTypeChecks */
+#if CYTHON_COMPILING_IN_CPYTHON
+static int __Pyx_InBases(PyTypeObject *a, PyTypeObject *b) {
+    while (a) {
+        a = __Pyx_PyType_GetSlot(a, tp_base, PyTypeObject*);
+        if (a == b)
+            return 1;
+    }
+    return b == &PyBaseObject_Type;
+}
+static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b) {
+    PyObject *mro;
+    if (a == b) return 1;
+    mro = a->tp_mro;
+    if (likely(mro)) {
+        Py_ssize_t i, n;
+        n = PyTuple_GET_SIZE(mro);
+        for (i = 0; i < n; i++) {
+            if (PyTuple_GET_ITEM(mro, i) == (PyObject *)b)
+                return 1;
+        }
+        return 0;
+    }
+    return __Pyx_InBases(a, b);
+}
+static CYTHON_INLINE int __Pyx_IsAnySubtype2(PyTypeObject *cls, PyTypeObject *a, PyTypeObject *b) {
+    PyObject *mro;
+    if (cls == a || cls == b) return 1;
+    mro = cls->tp_mro;
+    if (likely(mro)) {
+        Py_ssize_t i, n;
+        n = PyTuple_GET_SIZE(mro);
+        for (i = 0; i < n; i++) {
+            PyObject *base = PyTuple_GET_ITEM(mro, i);
+            if (base == (PyObject *)a || base == (PyObject *)b)
+                return 1;
+        }
+        return 0;
+    }
+    return __Pyx_InBases(cls, a) || __Pyx_InBases(cls, b);
+}
+#if PY_MAJOR_VERSION == 2
+static int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObject* exc_type1, PyObject* exc_type2) {
+    PyObject *exception, *value, *tb;
+    int res;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&exception, &value, &tb);
+    res = exc_type1 ? PyObject_IsSubclass(err, exc_type1) : 0;
+    if (unlikely(res == -1)) {
+        PyErr_WriteUnraisable(err);
+        res = 0;
+    }
+    if (!res) {
+        res = PyObject_IsSubclass(err, exc_type2);
+        if (unlikely(res == -1)) {
+            PyErr_WriteUnraisable(err);
+            res = 0;
+        }
+    }
+    __Pyx_ErrRestore(exception, value, tb);
+    return res;
+}
+#else
+static CYTHON_INLINE int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObject* exc_type1, PyObject *exc_type2) {
+    if (exc_type1) {
+        return __Pyx_IsAnySubtype2((PyTypeObject*)err, (PyTypeObject*)exc_type1, (PyTypeObject*)exc_type2);
+    } else {
+        return __Pyx_IsSubtype((PyTypeObject*)err, (PyTypeObject*)exc_type2);
+    }
+}
+#endif
+static int __Pyx_PyErr_GivenExceptionMatchesTuple(PyObject *exc_type, PyObject *tuple) {
+    Py_ssize_t i, n;
+    assert(PyExceptionClass_Check(exc_type));
+    n = PyTuple_GET_SIZE(tuple);
+#if PY_MAJOR_VERSION >= 3
+    for (i=0; i<n; i++) {
+        if (exc_type == PyTuple_GET_ITEM(tuple, i)) return 1;
+    }
+#endif
+    for (i=0; i<n; i++) {
+        PyObject *t = PyTuple_GET_ITEM(tuple, i);
+        #if PY_MAJOR_VERSION < 3
+        if (likely(exc_type == t)) return 1;
+        #endif
+        if (likely(PyExceptionClass_Check(t))) {
+            if (__Pyx_inner_PyErr_GivenExceptionMatches2(exc_type, NULL, t)) return 1;
+        } else {
+        }
+    }
+    return 0;
+}
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject* exc_type) {
+    if (likely(err == exc_type)) return 1;
+    if (likely(PyExceptionClass_Check(err))) {
+        if (likely(PyExceptionClass_Check(exc_type))) {
+            return __Pyx_inner_PyErr_GivenExceptionMatches2(err, NULL, exc_type);
+        } else if (likely(PyTuple_Check(exc_type))) {
+            return __Pyx_PyErr_GivenExceptionMatchesTuple(err, exc_type);
+        } else {
+        }
+    }
+    return PyErr_GivenExceptionMatches(err, exc_type);
+}
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *exc_type1, PyObject *exc_type2) {
+    assert(PyExceptionClass_Check(exc_type1));
+    assert(PyExceptionClass_Check(exc_type2));
+    if (likely(err == exc_type1 || err == exc_type2)) return 1;
+    if (likely(PyExceptionClass_Check(err))) {
+        return __Pyx_inner_PyErr_GivenExceptionMatches2(err, exc_type1, exc_type2);
+    }
+    return (PyErr_GivenExceptionMatches(err, exc_type1) || PyErr_GivenExceptionMatches(err, exc_type2));
+}
+#endif
+
+/* PySequenceMultiply */
+static PyObject* __Pyx_PySequence_Multiply_Generic(PyObject *seq, Py_ssize_t mul) {
+    PyObject *result, *pymul = PyInt_FromSsize_t(mul);
+    if (unlikely(!pymul))
+        return NULL;
+    result = PyNumber_Multiply(seq, pymul);
+    Py_DECREF(pymul);
+    return result;
+}
+static CYTHON_INLINE PyObject* __Pyx_PySequence_Multiply(PyObject *seq, Py_ssize_t mul) {
+#if CYTHON_USE_TYPE_SLOTS
+    PyTypeObject *type = Py_TYPE(seq);
+    if (likely(type->tp_as_sequence && type->tp_as_sequence->sq_repeat)) {
+        return type->tp_as_sequence->sq_repeat(seq, mul);
+    } else
+#endif
+    {
+        return __Pyx_PySequence_Multiply_Generic(seq, mul);
+    }
+}
+
+/* SetItemInt */
+static int __Pyx_SetItemInt_Generic(PyObject *o, PyObject *j, PyObject *v) {
+    int r;
+    if (unlikely(!j)) return -1;
+    r = PyObject_SetItem(o, j, v);
+    Py_DECREF(j);
+    return r;
+}
+static CYTHON_INLINE int __Pyx_SetItemInt_Fast(PyObject *o, Py_ssize_t i, PyObject *v, int is_list,
+                                               CYTHON_NCP_UNUSED int wraparound, CYTHON_NCP_UNUSED int boundscheck) {
+#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS && CYTHON_USE_TYPE_SLOTS
+    if (is_list || PyList_CheckExact(o)) {
+        Py_ssize_t n = (!wraparound) ? i : ((likely(i >= 0)) ? i : i + PyList_GET_SIZE(o));
+        if ((!boundscheck) || likely(__Pyx_is_valid_index(n, PyList_GET_SIZE(o)))) {
+            PyObject* old = PyList_GET_ITEM(o, n);
+            Py_INCREF(v);
+            PyList_SET_ITEM(o, n, v);
+            Py_DECREF(old);
+            return 1;
+        }
+    } else {
+        PyMappingMethods *mm = Py_TYPE(o)->tp_as_mapping;
+        PySequenceMethods *sm = Py_TYPE(o)->tp_as_sequence;
+        if (mm && mm->mp_ass_subscript) {
+            int r;
+            PyObject *key = PyInt_FromSsize_t(i);
+            if (unlikely(!key)) return -1;
+            r = mm->mp_ass_subscript(o, key, v);
+            Py_DECREF(key);
+            return r;
+        }
+        if (likely(sm && sm->sq_ass_item)) {
+            if (wraparound && unlikely(i < 0) && likely(sm->sq_length)) {
+                Py_ssize_t l = sm->sq_length(o);
+                if (likely(l >= 0)) {
+                    i += l;
+                } else {
+                    if (!PyErr_ExceptionMatches(PyExc_OverflowError))
+                        return -1;
+                    PyErr_Clear();
+                }
+            }
+            return sm->sq_ass_item(o, i, v);
+        }
+    }
+#else
+    if (is_list || !PyMapping_Check(o))
+    {
+        return PySequence_SetItem(o, i, v);
+    }
+#endif
+    return __Pyx_SetItemInt_Generic(o, PyInt_FromSsize_t(i), v);
+}
+
+/* RaiseUnboundLocalError */
+static CYTHON_INLINE void __Pyx_RaiseUnboundLocalError(const char *varname) {
+    PyErr_Format(PyExc_UnboundLocalError, "local variable '%s' referenced before assignment", varname);
+}
+
+/* DivInt[long] */
+static CYTHON_INLINE long __Pyx_div_long(long a, long b) {
+    long q = a / b;
+    long r = a - q*b;
+    q -= ((r != 0) & ((r ^ b) < 0));
+    return q;
+}
+
+/* ImportFrom */
+static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name) {
+    PyObject* value = __Pyx_PyObject_GetAttrStr(module, name);
+    if (unlikely(!value) && PyErr_ExceptionMatches(PyExc_AttributeError)) {
+        const char* module_name_str = 0;
+        PyObject* module_name = 0;
+        PyObject* module_dot = 0;
+        PyObject* full_name = 0;
+        PyErr_Clear();
+        module_name_str = PyModule_GetName(module);
+        if (unlikely(!module_name_str)) { goto modbad; }
+        module_name = PyUnicode_FromString(module_name_str);
+        if (unlikely(!module_name)) { goto modbad; }
+        module_dot = PyUnicode_Concat(module_name, __pyx_kp_u__2);
+        if (unlikely(!module_dot)) { goto modbad; }
+        full_name = PyUnicode_Concat(module_dot, name);
+        if (unlikely(!full_name)) { goto modbad; }
+        #if PY_VERSION_HEX < 0x030700A1 || (CYTHON_COMPILING_IN_PYPY && PYPY_VERSION_NUM  < 0x07030400)
+        {
+            PyObject *modules = PyImport_GetModuleDict();
+            if (unlikely(!modules))
+                goto modbad;
+            value = PyObject_GetItem(modules, full_name);
+        }
+        #else
+        value = PyImport_GetModule(full_name);
+        #endif
+      modbad:
+        Py_XDECREF(full_name);
+        Py_XDECREF(module_dot);
+        Py_XDECREF(module_name);
+    }
+    if (unlikely(!value)) {
+        PyErr_Format(PyExc_ImportError,
+        #if PY_MAJOR_VERSION < 3
+            "cannot import name %.230s", PyString_AS_STRING(name));
+        #else
+            "cannot import name %S", name);
+        #endif
+    }
+    return value;
+}
+
+/* HasAttr */
+static CYTHON_INLINE int __Pyx_HasAttr(PyObject *o, PyObject *n) {
+    PyObject *r;
+    if (unlikely(!__Pyx_PyBaseString_Check(n))) {
+        PyErr_SetString(PyExc_TypeError,
+                        "hasattr(): attribute name must be string");
+        return -1;
+    }
+    r = __Pyx_GetAttr(o, n);
+    if (!r) {
+        PyErr_Clear();
+        return 0;
+    } else {
+        Py_DECREF(r);
+        return 1;
+    }
+}
+
+/* IsLittleEndian */
+static CYTHON_INLINE int __Pyx_Is_Little_Endian(void)
+{
+  union {
+    uint32_t u32;
+    uint8_t u8[4];
+  } S;
+  S.u32 = 0x01020304;
+  return S.u8[0] == 4;
+}
+
+/* BufferFormatCheck */
+static void __Pyx_BufFmt_Init(__Pyx_BufFmt_Context* ctx,
+                              __Pyx_BufFmt_StackElem* stack,
+                              __Pyx_TypeInfo* type) {
+  stack[0].field = &ctx->root;
+  stack[0].parent_offset = 0;
+  ctx->root.type = type;
+  ctx->root.name = "buffer dtype";
+  ctx->root.offset = 0;
+  ctx->head = stack;
+  ctx->head->field = &ctx->root;
+  ctx->fmt_offset = 0;
+  ctx->head->parent_offset = 0;
+  ctx->new_packmode = '@';
+  ctx->enc_packmode = '@';
+  ctx->new_count = 1;
+  ctx->enc_count = 0;
+  ctx->enc_type = 0;
+  ctx->is_complex = 0;
+  ctx->is_valid_array = 0;
+  ctx->struct_alignment = 0;
+  while (type->typegroup == 'S') {
+    ++ctx->head;
+    ctx->head->field = type->fields;
+    ctx->head->parent_offset = 0;
+    type = type->fields->type;
+  }
+}
+static int __Pyx_BufFmt_ParseNumber(const char** ts) {
+    int count;
+    const char* t = *ts;
+    if (*t < '0' || *t > '9') {
+      return -1;
+    } else {
+        count = *t++ - '0';
+        while (*t >= '0' && *t <= '9') {
+            count *= 10;
+            count += *t++ - '0';
+        }
+    }
+    *ts = t;
+    return count;
+}
+static int __Pyx_BufFmt_ExpectNumber(const char **ts) {
+    int number = __Pyx_BufFmt_ParseNumber(ts);
+    if (number == -1)
+        PyErr_Format(PyExc_ValueError,\
+                     "Does not understand character buffer dtype format string ('%c')", **ts);
+    return number;
+}
+static void __Pyx_BufFmt_RaiseUnexpectedChar(char ch) {
+  PyErr_Format(PyExc_ValueError,
+               "Unexpected format string character: '%c'", ch);
+}
+static const char* __Pyx_BufFmt_DescribeTypeChar(char ch, int is_complex) {
+  switch (ch) {
+    case '?': return "'bool'";
+    case 'c': return "'char'";
+    case 'b': return "'signed char'";
+    case 'B': return "'unsigned char'";
+    case 'h': return "'short'";
+    case 'H': return "'unsigned short'";
+    case 'i': return "'int'";
+    case 'I': return "'unsigned int'";
+    case 'l': return "'long'";
+    case 'L': return "'unsigned long'";
+    case 'q': return "'long long'";
+    case 'Q': return "'unsigned long long'";
+    case 'f': return (is_complex ? "'complex float'" : "'float'");
+    case 'd': return (is_complex ? "'complex double'" : "'double'");
+    case 'g': return (is_complex ? "'complex long double'" : "'long double'");
+    case 'T': return "a struct";
+    case 'O': return "Python object";
+    case 'P': return "a pointer";
+    case 's': case 'p': return "a string";
+    case 0: return "end";
+    default: return "unparsable format string";
+  }
+}
+static size_t __Pyx_BufFmt_TypeCharToStandardSize(char ch, int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return 2;
+    case 'i': case 'I': case 'l': case 'L': return 4;
+    case 'q': case 'Q': return 8;
+    case 'f': return (is_complex ? 8 : 4);
+    case 'd': return (is_complex ? 16 : 8);
+    case 'g': {
+      PyErr_SetString(PyExc_ValueError, "Python does not define a standard format string size for long double ('g')..");
+      return 0;
+    }
+    case 'O': case 'P': return sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+static size_t __Pyx_BufFmt_TypeCharToNativeSize(char ch, int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(short);
+    case 'i': case 'I': return sizeof(int);
+    case 'l': case 'L': return sizeof(long);
+    #ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(PY_LONG_LONG);
+    #endif
+    case 'f': return sizeof(float) * (is_complex ? 2 : 1);
+    case 'd': return sizeof(double) * (is_complex ? 2 : 1);
+    case 'g': return sizeof(long double) * (is_complex ? 2 : 1);
+    case 'O': case 'P': return sizeof(void*);
+    default: {
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+  }
+}
+typedef struct { char c; short x; } __Pyx_st_short;
+typedef struct { char c; int x; } __Pyx_st_int;
+typedef struct { char c; long x; } __Pyx_st_long;
+typedef struct { char c; float x; } __Pyx_st_float;
+typedef struct { char c; double x; } __Pyx_st_double;
+typedef struct { char c; long double x; } __Pyx_st_longdouble;
+typedef struct { char c; void *x; } __Pyx_st_void_p;
+#ifdef HAVE_LONG_LONG
+typedef struct { char c; PY_LONG_LONG x; } __Pyx_st_longlong;
+#endif
+static size_t __Pyx_BufFmt_TypeCharToAlignment(char ch, int is_complex) {
+  CYTHON_UNUSED_VAR(is_complex);
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(__Pyx_st_short) - sizeof(short);
+    case 'i': case 'I': return sizeof(__Pyx_st_int) - sizeof(int);
+    case 'l': case 'L': return sizeof(__Pyx_st_long) - sizeof(long);
+#ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(__Pyx_st_longlong) - sizeof(PY_LONG_LONG);
+#endif
+    case 'f': return sizeof(__Pyx_st_float) - sizeof(float);
+    case 'd': return sizeof(__Pyx_st_double) - sizeof(double);
+    case 'g': return sizeof(__Pyx_st_longdouble) - sizeof(long double);
+    case 'P': case 'O': return sizeof(__Pyx_st_void_p) - sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+/* These are for computing the padding at the end of the struct to align
+   on the first member of the struct. This will probably the same as above,
+   but we don't have any guarantees.
+ */
+typedef struct { short x; char c; } __Pyx_pad_short;
+typedef struct { int x; char c; } __Pyx_pad_int;
+typedef struct { long x; char c; } __Pyx_pad_long;
+typedef struct { float x; char c; } __Pyx_pad_float;
+typedef struct { double x; char c; } __Pyx_pad_double;
+typedef struct { long double x; char c; } __Pyx_pad_longdouble;
+typedef struct { void *x; char c; } __Pyx_pad_void_p;
+#ifdef HAVE_LONG_LONG
+typedef struct { PY_LONG_LONG x; char c; } __Pyx_pad_longlong;
+#endif
+static size_t __Pyx_BufFmt_TypeCharToPadding(char ch, int is_complex) {
+  CYTHON_UNUSED_VAR(is_complex);
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(__Pyx_pad_short) - sizeof(short);
+    case 'i': case 'I': return sizeof(__Pyx_pad_int) - sizeof(int);
+    case 'l': case 'L': return sizeof(__Pyx_pad_long) - sizeof(long);
+#ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(__Pyx_pad_longlong) - sizeof(PY_LONG_LONG);
+#endif
+    case 'f': return sizeof(__Pyx_pad_float) - sizeof(float);
+    case 'd': return sizeof(__Pyx_pad_double) - sizeof(double);
+    case 'g': return sizeof(__Pyx_pad_longdouble) - sizeof(long double);
+    case 'P': case 'O': return sizeof(__Pyx_pad_void_p) - sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+static char __Pyx_BufFmt_TypeCharToGroup(char ch, int is_complex) {
+  switch (ch) {
+    case 'c':
+        return 'H';
+    case 'b': case 'h': case 'i':
+    case 'l': case 'q': case 's': case 'p':
+        return 'I';
+    case '?': case 'B': case 'H': case 'I': case 'L': case 'Q':
+        return 'U';
+    case 'f': case 'd': case 'g':
+        return (is_complex ? 'C' : 'R');
+    case 'O':
+        return 'O';
+    case 'P':
+        return 'P';
+    default: {
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+  }
+}
+static void __Pyx_BufFmt_RaiseExpected(__Pyx_BufFmt_Context* ctx) {
+  if (ctx->head == NULL || ctx->head->field == &ctx->root) {
+    const char* expected;
+    const char* quote;
+    if (ctx->head == NULL) {
+      expected = "end";
+      quote = "";
+    } else {
+      expected = ctx->head->field->type->name;
+      quote = "'";
+    }
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer dtype mismatch, expected %s%s%s but got %s",
+                 quote, expected, quote,
+                 __Pyx_BufFmt_DescribeTypeChar(ctx->enc_type, ctx->is_complex));
+  } else {
+    __Pyx_StructField* field = ctx->head->field;
+    __Pyx_StructField* parent = (ctx->head - 1)->field;
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer dtype mismatch, expected '%s' but got %s in '%s.%s'",
+                 field->type->name, __Pyx_BufFmt_DescribeTypeChar(ctx->enc_type, ctx->is_complex),
+                 parent->type->name, field->name);
+  }
+}
+static int __Pyx_BufFmt_ProcessTypeChunk(__Pyx_BufFmt_Context* ctx) {
+  char group;
+  size_t size, offset, arraysize = 1;
+  if (ctx->enc_type == 0) return 0;
+  if (ctx->head->field->type->arraysize[0]) {
+    int i, ndim = 0;
+    if (ctx->enc_type == 's' || ctx->enc_type == 'p') {
+        ctx->is_valid_array = ctx->head->field->type->ndim == 1;
+        ndim = 1;
+        if (ctx->enc_count != ctx->head->field->type->arraysize[0]) {
+            PyErr_Format(PyExc_ValueError,
+                         "Expected a dimension of size %zu, got %zu",
+                         ctx->head->field->type->arraysize[0], ctx->enc_count);
+            return -1;
+        }
+    }
+    if (!ctx->is_valid_array) {
+      PyErr_Format(PyExc_ValueError, "Expected %d dimensions, got %d",
+                   ctx->head->field->type->ndim, ndim);
+      return -1;
+    }
+    for (i = 0; i < ctx->head->field->type->ndim; i++) {
+      arraysize *= ctx->head->field->type->arraysize[i];
+    }
+    ctx->is_valid_array = 0;
+    ctx->enc_count = 1;
+  }
+  group = __Pyx_BufFmt_TypeCharToGroup(ctx->enc_type, ctx->is_complex);
+  do {
+    __Pyx_StructField* field = ctx->head->field;
+    __Pyx_TypeInfo* type = field->type;
+    if (ctx->enc_packmode == '@' || ctx->enc_packmode == '^') {
+      size = __Pyx_BufFmt_TypeCharToNativeSize(ctx->enc_type, ctx->is_complex);
+    } else {
+      size = __Pyx_BufFmt_TypeCharToStandardSize(ctx->enc_type, ctx->is_complex);
+    }
+    if (ctx->enc_packmode == '@') {
+      size_t align_at = __Pyx_BufFmt_TypeCharToAlignment(ctx->enc_type, ctx->is_complex);
+      size_t align_mod_offset;
+      if (align_at == 0) return -1;
+      align_mod_offset = ctx->fmt_offset % align_at;
+      if (align_mod_offset > 0) ctx->fmt_offset += align_at - align_mod_offset;
+      if (ctx->struct_alignment == 0)
+          ctx->struct_alignment = __Pyx_BufFmt_TypeCharToPadding(ctx->enc_type,
+                                                                 ctx->is_complex);
+    }
+    if (type->size != size || type->typegroup != group) {
+      if (type->typegroup == 'C' && type->fields != NULL) {
+        size_t parent_offset = ctx->head->parent_offset + field->offset;
+        ++ctx->head;
+        ctx->head->field = type->fields;
+        ctx->head->parent_offset = parent_offset;
+        continue;
+      }
+      if ((type->typegroup == 'H' || group == 'H') && type->size == size) {
+      } else {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return -1;
+      }
+    }
+    offset = ctx->head->parent_offset + field->offset;
+    if (ctx->fmt_offset != offset) {
+      PyErr_Format(PyExc_ValueError,
+                   "Buffer dtype mismatch; next field is at offset %" CYTHON_FORMAT_SSIZE_T "d but %" CYTHON_FORMAT_SSIZE_T "d expected",
+                   (Py_ssize_t)ctx->fmt_offset, (Py_ssize_t)offset);
+      return -1;
+    }
+    ctx->fmt_offset += size;
+    if (arraysize)
+      ctx->fmt_offset += (arraysize - 1) * size;
+    --ctx->enc_count;
+    while (1) {
+      if (field == &ctx->root) {
+        ctx->head = NULL;
+        if (ctx->enc_count != 0) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return -1;
+        }
+        break;
+      }
+      ctx->head->field = ++field;
+      if (field->type == NULL) {
+        --ctx->head;
+        field = ctx->head->field;
+        continue;
+      } else if (field->type->typegroup == 'S') {
+        size_t parent_offset = ctx->head->parent_offset + field->offset;
+        if (field->type->fields->type == NULL) continue;
+        field = field->type->fields;
+        ++ctx->head;
+        ctx->head->field = field;
+        ctx->head->parent_offset = parent_offset;
+        break;
+      } else {
+        break;
+      }
+    }
+  } while (ctx->enc_count);
+  ctx->enc_type = 0;
+  ctx->is_complex = 0;
+  return 0;
+}
+static int
+__pyx_buffmt_parse_array(__Pyx_BufFmt_Context* ctx, const char** tsp)
+{
+    const char *ts = *tsp;
+    int i = 0, number, ndim;
+    ++ts;
+    if (ctx->new_count != 1) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Cannot handle repeated arrays in format string");
+        return -1;
+    }
+    if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return -1;
+    ndim = ctx->head->field->type->ndim;
+    while (*ts && *ts != ')') {
+        switch (*ts) {
+            case ' ': case '\f': case '\r': case '\n': case '\t': case '\v':  continue;
+            default:  break;
+        }
+        number = __Pyx_BufFmt_ExpectNumber(&ts);
+        if (number == -1) return -1;
+        if (i < ndim && (size_t) number != ctx->head->field->type->arraysize[i]) {
+            PyErr_Format(PyExc_ValueError,
+                        "Expected a dimension of size %zu, got %d",
+                        ctx->head->field->type->arraysize[i], number);
+            return -1;
+        }
+        if (*ts != ',' && *ts != ')') {
+            PyErr_Format(PyExc_ValueError,
+                                "Expected a comma in format string, got '%c'", *ts);
+            return -1;
+        }
+        if (*ts == ',') ts++;
+        i++;
+    }
+    if (i != ndim) {
+        PyErr_Format(PyExc_ValueError, "Expected %d dimension(s), got %d",
+                            ctx->head->field->type->ndim, i);
+        return -1;
+    }
+    if (!*ts) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Unexpected end of format string, expected ')'");
+        return -1;
+    }
+    ctx->is_valid_array = 1;
+    ctx->new_count = 1;
+    *tsp = ++ts;
+    return 0;
+}
+static const char* __Pyx_BufFmt_CheckString(__Pyx_BufFmt_Context* ctx, const char* ts) {
+  int got_Z = 0;
+  while (1) {
+    switch(*ts) {
+      case 0:
+        if (ctx->enc_type != 0 && ctx->head == NULL) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return NULL;
+        }
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        if (ctx->head != NULL) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return NULL;
+        }
+        return ts;
+      case ' ':
+      case '\r':
+      case '\n':
+        ++ts;
+        break;
+      case '<':
+        if (!__Pyx_Is_Little_Endian()) {
+          PyErr_SetString(PyExc_ValueError, "Little-endian buffer not supported on big-endian compiler");
+          return NULL;
+        }
+        ctx->new_packmode = '=';
+        ++ts;
+        break;
+      case '>':
+      case '!':
+        if (__Pyx_Is_Little_Endian()) {
+          PyErr_SetString(PyExc_ValueError, "Big-endian buffer not supported on little-endian compiler");
+          return NULL;
+        }
+        ctx->new_packmode = '=';
+        ++ts;
+        break;
+      case '=':
+      case '@':
+      case '^':
+        ctx->new_packmode = *ts++;
+        break;
+      case 'T':
+        {
+          const char* ts_after_sub;
+          size_t i, struct_count = ctx->new_count;
+          size_t struct_alignment = ctx->struct_alignment;
+          ctx->new_count = 1;
+          ++ts;
+          if (*ts != '{') {
+            PyErr_SetString(PyExc_ValueError, "Buffer acquisition: Expected '{' after 'T'");
+            return NULL;
+          }
+          if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+          ctx->enc_type = 0;
+          ctx->enc_count = 0;
+          ctx->struct_alignment = 0;
+          ++ts;
+          ts_after_sub = ts;
+          for (i = 0; i != struct_count; ++i) {
+            ts_after_sub = __Pyx_BufFmt_CheckString(ctx, ts);
+            if (!ts_after_sub) return NULL;
+          }
+          ts = ts_after_sub;
+          if (struct_alignment) ctx->struct_alignment = struct_alignment;
+        }
+        break;
+      case '}':
+        {
+          size_t alignment = ctx->struct_alignment;
+          ++ts;
+          if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+          ctx->enc_type = 0;
+          if (alignment && ctx->fmt_offset % alignment) {
+            ctx->fmt_offset += alignment - (ctx->fmt_offset % alignment);
+          }
+        }
+        return ts;
+      case 'x':
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        ctx->fmt_offset += ctx->new_count;
+        ctx->new_count = 1;
+        ctx->enc_count = 0;
+        ctx->enc_type = 0;
+        ctx->enc_packmode = ctx->new_packmode;
+        ++ts;
+        break;
+      case 'Z':
+        got_Z = 1;
+        ++ts;
+        if (*ts != 'f' && *ts != 'd' && *ts != 'g') {
+          __Pyx_BufFmt_RaiseUnexpectedChar('Z');
+          return NULL;
+        }
+        CYTHON_FALLTHROUGH;
+      case '?': case 'c': case 'b': case 'B': case 'h': case 'H': case 'i': case 'I':
+      case 'l': case 'L': case 'q': case 'Q':
+      case 'f': case 'd': case 'g':
+      case 'O': case 'p':
+        if ((ctx->enc_type == *ts) && (got_Z == ctx->is_complex) &&
+            (ctx->enc_packmode == ctx->new_packmode) && (!ctx->is_valid_array)) {
+          ctx->enc_count += ctx->new_count;
+          ctx->new_count = 1;
+          got_Z = 0;
+          ++ts;
+          break;
+        }
+        CYTHON_FALLTHROUGH;
+      case 's':
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        ctx->enc_count = ctx->new_count;
+        ctx->enc_packmode = ctx->new_packmode;
+        ctx->enc_type = *ts;
+        ctx->is_complex = got_Z;
+        ++ts;
+        ctx->new_count = 1;
+        got_Z = 0;
+        break;
+      case ':':
+        ++ts;
+        while(*ts != ':') ++ts;
+        ++ts;
+        break;
+      case '(':
+        if (__pyx_buffmt_parse_array(ctx, &ts) < 0) return NULL;
+        break;
+      default:
+        {
+          int number = __Pyx_BufFmt_ExpectNumber(&ts);
+          if (number == -1) return NULL;
+          ctx->new_count = (size_t)number;
+        }
+    }
+  }
+}
+
+/* BufferGetAndValidate */
+  static CYTHON_INLINE void __Pyx_SafeReleaseBuffer(Py_buffer* info) {
+  if (unlikely(info->buf == NULL)) return;
+  if (info->suboffsets == __Pyx_minusones) info->suboffsets = NULL;
+  __Pyx_ReleaseBuffer(info);
+}
+static void __Pyx_ZeroBuffer(Py_buffer* buf) {
+  buf->buf = NULL;
+  buf->obj = NULL;
+  buf->strides = __Pyx_zeros;
+  buf->shape = __Pyx_zeros;
+  buf->suboffsets = __Pyx_minusones;
+}
+static int __Pyx__GetBufferAndValidate(
+        Py_buffer* buf, PyObject* obj,  __Pyx_TypeInfo* dtype, int flags,
+        int nd, int cast, __Pyx_BufFmt_StackElem* stack)
+{
+  buf->buf = NULL;
+  if (unlikely(__Pyx_GetBuffer(obj, buf, flags) == -1)) {
+    __Pyx_ZeroBuffer(buf);
+    return -1;
+  }
+  if (unlikely(buf->ndim != nd)) {
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer has wrong number of dimensions (expected %d, got %d)",
+                 nd, buf->ndim);
+    goto fail;
+  }
+  if (!cast) {
+    __Pyx_BufFmt_Context ctx;
+    __Pyx_BufFmt_Init(&ctx, stack, dtype);
+    if (!__Pyx_BufFmt_CheckString(&ctx, buf->format)) goto fail;
+  }
+  if (unlikely((size_t)buf->itemsize != dtype->size)) {
+    PyErr_Format(PyExc_ValueError,
+      "Item size of buffer (%" CYTHON_FORMAT_SSIZE_T "d byte%s) does not match size of '%s' (%" CYTHON_FORMAT_SSIZE_T "d byte%s)",
+      buf->itemsize, (buf->itemsize > 1) ? "s" : "",
+      dtype->name, (Py_ssize_t)dtype->size, (dtype->size > 1) ? "s" : "");
+    goto fail;
+  }
+  if (buf->suboffsets == NULL) buf->suboffsets = __Pyx_minusones;
+  return 0;
+fail:;
+  __Pyx_SafeReleaseBuffer(buf);
+  return -1;
+}
+
+/* BufferIndexError */
+  static void __Pyx_RaiseBufferIndexError(int axis) {
+  PyErr_Format(PyExc_IndexError,
+     "Out of bounds on buffer access (axis %d)", axis);
+}
+
+/* SliceTupleAndList */
+  #if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE void __Pyx_crop_slice(Py_ssize_t* _start, Py_ssize_t* _stop, Py_ssize_t* _length) {
+    Py_ssize_t start = *_start, stop = *_stop, length = *_length;
+    if (start < 0) {
+        start += length;
+        if (start < 0)
+            start = 0;
+    }
+    if (stop < 0)
+        stop += length;
+    else if (stop > length)
+        stop = length;
+    *_length = stop - start;
+    *_start = start;
+    *_stop = stop;
+}
+static CYTHON_INLINE PyObject* __Pyx_PyList_GetSlice(
+            PyObject* src, Py_ssize_t start, Py_ssize_t stop) {
+    Py_ssize_t length = PyList_GET_SIZE(src);
+    __Pyx_crop_slice(&start, &stop, &length);
+    if (length <= 0) {
+        return PyList_New(0);
+    }
+    return __Pyx_PyList_FromArray(((PyListObject*)src)->ob_item + start, length);
+}
+static CYTHON_INLINE PyObject* __Pyx_PyTuple_GetSlice(
+            PyObject* src, Py_ssize_t start, Py_ssize_t stop) {
+    Py_ssize_t length = PyTuple_GET_SIZE(src);
+    __Pyx_crop_slice(&start, &stop, &length);
+    return __Pyx_PyTuple_FromArray(((PyTupleObject*)src)->ob_item + start, length);
+}
+#endif
+
+/* PyIntCompare */
+  static CYTHON_INLINE int __Pyx_PyInt_BoolEqObjC(PyObject *op1, PyObject *op2, long intval, long inplace) {
+    CYTHON_MAYBE_UNUSED_VAR(intval);
+    CYTHON_UNUSED_VAR(inplace);
+    if (op1 == op2) {
+        return 1;
+    }
+    #if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_CheckExact(op1))) {
+        const long b = intval;
+        long a = PyInt_AS_LONG(op1);
+        return (a == b);
+    }
+    #endif
+    #if CYTHON_USE_PYLONG_INTERNALS
+    if (likely(PyLong_CheckExact(op1))) {
+        int unequal;
+        unsigned long uintval;
+        Py_ssize_t size = __Pyx_PyLong_DigitCount(op1);
+        const digit* digits = __Pyx_PyLong_Digits(op1);
+        if (intval == 0) {
+            return (__Pyx_PyLong_IsZero(op1) == 1);
+        } else if (intval < 0) {
+            if (__Pyx_PyLong_IsNonNeg(op1))
+                return 0;
+            intval = -intval;
+        } else {
+            if (__Pyx_PyLong_IsNeg(op1))
+                return 0;
+        }
+        uintval = (unsigned long) intval;
+#if PyLong_SHIFT * 4 < SIZEOF_LONG*8
+        if (uintval >> (PyLong_SHIFT * 4)) {
+            unequal = (size != 5) || (digits[0] != (uintval & (unsigned long) PyLong_MASK))
+                 | (digits[1] != ((uintval >> (1 * PyLong_SHIFT)) & (unsigned long) PyLong_MASK)) | (digits[2] != ((uintval >> (2 * PyLong_SHIFT)) & (unsigned long) PyLong_MASK)) | (digits[3] != ((uintval >> (3 * PyLong_SHIFT)) & (unsigned long) PyLong_MASK)) | (digits[4] != ((uintval >> (4 * PyLong_SHIFT)) & (unsigned long) PyLong_MASK));
+        } else
+#endif
+#if PyLong_SHIFT * 3 < SIZEOF_LONG*8
+        if (uintval >> (PyLong_SHIFT * 3)) {
+            unequal = (size != 4) || (digits[0] != (uintval & (unsigned long) PyLong_MASK))
+                 | (digits[1] != ((uintval >> (1 * PyLong_SHIFT)) & (unsigned long) PyLong_MASK)) | (digits[2] != ((uintval >> (2 * PyLong_SHIFT)) & (unsigned long) PyLong_MASK)) | (digits[3] != ((uintval >> (3 * PyLong_SHIFT)) & (unsigned long) PyLong_MASK));
+        } else
+#endif
+#if PyLong_SHIFT * 2 < SIZEOF_LONG*8
+        if (uintval >> (PyLong_SHIFT * 2)) {
+            unequal = (size != 3) || (digits[0] != (uintval & (unsigned long) PyLong_MASK))
+                 | (digits[1] != ((uintval >> (1 * PyLong_SHIFT)) & (unsigned long) PyLong_MASK)) | (digits[2] != ((uintval >> (2 * PyLong_SHIFT)) & (unsigned long) PyLong_MASK));
+        } else
+#endif
+#if PyLong_SHIFT * 1 < SIZEOF_LONG*8
+        if (uintval >> (PyLong_SHIFT * 1)) {
+            unequal = (size != 2) || (digits[0] != (uintval & (unsigned long) PyLong_MASK))
+                 | (digits[1] != ((uintval >> (1 * PyLong_SHIFT)) & (unsigned long) PyLong_MASK));
+        } else
+#endif
+            unequal = (size != 1) || (((unsigned long) digits[0]) != (uintval & (unsigned long) PyLong_MASK));
+        return (unequal == 0);
+    }
+    #endif
+    if (PyFloat_CheckExact(op1)) {
+        const long b = intval;
+#if CYTHON_COMPILING_IN_LIMITED_API
+        double a = __pyx_PyFloat_AsDouble(op1);
+#else
+        double a = PyFloat_AS_DOUBLE(op1);
+#endif
+        return ((double)a == (double)b);
+    }
+    return __Pyx_PyObject_IsTrueAndDecref(
+        PyObject_RichCompare(op1, op2, Py_EQ));
+}
+
+/* PyObject_GenericGetAttrNoDict */
+  #if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000
+static PyObject *__Pyx_RaiseGenericGetAttributeError(PyTypeObject *tp, PyObject *attr_name) {
+    __Pyx_TypeName type_name = __Pyx_PyType_GetName(tp);
+    PyErr_Format(PyExc_AttributeError,
+#if PY_MAJOR_VERSION >= 3
+                 "'" __Pyx_FMT_TYPENAME "' object has no attribute '%U'",
+                 type_name, attr_name);
+#else
+                 "'" __Pyx_FMT_TYPENAME "' object has no attribute '%.400s'",
+                 type_name, PyString_AS_STRING(attr_name));
+#endif
+    __Pyx_DECREF_TypeName(type_name);
+    return NULL;
+}
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GenericGetAttrNoDict(PyObject* obj, PyObject* attr_name) {
+    PyObject *descr;
+    PyTypeObject *tp = Py_TYPE(obj);
+    if (unlikely(!PyString_Check(attr_name))) {
+        return PyObject_GenericGetAttr(obj, attr_name);
+    }
+    assert(!tp->tp_dictoffset);
+    descr = _PyType_Lookup(tp, attr_name);
+    if (unlikely(!descr)) {
+        return __Pyx_RaiseGenericGetAttributeError(tp, attr_name);
+    }
+    Py_INCREF(descr);
+    #if PY_MAJOR_VERSION < 3
+    if (likely(PyType_HasFeature(Py_TYPE(descr), Py_TPFLAGS_HAVE_CLASS)))
+    #endif
+    {
+        descrgetfunc f = Py_TYPE(descr)->tp_descr_get;
+        if (unlikely(f)) {
+            PyObject *res = f(descr, obj, (PyObject *)tp);
+            Py_DECREF(descr);
+            return res;
+        }
+    }
+    return descr;
+}
+#endif
+
+/* PyObject_GenericGetAttr */
+  #if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000
+static PyObject* __Pyx_PyObject_GenericGetAttr(PyObject* obj, PyObject* attr_name) {
+    if (unlikely(Py_TYPE(obj)->tp_dictoffset)) {
+        return PyObject_GenericGetAttr(obj, attr_name);
+    }
+    return __Pyx_PyObject_GenericGetAttrNoDict(obj, attr_name);
+}
+#endif
+
+/* FixUpExtensionType */
+  #if CYTHON_USE_TYPE_SPECS
+static int __Pyx_fix_up_extension_type_from_spec(PyType_Spec *spec, PyTypeObject *type) {
+#if PY_VERSION_HEX > 0x030900B1 || CYTHON_COMPILING_IN_LIMITED_API
+    CYTHON_UNUSED_VAR(spec);
+    CYTHON_UNUSED_VAR(type);
+#else
+    const PyType_Slot *slot = spec->slots;
+    while (slot && slot->slot && slot->slot != Py_tp_members)
+        slot++;
+    if (slot && slot->slot == Py_tp_members) {
+        int changed = 0;
+#if !(PY_VERSION_HEX <= 0x030900b1 && CYTHON_COMPILING_IN_CPYTHON)
+        const
+#endif
+            PyMemberDef *memb = (PyMemberDef*) slot->pfunc;
+        while (memb && memb->name) {
+            if (memb->name[0] == '_' && memb->name[1] == '_') {
+#if PY_VERSION_HEX < 0x030900b1
+                if (strcmp(memb->name, "__weaklistoffset__") == 0) {
+                    assert(memb->type == T_PYSSIZET);
+                    assert(memb->flags == READONLY);
+                    type->tp_weaklistoffset = memb->offset;
+                    changed = 1;
+                }
+                else if (strcmp(memb->name, "__dictoffset__") == 0) {
+                    assert(memb->type == T_PYSSIZET);
+                    assert(memb->flags == READONLY);
+                    type->tp_dictoffset = memb->offset;
+                    changed = 1;
+                }
+#if CYTHON_METH_FASTCALL
+                else if (strcmp(memb->name, "__vectorcalloffset__") == 0) {
+                    assert(memb->type == T_PYSSIZET);
+                    assert(memb->flags == READONLY);
+#if PY_VERSION_HEX >= 0x030800b4
+                    type->tp_vectorcall_offset = memb->offset;
+#else
+                    type->tp_print = (printfunc) memb->offset;
+#endif
+                    changed = 1;
+                }
+#endif
+#else
+                if ((0));
+#endif
+#if PY_VERSION_HEX <= 0x030900b1 && CYTHON_COMPILING_IN_CPYTHON
+                else if (strcmp(memb->name, "__module__") == 0) {
+                    PyObject *descr;
+                    assert(memb->type == T_OBJECT);
+                    assert(memb->flags == 0 || memb->flags == READONLY);
+                    descr = PyDescr_NewMember(type, memb);
+                    if (unlikely(!descr))
+                        return -1;
+                    if (unlikely(PyDict_SetItem(type->tp_dict, PyDescr_NAME(descr), descr) < 0)) {
+                        Py_DECREF(descr);
+                        return -1;
+                    }
+                    Py_DECREF(descr);
+                    changed = 1;
+                }
+#endif
+            }
+            memb++;
+        }
+        if (changed)
+            PyType_Modified(type);
+    }
+#endif
+    return 0;
+}
+#endif
+
+/* PyObjectCallNoArg */
+  static CYTHON_INLINE PyObject* __Pyx_PyObject_CallNoArg(PyObject *func) {
+    PyObject *arg[2] = {NULL, NULL};
+    return __Pyx_PyObject_FastCall(func, arg + 1, 0 | __Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET);
+}
+
+/* PyObjectGetMethod */
+  static int __Pyx_PyObject_GetMethod(PyObject *obj, PyObject *name, PyObject **method) {
+    PyObject *attr;
+#if CYTHON_UNPACK_METHODS && CYTHON_COMPILING_IN_CPYTHON && CYTHON_USE_PYTYPE_LOOKUP
+    __Pyx_TypeName type_name;
+    PyTypeObject *tp = Py_TYPE(obj);
+    PyObject *descr;
+    descrgetfunc f = NULL;
+    PyObject **dictptr, *dict;
+    int meth_found = 0;
+    assert (*method == NULL);
+    if (unlikely(tp->tp_getattro != PyObject_GenericGetAttr)) {
+        attr = __Pyx_PyObject_GetAttrStr(obj, name);
+        goto try_unpack;
+    }
+    if (unlikely(tp->tp_dict == NULL) && unlikely(PyType_Ready(tp) < 0)) {
+        return 0;
+    }
+    descr = _PyType_Lookup(tp, name);
+    if (likely(descr != NULL)) {
+        Py_INCREF(descr);
+#if defined(Py_TPFLAGS_METHOD_DESCRIPTOR) && Py_TPFLAGS_METHOD_DESCRIPTOR
+        if (__Pyx_PyType_HasFeature(Py_TYPE(descr), Py_TPFLAGS_METHOD_DESCRIPTOR))
+#elif PY_MAJOR_VERSION >= 3
+        #ifdef __Pyx_CyFunction_USED
+        if (likely(PyFunction_Check(descr) || __Pyx_IS_TYPE(descr, &PyMethodDescr_Type) || __Pyx_CyFunction_Check(descr)))
+        #else
+        if (likely(PyFunction_Check(descr) || __Pyx_IS_TYPE(descr, &PyMethodDescr_Type)))
+        #endif
+#else
+        #ifdef __Pyx_CyFunction_USED
+        if (likely(PyFunction_Check(descr) || __Pyx_CyFunction_Check(descr)))
+        #else
+        if (likely(PyFunction_Check(descr)))
+        #endif
+#endif
+        {
+            meth_found = 1;
+        } else {
+            f = Py_TYPE(descr)->tp_descr_get;
+            if (f != NULL && PyDescr_IsData(descr)) {
+                attr = f(descr, obj, (PyObject *)Py_TYPE(obj));
+                Py_DECREF(descr);
+                goto try_unpack;
+            }
+        }
+    }
+    dictptr = _PyObject_GetDictPtr(obj);
+    if (dictptr != NULL && (dict = *dictptr) != NULL) {
+        Py_INCREF(dict);
+        attr = __Pyx_PyDict_GetItemStr(dict, name);
+        if (attr != NULL) {
+            Py_INCREF(attr);
+            Py_DECREF(dict);
+            Py_XDECREF(descr);
+            goto try_unpack;
+        }
+        Py_DECREF(dict);
+    }
+    if (meth_found) {
+        *method = descr;
+        return 1;
+    }
+    if (f != NULL) {
+        attr = f(descr, obj, (PyObject *)Py_TYPE(obj));
+        Py_DECREF(descr);
+        goto try_unpack;
+    }
+    if (likely(descr != NULL)) {
+        *method = descr;
+        return 0;
+    }
+    type_name = __Pyx_PyType_GetName(tp);
+    PyErr_Format(PyExc_AttributeError,
+#if PY_MAJOR_VERSION >= 3
+                 "'" __Pyx_FMT_TYPENAME "' object has no attribute '%U'",
+                 type_name, name);
+#else
+                 "'" __Pyx_FMT_TYPENAME "' object has no attribute '%.400s'",
+                 type_name, PyString_AS_STRING(name));
+#endif
+    __Pyx_DECREF_TypeName(type_name);
+    return 0;
+#else
+    attr = __Pyx_PyObject_GetAttrStr(obj, name);
+    goto try_unpack;
+#endif
+try_unpack:
+#if CYTHON_UNPACK_METHODS
+    if (likely(attr) && PyMethod_Check(attr) && likely(PyMethod_GET_SELF(attr) == obj)) {
+        PyObject *function = PyMethod_GET_FUNCTION(attr);
+        Py_INCREF(function);
+        Py_DECREF(attr);
+        *method = function;
+        return 1;
+    }
+#endif
+    *method = attr;
+    return 0;
+}
+
+/* PyObjectCallMethod0 */
+  static PyObject* __Pyx_PyObject_CallMethod0(PyObject* obj, PyObject* method_name) {
+    PyObject *method = NULL, *result = NULL;
+    int is_method = __Pyx_PyObject_GetMethod(obj, method_name, &method);
+    if (likely(is_method)) {
+        result = __Pyx_PyObject_CallOneArg(method, obj);
+        Py_DECREF(method);
+        return result;
+    }
+    if (unlikely(!method)) goto bad;
+    result = __Pyx_PyObject_CallNoArg(method);
+    Py_DECREF(method);
+bad:
+    return result;
+}
+
+/* ValidateBasesTuple */
+  #if CYTHON_COMPILING_IN_CPYTHON || CYTHON_COMPILING_IN_LIMITED_API || CYTHON_USE_TYPE_SPECS
+static int __Pyx_validate_bases_tuple(const char *type_name, Py_ssize_t dictoffset, PyObject *bases) {
+    Py_ssize_t i, n;
+#if CYTHON_ASSUME_SAFE_MACROS
+    n = PyTuple_GET_SIZE(bases);
+#else
+    n = PyTuple_Size(bases);
+    if (n < 0) return -1;
+#endif
+    for (i = 1; i < n; i++)
+    {
+#if CYTHON_AVOID_BORROWED_REFS
+        PyObject *b0 = PySequence_GetItem(bases, i);
+        if (!b0) return -1;
+#elif CYTHON_ASSUME_SAFE_MACROS
+        PyObject *b0 = PyTuple_GET_ITEM(bases, i);
+#else
+        PyObject *b0 = PyTuple_GetItem(bases, i);
+        if (!b0) return -1;
+#endif
+        PyTypeObject *b;
+#if PY_MAJOR_VERSION < 3
+        if (PyClass_Check(b0))
+        {
+            PyErr_Format(PyExc_TypeError, "base class '%.200s' is an old-style class",
+                         PyString_AS_STRING(((PyClassObject*)b0)->cl_name));
+#if CYTHON_AVOID_BORROWED_REFS
+            Py_DECREF(b0);
+#endif
+            return -1;
+        }
+#endif
+        b = (PyTypeObject*) b0;
+        if (!__Pyx_PyType_HasFeature(b, Py_TPFLAGS_HEAPTYPE))
+        {
+            __Pyx_TypeName b_name = __Pyx_PyType_GetName(b);
+            PyErr_Format(PyExc_TypeError,
+                "base class '" __Pyx_FMT_TYPENAME "' is not a heap type", b_name);
+            __Pyx_DECREF_TypeName(b_name);
+#if CYTHON_AVOID_BORROWED_REFS
+            Py_DECREF(b0);
+#endif
+            return -1;
+        }
+        if (dictoffset == 0)
+        {
+            Py_ssize_t b_dictoffset = 0;
+#if CYTHON_USE_TYPE_SLOTS || CYTHON_COMPILING_IN_PYPY
+            b_dictoffset = b->tp_dictoffset;
+#else
+            PyObject *py_b_dictoffset = PyObject_GetAttrString((PyObject*)b, "__dictoffset__");
+            if (!py_b_dictoffset) goto dictoffset_return;
+            b_dictoffset = PyLong_AsSsize_t(py_b_dictoffset);
+            Py_DECREF(py_b_dictoffset);
+            if (b_dictoffset == -1 && PyErr_Occurred()) goto dictoffset_return;
+#endif
+            if (b_dictoffset) {
+                {
+                    __Pyx_TypeName b_name = __Pyx_PyType_GetName(b);
+                    PyErr_Format(PyExc_TypeError,
+                        "extension type '%.200s' has no __dict__ slot, "
+                        "but base type '" __Pyx_FMT_TYPENAME "' has: "
+                        "either add 'cdef dict __dict__' to the extension type "
+                        "or add '__slots__ = [...]' to the base type",
+                        type_name, b_name);
+                    __Pyx_DECREF_TypeName(b_name);
+                }
+#if !(CYTHON_USE_TYPE_SLOTS || CYTHON_COMPILING_IN_PYPY)
+              dictoffset_return:
+#endif
+#if CYTHON_AVOID_BORROWED_REFS
+                Py_DECREF(b0);
+#endif
+                return -1;
+            }
+        }
+#if CYTHON_AVOID_BORROWED_REFS
+        Py_DECREF(b0);
+#endif
+    }
+    return 0;
+}
+#endif
+
+/* PyType_Ready */
+  static int __Pyx_PyType_Ready(PyTypeObject *t) {
+#if CYTHON_USE_TYPE_SPECS || !(CYTHON_COMPILING_IN_CPYTHON || CYTHON_COMPILING_IN_LIMITED_API) || defined(PYSTON_MAJOR_VERSION)
+    (void)__Pyx_PyObject_CallMethod0;
+#if CYTHON_USE_TYPE_SPECS
+    (void)__Pyx_validate_bases_tuple;
+#endif
+    return PyType_Ready(t);
+#else
+    int r;
+    PyObject *bases = __Pyx_PyType_GetSlot(t, tp_bases, PyObject*);
+    if (bases && unlikely(__Pyx_validate_bases_tuple(t->tp_name, t->tp_dictoffset, bases) == -1))
+        return -1;
+#if PY_VERSION_HEX >= 0x03050000 && !defined(PYSTON_MAJOR_VERSION)
+    {
+        int gc_was_enabled;
+    #if PY_VERSION_HEX >= 0x030A00b1
+        gc_was_enabled = PyGC_Disable();
+        (void)__Pyx_PyObject_CallMethod0;
+    #else
+        PyObject *ret, *py_status;
+        PyObject *gc = NULL;
+        #if PY_VERSION_HEX >= 0x030700a1 && (!CYTHON_COMPILING_IN_PYPY || PYPY_VERSION_NUM+0 >= 0x07030400)
+        gc = PyImport_GetModule(__pyx_kp_u_gc);
+        #endif
+        if (unlikely(!gc)) gc = PyImport_Import(__pyx_kp_u_gc);
+        if (unlikely(!gc)) return -1;
+        py_status = __Pyx_PyObject_CallMethod0(gc, __pyx_kp_u_isenabled);
+        if (unlikely(!py_status)) {
+            Py_DECREF(gc);
+            return -1;
+        }
+        gc_was_enabled = __Pyx_PyObject_IsTrue(py_status);
+        Py_DECREF(py_status);
+        if (gc_was_enabled > 0) {
+            ret = __Pyx_PyObject_CallMethod0(gc, __pyx_kp_u_disable);
+            if (unlikely(!ret)) {
+                Py_DECREF(gc);
+                return -1;
+            }
+            Py_DECREF(ret);
+        } else if (unlikely(gc_was_enabled == -1)) {
+            Py_DECREF(gc);
+            return -1;
+        }
+    #endif
+        t->tp_flags |= Py_TPFLAGS_HEAPTYPE;
+#if PY_VERSION_HEX >= 0x030A0000
+        t->tp_flags |= Py_TPFLAGS_IMMUTABLETYPE;
+#endif
+#else
+        (void)__Pyx_PyObject_CallMethod0;
+#endif
+    r = PyType_Ready(t);
+#if PY_VERSION_HEX >= 0x03050000 && !defined(PYSTON_MAJOR_VERSION)
+        t->tp_flags &= ~Py_TPFLAGS_HEAPTYPE;
+    #if PY_VERSION_HEX >= 0x030A00b1
+        if (gc_was_enabled)
+            PyGC_Enable();
+    #else
+        if (gc_was_enabled) {
+            PyObject *tp, *v, *tb;
+            PyErr_Fetch(&tp, &v, &tb);
+            ret = __Pyx_PyObject_CallMethod0(gc, __pyx_kp_u_enable);
+            if (likely(ret || r == -1)) {
+                Py_XDECREF(ret);
+                PyErr_Restore(tp, v, tb);
+            } else {
+                Py_XDECREF(tp);
+                Py_XDECREF(v);
+                Py_XDECREF(tb);
+                r = -1;
+            }
+        }
+        Py_DECREF(gc);
+    #endif
+    }
+#endif
+    return r;
+#endif
+}
+
+/* SetVTable */
+  static int __Pyx_SetVtable(PyTypeObject *type, void *vtable) {
+    PyObject *ob = PyCapsule_New(vtable, 0, 0);
+    if (unlikely(!ob))
+        goto bad;
+#if CYTHON_COMPILING_IN_LIMITED_API
+    if (unlikely(PyObject_SetAttr((PyObject *) type, __pyx_n_s_pyx_vtable, ob) < 0))
+#else
+    if (unlikely(PyDict_SetItem(type->tp_dict, __pyx_n_s_pyx_vtable, ob) < 0))
+#endif
+        goto bad;
+    Py_DECREF(ob);
+    return 0;
+bad:
+    Py_XDECREF(ob);
+    return -1;
+}
+
+/* GetVTable */
+  static void* __Pyx_GetVtable(PyTypeObject *type) {
+    void* ptr;
+#if CYTHON_COMPILING_IN_LIMITED_API
+    PyObject *ob = PyObject_GetAttr((PyObject *)type, __pyx_n_s_pyx_vtable);
+#else
+    PyObject *ob = PyObject_GetItem(type->tp_dict, __pyx_n_s_pyx_vtable);
+#endif
+    if (!ob)
+        goto bad;
+    ptr = PyCapsule_GetPointer(ob, 0);
+    if (!ptr && !PyErr_Occurred())
+        PyErr_SetString(PyExc_RuntimeError, "invalid vtable found for imported type");
+    Py_DECREF(ob);
+    return ptr;
+bad:
+    Py_XDECREF(ob);
+    return NULL;
+}
+
+/* MergeVTables */
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+static int __Pyx_MergeVtables(PyTypeObject *type) {
+    int i;
+    void** base_vtables;
+    __Pyx_TypeName tp_base_name;
+    __Pyx_TypeName base_name;
+    void* unknown = (void*)-1;
+    PyObject* bases = type->tp_bases;
+    int base_depth = 0;
+    {
+        PyTypeObject* base = type->tp_base;
+        while (base) {
+            base_depth += 1;
+            base = base->tp_base;
+        }
+    }
+    base_vtables = (void**) malloc(sizeof(void*) * (size_t)(base_depth + 1));
+    base_vtables[0] = unknown;
+    for (i = 1; i < PyTuple_GET_SIZE(bases); i++) {
+        void* base_vtable = __Pyx_GetVtable(((PyTypeObject*)PyTuple_GET_ITEM(bases, i)));
+        if (base_vtable != NULL) {
+            int j;
+            PyTypeObject* base = type->tp_base;
+            for (j = 0; j < base_depth; j++) {
+                if (base_vtables[j] == unknown) {
+                    base_vtables[j] = __Pyx_GetVtable(base);
+                    base_vtables[j + 1] = unknown;
+                }
+                if (base_vtables[j] == base_vtable) {
+                    break;
+                } else if (base_vtables[j] == NULL) {
+                    goto bad;
+                }
+                base = base->tp_base;
+            }
+        }
+    }
+    PyErr_Clear();
+    free(base_vtables);
+    return 0;
+bad:
+    tp_base_name = __Pyx_PyType_GetName(type->tp_base);
+    base_name = __Pyx_PyType_GetName((PyTypeObject*)PyTuple_GET_ITEM(bases, i));
+    PyErr_Format(PyExc_TypeError,
+        "multiple bases have vtable conflict: '" __Pyx_FMT_TYPENAME "' and '" __Pyx_FMT_TYPENAME "'", tp_base_name, base_name);
+    __Pyx_DECREF_TypeName(tp_base_name);
+    __Pyx_DECREF_TypeName(base_name);
+    free(base_vtables);
+    return -1;
+}
+#endif
+
+/* SetupReduce */
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+static int __Pyx_setup_reduce_is_named(PyObject* meth, PyObject* name) {
+  int ret;
+  PyObject *name_attr;
+  name_attr = __Pyx_PyObject_GetAttrStrNoError(meth, __pyx_n_s_name_2);
+  if (likely(name_attr)) {
+      ret = PyObject_RichCompareBool(name_attr, name, Py_EQ);
+  } else {
+      ret = -1;
+  }
+  if (unlikely(ret < 0)) {
+      PyErr_Clear();
+      ret = 0;
+  }
+  Py_XDECREF(name_attr);
+  return ret;
+}
+static int __Pyx_setup_reduce(PyObject* type_obj) {
+    int ret = 0;
+    PyObject *object_reduce = NULL;
+    PyObject *object_getstate = NULL;
+    PyObject *object_reduce_ex = NULL;
+    PyObject *reduce = NULL;
+    PyObject *reduce_ex = NULL;
+    PyObject *reduce_cython = NULL;
+    PyObject *setstate = NULL;
+    PyObject *setstate_cython = NULL;
+    PyObject *getstate = NULL;
+#if CYTHON_USE_PYTYPE_LOOKUP
+    getstate = _PyType_Lookup((PyTypeObject*)type_obj, __pyx_n_s_getstate);
+#else
+    getstate = __Pyx_PyObject_GetAttrStrNoError(type_obj, __pyx_n_s_getstate);
+    if (!getstate && PyErr_Occurred()) {
+        goto __PYX_BAD;
+    }
+#endif
+    if (getstate) {
+#if CYTHON_USE_PYTYPE_LOOKUP
+        object_getstate = _PyType_Lookup(&PyBaseObject_Type, __pyx_n_s_getstate);
+#else
+        object_getstate = __Pyx_PyObject_GetAttrStrNoError((PyObject*)&PyBaseObject_Type, __pyx_n_s_getstate);
+        if (!object_getstate && PyErr_Occurred()) {
+            goto __PYX_BAD;
+        }
+#endif
+        if (object_getstate != getstate) {
+            goto __PYX_GOOD;
+        }
+    }
+#if CYTHON_USE_PYTYPE_LOOKUP
+    object_reduce_ex = _PyType_Lookup(&PyBaseObject_Type, __pyx_n_s_reduce_ex); if (!object_reduce_ex) goto __PYX_BAD;
+#else
+    object_reduce_ex = __Pyx_PyObject_GetAttrStr((PyObject*)&PyBaseObject_Type, __pyx_n_s_reduce_ex); if (!object_reduce_ex) goto __PYX_BAD;
+#endif
+    reduce_ex = __Pyx_PyObject_GetAttrStr(type_obj, __pyx_n_s_reduce_ex); if (unlikely(!reduce_ex)) goto __PYX_BAD;
+    if (reduce_ex == object_reduce_ex) {
+#if CYTHON_USE_PYTYPE_LOOKUP
+        object_reduce = _PyType_Lookup(&PyBaseObject_Type, __pyx_n_s_reduce); if (!object_reduce) goto __PYX_BAD;
+#else
+        object_reduce = __Pyx_PyObject_GetAttrStr((PyObject*)&PyBaseObject_Type, __pyx_n_s_reduce); if (!object_reduce) goto __PYX_BAD;
+#endif
+        reduce = __Pyx_PyObject_GetAttrStr(type_obj, __pyx_n_s_reduce); if (unlikely(!reduce)) goto __PYX_BAD;
+        if (reduce == object_reduce || __Pyx_setup_reduce_is_named(reduce, __pyx_n_s_reduce_cython)) {
+            reduce_cython = __Pyx_PyObject_GetAttrStrNoError(type_obj, __pyx_n_s_reduce_cython);
+            if (likely(reduce_cython)) {
+                ret = PyDict_SetItem(((PyTypeObject*)type_obj)->tp_dict, __pyx_n_s_reduce, reduce_cython); if (unlikely(ret < 0)) goto __PYX_BAD;
+                ret = PyDict_DelItem(((PyTypeObject*)type_obj)->tp_dict, __pyx_n_s_reduce_cython); if (unlikely(ret < 0)) goto __PYX_BAD;
+            } else if (reduce == object_reduce || PyErr_Occurred()) {
+                goto __PYX_BAD;
+            }
+            setstate = __Pyx_PyObject_GetAttrStrNoError(type_obj, __pyx_n_s_setstate);
+            if (!setstate) PyErr_Clear();
+            if (!setstate || __Pyx_setup_reduce_is_named(setstate, __pyx_n_s_setstate_cython)) {
+                setstate_cython = __Pyx_PyObject_GetAttrStrNoError(type_obj, __pyx_n_s_setstate_cython);
+                if (likely(setstate_cython)) {
+                    ret = PyDict_SetItem(((PyTypeObject*)type_obj)->tp_dict, __pyx_n_s_setstate, setstate_cython); if (unlikely(ret < 0)) goto __PYX_BAD;
+                    ret = PyDict_DelItem(((PyTypeObject*)type_obj)->tp_dict, __pyx_n_s_setstate_cython); if (unlikely(ret < 0)) goto __PYX_BAD;
+                } else if (!setstate || PyErr_Occurred()) {
+                    goto __PYX_BAD;
+                }
+            }
+            PyType_Modified((PyTypeObject*)type_obj);
+        }
+    }
+    goto __PYX_GOOD;
+__PYX_BAD:
+    if (!PyErr_Occurred()) {
+        __Pyx_TypeName type_obj_name =
+            __Pyx_PyType_GetName((PyTypeObject*)type_obj);
+        PyErr_Format(PyExc_RuntimeError,
+            "Unable to initialize pickling for " __Pyx_FMT_TYPENAME, type_obj_name);
+        __Pyx_DECREF_TypeName(type_obj_name);
+    }
+    ret = -1;
+__PYX_GOOD:
+#if !CYTHON_USE_PYTYPE_LOOKUP
+    Py_XDECREF(object_reduce);
+    Py_XDECREF(object_reduce_ex);
+    Py_XDECREF(object_getstate);
+    Py_XDECREF(getstate);
+#endif
+    Py_XDECREF(reduce);
+    Py_XDECREF(reduce_ex);
+    Py_XDECREF(reduce_cython);
+    Py_XDECREF(setstate);
+    Py_XDECREF(setstate_cython);
+    return ret;
+}
+#endif
+
+/* TypeImport */
+  #ifndef __PYX_HAVE_RT_ImportType_3_0_12
+#define __PYX_HAVE_RT_ImportType_3_0_12
+static PyTypeObject *__Pyx_ImportType_3_0_12(PyObject *module, const char *module_name, const char *class_name,
+    size_t size, size_t alignment, enum __Pyx_ImportType_CheckSize_3_0_12 check_size)
+{
+    PyObject *result = 0;
+    char warning[200];
+    Py_ssize_t basicsize;
+    Py_ssize_t itemsize;
+#if CYTHON_COMPILING_IN_LIMITED_API
+    PyObject *py_basicsize;
+    PyObject *py_itemsize;
+#endif
+    result = PyObject_GetAttrString(module, class_name);
+    if (!result)
+        goto bad;
+    if (!PyType_Check(result)) {
+        PyErr_Format(PyExc_TypeError,
+            "%.200s.%.200s is not a type object",
+            module_name, class_name);
+        goto bad;
+    }
+#if !CYTHON_COMPILING_IN_LIMITED_API
+    basicsize = ((PyTypeObject *)result)->tp_basicsize;
+    itemsize = ((PyTypeObject *)result)->tp_itemsize;
+#else
+    py_basicsize = PyObject_GetAttrString(result, "__basicsize__");
+    if (!py_basicsize)
+        goto bad;
+    basicsize = PyLong_AsSsize_t(py_basicsize);
+    Py_DECREF(py_basicsize);
+    py_basicsize = 0;
+    if (basicsize == (Py_ssize_t)-1 && PyErr_Occurred())
+        goto bad;
+    py_itemsize = PyObject_GetAttrString(result, "__itemsize__");
+    if (!py_itemsize)
+        goto bad;
+    itemsize = PyLong_AsSsize_t(py_itemsize);
+    Py_DECREF(py_itemsize);
+    py_itemsize = 0;
+    if (itemsize == (Py_ssize_t)-1 && PyErr_Occurred())
+        goto bad;
+#endif
+    if (itemsize) {
+        if (size % alignment) {
+            alignment = size % alignment;
+        }
+        if (itemsize < (Py_ssize_t)alignment)
+            itemsize = (Py_ssize_t)alignment;
+    }
+    if ((size_t)(basicsize + itemsize) < size) {
+        PyErr_Format(PyExc_ValueError,
+            "%.200s.%.200s size changed, may indicate binary incompatibility. "
+            "Expected %zd from C header, got %zd from PyObject",
+            module_name, class_name, size, basicsize+itemsize);
+        goto bad;
+    }
+    if (check_size == __Pyx_ImportType_CheckSize_Error_3_0_12 &&
+            ((size_t)basicsize > size || (size_t)(basicsize + itemsize) < size)) {
+        PyErr_Format(PyExc_ValueError,
+            "%.200s.%.200s size changed, may indicate binary incompatibility. "
+            "Expected %zd from C header, got %zd-%zd from PyObject",
+            module_name, class_name, size, basicsize, basicsize+itemsize);
+        goto bad;
+    }
+    else if (check_size == __Pyx_ImportType_CheckSize_Warn_3_0_12 && (size_t)basicsize > size) {
+        PyOS_snprintf(warning, sizeof(warning),
+            "%s.%s size changed, may indicate binary incompatibility. "
+            "Expected %zd from C header, got %zd from PyObject",
+            module_name, class_name, size, basicsize);
+        if (PyErr_WarnEx(NULL, warning, 0) < 0) goto bad;
+    }
+    return (PyTypeObject *)result;
+bad:
+    Py_XDECREF(result);
+    return NULL;
+}
+#endif
+
+/* FetchSharedCythonModule */
+  static PyObject *__Pyx_FetchSharedCythonABIModule(void) {
+    return __Pyx_PyImport_AddModuleRef((char*) __PYX_ABI_MODULE_NAME);
+}
+
+/* FetchCommonType */
+  static int __Pyx_VerifyCachedType(PyObject *cached_type,
+                               const char *name,
+                               Py_ssize_t basicsize,
+                               Py_ssize_t expected_basicsize) {
+    if (!PyType_Check(cached_type)) {
+        PyErr_Format(PyExc_TypeError,
+            "Shared Cython type %.200s is not a type object", name);
+        return -1;
+    }
+    if (basicsize != expected_basicsize) {
+        PyErr_Format(PyExc_TypeError,
+            "Shared Cython type %.200s has the wrong size, try recompiling",
+            name);
+        return -1;
+    }
+    return 0;
+}
+#if !CYTHON_USE_TYPE_SPECS
+static PyTypeObject* __Pyx_FetchCommonType(PyTypeObject* type) {
+    PyObject* abi_module;
+    const char* object_name;
+    PyTypeObject *cached_type = NULL;
+    abi_module = __Pyx_FetchSharedCythonABIModule();
+    if (!abi_module) return NULL;
+    object_name = strrchr(type->tp_name, '.');
+    object_name = object_name ? object_name+1 : type->tp_name;
+    cached_type = (PyTypeObject*) PyObject_GetAttrString(abi_module, object_name);
+    if (cached_type) {
+        if (__Pyx_VerifyCachedType(
+              (PyObject *)cached_type,
+              object_name,
+              cached_type->tp_basicsize,
+              type->tp_basicsize) < 0) {
+            goto bad;
+        }
+        goto done;
+    }
+    if (!PyErr_ExceptionMatches(PyExc_AttributeError)) goto bad;
+    PyErr_Clear();
+    if (PyType_Ready(type) < 0) goto bad;
+    if (PyObject_SetAttrString(abi_module, object_name, (PyObject *)type) < 0)
+        goto bad;
+    Py_INCREF(type);
+    cached_type = type;
+done:
+    Py_DECREF(abi_module);
+    return cached_type;
+bad:
+    Py_XDECREF(cached_type);
+    cached_type = NULL;
+    goto done;
+}
+#else
+static PyTypeObject *__Pyx_FetchCommonTypeFromSpec(PyObject *module, PyType_Spec *spec, PyObject *bases) {
+    PyObject *abi_module, *cached_type = NULL;
+    const char* object_name = strrchr(spec->name, '.');
+    object_name = object_name ? object_name+1 : spec->name;
+    abi_module = __Pyx_FetchSharedCythonABIModule();
+    if (!abi_module) return NULL;
+    cached_type = PyObject_GetAttrString(abi_module, object_name);
+    if (cached_type) {
+        Py_ssize_t basicsize;
+#if CYTHON_COMPILING_IN_LIMITED_API
+        PyObject *py_basicsize;
+        py_basicsize = PyObject_GetAttrString(cached_type, "__basicsize__");
+        if (unlikely(!py_basicsize)) goto bad;
+        basicsize = PyLong_AsSsize_t(py_basicsize);
+        Py_DECREF(py_basicsize);
+        py_basicsize = 0;
+        if (unlikely(basicsize == (Py_ssize_t)-1) && PyErr_Occurred()) goto bad;
+#else
+        basicsize = likely(PyType_Check(cached_type)) ? ((PyTypeObject*) cached_type)->tp_basicsize : -1;
+#endif
+        if (__Pyx_VerifyCachedType(
+              cached_type,
+              object_name,
+              basicsize,
+              spec->basicsize) < 0) {
+            goto bad;
+        }
+        goto done;
+    }
+    if (!PyErr_ExceptionMatches(PyExc_AttributeError)) goto bad;
+    PyErr_Clear();
+    CYTHON_UNUSED_VAR(module);
+    cached_type = __Pyx_PyType_FromModuleAndSpec(abi_module, spec, bases);
+    if (unlikely(!cached_type)) goto bad;
+    if (unlikely(__Pyx_fix_up_extension_type_from_spec(spec, (PyTypeObject *) cached_type) < 0)) goto bad;
+    if (PyObject_SetAttrString(abi_module, object_name, cached_type) < 0) goto bad;
+done:
+    Py_DECREF(abi_module);
+    assert(cached_type == NULL || PyType_Check(cached_type));
+    return (PyTypeObject *) cached_type;
+bad:
+    Py_XDECREF(cached_type);
+    cached_type = NULL;
+    goto done;
+}
+#endif
+
+/* PyVectorcallFastCallDict */
+  #if CYTHON_METH_FASTCALL
+static PyObject *__Pyx_PyVectorcall_FastCallDict_kw(PyObject *func, __pyx_vectorcallfunc vc, PyObject *const *args, size_t nargs, PyObject *kw)
+{
+    PyObject *res = NULL;
+    PyObject *kwnames;
+    PyObject **newargs;
+    PyObject **kwvalues;
+    Py_ssize_t i, pos;
+    size_t j;
+    PyObject *key, *value;
+    unsigned long keys_are_strings;
+    Py_ssize_t nkw = PyDict_GET_SIZE(kw);
+    newargs = (PyObject **)PyMem_Malloc((nargs + (size_t)nkw) * sizeof(args[0]));
+    if (unlikely(newargs == NULL)) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    for (j = 0; j < nargs; j++) newargs[j] = args[j];
+    kwnames = PyTuple_New(nkw);
+    if (unlikely(kwnames == NULL)) {
+        PyMem_Free(newargs);
+        return NULL;
+    }
+    kwvalues = newargs + nargs;
+    pos = i = 0;
+    keys_are_strings = Py_TPFLAGS_UNICODE_SUBCLASS;
+    while (PyDict_Next(kw, &pos, &key, &value)) {
+        keys_are_strings &= Py_TYPE(key)->tp_flags;
+        Py_INCREF(key);
+        Py_INCREF(value);
+        PyTuple_SET_ITEM(kwnames, i, key);
+        kwvalues[i] = value;
+        i++;
+    }
+    if (unlikely(!keys_are_strings)) {
+        PyErr_SetString(PyExc_TypeError, "keywords must be strings");
+        goto cleanup;
+    }
+    res = vc(func, newargs, nargs, kwnames);
+cleanup:
+    Py_DECREF(kwnames);
+    for (i = 0; i < nkw; i++)
+        Py_DECREF(kwvalues[i]);
+    PyMem_Free(newargs);
+    return res;
+}
+static CYTHON_INLINE PyObject *__Pyx_PyVectorcall_FastCallDict(PyObject *func, __pyx_vectorcallfunc vc, PyObject *const *args, size_t nargs, PyObject *kw)
+{
+    if (likely(kw == NULL) || PyDict_GET_SIZE(kw) == 0) {
+        return vc(func, args, nargs, NULL);
+    }
+    return __Pyx_PyVectorcall_FastCallDict_kw(func, vc, args, nargs, kw);
+}
+#endif
+
+/* CythonFunctionShared */
+  #if CYTHON_COMPILING_IN_LIMITED_API
+static CYTHON_INLINE int __Pyx__IsSameCyOrCFunction(PyObject *func, void *cfunc) {
+    if (__Pyx_CyFunction_Check(func)) {
+        return PyCFunction_GetFunction(((__pyx_CyFunctionObject*)func)->func) == (PyCFunction) cfunc;
+    } else if (PyCFunction_Check(func)) {
+        return PyCFunction_GetFunction(func) == (PyCFunction) cfunc;
+    }
+    return 0;
+}
+#else
+static CYTHON_INLINE int __Pyx__IsSameCyOrCFunction(PyObject *func, void *cfunc) {
+    return __Pyx_CyOrPyCFunction_Check(func) && __Pyx_CyOrPyCFunction_GET_FUNCTION(func) == (PyCFunction) cfunc;
+}
+#endif
+static CYTHON_INLINE void __Pyx__CyFunction_SetClassObj(__pyx_CyFunctionObject* f, PyObject* classobj) {
+#if PY_VERSION_HEX < 0x030900B1 || CYTHON_COMPILING_IN_LIMITED_API
+    __Pyx_Py_XDECREF_SET(
+        __Pyx_CyFunction_GetClassObj(f),
+            ((classobj) ? __Pyx_NewRef(classobj) : NULL));
+#else
+    __Pyx_Py_XDECREF_SET(
+        ((PyCMethodObject *) (f))->mm_class,
+        (PyTypeObject*)((classobj) ? __Pyx_NewRef(classobj) : NULL));
+#endif
+}
+static PyObject *
+__Pyx_CyFunction_get_doc(__pyx_CyFunctionObject *op, void *closure)
+{
+    CYTHON_UNUSED_VAR(closure);
+    if (unlikely(op->func_doc == NULL)) {
+#if CYTHON_COMPILING_IN_LIMITED_API
+        op->func_doc = PyObject_GetAttrString(op->func, "__doc__");
+        if (unlikely(!op->func_doc)) return NULL;
+#else
+        if (((PyCFunctionObject*)op)->m_ml->ml_doc) {
+#if PY_MAJOR_VERSION >= 3
+            op->func_doc = PyUnicode_FromString(((PyCFunctionObject*)op)->m_ml->ml_doc);
+#else
+            op->func_doc = PyString_FromString(((PyCFunctionObject*)op)->m_ml->ml_doc);
+#endif
+            if (unlikely(op->func_doc == NULL))
+                return NULL;
+        } else {
+            Py_INCREF(Py_None);
+            return Py_None;
+        }
+#endif
+    }
+    Py_INCREF(op->func_doc);
+    return op->func_doc;
+}
+static int
+__Pyx_CyFunction_set_doc(__pyx_CyFunctionObject *op, PyObject *value, void *context)
+{
+    CYTHON_UNUSED_VAR(context);
+    if (value == NULL) {
+        value = Py_None;
+    }
+    Py_INCREF(value);
+    __Pyx_Py_XDECREF_SET(op->func_doc, value);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_name(__pyx_CyFunctionObject *op, void *context)
+{
+    CYTHON_UNUSED_VAR(context);
+    if (unlikely(op->func_name == NULL)) {
+#if CYTHON_COMPILING_IN_LIMITED_API
+        op->func_name = PyObject_GetAttrString(op->func, "__name__");
+#elif PY_MAJOR_VERSION >= 3
+        op->func_name = PyUnicode_InternFromString(((PyCFunctionObject*)op)->m_ml->ml_name);
+#else
+        op->func_name = PyString_InternFromString(((PyCFunctionObject*)op)->m_ml->ml_name);
+#endif
+        if (unlikely(op->func_name == NULL))
+            return NULL;
+    }
+    Py_INCREF(op->func_name);
+    return op->func_name;
+}
+static int
+__Pyx_CyFunction_set_name(__pyx_CyFunctionObject *op, PyObject *value, void *context)
+{
+    CYTHON_UNUSED_VAR(context);
+#if PY_MAJOR_VERSION >= 3
+    if (unlikely(value == NULL || !PyUnicode_Check(value)))
+#else
+    if (unlikely(value == NULL || !PyString_Check(value)))
+#endif
+    {
+        PyErr_SetString(PyExc_TypeError,
+                        "__name__ must be set to a string object");
+        return -1;
+    }
+    Py_INCREF(value);
+    __Pyx_Py_XDECREF_SET(op->func_name, value);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_qualname(__pyx_CyFunctionObject *op, void *context)
+{
+    CYTHON_UNUSED_VAR(context);
+    Py_INCREF(op->func_qualname);
+    return op->func_qualname;
+}
+static int
+__Pyx_CyFunction_set_qualname(__pyx_CyFunctionObject *op, PyObject *value, void *context)
+{
+    CYTHON_UNUSED_VAR(context);
+#if PY_MAJOR_VERSION >= 3
+    if (unlikely(value == NULL || !PyUnicode_Check(value)))
+#else
+    if (unlikely(value == NULL || !PyString_Check(value)))
+#endif
+    {
+        PyErr_SetString(PyExc_TypeError,
+                        "__qualname__ must be set to a string object");
+        return -1;
+    }
+    Py_INCREF(value);
+    __Pyx_Py_XDECREF_SET(op->func_qualname, value);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_dict(__pyx_CyFunctionObject *op, void *context)
+{
+    CYTHON_UNUSED_VAR(context);
+    if (unlikely(op->func_dict == NULL)) {
+        op->func_dict = PyDict_New();
+        if (unlikely(op->func_dict == NULL))
+            return NULL;
+    }
+    Py_INCREF(op->func_dict);
+    return op->func_dict;
+}
+static int
+__Pyx_CyFunction_set_dict(__pyx_CyFunctionObject *op, PyObject *value, void *context)
+{
+    CYTHON_UNUSED_VAR(context);
+    if (unlikely(value == NULL)) {
+        PyErr_SetString(PyExc_TypeError,
+               "function's dictionary may not be deleted");
+        return -1;
+    }
+    if (unlikely(!PyDict_Check(value))) {
+        PyErr_SetString(PyExc_TypeError,
+               "setting function's dictionary to a non-dict");
+        return -1;
+    }
+    Py_INCREF(value);
+    __Pyx_Py_XDECREF_SET(op->func_dict, value);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_globals(__pyx_CyFunctionObject *op, void *context)
+{
+    CYTHON_UNUSED_VAR(context);
+    Py_INCREF(op->func_globals);
+    return op->func_globals;
+}
+static PyObject *
+__Pyx_CyFunction_get_closure(__pyx_CyFunctionObject *op, void *context)
+{
+    CYTHON_UNUSED_VAR(op);
+    CYTHON_UNUSED_VAR(context);
+    Py_INCREF(Py_None);
+    return Py_None;
+}
+static PyObject *
+__Pyx_CyFunction_get_code(__pyx_CyFunctionObject *op, void *context)
+{
+    PyObject* result = (op->func_code) ? op->func_code : Py_None;
+    CYTHON_UNUSED_VAR(context);
+    Py_INCREF(result);
+    return result;
+}
+static int
+__Pyx_CyFunction_init_defaults(__pyx_CyFunctionObject *op) {
+    int result = 0;
+    PyObject *res = op->defaults_getter((PyObject *) op);
+    if (unlikely(!res))
+        return -1;
+    #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    op->defaults_tuple = PyTuple_GET_ITEM(res, 0);
+    Py_INCREF(op->defaults_tuple);
+    op->defaults_kwdict = PyTuple_GET_ITEM(res, 1);
+    Py_INCREF(op->defaults_kwdict);
+    #else
+    op->defaults_tuple = __Pyx_PySequence_ITEM(res, 0);
+    if (unlikely(!op->defaults_tuple)) result = -1;
+    else {
+        op->defaults_kwdict = __Pyx_PySequence_ITEM(res, 1);
+        if (unlikely(!op->defaults_kwdict)) result = -1;
+    }
+    #endif
+    Py_DECREF(res);
+    return result;
+}
+static int
+__Pyx_CyFunction_set_defaults(__pyx_CyFunctionObject *op, PyObject* value, void *context) {
+    CYTHON_UNUSED_VAR(context);
+    if (!value) {
+        value = Py_None;
+    } else if (unlikely(value != Py_None && !PyTuple_Check(value))) {
+        PyErr_SetString(PyExc_TypeError,
+                        "__defaults__ must be set to a tuple object");
+        return -1;
+    }
+    PyErr_WarnEx(PyExc_RuntimeWarning, "changes to cyfunction.__defaults__ will not "
+                 "currently affect the values used in function calls", 1);
+    Py_INCREF(value);
+    __Pyx_Py_XDECREF_SET(op->defaults_tuple, value);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_defaults(__pyx_CyFunctionObject *op, void *context) {
+    PyObject* result = op->defaults_tuple;
+    CYTHON_UNUSED_VAR(context);
+    if (unlikely(!result)) {
+        if (op->defaults_getter) {
+            if (unlikely(__Pyx_CyFunction_init_defaults(op) < 0)) return NULL;
+            result = op->defaults_tuple;
+        } else {
+            result = Py_None;
+        }
+    }
+    Py_INCREF(result);
+    return result;
+}
+static int
+__Pyx_CyFunction_set_kwdefaults(__pyx_CyFunctionObject *op, PyObject* value, void *context) {
+    CYTHON_UNUSED_VAR(context);
+    if (!value) {
+        value = Py_None;
+    } else if (unlikely(value != Py_None && !PyDict_Check(value))) {
+        PyErr_SetString(PyExc_TypeError,
+                        "__kwdefaults__ must be set to a dict object");
+        return -1;
+    }
+    PyErr_WarnEx(PyExc_RuntimeWarning, "changes to cyfunction.__kwdefaults__ will not "
+                 "currently affect the values used in function calls", 1);
+    Py_INCREF(value);
+    __Pyx_Py_XDECREF_SET(op->defaults_kwdict, value);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_kwdefaults(__pyx_CyFunctionObject *op, void *context) {
+    PyObject* result = op->defaults_kwdict;
+    CYTHON_UNUSED_VAR(context);
+    if (unlikely(!result)) {
+        if (op->defaults_getter) {
+            if (unlikely(__Pyx_CyFunction_init_defaults(op) < 0)) return NULL;
+            result = op->defaults_kwdict;
+        } else {
+            result = Py_None;
+        }
+    }
+    Py_INCREF(result);
+    return result;
+}
+static int
+__Pyx_CyFunction_set_annotations(__pyx_CyFunctionObject *op, PyObject* value, void *context) {
+    CYTHON_UNUSED_VAR(context);
+    if (!value || value == Py_None) {
+        value = NULL;
+    } else if (unlikely(!PyDict_Check(value))) {
+        PyErr_SetString(PyExc_TypeError,
+                        "__annotations__ must be set to a dict object");
+        return -1;
+    }
+    Py_XINCREF(value);
+    __Pyx_Py_XDECREF_SET(op->func_annotations, value);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_annotations(__pyx_CyFunctionObject *op, void *context) {
+    PyObject* result = op->func_annotations;
+    CYTHON_UNUSED_VAR(context);
+    if (unlikely(!result)) {
+        result = PyDict_New();
+        if (unlikely(!result)) return NULL;
+        op->func_annotations = result;
+    }
+    Py_INCREF(result);
+    return result;
+}
+static PyObject *
+__Pyx_CyFunction_get_is_coroutine(__pyx_CyFunctionObject *op, void *context) {
+    int is_coroutine;
+    CYTHON_UNUSED_VAR(context);
+    if (op->func_is_coroutine) {
+        return __Pyx_NewRef(op->func_is_coroutine);
+    }
+    is_coroutine = op->flags & __Pyx_CYFUNCTION_COROUTINE;
+#if PY_VERSION_HEX >= 0x03050000
+    if (is_coroutine) {
+        PyObject *module, *fromlist, *marker = __pyx_n_s_is_coroutine;
+        fromlist = PyList_New(1);
+        if (unlikely(!fromlist)) return NULL;
+        Py_INCREF(marker);
+#if CYTHON_ASSUME_SAFE_MACROS
+        PyList_SET_ITEM(fromlist, 0, marker);
+#else
+        if (unlikely(PyList_SetItem(fromlist, 0, marker) < 0)) {
+            Py_DECREF(marker);
+            Py_DECREF(fromlist);
+            return NULL;
+        }
+#endif
+        module = PyImport_ImportModuleLevelObject(__pyx_n_s_asyncio_coroutines, NULL, NULL, fromlist, 0);
+        Py_DECREF(fromlist);
+        if (unlikely(!module)) goto ignore;
+        op->func_is_coroutine = __Pyx_PyObject_GetAttrStr(module, marker);
+        Py_DECREF(module);
+        if (likely(op->func_is_coroutine)) {
+            return __Pyx_NewRef(op->func_is_coroutine);
+        }
+ignore:
+        PyErr_Clear();
+    }
+#endif
+    op->func_is_coroutine = __Pyx_PyBool_FromLong(is_coroutine);
+    return __Pyx_NewRef(op->func_is_coroutine);
+}
+#if CYTHON_COMPILING_IN_LIMITED_API
+static PyObject *
+__Pyx_CyFunction_get_module(__pyx_CyFunctionObject *op, void *context) {
+    CYTHON_UNUSED_VAR(context);
+    return PyObject_GetAttrString(op->func, "__module__");
+}
+static int
+__Pyx_CyFunction_set_module(__pyx_CyFunctionObject *op, PyObject* value, void *context) {
+    CYTHON_UNUSED_VAR(context);
+    return PyObject_SetAttrString(op->func, "__module__", value);
+}
+#endif
+static PyGetSetDef __pyx_CyFunction_getsets[] = {
+    {(char *) "func_doc", (getter)__Pyx_CyFunction_get_doc, (setter)__Pyx_CyFunction_set_doc, 0, 0},
+    {(char *) "__doc__",  (getter)__Pyx_CyFunction_get_doc, (setter)__Pyx_CyFunction_set_doc, 0, 0},
+    {(char *) "func_name", (getter)__Pyx_CyFunction_get_name, (setter)__Pyx_CyFunction_set_name, 0, 0},
+    {(char *) "__name__", (getter)__Pyx_CyFunction_get_name, (setter)__Pyx_CyFunction_set_name, 0, 0},
+    {(char *) "__qualname__", (getter)__Pyx_CyFunction_get_qualname, (setter)__Pyx_CyFunction_set_qualname, 0, 0},
+    {(char *) "func_dict", (getter)__Pyx_CyFunction_get_dict, (setter)__Pyx_CyFunction_set_dict, 0, 0},
+    {(char *) "__dict__", (getter)__Pyx_CyFunction_get_dict, (setter)__Pyx_CyFunction_set_dict, 0, 0},
+    {(char *) "func_globals", (getter)__Pyx_CyFunction_get_globals, 0, 0, 0},
+    {(char *) "__globals__", (getter)__Pyx_CyFunction_get_globals, 0, 0, 0},
+    {(char *) "func_closure", (getter)__Pyx_CyFunction_get_closure, 0, 0, 0},
+    {(char *) "__closure__", (getter)__Pyx_CyFunction_get_closure, 0, 0, 0},
+    {(char *) "func_code", (getter)__Pyx_CyFunction_get_code, 0, 0, 0},
+    {(char *) "__code__", (getter)__Pyx_CyFunction_get_code, 0, 0, 0},
+    {(char *) "func_defaults", (getter)__Pyx_CyFunction_get_defaults, (setter)__Pyx_CyFunction_set_defaults, 0, 0},
+    {(char *) "__defaults__", (getter)__Pyx_CyFunction_get_defaults, (setter)__Pyx_CyFunction_set_defaults, 0, 0},
+    {(char *) "__kwdefaults__", (getter)__Pyx_CyFunction_get_kwdefaults, (setter)__Pyx_CyFunction_set_kwdefaults, 0, 0},
+    {(char *) "__annotations__", (getter)__Pyx_CyFunction_get_annotations, (setter)__Pyx_CyFunction_set_annotations, 0, 0},
+    {(char *) "_is_coroutine", (getter)__Pyx_CyFunction_get_is_coroutine, 0, 0, 0},
+#if CYTHON_COMPILING_IN_LIMITED_API
+    {"__module__", (getter)__Pyx_CyFunction_get_module, (setter)__Pyx_CyFunction_set_module, 0, 0},
+#endif
+    {0, 0, 0, 0, 0}
+};
+static PyMemberDef __pyx_CyFunction_members[] = {
+#if !CYTHON_COMPILING_IN_LIMITED_API
+    {(char *) "__module__", T_OBJECT, offsetof(PyCFunctionObject, m_module), 0, 0},
+#endif
+#if CYTHON_USE_TYPE_SPECS
+    {(char *) "__dictoffset__", T_PYSSIZET, offsetof(__pyx_CyFunctionObject, func_dict), READONLY, 0},
+#if CYTHON_METH_FASTCALL
+#if CYTHON_BACKPORT_VECTORCALL
+    {(char *) "__vectorcalloffset__", T_PYSSIZET, offsetof(__pyx_CyFunctionObject, func_vectorcall), READONLY, 0},
+#else
+#if !CYTHON_COMPILING_IN_LIMITED_API
+    {(char *) "__vectorcalloffset__", T_PYSSIZET, offsetof(PyCFunctionObject, vectorcall), READONLY, 0},
+#endif
+#endif
+#endif
+#if PY_VERSION_HEX < 0x030500A0 || CYTHON_COMPILING_IN_LIMITED_API
+    {(char *) "__weaklistoffset__", T_PYSSIZET, offsetof(__pyx_CyFunctionObject, func_weakreflist), READONLY, 0},
+#else
+    {(char *) "__weaklistoffset__", T_PYSSIZET, offsetof(PyCFunctionObject, m_weakreflist), READONLY, 0},
+#endif
+#endif
+    {0, 0, 0,  0, 0}
+};
+static PyObject *
+__Pyx_CyFunction_reduce(__pyx_CyFunctionObject *m, PyObject *args)
+{
+    CYTHON_UNUSED_VAR(args);
+#if PY_MAJOR_VERSION >= 3
+    Py_INCREF(m->func_qualname);
+    return m->func_qualname;
+#else
+    return PyString_FromString(((PyCFunctionObject*)m)->m_ml->ml_name);
+#endif
+}
+static PyMethodDef __pyx_CyFunction_methods[] = {
+    {"__reduce__", (PyCFunction)__Pyx_CyFunction_reduce, METH_VARARGS, 0},
+    {0, 0, 0, 0}
+};
+#if PY_VERSION_HEX < 0x030500A0 || CYTHON_COMPILING_IN_LIMITED_API
+#define __Pyx_CyFunction_weakreflist(cyfunc) ((cyfunc)->func_weakreflist)
+#else
+#define __Pyx_CyFunction_weakreflist(cyfunc) (((PyCFunctionObject*)cyfunc)->m_weakreflist)
+#endif
+static PyObject *__Pyx_CyFunction_Init(__pyx_CyFunctionObject *op, PyMethodDef *ml, int flags, PyObject* qualname,
+                                       PyObject *closure, PyObject *module, PyObject* globals, PyObject* code) {
+#if !CYTHON_COMPILING_IN_LIMITED_API
+    PyCFunctionObject *cf = (PyCFunctionObject*) op;
+#endif
+    if (unlikely(op == NULL))
+        return NULL;
+#if CYTHON_COMPILING_IN_LIMITED_API
+    op->func = PyCFunction_NewEx(ml, (PyObject*)op, module);
+    if (unlikely(!op->func)) return NULL;
+#endif
+    op->flags = flags;
+    __Pyx_CyFunction_weakreflist(op) = NULL;
+#if !CYTHON_COMPILING_IN_LIMITED_API
+    cf->m_ml = ml;
+    cf->m_self = (PyObject *) op;
+#endif
+    Py_XINCREF(closure);
+    op->func_closure = closure;
+#if !CYTHON_COMPILING_IN_LIMITED_API
+    Py_XINCREF(module);
+    cf->m_module = module;
+#endif
+    op->func_dict = NULL;
+    op->func_name = NULL;
+    Py_INCREF(qualname);
+    op->func_qualname = qualname;
+    op->func_doc = NULL;
+#if PY_VERSION_HEX < 0x030900B1 || CYTHON_COMPILING_IN_LIMITED_API
+    op->func_classobj = NULL;
+#else
+    ((PyCMethodObject*)op)->mm_class = NULL;
+#endif
+    op->func_globals = globals;
+    Py_INCREF(op->func_globals);
+    Py_XINCREF(code);
+    op->func_code = code;
+    op->defaults_pyobjects = 0;
+    op->defaults_size = 0;
+    op->defaults = NULL;
+    op->defaults_tuple = NULL;
+    op->defaults_kwdict = NULL;
+    op->defaults_getter = NULL;
+    op->func_annotations = NULL;
+    op->func_is_coroutine = NULL;
+#if CYTHON_METH_FASTCALL
+    switch (ml->ml_flags & (METH_VARARGS | METH_FASTCALL | METH_NOARGS | METH_O | METH_KEYWORDS | METH_METHOD)) {
+    case METH_NOARGS:
+        __Pyx_CyFunction_func_vectorcall(op) = __Pyx_CyFunction_Vectorcall_NOARGS;
+        break;
+    case METH_O:
+        __Pyx_CyFunction_func_vectorcall(op) = __Pyx_CyFunction_Vectorcall_O;
+        break;
+    case METH_METHOD | METH_FASTCALL | METH_KEYWORDS:
+        __Pyx_CyFunction_func_vectorcall(op) = __Pyx_CyFunction_Vectorcall_FASTCALL_KEYWORDS_METHOD;
+        break;
+    case METH_FASTCALL | METH_KEYWORDS:
+        __Pyx_CyFunction_func_vectorcall(op) = __Pyx_CyFunction_Vectorcall_FASTCALL_KEYWORDS;
+        break;
+    case METH_VARARGS | METH_KEYWORDS:
+        __Pyx_CyFunction_func_vectorcall(op) = NULL;
+        break;
+    default:
+        PyErr_SetString(PyExc_SystemError, "Bad call flags for CyFunction");
+        Py_DECREF(op);
+        return NULL;
+    }
+#endif
+    return (PyObject *) op;
+}
+static int
+__Pyx_CyFunction_clear(__pyx_CyFunctionObject *m)
+{
+    Py_CLEAR(m->func_closure);
+#if CYTHON_COMPILING_IN_LIMITED_API
+    Py_CLEAR(m->func);
+#else
+    Py_CLEAR(((PyCFunctionObject*)m)->m_module);
+#endif
+    Py_CLEAR(m->func_dict);
+    Py_CLEAR(m->func_name);
+    Py_CLEAR(m->func_qualname);
+    Py_CLEAR(m->func_doc);
+    Py_CLEAR(m->func_globals);
+    Py_CLEAR(m->func_code);
+#if !CYTHON_COMPILING_IN_LIMITED_API
+#if PY_VERSION_HEX < 0x030900B1
+    Py_CLEAR(__Pyx_CyFunction_GetClassObj(m));
+#else
+    {
+        PyObject *cls = (PyObject*) ((PyCMethodObject *) (m))->mm_class;
+        ((PyCMethodObject *) (m))->mm_class = NULL;
+        Py_XDECREF(cls);
+    }
+#endif
+#endif
+    Py_CLEAR(m->defaults_tuple);
+    Py_CLEAR(m->defaults_kwdict);
+    Py_CLEAR(m->func_annotations);
+    Py_CLEAR(m->func_is_coroutine);
+    if (m->defaults) {
+        PyObject **pydefaults = __Pyx_CyFunction_Defaults(PyObject *, m);
+        int i;
+        for (i = 0; i < m->defaults_pyobjects; i++)
+            Py_XDECREF(pydefaults[i]);
+        PyObject_Free(m->defaults);
+        m->defaults = NULL;
+    }
+    return 0;
+}
+static void __Pyx__CyFunction_dealloc(__pyx_CyFunctionObject *m)
+{
+    if (__Pyx_CyFunction_weakreflist(m) != NULL)
+        PyObject_ClearWeakRefs((PyObject *) m);
+    __Pyx_CyFunction_clear(m);
+    __Pyx_PyHeapTypeObject_GC_Del(m);
+}
+static void __Pyx_CyFunction_dealloc(__pyx_CyFunctionObject *m)
+{
+    PyObject_GC_UnTrack(m);
+    __Pyx__CyFunction_dealloc(m);
+}
+static int __Pyx_CyFunction_traverse(__pyx_CyFunctionObject *m, visitproc visit, void *arg)
+{
+    Py_VISIT(m->func_closure);
+#if CYTHON_COMPILING_IN_LIMITED_API
+    Py_VISIT(m->func);
+#else
+    Py_VISIT(((PyCFunctionObject*)m)->m_module);
+#endif
+    Py_VISIT(m->func_dict);
+    Py_VISIT(m->func_name);
+    Py_VISIT(m->func_qualname);
+    Py_VISIT(m->func_doc);
+    Py_VISIT(m->func_globals);
+    Py_VISIT(m->func_code);
+#if !CYTHON_COMPILING_IN_LIMITED_API
+    Py_VISIT(__Pyx_CyFunction_GetClassObj(m));
+#endif
+    Py_VISIT(m->defaults_tuple);
+    Py_VISIT(m->defaults_kwdict);
+    Py_VISIT(m->func_is_coroutine);
+    if (m->defaults) {
+        PyObject **pydefaults = __Pyx_CyFunction_Defaults(PyObject *, m);
+        int i;
+        for (i = 0; i < m->defaults_pyobjects; i++)
+            Py_VISIT(pydefaults[i]);
+    }
+    return 0;
+}
+static PyObject*
+__Pyx_CyFunction_repr(__pyx_CyFunctionObject *op)
+{
+#if PY_MAJOR_VERSION >= 3
+    return PyUnicode_FromFormat("<cyfunction %U at %p>",
+                                op->func_qualname, (void *)op);
+#else
+    return PyString_FromFormat("<cyfunction %s at %p>",
+                               PyString_AsString(op->func_qualname), (void *)op);
+#endif
+}
+static PyObject * __Pyx_CyFunction_CallMethod(PyObject *func, PyObject *self, PyObject *arg, PyObject *kw) {
+#if CYTHON_COMPILING_IN_LIMITED_API
+    PyObject *f = ((__pyx_CyFunctionObject*)func)->func;
+    PyObject *py_name = NULL;
+    PyCFunction meth;
+    int flags;
+    meth = PyCFunction_GetFunction(f);
+    if (unlikely(!meth)) return NULL;
+    flags = PyCFunction_GetFlags(f);
+    if (unlikely(flags < 0)) return NULL;
+#else
+    PyCFunctionObject* f = (PyCFunctionObject*)func;
+    PyCFunction meth = f->m_ml->ml_meth;
+    int flags = f->m_ml->ml_flags;
+#endif
+    Py_ssize_t size;
+    switch (flags & (METH_VARARGS | METH_KEYWORDS | METH_NOARGS | METH_O)) {
+    case METH_VARARGS:
+        if (likely(kw == NULL || PyDict_Size(kw) == 0))
+            return (*meth)(self, arg);
+        break;
+    case METH_VARARGS | METH_KEYWORDS:
+        return (*(PyCFunctionWithKeywords)(void*)meth)(self, arg, kw);
+    case METH_NOARGS:
+        if (likely(kw == NULL || PyDict_Size(kw) == 0)) {
+#if CYTHON_ASSUME_SAFE_MACROS
+            size = PyTuple_GET_SIZE(arg);
+#else
+            size = PyTuple_Size(arg);
+            if (unlikely(size < 0)) return NULL;
+#endif
+            if (likely(size == 0))
+                return (*meth)(self, NULL);
+#if CYTHON_COMPILING_IN_LIMITED_API
+            py_name = __Pyx_CyFunction_get_name((__pyx_CyFunctionObject*)func, NULL);
+            if (!py_name) return NULL;
+            PyErr_Format(PyExc_TypeError,
+                "%.200S() takes no arguments (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                py_name, size);
+            Py_DECREF(py_name);
+#else
+            PyErr_Format(PyExc_TypeError,
+                "%.200s() takes no arguments (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                f->m_ml->ml_name, size);
+#endif
+            return NULL;
+        }
+        break;
+    case METH_O:
+        if (likely(kw == NULL || PyDict_Size(kw) == 0)) {
+#if CYTHON_ASSUME_SAFE_MACROS
+            size = PyTuple_GET_SIZE(arg);
+#else
+            size = PyTuple_Size(arg);
+            if (unlikely(size < 0)) return NULL;
+#endif
+            if (likely(size == 1)) {
+                PyObject *result, *arg0;
+                #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+                arg0 = PyTuple_GET_ITEM(arg, 0);
+                #else
+                arg0 = __Pyx_PySequence_ITEM(arg, 0); if (unlikely(!arg0)) return NULL;
+                #endif
+                result = (*meth)(self, arg0);
+                #if !(CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS)
+                Py_DECREF(arg0);
+                #endif
+                return result;
+            }
+#if CYTHON_COMPILING_IN_LIMITED_API
+            py_name = __Pyx_CyFunction_get_name((__pyx_CyFunctionObject*)func, NULL);
+            if (!py_name) return NULL;
+            PyErr_Format(PyExc_TypeError,
+                "%.200S() takes exactly one argument (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                py_name, size);
+            Py_DECREF(py_name);
+#else
+            PyErr_Format(PyExc_TypeError,
+                "%.200s() takes exactly one argument (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                f->m_ml->ml_name, size);
+#endif
+            return NULL;
+        }
+        break;
+    default:
+        PyErr_SetString(PyExc_SystemError, "Bad call flags for CyFunction");
+        return NULL;
+    }
+#if CYTHON_COMPILING_IN_LIMITED_API
+    py_name = __Pyx_CyFunction_get_name((__pyx_CyFunctionObject*)func, NULL);
+    if (!py_name) return NULL;
+    PyErr_Format(PyExc_TypeError, "%.200S() takes no keyword arguments",
+                 py_name);
+    Py_DECREF(py_name);
+#else
+    PyErr_Format(PyExc_TypeError, "%.200s() takes no keyword arguments",
+                 f->m_ml->ml_name);
+#endif
+    return NULL;
+}
+static CYTHON_INLINE PyObject *__Pyx_CyFunction_Call(PyObject *func, PyObject *arg, PyObject *kw) {
+    PyObject *self, *result;
+#if CYTHON_COMPILING_IN_LIMITED_API
+    self = PyCFunction_GetSelf(((__pyx_CyFunctionObject*)func)->func);
+    if (unlikely(!self) && PyErr_Occurred()) return NULL;
+#else
+    self = ((PyCFunctionObject*)func)->m_self;
+#endif
+    result = __Pyx_CyFunction_CallMethod(func, self, arg, kw);
+    return result;
+}
+static PyObject *__Pyx_CyFunction_CallAsMethod(PyObject *func, PyObject *args, PyObject *kw) {
+    PyObject *result;
+    __pyx_CyFunctionObject *cyfunc = (__pyx_CyFunctionObject *) func;
+#if CYTHON_METH_FASTCALL
+     __pyx_vectorcallfunc vc = __Pyx_CyFunction_func_vectorcall(cyfunc);
+    if (vc) {
+#if CYTHON_ASSUME_SAFE_MACROS
+        return __Pyx_PyVectorcall_FastCallDict(func, vc, &PyTuple_GET_ITEM(args, 0), (size_t)PyTuple_GET_SIZE(args), kw);
+#else
+        (void) &__Pyx_PyVectorcall_FastCallDict;
+        return PyVectorcall_Call(func, args, kw);
+#endif
+    }
+#endif
+    if ((cyfunc->flags & __Pyx_CYFUNCTION_CCLASS) && !(cyfunc->flags & __Pyx_CYFUNCTION_STATICMETHOD)) {
+        Py_ssize_t argc;
+        PyObject *new_args;
+        PyObject *self;
+#if CYTHON_ASSUME_SAFE_MACROS
+        argc = PyTuple_GET_SIZE(args);
+#else
+        argc = PyTuple_Size(args);
+        if (unlikely(!argc) < 0) return NULL;
+#endif
+        new_args = PyTuple_GetSlice(args, 1, argc);
+        if (unlikely(!new_args))
+            return NULL;
+        self = PyTuple_GetItem(args, 0);
+        if (unlikely(!self)) {
+            Py_DECREF(new_args);
+#if PY_MAJOR_VERSION > 2
+            PyErr_Format(PyExc_TypeError,
+                         "unbound method %.200S() needs an argument",
+                         cyfunc->func_qualname);
+#else
+            PyErr_SetString(PyExc_TypeError,
+                            "unbound method needs an argument");
+#endif
+            return NULL;
+        }
+        result = __Pyx_CyFunction_CallMethod(func, self, new_args, kw);
+        Py_DECREF(new_args);
+    } else {
+        result = __Pyx_CyFunction_Call(func, args, kw);
+    }
+    return result;
+}
+#if CYTHON_METH_FASTCALL
+static CYTHON_INLINE int __Pyx_CyFunction_Vectorcall_CheckArgs(__pyx_CyFunctionObject *cyfunc, Py_ssize_t nargs, PyObject *kwnames)
+{
+    int ret = 0;
+    if ((cyfunc->flags & __Pyx_CYFUNCTION_CCLASS) && !(cyfunc->flags & __Pyx_CYFUNCTION_STATICMETHOD)) {
+        if (unlikely(nargs < 1)) {
+            PyErr_Format(PyExc_TypeError, "%.200s() needs an argument",
+                         ((PyCFunctionObject*)cyfunc)->m_ml->ml_name);
+            return -1;
+        }
+        ret = 1;
+    }
+    if (unlikely(kwnames) && unlikely(PyTuple_GET_SIZE(kwnames))) {
+        PyErr_Format(PyExc_TypeError,
+                     "%.200s() takes no keyword arguments", ((PyCFunctionObject*)cyfunc)->m_ml->ml_name);
+        return -1;
+    }
+    return ret;
+}
+static PyObject * __Pyx_CyFunction_Vectorcall_NOARGS(PyObject *func, PyObject *const *args, size_t nargsf, PyObject *kwnames)
+{
+    __pyx_CyFunctionObject *cyfunc = (__pyx_CyFunctionObject *)func;
+    PyMethodDef* def = ((PyCFunctionObject*)cyfunc)->m_ml;
+#if CYTHON_BACKPORT_VECTORCALL
+    Py_ssize_t nargs = (Py_ssize_t)nargsf;
+#else
+    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
+#endif
+    PyObject *self;
+    switch (__Pyx_CyFunction_Vectorcall_CheckArgs(cyfunc, nargs, kwnames)) {
+    case 1:
+        self = args[0];
+        args += 1;
+        nargs -= 1;
+        break;
+    case 0:
+        self = ((PyCFunctionObject*)cyfunc)->m_self;
+        break;
+    default:
+        return NULL;
+    }
+    if (unlikely(nargs != 0)) {
+        PyErr_Format(PyExc_TypeError,
+            "%.200s() takes no arguments (%" CYTHON_FORMAT_SSIZE_T "d given)",
+            def->ml_name, nargs);
+        return NULL;
+    }
+    return def->ml_meth(self, NULL);
+}
+static PyObject * __Pyx_CyFunction_Vectorcall_O(PyObject *func, PyObject *const *args, size_t nargsf, PyObject *kwnames)
+{
+    __pyx_CyFunctionObject *cyfunc = (__pyx_CyFunctionObject *)func;
+    PyMethodDef* def = ((PyCFunctionObject*)cyfunc)->m_ml;
+#if CYTHON_BACKPORT_VECTORCALL
+    Py_ssize_t nargs = (Py_ssize_t)nargsf;
+#else
+    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
+#endif
+    PyObject *self;
+    switch (__Pyx_CyFunction_Vectorcall_CheckArgs(cyfunc, nargs, kwnames)) {
+    case 1:
+        self = args[0];
+        args += 1;
+        nargs -= 1;
+        break;
+    case 0:
+        self = ((PyCFunctionObject*)cyfunc)->m_self;
+        break;
+    default:
+        return NULL;
+    }
+    if (unlikely(nargs != 1)) {
+        PyErr_Format(PyExc_TypeError,
+            "%.200s() takes exactly one argument (%" CYTHON_FORMAT_SSIZE_T "d given)",
+            def->ml_name, nargs);
+        return NULL;
+    }
+    return def->ml_meth(self, args[0]);
+}
+static PyObject * __Pyx_CyFunction_Vectorcall_FASTCALL_KEYWORDS(PyObject *func, PyObject *const *args, size_t nargsf, PyObject *kwnames)
+{
+    __pyx_CyFunctionObject *cyfunc = (__pyx_CyFunctionObject *)func;
+    PyMethodDef* def = ((PyCFunctionObject*)cyfunc)->m_ml;
+#if CYTHON_BACKPORT_VECTORCALL
+    Py_ssize_t nargs = (Py_ssize_t)nargsf;
+#else
+    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
+#endif
+    PyObject *self;
+    switch (__Pyx_CyFunction_Vectorcall_CheckArgs(cyfunc, nargs, NULL)) {
+    case 1:
+        self = args[0];
+        args += 1;
+        nargs -= 1;
+        break;
+    case 0:
+        self = ((PyCFunctionObject*)cyfunc)->m_self;
+        break;
+    default:
+        return NULL;
+    }
+    return ((__Pyx_PyCFunctionFastWithKeywords)(void(*)(void))def->ml_meth)(self, args, nargs, kwnames);
+}
+static PyObject * __Pyx_CyFunction_Vectorcall_FASTCALL_KEYWORDS_METHOD(PyObject *func, PyObject *const *args, size_t nargsf, PyObject *kwnames)
+{
+    __pyx_CyFunctionObject *cyfunc = (__pyx_CyFunctionObject *)func;
+    PyMethodDef* def = ((PyCFunctionObject*)cyfunc)->m_ml;
+    PyTypeObject *cls = (PyTypeObject *) __Pyx_CyFunction_GetClassObj(cyfunc);
+#if CYTHON_BACKPORT_VECTORCALL
+    Py_ssize_t nargs = (Py_ssize_t)nargsf;
+#else
+    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
+#endif
+    PyObject *self;
+    switch (__Pyx_CyFunction_Vectorcall_CheckArgs(cyfunc, nargs, NULL)) {
+    case 1:
+        self = args[0];
+        args += 1;
+        nargs -= 1;
+        break;
+    case 0:
+        self = ((PyCFunctionObject*)cyfunc)->m_self;
+        break;
+    default:
+        return NULL;
+    }
+    return ((__Pyx_PyCMethod)(void(*)(void))def->ml_meth)(self, cls, args, (size_t)nargs, kwnames);
+}
+#endif
+#if CYTHON_USE_TYPE_SPECS
+static PyType_Slot __pyx_CyFunctionType_slots[] = {
+    {Py_tp_dealloc, (void *)__Pyx_CyFunction_dealloc},
+    {Py_tp_repr, (void *)__Pyx_CyFunction_repr},
+    {Py_tp_call, (void *)__Pyx_CyFunction_CallAsMethod},
+    {Py_tp_traverse, (void *)__Pyx_CyFunction_traverse},
+    {Py_tp_clear, (void *)__Pyx_CyFunction_clear},
+    {Py_tp_methods, (void *)__pyx_CyFunction_methods},
+    {Py_tp_members, (void *)__pyx_CyFunction_members},
+    {Py_tp_getset, (void *)__pyx_CyFunction_getsets},
+    {Py_tp_descr_get, (void *)__Pyx_PyMethod_New},
+    {0, 0},
+};
+static PyType_Spec __pyx_CyFunctionType_spec = {
+    __PYX_TYPE_MODULE_PREFIX "cython_function_or_method",
+    sizeof(__pyx_CyFunctionObject),
+    0,
+#ifdef Py_TPFLAGS_METHOD_DESCRIPTOR
+    Py_TPFLAGS_METHOD_DESCRIPTOR |
+#endif
+#if (defined(_Py_TPFLAGS_HAVE_VECTORCALL) && CYTHON_METH_FASTCALL)
+    _Py_TPFLAGS_HAVE_VECTORCALL |
+#endif
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_BASETYPE,
+    __pyx_CyFunctionType_slots
+};
+#else
+static PyTypeObject __pyx_CyFunctionType_type = {
+    PyVarObject_HEAD_INIT(0, 0)
+    __PYX_TYPE_MODULE_PREFIX "cython_function_or_method",
+    sizeof(__pyx_CyFunctionObject),
+    0,
+    (destructor) __Pyx_CyFunction_dealloc,
+#if !CYTHON_METH_FASTCALL
+    0,
+#elif CYTHON_BACKPORT_VECTORCALL
+    (printfunc)offsetof(__pyx_CyFunctionObject, func_vectorcall),
+#else
+    offsetof(PyCFunctionObject, vectorcall),
+#endif
+    0,
+    0,
+#if PY_MAJOR_VERSION < 3
+    0,
+#else
+    0,
+#endif
+    (reprfunc) __Pyx_CyFunction_repr,
+    0,
+    0,
+    0,
+    0,
+    __Pyx_CyFunction_CallAsMethod,
+    0,
+    0,
+    0,
+    0,
+#ifdef Py_TPFLAGS_METHOD_DESCRIPTOR
+    Py_TPFLAGS_METHOD_DESCRIPTOR |
+#endif
+#if defined(_Py_TPFLAGS_HAVE_VECTORCALL) && CYTHON_METH_FASTCALL
+    _Py_TPFLAGS_HAVE_VECTORCALL |
+#endif
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_BASETYPE,
+    0,
+    (traverseproc) __Pyx_CyFunction_traverse,
+    (inquiry) __Pyx_CyFunction_clear,
+    0,
+#if PY_VERSION_HEX < 0x030500A0
+    offsetof(__pyx_CyFunctionObject, func_weakreflist),
+#else
+    offsetof(PyCFunctionObject, m_weakreflist),
+#endif
+    0,
+    0,
+    __pyx_CyFunction_methods,
+    __pyx_CyFunction_members,
+    __pyx_CyFunction_getsets,
+    0,
+    0,
+    __Pyx_PyMethod_New,
+    0,
+    offsetof(__pyx_CyFunctionObject, func_dict),
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+#if PY_VERSION_HEX >= 0x030400a1
+    0,
+#endif
+#if PY_VERSION_HEX >= 0x030800b1 && (!CYTHON_COMPILING_IN_PYPY || PYPY_VERSION_NUM >= 0x07030800)
+    0,
+#endif
+#if __PYX_NEED_TP_PRINT_SLOT
+    0,
+#endif
+#if PY_VERSION_HEX >= 0x030C0000
+    0,
+#endif
+#if PY_VERSION_HEX >= 0x030d00A4
+    0,
+#endif
+#if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX >= 0x03090000 && PY_VERSION_HEX < 0x030a0000
+    0,
+#endif
+};
+#endif
+static int __pyx_CyFunction_init(PyObject *module) {
+#if CYTHON_USE_TYPE_SPECS
+    __pyx_CyFunctionType = __Pyx_FetchCommonTypeFromSpec(module, &__pyx_CyFunctionType_spec, NULL);
+#else
+    CYTHON_UNUSED_VAR(module);
+    __pyx_CyFunctionType = __Pyx_FetchCommonType(&__pyx_CyFunctionType_type);
+#endif
+    if (unlikely(__pyx_CyFunctionType == NULL)) {
+        return -1;
+    }
+    return 0;
+}
+static CYTHON_INLINE void *__Pyx_CyFunction_InitDefaults(PyObject *func, size_t size, int pyobjects) {
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    m->defaults = PyObject_Malloc(size);
+    if (unlikely(!m->defaults))
+        return PyErr_NoMemory();
+    memset(m->defaults, 0, size);
+    m->defaults_pyobjects = pyobjects;
+    m->defaults_size = size;
+    return m->defaults;
+}
+static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsTuple(PyObject *func, PyObject *tuple) {
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    m->defaults_tuple = tuple;
+    Py_INCREF(tuple);
+}
+static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsKwDict(PyObject *func, PyObject *dict) {
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    m->defaults_kwdict = dict;
+    Py_INCREF(dict);
+}
+static CYTHON_INLINE void __Pyx_CyFunction_SetAnnotationsDict(PyObject *func, PyObject *dict) {
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    m->func_annotations = dict;
+    Py_INCREF(dict);
+}
+
+/* CythonFunction */
+  static PyObject *__Pyx_CyFunction_New(PyMethodDef *ml, int flags, PyObject* qualname,
+                                      PyObject *closure, PyObject *module, PyObject* globals, PyObject* code) {
+    PyObject *op = __Pyx_CyFunction_Init(
+        PyObject_GC_New(__pyx_CyFunctionObject, __pyx_CyFunctionType),
+        ml, flags, qualname, closure, module, globals, code
+    );
+    if (likely(op)) {
+        PyObject_GC_Track(op);
+    }
+    return op;
+}
+
+/* CLineInTraceback */
+  #ifndef CYTHON_CLINE_IN_TRACEBACK
+static int __Pyx_CLineForTraceback(PyThreadState *tstate, int c_line) {
+    PyObject *use_cline;
+    PyObject *ptype, *pvalue, *ptraceback;
+#if CYTHON_COMPILING_IN_CPYTHON
+    PyObject **cython_runtime_dict;
+#endif
+    CYTHON_MAYBE_UNUSED_VAR(tstate);
+    if (unlikely(!__pyx_cython_runtime)) {
+        return c_line;
+    }
+    __Pyx_ErrFetchInState(tstate, &ptype, &pvalue, &ptraceback);
+#if CYTHON_COMPILING_IN_CPYTHON
+    cython_runtime_dict = _PyObject_GetDictPtr(__pyx_cython_runtime);
+    if (likely(cython_runtime_dict)) {
+        __PYX_PY_DICT_LOOKUP_IF_MODIFIED(
+            use_cline, *cython_runtime_dict,
+            __Pyx_PyDict_GetItemStr(*cython_runtime_dict, __pyx_n_s_cline_in_traceback))
+    } else
+#endif
+    {
+      PyObject *use_cline_obj = __Pyx_PyObject_GetAttrStrNoError(__pyx_cython_runtime, __pyx_n_s_cline_in_traceback);
+      if (use_cline_obj) {
+        use_cline = PyObject_Not(use_cline_obj) ? Py_False : Py_True;
+        Py_DECREF(use_cline_obj);
+      } else {
+        PyErr_Clear();
+        use_cline = NULL;
+      }
+    }
+    if (!use_cline) {
+        c_line = 0;
+        (void) PyObject_SetAttr(__pyx_cython_runtime, __pyx_n_s_cline_in_traceback, Py_False);
+    }
+    else if (use_cline == Py_False || (use_cline != Py_True && PyObject_Not(use_cline) != 0)) {
+        c_line = 0;
+    }
+    __Pyx_ErrRestoreInState(tstate, ptype, pvalue, ptraceback);
+    return c_line;
+}
+#endif
+
+/* CodeObjectCache */
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) {
+    int start = 0, mid = 0, end = count - 1;
+    if (end >= 0 && code_line > entries[end].code_line) {
+        return count;
+    }
+    while (start < end) {
+        mid = start + (end - start) / 2;
+        if (code_line < entries[mid].code_line) {
+            end = mid;
+        } else if (code_line > entries[mid].code_line) {
+             start = mid + 1;
+        } else {
+            return mid;
+        }
+    }
+    if (code_line <= entries[mid].code_line) {
+        return mid;
+    } else {
+        return mid + 1;
+    }
+}
+static PyCodeObject *__pyx_find_code_object(int code_line) {
+    PyCodeObject* code_object;
+    int pos;
+    if (unlikely(!code_line) || unlikely(!__pyx_code_cache.entries)) {
+        return NULL;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if (unlikely(pos >= __pyx_code_cache.count) || unlikely(__pyx_code_cache.entries[pos].code_line != code_line)) {
+        return NULL;
+    }
+    code_object = __pyx_code_cache.entries[pos].code_object;
+    Py_INCREF(code_object);
+    return code_object;
+}
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object) {
+    int pos, i;
+    __Pyx_CodeObjectCacheEntry* entries = __pyx_code_cache.entries;
+    if (unlikely(!code_line)) {
+        return;
+    }
+    if (unlikely(!entries)) {
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Malloc(64*sizeof(__Pyx_CodeObjectCacheEntry));
+        if (likely(entries)) {
+            __pyx_code_cache.entries = entries;
+            __pyx_code_cache.max_count = 64;
+            __pyx_code_cache.count = 1;
+            entries[0].code_line = code_line;
+            entries[0].code_object = code_object;
+            Py_INCREF(code_object);
+        }
+        return;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if ((pos < __pyx_code_cache.count) && unlikely(__pyx_code_cache.entries[pos].code_line == code_line)) {
+        PyCodeObject* tmp = entries[pos].code_object;
+        entries[pos].code_object = code_object;
+        Py_DECREF(tmp);
+        return;
+    }
+    if (__pyx_code_cache.count == __pyx_code_cache.max_count) {
+        int new_max = __pyx_code_cache.max_count + 64;
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Realloc(
+            __pyx_code_cache.entries, ((size_t)new_max) * sizeof(__Pyx_CodeObjectCacheEntry));
+        if (unlikely(!entries)) {
+            return;
+        }
+        __pyx_code_cache.entries = entries;
+        __pyx_code_cache.max_count = new_max;
+    }
+    for (i=__pyx_code_cache.count; i>pos; i--) {
+        entries[i] = entries[i-1];
+    }
+    entries[pos].code_line = code_line;
+    entries[pos].code_object = code_object;
+    __pyx_code_cache.count++;
+    Py_INCREF(code_object);
+}
+#endif
+
+/* AddTraceback */
+  #include "compile.h"
+#include "frameobject.h"
+#include "traceback.h"
+#if PY_VERSION_HEX >= 0x030b00a6 && !CYTHON_COMPILING_IN_LIMITED_API && !defined(PYPY_VERSION)
+  #ifndef Py_BUILD_CORE
+    #define Py_BUILD_CORE 1
+  #endif
+  #include "internal/pycore_frame.h"
+#endif
+#if CYTHON_COMPILING_IN_LIMITED_API
+static PyObject *__Pyx_PyCode_Replace_For_AddTraceback(PyObject *code, PyObject *scratch_dict,
+                                                       PyObject *firstlineno, PyObject *name) {
+    PyObject *replace = NULL;
+    if (unlikely(PyDict_SetItemString(scratch_dict, "co_firstlineno", firstlineno))) return NULL;
+    if (unlikely(PyDict_SetItemString(scratch_dict, "co_name", name))) return NULL;
+    replace = PyObject_GetAttrString(code, "replace");
+    if (likely(replace)) {
+        PyObject *result;
+        result = PyObject_Call(replace, __pyx_empty_tuple, scratch_dict);
+        Py_DECREF(replace);
+        return result;
+    }
+    PyErr_Clear();
+    #if __PYX_LIMITED_VERSION_HEX < 0x030780000
+    {
+        PyObject *compiled = NULL, *result = NULL;
+        if (unlikely(PyDict_SetItemString(scratch_dict, "code", code))) return NULL;
+        if (unlikely(PyDict_SetItemString(scratch_dict, "type", (PyObject*)(&PyType_Type)))) return NULL;
+        compiled = Py_CompileString(
+            "out = type(code)(\n"
+            "  code.co_argcount, code.co_kwonlyargcount, code.co_nlocals, code.co_stacksize,\n"
+            "  code.co_flags, code.co_code, code.co_consts, code.co_names,\n"
+            "  code.co_varnames, code.co_filename, co_name, co_firstlineno,\n"
+            "  code.co_lnotab)\n", "<dummy>", Py_file_input);
+        if (!compiled) return NULL;
+        result = PyEval_EvalCode(compiled, scratch_dict, scratch_dict);
+        Py_DECREF(compiled);
+        if (!result) PyErr_Print();
+        Py_DECREF(result);
+        result = PyDict_GetItemString(scratch_dict, "out");
+        if (result) Py_INCREF(result);
+        return result;
+    }
+    #else
+    return NULL;
+    #endif
+}
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename) {
+    PyObject *code_object = NULL, *py_py_line = NULL, *py_funcname = NULL, *dict = NULL;
+    PyObject *replace = NULL, *getframe = NULL, *frame = NULL;
+    PyObject *exc_type, *exc_value, *exc_traceback;
+    int success = 0;
+    if (c_line) {
+        (void) __pyx_cfilenm;
+        (void) __Pyx_CLineForTraceback(__Pyx_PyThreadState_Current, c_line);
+    }
+    PyErr_Fetch(&exc_type, &exc_value, &exc_traceback);
+    code_object = Py_CompileString("_getframe()", filename, Py_eval_input);
+    if (unlikely(!code_object)) goto bad;
+    py_py_line = PyLong_FromLong(py_line);
+    if (unlikely(!py_py_line)) goto bad;
+    py_funcname = PyUnicode_FromString(funcname);
+    if (unlikely(!py_funcname)) goto bad;
+    dict = PyDict_New();
+    if (unlikely(!dict)) goto bad;
+    {
+        PyObject *old_code_object = code_object;
+        code_object = __Pyx_PyCode_Replace_For_AddTraceback(code_object, dict, py_py_line, py_funcname);
+        Py_DECREF(old_code_object);
+    }
+    if (unlikely(!code_object)) goto bad;
+    getframe = PySys_GetObject("_getframe");
+    if (unlikely(!getframe)) goto bad;
+    if (unlikely(PyDict_SetItemString(dict, "_getframe", getframe))) goto bad;
+    frame = PyEval_EvalCode(code_object, dict, dict);
+    if (unlikely(!frame) || frame == Py_None) goto bad;
+    success = 1;
+  bad:
+    PyErr_Restore(exc_type, exc_value, exc_traceback);
+    Py_XDECREF(code_object);
+    Py_XDECREF(py_py_line);
+    Py_XDECREF(py_funcname);
+    Py_XDECREF(dict);
+    Py_XDECREF(replace);
+    if (success) {
+        PyTraceBack_Here(
+            (struct _frame*)frame);
+    }
+    Py_XDECREF(frame);
+}
+#else
+static PyCodeObject* __Pyx_CreateCodeObjectForTraceback(
+            const char *funcname, int c_line,
+            int py_line, const char *filename) {
+    PyCodeObject *py_code = NULL;
+    PyObject *py_funcname = NULL;
+    #if PY_MAJOR_VERSION < 3
+    PyObject *py_srcfile = NULL;
+    py_srcfile = PyString_FromString(filename);
+    if (!py_srcfile) goto bad;
+    #endif
+    if (c_line) {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        if (!py_funcname) goto bad;
+        #else
+        py_funcname = PyUnicode_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        if (!py_funcname) goto bad;
+        funcname = PyUnicode_AsUTF8(py_funcname);
+        if (!funcname) goto bad;
+        #endif
+    }
+    else {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromString(funcname);
+        if (!py_funcname) goto bad;
+        #endif
+    }
+    #if PY_MAJOR_VERSION < 3
+    py_code = __Pyx_PyCode_New(
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        __pyx_empty_bytes, /*PyObject *code,*/
+        __pyx_empty_tuple, /*PyObject *consts,*/
+        __pyx_empty_tuple, /*PyObject *names,*/
+        __pyx_empty_tuple, /*PyObject *varnames,*/
+        __pyx_empty_tuple, /*PyObject *freevars,*/
+        __pyx_empty_tuple, /*PyObject *cellvars,*/
+        py_srcfile,   /*PyObject *filename,*/
+        py_funcname,  /*PyObject *name,*/
+        py_line,
+        __pyx_empty_bytes  /*PyObject *lnotab*/
+    );
+    Py_DECREF(py_srcfile);
+    #else
+    py_code = PyCode_NewEmpty(filename, funcname, py_line);
+    #endif
+    Py_XDECREF(py_funcname);
+    return py_code;
+bad:
+    Py_XDECREF(py_funcname);
+    #if PY_MAJOR_VERSION < 3
+    Py_XDECREF(py_srcfile);
+    #endif
+    return NULL;
+}
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename) {
+    PyCodeObject *py_code = 0;
+    PyFrameObject *py_frame = 0;
+    PyThreadState *tstate = __Pyx_PyThreadState_Current;
+    PyObject *ptype, *pvalue, *ptraceback;
+    if (c_line) {
+        c_line = __Pyx_CLineForTraceback(tstate, c_line);
+    }
+    py_code = __pyx_find_code_object(c_line ? -c_line : py_line);
+    if (!py_code) {
+        __Pyx_ErrFetchInState(tstate, &ptype, &pvalue, &ptraceback);
+        py_code = __Pyx_CreateCodeObjectForTraceback(
+            funcname, c_line, py_line, filename);
+        if (!py_code) {
+            /* If the code object creation fails, then we should clear the
+               fetched exception references and propagate the new exception */
+            Py_XDECREF(ptype);
+            Py_XDECREF(pvalue);
+            Py_XDECREF(ptraceback);
+            goto bad;
+        }
+        __Pyx_ErrRestoreInState(tstate, ptype, pvalue, ptraceback);
+        __pyx_insert_code_object(c_line ? -c_line : py_line, py_code);
+    }
+    py_frame = PyFrame_New(
+        tstate,            /*PyThreadState *tstate,*/
+        py_code,           /*PyCodeObject *code,*/
+        __pyx_d,    /*PyObject *globals,*/
+        0                  /*PyObject *locals*/
+    );
+    if (!py_frame) goto bad;
+    __Pyx_PyFrame_SetLineNumber(py_frame, py_line);
+    PyTraceBack_Here(py_frame);
+bad:
+    Py_XDECREF(py_code);
+    Py_XDECREF(py_frame);
+}
+#endif
+
+#if PY_MAJOR_VERSION < 3
+static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags) {
+    __Pyx_TypeName obj_type_name;
+    if (PyObject_CheckBuffer(obj)) return PyObject_GetBuffer(obj, view, flags);
+        if (__Pyx_TypeCheck(obj, __pyx_array_type)) return __pyx_array_getbuffer(obj, view, flags);
+        if (__Pyx_TypeCheck(obj, __pyx_memoryview_type)) return __pyx_memoryview_getbuffer(obj, view, flags);
+    obj_type_name = __Pyx_PyType_GetName(Py_TYPE(obj));
+    PyErr_Format(PyExc_TypeError,
+                 "'" __Pyx_FMT_TYPENAME "' does not have the buffer interface",
+                 obj_type_name);
+    __Pyx_DECREF_TypeName(obj_type_name);
+    return -1;
+}
+static void __Pyx_ReleaseBuffer(Py_buffer *view) {
+    PyObject *obj = view->obj;
+    if (!obj) return;
+    if (PyObject_CheckBuffer(obj)) {
+        PyBuffer_Release(view);
+        return;
+    }
+    if ((0)) {}
+    view->obj = NULL;
+    Py_DECREF(obj);
+}
+#endif
+
+
+  /* MemviewSliceIsContig */
+  static int
+__pyx_memviewslice_is_contig(const __Pyx_memviewslice mvs, char order, int ndim)
+{
+    int i, index, step, start;
+    Py_ssize_t itemsize = mvs.memview->view.itemsize;
+    if (order == 'F') {
+        step = 1;
+        start = 0;
+    } else {
+        step = -1;
+        start = ndim - 1;
+    }
+    for (i = 0; i < ndim; i++) {
+        index = start + step * i;
+        if (mvs.suboffsets[index] >= 0 || mvs.strides[index] != itemsize)
+            return 0;
+        itemsize *= mvs.shape[index];
+    }
+    return 1;
+}
+
+/* OverlappingSlices */
+  static void
+__pyx_get_array_memory_extents(__Pyx_memviewslice *slice,
+                               void **out_start, void **out_end,
+                               int ndim, size_t itemsize)
+{
+    char *start, *end;
+    int i;
+    start = end = slice->data;
+    for (i = 0; i < ndim; i++) {
+        Py_ssize_t stride = slice->strides[i];
+        Py_ssize_t extent = slice->shape[i];
+        if (extent == 0) {
+            *out_start = *out_end = start;
+            return;
+        } else {
+            if (stride > 0)
+                end += stride * (extent - 1);
+            else
+                start += stride * (extent - 1);
+        }
+    }
+    *out_start = start;
+    *out_end = end + itemsize;
+}
+static int
+__pyx_slices_overlap(__Pyx_memviewslice *slice1,
+                     __Pyx_memviewslice *slice2,
+                     int ndim, size_t itemsize)
+{
+    void *start1, *end1, *start2, *end2;
+    __pyx_get_array_memory_extents(slice1, &start1, &end1, ndim, itemsize);
+    __pyx_get_array_memory_extents(slice2, &start2, &end2, ndim, itemsize);
+    return (start1 < end2) && (start2 < end1);
+}
+
+/* CIntFromPyVerify */
+  #define __PYX_VERIFY_RETURN_INT(target_type, func_type, func_value)\
+    __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 0)
+#define __PYX_VERIFY_RETURN_INT_EXC(target_type, func_type, func_value)\
+    __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 1)
+#define __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, exc)\
+    {\
+        func_type value = func_value;\
+        if (sizeof(target_type) < sizeof(func_type)) {\
+            if (unlikely(value != (func_type) (target_type) value)) {\
+                func_type zero = 0;\
+                if (exc && unlikely(value == (func_type)-1 && PyErr_Occurred()))\
+                    return (target_type) -1;\
+                if (is_unsigned && unlikely(value < zero))\
+                    goto raise_neg_overflow;\
+                else\
+                    goto raise_overflow;\
+            }\
+        }\
+        return (target_type) value;\
+    }
+
+/* TypeInfoCompare */
+  static int
+__pyx_typeinfo_cmp(__Pyx_TypeInfo *a, __Pyx_TypeInfo *b)
+{
+    int i;
+    if (!a || !b)
+        return 0;
+    if (a == b)
+        return 1;
+    if (a->size != b->size || a->typegroup != b->typegroup ||
+            a->is_unsigned != b->is_unsigned || a->ndim != b->ndim) {
+        if (a->typegroup == 'H' || b->typegroup == 'H') {
+            return a->size == b->size;
+        } else {
+            return 0;
+        }
+    }
+    if (a->ndim) {
+        for (i = 0; i < a->ndim; i++)
+            if (a->arraysize[i] != b->arraysize[i])
+                return 0;
+    }
+    if (a->typegroup == 'S') {
+        if (a->flags != b->flags)
+            return 0;
+        if (a->fields || b->fields) {
+            if (!(a->fields && b->fields))
+                return 0;
+            for (i = 0; a->fields[i].type && b->fields[i].type; i++) {
+                __Pyx_StructField *field_a = a->fields + i;
+                __Pyx_StructField *field_b = b->fields + i;
+                if (field_a->offset != field_b->offset ||
+                    !__pyx_typeinfo_cmp(field_a->type, field_b->type))
+                    return 0;
+            }
+            return !a->fields[i].type && !b->fields[i].type;
+        }
+    }
+    return 1;
+}
+
+/* MemviewSliceValidateAndInit */
+  static int
+__pyx_check_strides(Py_buffer *buf, int dim, int ndim, int spec)
+{
+    if (buf->shape[dim] <= 1)
+        return 1;
+    if (buf->strides) {
+        if (spec & __Pyx_MEMVIEW_CONTIG) {
+            if (spec & (__Pyx_MEMVIEW_PTR|__Pyx_MEMVIEW_FULL)) {
+                if (unlikely(buf->strides[dim] != sizeof(void *))) {
+                    PyErr_Format(PyExc_ValueError,
+                                 "Buffer is not indirectly contiguous "
+                                 "in dimension %d.", dim);
+                    goto fail;
+                }
+            } else if (unlikely(buf->strides[dim] != buf->itemsize)) {
+                PyErr_SetString(PyExc_ValueError,
+                                "Buffer and memoryview are not contiguous "
+                                "in the same dimension.");
+                goto fail;
+            }
+        }
+        if (spec & __Pyx_MEMVIEW_FOLLOW) {
+            Py_ssize_t stride = buf->strides[dim];
+            if (stride < 0)
+                stride = -stride;
+            if (unlikely(stride < buf->itemsize)) {
+                PyErr_SetString(PyExc_ValueError,
+                                "Buffer and memoryview are not contiguous "
+                                "in the same dimension.");
+                goto fail;
+            }
+        }
+    } else {
+        if (unlikely(spec & __Pyx_MEMVIEW_CONTIG && dim != ndim - 1)) {
+            PyErr_Format(PyExc_ValueError,
+                         "C-contiguous buffer is not contiguous in "
+                         "dimension %d", dim);
+            goto fail;
+        } else if (unlikely(spec & (__Pyx_MEMVIEW_PTR))) {
+            PyErr_Format(PyExc_ValueError,
+                         "C-contiguous buffer is not indirect in "
+                         "dimension %d", dim);
+            goto fail;
+        } else if (unlikely(buf->suboffsets)) {
+            PyErr_SetString(PyExc_ValueError,
+                            "Buffer exposes suboffsets but no strides");
+            goto fail;
+        }
+    }
+    return 1;
+fail:
+    return 0;
+}
+static int
+__pyx_check_suboffsets(Py_buffer *buf, int dim, int ndim, int spec)
+{
+    CYTHON_UNUSED_VAR(ndim);
+    if (spec & __Pyx_MEMVIEW_DIRECT) {
+        if (unlikely(buf->suboffsets && buf->suboffsets[dim] >= 0)) {
+            PyErr_Format(PyExc_ValueError,
+                         "Buffer not compatible with direct access "
+                         "in dimension %d.", dim);
+            goto fail;
+        }
+    }
+    if (spec & __Pyx_MEMVIEW_PTR) {
+        if (unlikely(!buf->suboffsets || (buf->suboffsets[dim] < 0))) {
+            PyErr_Format(PyExc_ValueError,
+                         "Buffer is not indirectly accessible "
+                         "in dimension %d.", dim);
+            goto fail;
+        }
+    }
+    return 1;
+fail:
+    return 0;
+}
+static int
+__pyx_verify_contig(Py_buffer *buf, int ndim, int c_or_f_flag)
+{
+    int i;
+    if (c_or_f_flag & __Pyx_IS_F_CONTIG) {
+        Py_ssize_t stride = 1;
+        for (i = 0; i < ndim; i++) {
+            if (unlikely(stride * buf->itemsize != buf->strides[i]  &&  buf->shape[i] > 1)) {
+                PyErr_SetString(PyExc_ValueError,
+                    "Buffer not fortran contiguous.");
+                goto fail;
+            }
+            stride = stride * buf->shape[i];
+        }
+    } else if (c_or_f_flag & __Pyx_IS_C_CONTIG) {
+        Py_ssize_t stride = 1;
+        for (i = ndim - 1; i >- 1; i--) {
+            if (unlikely(stride * buf->itemsize != buf->strides[i]  &&  buf->shape[i] > 1)) {
+                PyErr_SetString(PyExc_ValueError,
+                    "Buffer not C contiguous.");
+                goto fail;
+            }
+            stride = stride * buf->shape[i];
+        }
+    }
+    return 1;
+fail:
+    return 0;
+}
+static int __Pyx_ValidateAndInit_memviewslice(
+                int *axes_specs,
+                int c_or_f_flag,
+                int buf_flags,
+                int ndim,
+                __Pyx_TypeInfo *dtype,
+                __Pyx_BufFmt_StackElem stack[],
+                __Pyx_memviewslice *memviewslice,
+                PyObject *original_obj)
+{
+    struct __pyx_memoryview_obj *memview, *new_memview;
+    __Pyx_RefNannyDeclarations
+    Py_buffer *buf;
+    int i, spec = 0, retval = -1;
+    __Pyx_BufFmt_Context ctx;
+    int from_memoryview = __pyx_memoryview_check(original_obj);
+    __Pyx_RefNannySetupContext("ValidateAndInit_memviewslice", 0);
+    if (from_memoryview && __pyx_typeinfo_cmp(dtype, ((struct __pyx_memoryview_obj *)
+                                                            original_obj)->typeinfo)) {
+        memview = (struct __pyx_memoryview_obj *) original_obj;
+        new_memview = NULL;
+    } else {
+        memview = (struct __pyx_memoryview_obj *) __pyx_memoryview_new(
+                                            original_obj, buf_flags, 0, dtype);
+        new_memview = memview;
+        if (unlikely(!memview))
+            goto fail;
+    }
+    buf = &memview->view;
+    if (unlikely(buf->ndim != ndim)) {
+        PyErr_Format(PyExc_ValueError,
+                "Buffer has wrong number of dimensions (expected %d, got %d)",
+                ndim, buf->ndim);
+        goto fail;
+    }
+    if (new_memview) {
+        __Pyx_BufFmt_Init(&ctx, stack, dtype);
+        if (unlikely(!__Pyx_BufFmt_CheckString(&ctx, buf->format))) goto fail;
+    }
+    if (unlikely((unsigned) buf->itemsize != dtype->size)) {
+        PyErr_Format(PyExc_ValueError,
+                     "Item size of buffer (%" CYTHON_FORMAT_SSIZE_T "u byte%s) "
+                     "does not match size of '%s' (%" CYTHON_FORMAT_SSIZE_T "u byte%s)",
+                     buf->itemsize,
+                     (buf->itemsize > 1) ? "s" : "",
+                     dtype->name,
+                     dtype->size,
+                     (dtype->size > 1) ? "s" : "");
+        goto fail;
+    }
+    if (buf->len > 0) {
+        for (i = 0; i < ndim; i++) {
+            spec = axes_specs[i];
+            if (unlikely(!__pyx_check_strides(buf, i, ndim, spec)))
+                goto fail;
+            if (unlikely(!__pyx_check_suboffsets(buf, i, ndim, spec)))
+                goto fail;
+        }
+        if (unlikely(buf->strides && !__pyx_verify_contig(buf, ndim, c_or_f_flag)))
+            goto fail;
+    }
+    if (unlikely(__Pyx_init_memviewslice(memview, ndim, memviewslice,
+                                         new_memview != NULL) == -1)) {
+        goto fail;
+    }
+    retval = 0;
+    goto no_fail;
+fail:
+    Py_XDECREF(new_memview);
+    retval = -1;
+no_fail:
+    __Pyx_RefNannyFinishContext();
+    return retval;
+}
+
+/* ObjectToMemviewSlice */
+  static CYTHON_INLINE __Pyx_memviewslice __Pyx_PyObject_to_MemoryviewSlice_ds_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t(PyObject *obj, int writable_flag) {
+    __Pyx_memviewslice result = { 0, 0, { 0 }, { 0 }, { 0 } };
+    __Pyx_BufFmt_StackElem stack[1];
+    int axes_specs[] = { (__Pyx_MEMVIEW_DIRECT | __Pyx_MEMVIEW_STRIDED) };
+    int retcode;
+    if (obj == Py_None) {
+        result.memview = (struct __pyx_memoryview_obj *) Py_None;
+        return result;
+    }
+    retcode = __Pyx_ValidateAndInit_memviewslice(axes_specs, 0,
+                                                 PyBUF_RECORDS_RO | writable_flag, 1,
+                                                 &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t, stack,
+                                                 &result, obj);
+    if (unlikely(retcode == -1))
+        goto __pyx_fail;
+    return result;
+__pyx_fail:
+    result.memview = NULL;
+    result.data = NULL;
+    return result;
+}
+
+/* MemviewDtypeToObject */
+  static CYTHON_INLINE PyObject *__pyx_memview_get_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t(const char *itemp) {
+    return (PyObject *) __Pyx_PyInt_From_npy_int64(*(__pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t *) itemp);
+}
+static CYTHON_INLINE int __pyx_memview_set_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t(const char *itemp, PyObject *obj) {
+    __pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t value = __Pyx_PyInt_As_npy_int64(obj);
+    if (unlikely((value == ((npy_int64)-1)) && PyErr_Occurred()))
+        return 0;
+    *(__pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t *) itemp = value;
+    return 1;
+}
+
+/* ObjectToMemviewSlice */
+  static CYTHON_INLINE __Pyx_memviewslice __Pyx_PyObject_to_MemoryviewSlice_dsds_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t(PyObject *obj, int writable_flag) {
+    __Pyx_memviewslice result = { 0, 0, { 0 }, { 0 }, { 0 } };
+    __Pyx_BufFmt_StackElem stack[1];
+    int axes_specs[] = { (__Pyx_MEMVIEW_DIRECT | __Pyx_MEMVIEW_STRIDED), (__Pyx_MEMVIEW_DIRECT | __Pyx_MEMVIEW_STRIDED) };
+    int retcode;
+    if (obj == Py_None) {
+        result.memview = (struct __pyx_memoryview_obj *) Py_None;
+        return result;
+    }
+    retcode = __Pyx_ValidateAndInit_memviewslice(axes_specs, 0,
+                                                 PyBUF_RECORDS_RO | writable_flag, 2,
+                                                 &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_15data_utils_fast_DTYPE_t, stack,
+                                                 &result, obj);
+    if (unlikely(retcode == -1))
+        goto __pyx_fail;
+    return result;
+__pyx_fail:
+    result.memview = NULL;
+    result.data = NULL;
+    return result;
+}
+
+/* Declarations */
+  #if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+  #ifdef __cplusplus
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      return ::std::complex< float >(x, y);
+    }
+  #else
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      return x + y*(__pyx_t_float_complex)_Complex_I;
+    }
+  #endif
+#else
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      __pyx_t_float_complex z;
+      z.real = x;
+      z.imag = y;
+      return z;
+    }
+#endif
+
+/* Arithmetic */
+  #if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+       return (a.real == b.real) && (a.imag == b.imag);
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sum_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real + b.real;
+        z.imag = a.imag + b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_diff_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real - b.real;
+        z.imag = a.imag - b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prod_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real * b.real - a.imag * b.imag;
+        z.imag = a.real * b.imag + a.imag * b.real;
+        return z;
+    }
+    #if 1
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else if (fabsf(b.real) >= fabsf(b.imag)) {
+            if (b.real == 0 && b.imag == 0) {
+                return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.imag);
+            } else {
+                float r = b.imag / b.real;
+                float s = (float)(1.0) / (b.real + b.imag * r);
+                return __pyx_t_float_complex_from_parts(
+                    (a.real + a.imag * r) * s, (a.imag - a.real * r) * s);
+            }
+        } else {
+            float r = b.real / b.imag;
+            float s = (float)(1.0) / (b.imag + b.real * r);
+            return __pyx_t_float_complex_from_parts(
+                (a.real * r + a.imag) * s, (a.imag * r - a.real) * s);
+        }
+    }
+    #else
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else {
+            float denom = b.real * b.real + b.imag * b.imag;
+            return __pyx_t_float_complex_from_parts(
+                (a.real * b.real + a.imag * b.imag) / denom,
+                (a.imag * b.real - a.real * b.imag) / denom);
+        }
+    }
+    #endif
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_neg_float(__pyx_t_float_complex a) {
+        __pyx_t_float_complex z;
+        z.real = -a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    static CYTHON_INLINE int __Pyx_c_is_zero_float(__pyx_t_float_complex a) {
+       return (a.real == 0) && (a.imag == 0);
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conj_float(__pyx_t_float_complex a) {
+        __pyx_t_float_complex z;
+        z.real =  a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    #if 1
+        static CYTHON_INLINE float __Pyx_c_abs_float(__pyx_t_float_complex z) {
+          #if !defined(HAVE_HYPOT) || defined(_MSC_VER)
+            return sqrtf(z.real*z.real + z.imag*z.imag);
+          #else
+            return hypotf(z.real, z.imag);
+          #endif
+        }
+        static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_pow_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+            __pyx_t_float_complex z;
+            float r, lnr, theta, z_r, z_theta;
+            if (b.imag == 0 && b.real == (int)b.real) {
+                if (b.real < 0) {
+                    float denom = a.real * a.real + a.imag * a.imag;
+                    a.real = a.real / denom;
+                    a.imag = -a.imag / denom;
+                    b.real = -b.real;
+                }
+                switch ((int)b.real) {
+                    case 0:
+                        z.real = 1;
+                        z.imag = 0;
+                        return z;
+                    case 1:
+                        return a;
+                    case 2:
+                        return __Pyx_c_prod_float(a, a);
+                    case 3:
+                        z = __Pyx_c_prod_float(a, a);
+                        return __Pyx_c_prod_float(z, a);
+                    case 4:
+                        z = __Pyx_c_prod_float(a, a);
+                        return __Pyx_c_prod_float(z, z);
+                }
+            }
+            if (a.imag == 0) {
+                if (a.real == 0) {
+                    return a;
+                } else if ((b.imag == 0) && (a.real >= 0)) {
+                    z.real = powf(a.real, b.real);
+                    z.imag = 0;
+                    return z;
+                } else if (a.real > 0) {
+                    r = a.real;
+                    theta = 0;
+                } else {
+                    r = -a.real;
+                    theta = atan2f(0.0, -1.0);
+                }
+            } else {
+                r = __Pyx_c_abs_float(a);
+                theta = atan2f(a.imag, a.real);
+            }
+            lnr = logf(r);
+            z_r = expf(lnr * b.real - theta * b.imag);
+            z_theta = theta * b.real + lnr * b.imag;
+            z.real = z_r * cosf(z_theta);
+            z.imag = z_r * sinf(z_theta);
+            return z;
+        }
+    #endif
+#endif
+
+/* Declarations */
+  #if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+  #ifdef __cplusplus
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      return ::std::complex< double >(x, y);
+    }
+  #else
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      return x + y*(__pyx_t_double_complex)_Complex_I;
+    }
+  #endif
+#else
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      __pyx_t_double_complex z;
+      z.real = x;
+      z.imag = y;
+      return z;
+    }
+#endif
+
+/* Arithmetic */
+  #if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+       return (a.real == b.real) && (a.imag == b.imag);
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real + b.real;
+        z.imag = a.imag + b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real - b.real;
+        z.imag = a.imag - b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real * b.real - a.imag * b.imag;
+        z.imag = a.real * b.imag + a.imag * b.real;
+        return z;
+    }
+    #if 1
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else if (fabs(b.real) >= fabs(b.imag)) {
+            if (b.real == 0 && b.imag == 0) {
+                return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.imag);
+            } else {
+                double r = b.imag / b.real;
+                double s = (double)(1.0) / (b.real + b.imag * r);
+                return __pyx_t_double_complex_from_parts(
+                    (a.real + a.imag * r) * s, (a.imag - a.real * r) * s);
+            }
+        } else {
+            double r = b.real / b.imag;
+            double s = (double)(1.0) / (b.imag + b.real * r);
+            return __pyx_t_double_complex_from_parts(
+                (a.real * r + a.imag) * s, (a.imag * r - a.real) * s);
+        }
+    }
+    #else
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else {
+            double denom = b.real * b.real + b.imag * b.imag;
+            return __pyx_t_double_complex_from_parts(
+                (a.real * b.real + a.imag * b.imag) / denom,
+                (a.imag * b.real - a.real * b.imag) / denom);
+        }
+    }
+    #endif
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg_double(__pyx_t_double_complex a) {
+        __pyx_t_double_complex z;
+        z.real = -a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    static CYTHON_INLINE int __Pyx_c_is_zero_double(__pyx_t_double_complex a) {
+       return (a.real == 0) && (a.imag == 0);
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj_double(__pyx_t_double_complex a) {
+        __pyx_t_double_complex z;
+        z.real =  a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    #if 1
+        static CYTHON_INLINE double __Pyx_c_abs_double(__pyx_t_double_complex z) {
+          #if !defined(HAVE_HYPOT) || defined(_MSC_VER)
+            return sqrt(z.real*z.real + z.imag*z.imag);
+          #else
+            return hypot(z.real, z.imag);
+          #endif
+        }
+        static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+            __pyx_t_double_complex z;
+            double r, lnr, theta, z_r, z_theta;
+            if (b.imag == 0 && b.real == (int)b.real) {
+                if (b.real < 0) {
+                    double denom = a.real * a.real + a.imag * a.imag;
+                    a.real = a.real / denom;
+                    a.imag = -a.imag / denom;
+                    b.real = -b.real;
+                }
+                switch ((int)b.real) {
+                    case 0:
+                        z.real = 1;
+                        z.imag = 0;
+                        return z;
+                    case 1:
+                        return a;
+                    case 2:
+                        return __Pyx_c_prod_double(a, a);
+                    case 3:
+                        z = __Pyx_c_prod_double(a, a);
+                        return __Pyx_c_prod_double(z, a);
+                    case 4:
+                        z = __Pyx_c_prod_double(a, a);
+                        return __Pyx_c_prod_double(z, z);
+                }
+            }
+            if (a.imag == 0) {
+                if (a.real == 0) {
+                    return a;
+                } else if ((b.imag == 0) && (a.real >= 0)) {
+                    z.real = pow(a.real, b.real);
+                    z.imag = 0;
+                    return z;
+                } else if (a.real > 0) {
+                    r = a.real;
+                    theta = 0;
+                } else {
+                    r = -a.real;
+                    theta = atan2(0.0, -1.0);
+                }
+            } else {
+                r = __Pyx_c_abs_double(a);
+                theta = atan2(a.imag, a.real);
+            }
+            lnr = log(r);
+            z_r = exp(lnr * b.real - theta * b.imag);
+            z_theta = theta * b.real + lnr * b.imag;
+            z.real = z_r * cos(z_theta);
+            z.imag = z_r * sin(z_theta);
+            return z;
+        }
+    #endif
+#endif
+
+/* Declarations */
+  #if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+  #ifdef __cplusplus
+    static CYTHON_INLINE __pyx_t_long_double_complex __pyx_t_long_double_complex_from_parts(long double x, long double y) {
+      return ::std::complex< long double >(x, y);
+    }
+  #else
+    static CYTHON_INLINE __pyx_t_long_double_complex __pyx_t_long_double_complex_from_parts(long double x, long double y) {
+      return x + y*(__pyx_t_long_double_complex)_Complex_I;
+    }
+  #endif
+#else
+    static CYTHON_INLINE __pyx_t_long_double_complex __pyx_t_long_double_complex_from_parts(long double x, long double y) {
+      __pyx_t_long_double_complex z;
+      z.real = x;
+      z.imag = y;
+      return z;
+    }
+#endif
+
+/* Arithmetic */
+  #if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_long__double(__pyx_t_long_double_complex a, __pyx_t_long_double_complex b) {
+       return (a.real == b.real) && (a.imag == b.imag);
+    }
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_sum_long__double(__pyx_t_long_double_complex a, __pyx_t_long_double_complex b) {
+        __pyx_t_long_double_complex z;
+        z.real = a.real + b.real;
+        z.imag = a.imag + b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_diff_long__double(__pyx_t_long_double_complex a, __pyx_t_long_double_complex b) {
+        __pyx_t_long_double_complex z;
+        z.real = a.real - b.real;
+        z.imag = a.imag - b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_prod_long__double(__pyx_t_long_double_complex a, __pyx_t_long_double_complex b) {
+        __pyx_t_long_double_complex z;
+        z.real = a.real * b.real - a.imag * b.imag;
+        z.imag = a.real * b.imag + a.imag * b.real;
+        return z;
+    }
+    #if 1
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_quot_long__double(__pyx_t_long_double_complex a, __pyx_t_long_double_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_long_double_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else if (fabsl(b.real) >= fabsl(b.imag)) {
+            if (b.real == 0 && b.imag == 0) {
+                return __pyx_t_long_double_complex_from_parts(a.real / b.real, a.imag / b.imag);
+            } else {
+                long double r = b.imag / b.real;
+                long double s = (long double)(1.0) / (b.real + b.imag * r);
+                return __pyx_t_long_double_complex_from_parts(
+                    (a.real + a.imag * r) * s, (a.imag - a.real * r) * s);
+            }
+        } else {
+            long double r = b.real / b.imag;
+            long double s = (long double)(1.0) / (b.imag + b.real * r);
+            return __pyx_t_long_double_complex_from_parts(
+                (a.real * r + a.imag) * s, (a.imag * r - a.real) * s);
+        }
+    }
+    #else
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_quot_long__double(__pyx_t_long_double_complex a, __pyx_t_long_double_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_long_double_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else {
+            long double denom = b.real * b.real + b.imag * b.imag;
+            return __pyx_t_long_double_complex_from_parts(
+                (a.real * b.real + a.imag * b.imag) / denom,
+                (a.imag * b.real - a.real * b.imag) / denom);
+        }
+    }
+    #endif
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_neg_long__double(__pyx_t_long_double_complex a) {
+        __pyx_t_long_double_complex z;
+        z.real = -a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    static CYTHON_INLINE int __Pyx_c_is_zero_long__double(__pyx_t_long_double_complex a) {
+       return (a.real == 0) && (a.imag == 0);
+    }
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_conj_long__double(__pyx_t_long_double_complex a) {
+        __pyx_t_long_double_complex z;
+        z.real =  a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    #if 1
+        static CYTHON_INLINE long double __Pyx_c_abs_long__double(__pyx_t_long_double_complex z) {
+          #if !defined(HAVE_HYPOT) || defined(_MSC_VER)
+            return sqrtl(z.real*z.real + z.imag*z.imag);
+          #else
+            return hypotl(z.real, z.imag);
+          #endif
+        }
+        static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_pow_long__double(__pyx_t_long_double_complex a, __pyx_t_long_double_complex b) {
+            __pyx_t_long_double_complex z;
+            long double r, lnr, theta, z_r, z_theta;
+            if (b.imag == 0 && b.real == (int)b.real) {
+                if (b.real < 0) {
+                    long double denom = a.real * a.real + a.imag * a.imag;
+                    a.real = a.real / denom;
+                    a.imag = -a.imag / denom;
+                    b.real = -b.real;
+                }
+                switch ((int)b.real) {
+                    case 0:
+                        z.real = 1;
+                        z.imag = 0;
+                        return z;
+                    case 1:
+                        return a;
+                    case 2:
+                        return __Pyx_c_prod_long__double(a, a);
+                    case 3:
+                        z = __Pyx_c_prod_long__double(a, a);
+                        return __Pyx_c_prod_long__double(z, a);
+                    case 4:
+                        z = __Pyx_c_prod_long__double(a, a);
+                        return __Pyx_c_prod_long__double(z, z);
+                }
+            }
+            if (a.imag == 0) {
+                if (a.real == 0) {
+                    return a;
+                } else if ((b.imag == 0) && (a.real >= 0)) {
+                    z.real = powl(a.real, b.real);
+                    z.imag = 0;
+                    return z;
+                } else if (a.real > 0) {
+                    r = a.real;
+                    theta = 0;
+                } else {
+                    r = -a.real;
+                    theta = atan2l(0.0, -1.0);
+                }
+            } else {
+                r = __Pyx_c_abs_long__double(a);
+                theta = atan2l(a.imag, a.real);
+            }
+            lnr = logl(r);
+            z_r = expl(lnr * b.real - theta * b.imag);
+            z_theta = theta * b.real + lnr * b.imag;
+            z.real = z_r * cosl(z_theta);
+            z.imag = z_r * sinl(z_theta);
+            return z;
+        }
+    #endif
+#endif
+
+/* MemviewSliceCopyTemplate */
+  static __Pyx_memviewslice
+__pyx_memoryview_copy_new_contig(const __Pyx_memviewslice *from_mvs,
+                                 const char *mode, int ndim,
+                                 size_t sizeof_dtype, int contig_flag,
+                                 int dtype_is_object)
+{
+    __Pyx_RefNannyDeclarations
+    int i;
+    __Pyx_memviewslice new_mvs = { 0, 0, { 0 }, { 0 }, { 0 } };
+    struct __pyx_memoryview_obj *from_memview = from_mvs->memview;
+    Py_buffer *buf = &from_memview->view;
+    PyObject *shape_tuple = NULL;
+    PyObject *temp_int = NULL;
+    struct __pyx_array_obj *array_obj = NULL;
+    struct __pyx_memoryview_obj *memview_obj = NULL;
+    __Pyx_RefNannySetupContext("__pyx_memoryview_copy_new_contig", 0);
+    for (i = 0; i < ndim; i++) {
+        if (unlikely(from_mvs->suboffsets[i] >= 0)) {
+            PyErr_Format(PyExc_ValueError, "Cannot copy memoryview slice with "
+                                           "indirect dimensions (axis %d)", i);
+            goto fail;
+        }
+    }
+    shape_tuple = PyTuple_New(ndim);
+    if (unlikely(!shape_tuple)) {
+        goto fail;
+    }
+    __Pyx_GOTREF(shape_tuple);
+    for(i = 0; i < ndim; i++) {
+        temp_int = PyInt_FromSsize_t(from_mvs->shape[i]);
+        if(unlikely(!temp_int)) {
+            goto fail;
+        } else {
+            PyTuple_SET_ITEM(shape_tuple, i, temp_int);
+            temp_int = NULL;
+        }
+    }
+    array_obj = __pyx_array_new(shape_tuple, sizeof_dtype, buf->format, (char *) mode, NULL);
+    if (unlikely(!array_obj)) {
+        goto fail;
+    }
+    __Pyx_GOTREF(array_obj);
+    memview_obj = (struct __pyx_memoryview_obj *) __pyx_memoryview_new(
+                                    (PyObject *) array_obj, contig_flag,
+                                    dtype_is_object,
+                                    from_mvs->memview->typeinfo);
+    if (unlikely(!memview_obj))
+        goto fail;
+    if (unlikely(__Pyx_init_memviewslice(memview_obj, ndim, &new_mvs, 1) < 0))
+        goto fail;
+    if (unlikely(__pyx_memoryview_copy_contents(*from_mvs, new_mvs, ndim, ndim,
+                                                dtype_is_object) < 0))
+        goto fail;
+    goto no_fail;
+fail:
+    __Pyx_XDECREF(new_mvs.memview);
+    new_mvs.memview = NULL;
+    new_mvs.data = NULL;
+no_fail:
+    __Pyx_XDECREF(shape_tuple);
+    __Pyx_XDECREF(temp_int);
+    __Pyx_XDECREF(array_obj);
+    __Pyx_RefNannyFinishContext();
+    return new_mvs;
+}
+
+/* MemviewSliceInit */
+  static int
+__Pyx_init_memviewslice(struct __pyx_memoryview_obj *memview,
+                        int ndim,
+                        __Pyx_memviewslice *memviewslice,
+                        int memview_is_new_reference)
+{
+    __Pyx_RefNannyDeclarations
+    int i, retval=-1;
+    Py_buffer *buf = &memview->view;
+    __Pyx_RefNannySetupContext("init_memviewslice", 0);
+    if (unlikely(memviewslice->memview || memviewslice->data)) {
+        PyErr_SetString(PyExc_ValueError,
+            "memviewslice is already initialized!");
+        goto fail;
+    }
+    if (buf->strides) {
+        for (i = 0; i < ndim; i++) {
+            memviewslice->strides[i] = buf->strides[i];
+        }
+    } else {
+        Py_ssize_t stride = buf->itemsize;
+        for (i = ndim - 1; i >= 0; i--) {
+            memviewslice->strides[i] = stride;
+            stride *= buf->shape[i];
+        }
+    }
+    for (i = 0; i < ndim; i++) {
+        memviewslice->shape[i]   = buf->shape[i];
+        if (buf->suboffsets) {
+            memviewslice->suboffsets[i] = buf->suboffsets[i];
+        } else {
+            memviewslice->suboffsets[i] = -1;
+        }
+    }
+    memviewslice->memview = memview;
+    memviewslice->data = (char *)buf->buf;
+    if (__pyx_add_acquisition_count(memview) == 0 && !memview_is_new_reference) {
+        Py_INCREF(memview);
+    }
+    retval = 0;
+    goto no_fail;
+fail:
+    memviewslice->memview = 0;
+    memviewslice->data = 0;
+    retval = -1;
+no_fail:
+    __Pyx_RefNannyFinishContext();
+    return retval;
+}
+#ifndef Py_NO_RETURN
+#define Py_NO_RETURN
+#endif
+static void __pyx_fatalerror(const char *fmt, ...) Py_NO_RETURN {
+    va_list vargs;
+    char msg[200];
+#if PY_VERSION_HEX >= 0x030A0000 || defined(HAVE_STDARG_PROTOTYPES)
+    va_start(vargs, fmt);
+#else
+    va_start(vargs);
+#endif
+    vsnprintf(msg, 200, fmt, vargs);
+    va_end(vargs);
+    Py_FatalError(msg);
+}
+static CYTHON_INLINE int
+__pyx_add_acquisition_count_locked(__pyx_atomic_int_type *acquisition_count,
+                                   PyThread_type_lock lock)
+{
+    int result;
+    PyThread_acquire_lock(lock, 1);
+    result = (*acquisition_count)++;
+    PyThread_release_lock(lock);
+    return result;
+}
+static CYTHON_INLINE int
+__pyx_sub_acquisition_count_locked(__pyx_atomic_int_type *acquisition_count,
+                                   PyThread_type_lock lock)
+{
+    int result;
+    PyThread_acquire_lock(lock, 1);
+    result = (*acquisition_count)--;
+    PyThread_release_lock(lock);
+    return result;
+}
+static CYTHON_INLINE void
+__Pyx_INC_MEMVIEW(__Pyx_memviewslice *memslice, int have_gil, int lineno)
+{
+    __pyx_nonatomic_int_type old_acquisition_count;
+    struct __pyx_memoryview_obj *memview = memslice->memview;
+    if (unlikely(!memview || (PyObject *) memview == Py_None)) {
+        return;
+    }
+    old_acquisition_count = __pyx_add_acquisition_count(memview);
+    if (unlikely(old_acquisition_count <= 0)) {
+        if (likely(old_acquisition_count == 0)) {
+            if (have_gil) {
+                Py_INCREF((PyObject *) memview);
+            } else {
+                PyGILState_STATE _gilstate = PyGILState_Ensure();
+                Py_INCREF((PyObject *) memview);
+                PyGILState_Release(_gilstate);
+            }
+        } else {
+            __pyx_fatalerror("Acquisition count is %d (line %d)",
+                             old_acquisition_count+1, lineno);
+        }
+    }
+}
+static CYTHON_INLINE void __Pyx_XCLEAR_MEMVIEW(__Pyx_memviewslice *memslice,
+                                             int have_gil, int lineno) {
+    __pyx_nonatomic_int_type old_acquisition_count;
+    struct __pyx_memoryview_obj *memview = memslice->memview;
+    if (unlikely(!memview || (PyObject *) memview == Py_None)) {
+        memslice->memview = NULL;
+        return;
+    }
+    old_acquisition_count = __pyx_sub_acquisition_count(memview);
+    memslice->data = NULL;
+    if (likely(old_acquisition_count > 1)) {
+        memslice->memview = NULL;
+    } else if (likely(old_acquisition_count == 1)) {
+        if (have_gil) {
+            Py_CLEAR(memslice->memview);
+        } else {
+            PyGILState_STATE _gilstate = PyGILState_Ensure();
+            Py_CLEAR(memslice->memview);
+            PyGILState_Release(_gilstate);
+        }
+    } else {
+        __pyx_fatalerror("Acquisition count is %d (line %d)",
+                         old_acquisition_count-1, lineno);
+    }
+}
+
+/* CIntFromPy */
+  static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const long neg_one = (long) -1, const_zero = (long) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if ((sizeof(long) < sizeof(long))) {
+            __PYX_VERIFY_RETURN_INT(long, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (long) val;
+        }
+    }
+#endif
+    if (unlikely(!PyLong_Check(x))) {
+        long val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (long) -1;
+        val = __Pyx_PyInt_As_long(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+    if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+        if (unlikely(__Pyx_PyLong_IsNeg(x))) {
+            goto raise_neg_overflow;
+        } else if (__Pyx_PyLong_IsCompact(x)) {
+            __PYX_VERIFY_RETURN_INT(long, __Pyx_compact_upylong, __Pyx_PyLong_CompactValueUnsigned(x))
+        } else {
+            const digit* digits = __Pyx_PyLong_Digits(x);
+            assert(__Pyx_PyLong_DigitCount(x) > 1);
+            switch (__Pyx_PyLong_DigitCount(x)) {
+                case 2:
+                    if ((8 * sizeof(long) > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(long) >= 2 * PyLong_SHIFT)) {
+                            return (long) (((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if ((8 * sizeof(long) > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(long) >= 3 * PyLong_SHIFT)) {
+                            return (long) (((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if ((8 * sizeof(long) > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(long) >= 4 * PyLong_SHIFT)) {
+                            return (long) (((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+            }
+        }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX < 0x030C00A7
+        if (unlikely(Py_SIZE(x) < 0)) {
+            goto raise_neg_overflow;
+        }
+#else
+        {
+            int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+            if (unlikely(result < 0))
+                return (long) -1;
+            if (unlikely(result == 1))
+                goto raise_neg_overflow;
+        }
+#endif
+        if ((sizeof(long) <= sizeof(unsigned long))) {
+            __PYX_VERIFY_RETURN_INT_EXC(long, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+        } else if ((sizeof(long) <= sizeof(unsigned PY_LONG_LONG))) {
+            __PYX_VERIFY_RETURN_INT_EXC(long, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+        }
+    } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+        if (__Pyx_PyLong_IsCompact(x)) {
+            __PYX_VERIFY_RETURN_INT(long, __Pyx_compact_pylong, __Pyx_PyLong_CompactValue(x))
+        } else {
+            const digit* digits = __Pyx_PyLong_Digits(x);
+            assert(__Pyx_PyLong_DigitCount(x) > 1);
+            switch (__Pyx_PyLong_SignedDigitCount(x)) {
+                case -2:
+                    if ((8 * sizeof(long) - 1 > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(long) - 1 > 2 * PyLong_SHIFT)) {
+                            return (long) (((long)-1)*(((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if ((8 * sizeof(long) > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(long) - 1 > 2 * PyLong_SHIFT)) {
+                            return (long) ((((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if ((8 * sizeof(long) - 1 > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(long) - 1 > 3 * PyLong_SHIFT)) {
+                            return (long) (((long)-1)*(((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if ((8 * sizeof(long) > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(long) - 1 > 3 * PyLong_SHIFT)) {
+                            return (long) ((((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if ((8 * sizeof(long) - 1 > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(long) - 1 > 4 * PyLong_SHIFT)) {
+                            return (long) (((long)-1)*(((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if ((8 * sizeof(long) > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(long) - 1 > 4 * PyLong_SHIFT)) {
+                            return (long) ((((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+            }
+        }
+#endif
+        if ((sizeof(long) <= sizeof(long))) {
+            __PYX_VERIFY_RETURN_INT_EXC(long, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+        } else if ((sizeof(long) <= sizeof(PY_LONG_LONG))) {
+            __PYX_VERIFY_RETURN_INT_EXC(long, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+        }
+    }
+    {
+        long val;
+        int ret = -1;
+#if PY_VERSION_HEX >= 0x030d00A6 && !CYTHON_COMPILING_IN_LIMITED_API
+        Py_ssize_t bytes_copied = PyLong_AsNativeBytes(
+            x, &val, sizeof(val), Py_ASNATIVEBYTES_NATIVE_ENDIAN | (is_unsigned ? Py_ASNATIVEBYTES_UNSIGNED_BUFFER | Py_ASNATIVEBYTES_REJECT_NEGATIVE : 0));
+        if (unlikely(bytes_copied == -1)) {
+        } else if (unlikely(bytes_copied > (Py_ssize_t) sizeof(val))) {
+            goto raise_overflow;
+        } else {
+            ret = 0;
+        }
+#elif PY_VERSION_HEX < 0x030d0000 && !(CYTHON_COMPILING_IN_PYPY || CYTHON_COMPILING_IN_LIMITED_API) || defined(_PyLong_AsByteArray)
+        int one = 1; int is_little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&val;
+        ret = _PyLong_AsByteArray((PyLongObject *)x,
+                                    bytes, sizeof(val),
+                                    is_little, !is_unsigned);
+#else
+        PyObject *v;
+        PyObject *stepval = NULL, *mask = NULL, *shift = NULL;
+        int bits, remaining_bits, is_negative = 0;
+        int chunk_size = (sizeof(long) < 8) ? 30 : 62;
+        if (likely(PyLong_CheckExact(x))) {
+            v = __Pyx_NewRef(x);
+        } else {
+            v = PyNumber_Long(x);
+            if (unlikely(!v)) return (long) -1;
+            assert(PyLong_CheckExact(v));
+        }
+        {
+            int result = PyObject_RichCompareBool(v, Py_False, Py_LT);
+            if (unlikely(result < 0)) {
+                Py_DECREF(v);
+                return (long) -1;
+            }
+            is_negative = result == 1;
+        }
+        if (is_unsigned && unlikely(is_negative)) {
+            Py_DECREF(v);
+            goto raise_neg_overflow;
+        } else if (is_negative) {
+            stepval = PyNumber_Invert(v);
+            Py_DECREF(v);
+            if (unlikely(!stepval))
+                return (long) -1;
+        } else {
+            stepval = v;
+        }
+        v = NULL;
+        val = (long) 0;
+        mask = PyLong_FromLong((1L << chunk_size) - 1); if (unlikely(!mask)) goto done;
+        shift = PyLong_FromLong(chunk_size); if (unlikely(!shift)) goto done;
+        for (bits = 0; bits < (int) sizeof(long) * 8 - chunk_size; bits += chunk_size) {
+            PyObject *tmp, *digit;
+            long idigit;
+            digit = PyNumber_And(stepval, mask);
+            if (unlikely(!digit)) goto done;
+            idigit = PyLong_AsLong(digit);
+            Py_DECREF(digit);
+            if (unlikely(idigit < 0)) goto done;
+            val |= ((long) idigit) << bits;
+            tmp = PyNumber_Rshift(stepval, shift);
+            if (unlikely(!tmp)) goto done;
+            Py_DECREF(stepval); stepval = tmp;
+        }
+        Py_DECREF(shift); shift = NULL;
+        Py_DECREF(mask); mask = NULL;
+        {
+            long idigit = PyLong_AsLong(stepval);
+            if (unlikely(idigit < 0)) goto done;
+            remaining_bits = ((int) sizeof(long) * 8) - bits - (is_unsigned ? 0 : 1);
+            if (unlikely(idigit >= (1L << remaining_bits)))
+                goto raise_overflow;
+            val |= ((long) idigit) << bits;
+        }
+        if (!is_unsigned) {
+            if (unlikely(val & (((long) 1) << (sizeof(long) * 8 - 1))))
+                goto raise_overflow;
+            if (is_negative)
+                val = ~val;
+        }
+        ret = 0;
+    done:
+        Py_XDECREF(shift);
+        Py_XDECREF(mask);
+        Py_XDECREF(stepval);
+#endif
+        if (unlikely(ret))
+            return (long) -1;
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to long");
+    return (long) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to long");
+    return (long) -1;
+}
+
+/* CIntFromPy */
+  static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const int neg_one = (int) -1, const_zero = (int) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if ((sizeof(int) < sizeof(long))) {
+            __PYX_VERIFY_RETURN_INT(int, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (int) val;
+        }
+    }
+#endif
+    if (unlikely(!PyLong_Check(x))) {
+        int val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (int) -1;
+        val = __Pyx_PyInt_As_int(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+    if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+        if (unlikely(__Pyx_PyLong_IsNeg(x))) {
+            goto raise_neg_overflow;
+        } else if (__Pyx_PyLong_IsCompact(x)) {
+            __PYX_VERIFY_RETURN_INT(int, __Pyx_compact_upylong, __Pyx_PyLong_CompactValueUnsigned(x))
+        } else {
+            const digit* digits = __Pyx_PyLong_Digits(x);
+            assert(__Pyx_PyLong_DigitCount(x) > 1);
+            switch (__Pyx_PyLong_DigitCount(x)) {
+                case 2:
+                    if ((8 * sizeof(int) > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(int) >= 2 * PyLong_SHIFT)) {
+                            return (int) (((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if ((8 * sizeof(int) > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(int) >= 3 * PyLong_SHIFT)) {
+                            return (int) (((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if ((8 * sizeof(int) > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(int) >= 4 * PyLong_SHIFT)) {
+                            return (int) (((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+            }
+        }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX < 0x030C00A7
+        if (unlikely(Py_SIZE(x) < 0)) {
+            goto raise_neg_overflow;
+        }
+#else
+        {
+            int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+            if (unlikely(result < 0))
+                return (int) -1;
+            if (unlikely(result == 1))
+                goto raise_neg_overflow;
+        }
+#endif
+        if ((sizeof(int) <= sizeof(unsigned long))) {
+            __PYX_VERIFY_RETURN_INT_EXC(int, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+        } else if ((sizeof(int) <= sizeof(unsigned PY_LONG_LONG))) {
+            __PYX_VERIFY_RETURN_INT_EXC(int, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+        }
+    } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+        if (__Pyx_PyLong_IsCompact(x)) {
+            __PYX_VERIFY_RETURN_INT(int, __Pyx_compact_pylong, __Pyx_PyLong_CompactValue(x))
+        } else {
+            const digit* digits = __Pyx_PyLong_Digits(x);
+            assert(__Pyx_PyLong_DigitCount(x) > 1);
+            switch (__Pyx_PyLong_SignedDigitCount(x)) {
+                case -2:
+                    if ((8 * sizeof(int) - 1 > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(int) - 1 > 2 * PyLong_SHIFT)) {
+                            return (int) (((int)-1)*(((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if ((8 * sizeof(int) > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(int) - 1 > 2 * PyLong_SHIFT)) {
+                            return (int) ((((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if ((8 * sizeof(int) - 1 > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(int) - 1 > 3 * PyLong_SHIFT)) {
+                            return (int) (((int)-1)*(((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if ((8 * sizeof(int) > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(int) - 1 > 3 * PyLong_SHIFT)) {
+                            return (int) ((((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if ((8 * sizeof(int) - 1 > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(int) - 1 > 4 * PyLong_SHIFT)) {
+                            return (int) (((int)-1)*(((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if ((8 * sizeof(int) > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(int) - 1 > 4 * PyLong_SHIFT)) {
+                            return (int) ((((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+            }
+        }
+#endif
+        if ((sizeof(int) <= sizeof(long))) {
+            __PYX_VERIFY_RETURN_INT_EXC(int, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+        } else if ((sizeof(int) <= sizeof(PY_LONG_LONG))) {
+            __PYX_VERIFY_RETURN_INT_EXC(int, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+        }
+    }
+    {
+        int val;
+        int ret = -1;
+#if PY_VERSION_HEX >= 0x030d00A6 && !CYTHON_COMPILING_IN_LIMITED_API
+        Py_ssize_t bytes_copied = PyLong_AsNativeBytes(
+            x, &val, sizeof(val), Py_ASNATIVEBYTES_NATIVE_ENDIAN | (is_unsigned ? Py_ASNATIVEBYTES_UNSIGNED_BUFFER | Py_ASNATIVEBYTES_REJECT_NEGATIVE : 0));
+        if (unlikely(bytes_copied == -1)) {
+        } else if (unlikely(bytes_copied > (Py_ssize_t) sizeof(val))) {
+            goto raise_overflow;
+        } else {
+            ret = 0;
+        }
+#elif PY_VERSION_HEX < 0x030d0000 && !(CYTHON_COMPILING_IN_PYPY || CYTHON_COMPILING_IN_LIMITED_API) || defined(_PyLong_AsByteArray)
+        int one = 1; int is_little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&val;
+        ret = _PyLong_AsByteArray((PyLongObject *)x,
+                                    bytes, sizeof(val),
+                                    is_little, !is_unsigned);
+#else
+        PyObject *v;
+        PyObject *stepval = NULL, *mask = NULL, *shift = NULL;
+        int bits, remaining_bits, is_negative = 0;
+        int chunk_size = (sizeof(long) < 8) ? 30 : 62;
+        if (likely(PyLong_CheckExact(x))) {
+            v = __Pyx_NewRef(x);
+        } else {
+            v = PyNumber_Long(x);
+            if (unlikely(!v)) return (int) -1;
+            assert(PyLong_CheckExact(v));
+        }
+        {
+            int result = PyObject_RichCompareBool(v, Py_False, Py_LT);
+            if (unlikely(result < 0)) {
+                Py_DECREF(v);
+                return (int) -1;
+            }
+            is_negative = result == 1;
+        }
+        if (is_unsigned && unlikely(is_negative)) {
+            Py_DECREF(v);
+            goto raise_neg_overflow;
+        } else if (is_negative) {
+            stepval = PyNumber_Invert(v);
+            Py_DECREF(v);
+            if (unlikely(!stepval))
+                return (int) -1;
+        } else {
+            stepval = v;
+        }
+        v = NULL;
+        val = (int) 0;
+        mask = PyLong_FromLong((1L << chunk_size) - 1); if (unlikely(!mask)) goto done;
+        shift = PyLong_FromLong(chunk_size); if (unlikely(!shift)) goto done;
+        for (bits = 0; bits < (int) sizeof(int) * 8 - chunk_size; bits += chunk_size) {
+            PyObject *tmp, *digit;
+            long idigit;
+            digit = PyNumber_And(stepval, mask);
+            if (unlikely(!digit)) goto done;
+            idigit = PyLong_AsLong(digit);
+            Py_DECREF(digit);
+            if (unlikely(idigit < 0)) goto done;
+            val |= ((int) idigit) << bits;
+            tmp = PyNumber_Rshift(stepval, shift);
+            if (unlikely(!tmp)) goto done;
+            Py_DECREF(stepval); stepval = tmp;
+        }
+        Py_DECREF(shift); shift = NULL;
+        Py_DECREF(mask); mask = NULL;
+        {
+            long idigit = PyLong_AsLong(stepval);
+            if (unlikely(idigit < 0)) goto done;
+            remaining_bits = ((int) sizeof(int) * 8) - bits - (is_unsigned ? 0 : 1);
+            if (unlikely(idigit >= (1L << remaining_bits)))
+                goto raise_overflow;
+            val |= ((int) idigit) << bits;
+        }
+        if (!is_unsigned) {
+            if (unlikely(val & (((int) 1) << (sizeof(int) * 8 - 1))))
+                goto raise_overflow;
+            if (is_negative)
+                val = ~val;
+        }
+        ret = 0;
+    done:
+        Py_XDECREF(shift);
+        Py_XDECREF(mask);
+        Py_XDECREF(stepval);
+#endif
+        if (unlikely(ret))
+            return (int) -1;
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to int");
+    return (int) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to int");
+    return (int) -1;
+}
+
+/* CIntToPy */
+  static CYTHON_INLINE PyObject* __Pyx_PyInt_From_npy_int64(npy_int64 value) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const npy_int64 neg_one = (npy_int64) -1, const_zero = (npy_int64) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(npy_int64) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(npy_int64) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(npy_int64) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(npy_int64) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(npy_int64) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        unsigned char *bytes = (unsigned char *)&value;
+#if !CYTHON_COMPILING_IN_LIMITED_API && PY_VERSION_HEX >= 0x030d00A4
+        if (is_unsigned) {
+            return PyLong_FromUnsignedNativeBytes(bytes, sizeof(value), -1);
+        } else {
+            return PyLong_FromNativeBytes(bytes, sizeof(value), -1);
+        }
+#elif !CYTHON_COMPILING_IN_LIMITED_API && PY_VERSION_HEX < 0x030d0000
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        return _PyLong_FromByteArray(bytes, sizeof(npy_int64),
+                                     little, !is_unsigned);
+#else
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        PyObject *from_bytes, *result = NULL;
+        PyObject *py_bytes = NULL, *arg_tuple = NULL, *kwds = NULL, *order_str = NULL;
+        from_bytes = PyObject_GetAttrString((PyObject*)&PyLong_Type, "from_bytes");
+        if (!from_bytes) return NULL;
+        py_bytes = PyBytes_FromStringAndSize((char*)bytes, sizeof(npy_int64));
+        if (!py_bytes) goto limited_bad;
+        order_str = PyUnicode_FromString(little ? "little" : "big");
+        if (!order_str) goto limited_bad;
+        arg_tuple = PyTuple_Pack(2, py_bytes, order_str);
+        if (!arg_tuple) goto limited_bad;
+        if (!is_unsigned) {
+            kwds = PyDict_New();
+            if (!kwds) goto limited_bad;
+            if (PyDict_SetItemString(kwds, "signed", __Pyx_NewRef(Py_True))) goto limited_bad;
+        }
+        result = PyObject_Call(from_bytes, arg_tuple, kwds);
+        limited_bad:
+        Py_XDECREF(kwds);
+        Py_XDECREF(arg_tuple);
+        Py_XDECREF(order_str);
+        Py_XDECREF(py_bytes);
+        Py_XDECREF(from_bytes);
+        return result;
+#endif
+    }
+}
+
+/* CIntFromPy */
+  static CYTHON_INLINE npy_int64 __Pyx_PyInt_As_npy_int64(PyObject *x) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const npy_int64 neg_one = (npy_int64) -1, const_zero = (npy_int64) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if ((sizeof(npy_int64) < sizeof(long))) {
+            __PYX_VERIFY_RETURN_INT(npy_int64, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (npy_int64) val;
+        }
+    }
+#endif
+    if (unlikely(!PyLong_Check(x))) {
+        npy_int64 val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (npy_int64) -1;
+        val = __Pyx_PyInt_As_npy_int64(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+    if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+        if (unlikely(__Pyx_PyLong_IsNeg(x))) {
+            goto raise_neg_overflow;
+        } else if (__Pyx_PyLong_IsCompact(x)) {
+            __PYX_VERIFY_RETURN_INT(npy_int64, __Pyx_compact_upylong, __Pyx_PyLong_CompactValueUnsigned(x))
+        } else {
+            const digit* digits = __Pyx_PyLong_Digits(x);
+            assert(__Pyx_PyLong_DigitCount(x) > 1);
+            switch (__Pyx_PyLong_DigitCount(x)) {
+                case 2:
+                    if ((8 * sizeof(npy_int64) > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(npy_int64, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(npy_int64) >= 2 * PyLong_SHIFT)) {
+                            return (npy_int64) (((((npy_int64)digits[1]) << PyLong_SHIFT) | (npy_int64)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if ((8 * sizeof(npy_int64) > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(npy_int64, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(npy_int64) >= 3 * PyLong_SHIFT)) {
+                            return (npy_int64) (((((((npy_int64)digits[2]) << PyLong_SHIFT) | (npy_int64)digits[1]) << PyLong_SHIFT) | (npy_int64)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if ((8 * sizeof(npy_int64) > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(npy_int64, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(npy_int64) >= 4 * PyLong_SHIFT)) {
+                            return (npy_int64) (((((((((npy_int64)digits[3]) << PyLong_SHIFT) | (npy_int64)digits[2]) << PyLong_SHIFT) | (npy_int64)digits[1]) << PyLong_SHIFT) | (npy_int64)digits[0]));
+                        }
+                    }
+                    break;
+            }
+        }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX < 0x030C00A7
+        if (unlikely(Py_SIZE(x) < 0)) {
+            goto raise_neg_overflow;
+        }
+#else
+        {
+            int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+            if (unlikely(result < 0))
+                return (npy_int64) -1;
+            if (unlikely(result == 1))
+                goto raise_neg_overflow;
+        }
+#endif
+        if ((sizeof(npy_int64) <= sizeof(unsigned long))) {
+            __PYX_VERIFY_RETURN_INT_EXC(npy_int64, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+        } else if ((sizeof(npy_int64) <= sizeof(unsigned PY_LONG_LONG))) {
+            __PYX_VERIFY_RETURN_INT_EXC(npy_int64, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+        }
+    } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+        if (__Pyx_PyLong_IsCompact(x)) {
+            __PYX_VERIFY_RETURN_INT(npy_int64, __Pyx_compact_pylong, __Pyx_PyLong_CompactValue(x))
+        } else {
+            const digit* digits = __Pyx_PyLong_Digits(x);
+            assert(__Pyx_PyLong_DigitCount(x) > 1);
+            switch (__Pyx_PyLong_SignedDigitCount(x)) {
+                case -2:
+                    if ((8 * sizeof(npy_int64) - 1 > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(npy_int64, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(npy_int64) - 1 > 2 * PyLong_SHIFT)) {
+                            return (npy_int64) (((npy_int64)-1)*(((((npy_int64)digits[1]) << PyLong_SHIFT) | (npy_int64)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if ((8 * sizeof(npy_int64) > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(npy_int64, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(npy_int64) - 1 > 2 * PyLong_SHIFT)) {
+                            return (npy_int64) ((((((npy_int64)digits[1]) << PyLong_SHIFT) | (npy_int64)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if ((8 * sizeof(npy_int64) - 1 > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(npy_int64, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(npy_int64) - 1 > 3 * PyLong_SHIFT)) {
+                            return (npy_int64) (((npy_int64)-1)*(((((((npy_int64)digits[2]) << PyLong_SHIFT) | (npy_int64)digits[1]) << PyLong_SHIFT) | (npy_int64)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if ((8 * sizeof(npy_int64) > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(npy_int64, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(npy_int64) - 1 > 3 * PyLong_SHIFT)) {
+                            return (npy_int64) ((((((((npy_int64)digits[2]) << PyLong_SHIFT) | (npy_int64)digits[1]) << PyLong_SHIFT) | (npy_int64)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if ((8 * sizeof(npy_int64) - 1 > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(npy_int64, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(npy_int64) - 1 > 4 * PyLong_SHIFT)) {
+                            return (npy_int64) (((npy_int64)-1)*(((((((((npy_int64)digits[3]) << PyLong_SHIFT) | (npy_int64)digits[2]) << PyLong_SHIFT) | (npy_int64)digits[1]) << PyLong_SHIFT) | (npy_int64)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if ((8 * sizeof(npy_int64) > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(npy_int64, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(npy_int64) - 1 > 4 * PyLong_SHIFT)) {
+                            return (npy_int64) ((((((((((npy_int64)digits[3]) << PyLong_SHIFT) | (npy_int64)digits[2]) << PyLong_SHIFT) | (npy_int64)digits[1]) << PyLong_SHIFT) | (npy_int64)digits[0])));
+                        }
+                    }
+                    break;
+            }
+        }
+#endif
+        if ((sizeof(npy_int64) <= sizeof(long))) {
+            __PYX_VERIFY_RETURN_INT_EXC(npy_int64, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+        } else if ((sizeof(npy_int64) <= sizeof(PY_LONG_LONG))) {
+            __PYX_VERIFY_RETURN_INT_EXC(npy_int64, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+        }
+    }
+    {
+        npy_int64 val;
+        int ret = -1;
+#if PY_VERSION_HEX >= 0x030d00A6 && !CYTHON_COMPILING_IN_LIMITED_API
+        Py_ssize_t bytes_copied = PyLong_AsNativeBytes(
+            x, &val, sizeof(val), Py_ASNATIVEBYTES_NATIVE_ENDIAN | (is_unsigned ? Py_ASNATIVEBYTES_UNSIGNED_BUFFER | Py_ASNATIVEBYTES_REJECT_NEGATIVE : 0));
+        if (unlikely(bytes_copied == -1)) {
+        } else if (unlikely(bytes_copied > (Py_ssize_t) sizeof(val))) {
+            goto raise_overflow;
+        } else {
+            ret = 0;
+        }
+#elif PY_VERSION_HEX < 0x030d0000 && !(CYTHON_COMPILING_IN_PYPY || CYTHON_COMPILING_IN_LIMITED_API) || defined(_PyLong_AsByteArray)
+        int one = 1; int is_little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&val;
+        ret = _PyLong_AsByteArray((PyLongObject *)x,
+                                    bytes, sizeof(val),
+                                    is_little, !is_unsigned);
+#else
+        PyObject *v;
+        PyObject *stepval = NULL, *mask = NULL, *shift = NULL;
+        int bits, remaining_bits, is_negative = 0;
+        int chunk_size = (sizeof(long) < 8) ? 30 : 62;
+        if (likely(PyLong_CheckExact(x))) {
+            v = __Pyx_NewRef(x);
+        } else {
+            v = PyNumber_Long(x);
+            if (unlikely(!v)) return (npy_int64) -1;
+            assert(PyLong_CheckExact(v));
+        }
+        {
+            int result = PyObject_RichCompareBool(v, Py_False, Py_LT);
+            if (unlikely(result < 0)) {
+                Py_DECREF(v);
+                return (npy_int64) -1;
+            }
+            is_negative = result == 1;
+        }
+        if (is_unsigned && unlikely(is_negative)) {
+            Py_DECREF(v);
+            goto raise_neg_overflow;
+        } else if (is_negative) {
+            stepval = PyNumber_Invert(v);
+            Py_DECREF(v);
+            if (unlikely(!stepval))
+                return (npy_int64) -1;
+        } else {
+            stepval = v;
+        }
+        v = NULL;
+        val = (npy_int64) 0;
+        mask = PyLong_FromLong((1L << chunk_size) - 1); if (unlikely(!mask)) goto done;
+        shift = PyLong_FromLong(chunk_size); if (unlikely(!shift)) goto done;
+        for (bits = 0; bits < (int) sizeof(npy_int64) * 8 - chunk_size; bits += chunk_size) {
+            PyObject *tmp, *digit;
+            long idigit;
+            digit = PyNumber_And(stepval, mask);
+            if (unlikely(!digit)) goto done;
+            idigit = PyLong_AsLong(digit);
+            Py_DECREF(digit);
+            if (unlikely(idigit < 0)) goto done;
+            val |= ((npy_int64) idigit) << bits;
+            tmp = PyNumber_Rshift(stepval, shift);
+            if (unlikely(!tmp)) goto done;
+            Py_DECREF(stepval); stepval = tmp;
+        }
+        Py_DECREF(shift); shift = NULL;
+        Py_DECREF(mask); mask = NULL;
+        {
+            long idigit = PyLong_AsLong(stepval);
+            if (unlikely(idigit < 0)) goto done;
+            remaining_bits = ((int) sizeof(npy_int64) * 8) - bits - (is_unsigned ? 0 : 1);
+            if (unlikely(idigit >= (1L << remaining_bits)))
+                goto raise_overflow;
+            val |= ((npy_int64) idigit) << bits;
+        }
+        if (!is_unsigned) {
+            if (unlikely(val & (((npy_int64) 1) << (sizeof(npy_int64) * 8 - 1))))
+                goto raise_overflow;
+            if (is_negative)
+                val = ~val;
+        }
+        ret = 0;
+    done:
+        Py_XDECREF(shift);
+        Py_XDECREF(mask);
+        Py_XDECREF(stepval);
+#endif
+        if (unlikely(ret))
+            return (npy_int64) -1;
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to npy_int64");
+    return (npy_int64) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to npy_int64");
+    return (npy_int64) -1;
+}
+
+/* CIntToPy */
+  static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const long neg_one = (long) -1, const_zero = (long) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(long) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(long) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(long) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        unsigned char *bytes = (unsigned char *)&value;
+#if !CYTHON_COMPILING_IN_LIMITED_API && PY_VERSION_HEX >= 0x030d00A4
+        if (is_unsigned) {
+            return PyLong_FromUnsignedNativeBytes(bytes, sizeof(value), -1);
+        } else {
+            return PyLong_FromNativeBytes(bytes, sizeof(value), -1);
+        }
+#elif !CYTHON_COMPILING_IN_LIMITED_API && PY_VERSION_HEX < 0x030d0000
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        return _PyLong_FromByteArray(bytes, sizeof(long),
+                                     little, !is_unsigned);
+#else
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        PyObject *from_bytes, *result = NULL;
+        PyObject *py_bytes = NULL, *arg_tuple = NULL, *kwds = NULL, *order_str = NULL;
+        from_bytes = PyObject_GetAttrString((PyObject*)&PyLong_Type, "from_bytes");
+        if (!from_bytes) return NULL;
+        py_bytes = PyBytes_FromStringAndSize((char*)bytes, sizeof(long));
+        if (!py_bytes) goto limited_bad;
+        order_str = PyUnicode_FromString(little ? "little" : "big");
+        if (!order_str) goto limited_bad;
+        arg_tuple = PyTuple_Pack(2, py_bytes, order_str);
+        if (!arg_tuple) goto limited_bad;
+        if (!is_unsigned) {
+            kwds = PyDict_New();
+            if (!kwds) goto limited_bad;
+            if (PyDict_SetItemString(kwds, "signed", __Pyx_NewRef(Py_True))) goto limited_bad;
+        }
+        result = PyObject_Call(from_bytes, arg_tuple, kwds);
+        limited_bad:
+        Py_XDECREF(kwds);
+        Py_XDECREF(arg_tuple);
+        Py_XDECREF(order_str);
+        Py_XDECREF(py_bytes);
+        Py_XDECREF(from_bytes);
+        return result;
+#endif
+    }
+}
+
+/* CIntToPy */
+  static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const int neg_one = (int) -1, const_zero = (int) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(int) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(int) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(int) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(int) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(int) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        unsigned char *bytes = (unsigned char *)&value;
+#if !CYTHON_COMPILING_IN_LIMITED_API && PY_VERSION_HEX >= 0x030d00A4
+        if (is_unsigned) {
+            return PyLong_FromUnsignedNativeBytes(bytes, sizeof(value), -1);
+        } else {
+            return PyLong_FromNativeBytes(bytes, sizeof(value), -1);
+        }
+#elif !CYTHON_COMPILING_IN_LIMITED_API && PY_VERSION_HEX < 0x030d0000
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        return _PyLong_FromByteArray(bytes, sizeof(int),
+                                     little, !is_unsigned);
+#else
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        PyObject *from_bytes, *result = NULL;
+        PyObject *py_bytes = NULL, *arg_tuple = NULL, *kwds = NULL, *order_str = NULL;
+        from_bytes = PyObject_GetAttrString((PyObject*)&PyLong_Type, "from_bytes");
+        if (!from_bytes) return NULL;
+        py_bytes = PyBytes_FromStringAndSize((char*)bytes, sizeof(int));
+        if (!py_bytes) goto limited_bad;
+        order_str = PyUnicode_FromString(little ? "little" : "big");
+        if (!order_str) goto limited_bad;
+        arg_tuple = PyTuple_Pack(2, py_bytes, order_str);
+        if (!arg_tuple) goto limited_bad;
+        if (!is_unsigned) {
+            kwds = PyDict_New();
+            if (!kwds) goto limited_bad;
+            if (PyDict_SetItemString(kwds, "signed", __Pyx_NewRef(Py_True))) goto limited_bad;
+        }
+        result = PyObject_Call(from_bytes, arg_tuple, kwds);
+        limited_bad:
+        Py_XDECREF(kwds);
+        Py_XDECREF(arg_tuple);
+        Py_XDECREF(order_str);
+        Py_XDECREF(py_bytes);
+        Py_XDECREF(from_bytes);
+        return result;
+#endif
+    }
+}
+
+/* CIntFromPy */
+  static CYTHON_INLINE char __Pyx_PyInt_As_char(PyObject *x) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const char neg_one = (char) -1, const_zero = (char) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if ((sizeof(char) < sizeof(long))) {
+            __PYX_VERIFY_RETURN_INT(char, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (char) val;
+        }
+    }
+#endif
+    if (unlikely(!PyLong_Check(x))) {
+        char val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (char) -1;
+        val = __Pyx_PyInt_As_char(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+    if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+        if (unlikely(__Pyx_PyLong_IsNeg(x))) {
+            goto raise_neg_overflow;
+        } else if (__Pyx_PyLong_IsCompact(x)) {
+            __PYX_VERIFY_RETURN_INT(char, __Pyx_compact_upylong, __Pyx_PyLong_CompactValueUnsigned(x))
+        } else {
+            const digit* digits = __Pyx_PyLong_Digits(x);
+            assert(__Pyx_PyLong_DigitCount(x) > 1);
+            switch (__Pyx_PyLong_DigitCount(x)) {
+                case 2:
+                    if ((8 * sizeof(char) > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(char, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(char) >= 2 * PyLong_SHIFT)) {
+                            return (char) (((((char)digits[1]) << PyLong_SHIFT) | (char)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if ((8 * sizeof(char) > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(char, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(char) >= 3 * PyLong_SHIFT)) {
+                            return (char) (((((((char)digits[2]) << PyLong_SHIFT) | (char)digits[1]) << PyLong_SHIFT) | (char)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if ((8 * sizeof(char) > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(char, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(char) >= 4 * PyLong_SHIFT)) {
+                            return (char) (((((((((char)digits[3]) << PyLong_SHIFT) | (char)digits[2]) << PyLong_SHIFT) | (char)digits[1]) << PyLong_SHIFT) | (char)digits[0]));
+                        }
+                    }
+                    break;
+            }
+        }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX < 0x030C00A7
+        if (unlikely(Py_SIZE(x) < 0)) {
+            goto raise_neg_overflow;
+        }
+#else
+        {
+            int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+            if (unlikely(result < 0))
+                return (char) -1;
+            if (unlikely(result == 1))
+                goto raise_neg_overflow;
+        }
+#endif
+        if ((sizeof(char) <= sizeof(unsigned long))) {
+            __PYX_VERIFY_RETURN_INT_EXC(char, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+        } else if ((sizeof(char) <= sizeof(unsigned PY_LONG_LONG))) {
+            __PYX_VERIFY_RETURN_INT_EXC(char, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+        }
+    } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+        if (__Pyx_PyLong_IsCompact(x)) {
+            __PYX_VERIFY_RETURN_INT(char, __Pyx_compact_pylong, __Pyx_PyLong_CompactValue(x))
+        } else {
+            const digit* digits = __Pyx_PyLong_Digits(x);
+            assert(__Pyx_PyLong_DigitCount(x) > 1);
+            switch (__Pyx_PyLong_SignedDigitCount(x)) {
+                case -2:
+                    if ((8 * sizeof(char) - 1 > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(char, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(char) - 1 > 2 * PyLong_SHIFT)) {
+                            return (char) (((char)-1)*(((((char)digits[1]) << PyLong_SHIFT) | (char)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if ((8 * sizeof(char) > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(char, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(char) - 1 > 2 * PyLong_SHIFT)) {
+                            return (char) ((((((char)digits[1]) << PyLong_SHIFT) | (char)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if ((8 * sizeof(char) - 1 > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(char, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(char) - 1 > 3 * PyLong_SHIFT)) {
+                            return (char) (((char)-1)*(((((((char)digits[2]) << PyLong_SHIFT) | (char)digits[1]) << PyLong_SHIFT) | (char)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if ((8 * sizeof(char) > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(char, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(char) - 1 > 3 * PyLong_SHIFT)) {
+                            return (char) ((((((((char)digits[2]) << PyLong_SHIFT) | (char)digits[1]) << PyLong_SHIFT) | (char)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if ((8 * sizeof(char) - 1 > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(char, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(char) - 1 > 4 * PyLong_SHIFT)) {
+                            return (char) (((char)-1)*(((((((((char)digits[3]) << PyLong_SHIFT) | (char)digits[2]) << PyLong_SHIFT) | (char)digits[1]) << PyLong_SHIFT) | (char)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if ((8 * sizeof(char) > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(char, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(char) - 1 > 4 * PyLong_SHIFT)) {
+                            return (char) ((((((((((char)digits[3]) << PyLong_SHIFT) | (char)digits[2]) << PyLong_SHIFT) | (char)digits[1]) << PyLong_SHIFT) | (char)digits[0])));
+                        }
+                    }
+                    break;
+            }
+        }
+#endif
+        if ((sizeof(char) <= sizeof(long))) {
+            __PYX_VERIFY_RETURN_INT_EXC(char, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+        } else if ((sizeof(char) <= sizeof(PY_LONG_LONG))) {
+            __PYX_VERIFY_RETURN_INT_EXC(char, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+        }
+    }
+    {
+        char val;
+        int ret = -1;
+#if PY_VERSION_HEX >= 0x030d00A6 && !CYTHON_COMPILING_IN_LIMITED_API
+        Py_ssize_t bytes_copied = PyLong_AsNativeBytes(
+            x, &val, sizeof(val), Py_ASNATIVEBYTES_NATIVE_ENDIAN | (is_unsigned ? Py_ASNATIVEBYTES_UNSIGNED_BUFFER | Py_ASNATIVEBYTES_REJECT_NEGATIVE : 0));
+        if (unlikely(bytes_copied == -1)) {
+        } else if (unlikely(bytes_copied > (Py_ssize_t) sizeof(val))) {
+            goto raise_overflow;
+        } else {
+            ret = 0;
+        }
+#elif PY_VERSION_HEX < 0x030d0000 && !(CYTHON_COMPILING_IN_PYPY || CYTHON_COMPILING_IN_LIMITED_API) || defined(_PyLong_AsByteArray)
+        int one = 1; int is_little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&val;
+        ret = _PyLong_AsByteArray((PyLongObject *)x,
+                                    bytes, sizeof(val),
+                                    is_little, !is_unsigned);
+#else
+        PyObject *v;
+        PyObject *stepval = NULL, *mask = NULL, *shift = NULL;
+        int bits, remaining_bits, is_negative = 0;
+        int chunk_size = (sizeof(long) < 8) ? 30 : 62;
+        if (likely(PyLong_CheckExact(x))) {
+            v = __Pyx_NewRef(x);
+        } else {
+            v = PyNumber_Long(x);
+            if (unlikely(!v)) return (char) -1;
+            assert(PyLong_CheckExact(v));
+        }
+        {
+            int result = PyObject_RichCompareBool(v, Py_False, Py_LT);
+            if (unlikely(result < 0)) {
+                Py_DECREF(v);
+                return (char) -1;
+            }
+            is_negative = result == 1;
+        }
+        if (is_unsigned && unlikely(is_negative)) {
+            Py_DECREF(v);
+            goto raise_neg_overflow;
+        } else if (is_negative) {
+            stepval = PyNumber_Invert(v);
+            Py_DECREF(v);
+            if (unlikely(!stepval))
+                return (char) -1;
+        } else {
+            stepval = v;
+        }
+        v = NULL;
+        val = (char) 0;
+        mask = PyLong_FromLong((1L << chunk_size) - 1); if (unlikely(!mask)) goto done;
+        shift = PyLong_FromLong(chunk_size); if (unlikely(!shift)) goto done;
+        for (bits = 0; bits < (int) sizeof(char) * 8 - chunk_size; bits += chunk_size) {
+            PyObject *tmp, *digit;
+            long idigit;
+            digit = PyNumber_And(stepval, mask);
+            if (unlikely(!digit)) goto done;
+            idigit = PyLong_AsLong(digit);
+            Py_DECREF(digit);
+            if (unlikely(idigit < 0)) goto done;
+            val |= ((char) idigit) << bits;
+            tmp = PyNumber_Rshift(stepval, shift);
+            if (unlikely(!tmp)) goto done;
+            Py_DECREF(stepval); stepval = tmp;
+        }
+        Py_DECREF(shift); shift = NULL;
+        Py_DECREF(mask); mask = NULL;
+        {
+            long idigit = PyLong_AsLong(stepval);
+            if (unlikely(idigit < 0)) goto done;
+            remaining_bits = ((int) sizeof(char) * 8) - bits - (is_unsigned ? 0 : 1);
+            if (unlikely(idigit >= (1L << remaining_bits)))
+                goto raise_overflow;
+            val |= ((char) idigit) << bits;
+        }
+        if (!is_unsigned) {
+            if (unlikely(val & (((char) 1) << (sizeof(char) * 8 - 1))))
+                goto raise_overflow;
+            if (is_negative)
+                val = ~val;
+        }
+        ret = 0;
+    done:
+        Py_XDECREF(shift);
+        Py_XDECREF(mask);
+        Py_XDECREF(stepval);
+#endif
+        if (unlikely(ret))
+            return (char) -1;
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to char");
+    return (char) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to char");
+    return (char) -1;
+}
+
+/* FormatTypeName */
+  #if CYTHON_COMPILING_IN_LIMITED_API
+static __Pyx_TypeName
+__Pyx_PyType_GetName(PyTypeObject* tp)
+{
+    PyObject *name = __Pyx_PyObject_GetAttrStr((PyObject *)tp,
+                                               __pyx_n_s_name_2);
+    if (unlikely(name == NULL) || unlikely(!PyUnicode_Check(name))) {
+        PyErr_Clear();
+        Py_XDECREF(name);
+        name = __Pyx_NewRef(__pyx_n_s__26);
+    }
+    return name;
+}
+#endif
+
+/* CheckBinaryVersion */
+  static unsigned long __Pyx_get_runtime_version(void) {
+#if __PYX_LIMITED_VERSION_HEX >= 0x030B00A4
+    return Py_Version & ~0xFFUL;
+#else
+    const char* rt_version = Py_GetVersion();
+    unsigned long version = 0;
+    unsigned long factor = 0x01000000UL;
+    unsigned int digit = 0;
+    int i = 0;
+    while (factor) {
+        while ('0' <= rt_version[i] && rt_version[i] <= '9') {
+            digit = digit * 10 + (unsigned int) (rt_version[i] - '0');
+            ++i;
+        }
+        version += factor * digit;
+        if (rt_version[i] != '.')
+            break;
+        digit = 0;
+        factor >>= 8;
+        ++i;
+    }
+    return version;
+#endif
+}
+static int __Pyx_check_binary_version(unsigned long ct_version, unsigned long rt_version, int allow_newer) {
+    const unsigned long MAJOR_MINOR = 0xFFFF0000UL;
+    if ((rt_version & MAJOR_MINOR) == (ct_version & MAJOR_MINOR))
+        return 0;
+    if (likely(allow_newer && (rt_version & MAJOR_MINOR) > (ct_version & MAJOR_MINOR)))
+        return 1;
+    {
+        char message[200];
+        PyOS_snprintf(message, sizeof(message),
+                      "compile time Python version %d.%d "
+                      "of module '%.100s' "
+                      "%s "
+                      "runtime version %d.%d",
+                       (int) (ct_version >> 24), (int) ((ct_version >> 16) & 0xFF),
+                       __Pyx_MODULE_NAME,
+                       (allow_newer) ? "was newer than" : "does not match",
+                       (int) (rt_version >> 24), (int) ((rt_version >> 16) & 0xFF)
+       );
+        return PyErr_WarnEx(NULL, message, 1);
+    }
+}
+
+/* InitStrings */
+  #if PY_MAJOR_VERSION >= 3
+static int __Pyx_InitString(__Pyx_StringTabEntry t, PyObject **str) {
+    if (t.is_unicode | t.is_str) {
+        if (t.intern) {
+            *str = PyUnicode_InternFromString(t.s);
+        } else if (t.encoding) {
+            *str = PyUnicode_Decode(t.s, t.n - 1, t.encoding, NULL);
+        } else {
+            *str = PyUnicode_FromStringAndSize(t.s, t.n - 1);
+        }
+    } else {
+        *str = PyBytes_FromStringAndSize(t.s, t.n - 1);
+    }
+    if (!*str)
+        return -1;
+    if (PyObject_Hash(*str) == -1)
+        return -1;
+    return 0;
+}
+#endif
+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
+    while (t->p) {
+        #if PY_MAJOR_VERSION >= 3
+        __Pyx_InitString(*t, t->p);
+        #else
+        if (t->is_unicode) {
+            *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
+        } else if (t->intern) {
+            *t->p = PyString_InternFromString(t->s);
+        } else {
+            *t->p = PyString_FromStringAndSize(t->s, t->n - 1);
+        }
+        if (!*t->p)
+            return -1;
+        if (PyObject_Hash(*t->p) == -1)
+            return -1;
+        #endif
+        ++t;
+    }
+    return 0;
+}
+
+#include <string.h>
+static CYTHON_INLINE Py_ssize_t __Pyx_ssize_strlen(const char *s) {
+    size_t len = strlen(s);
+    if (unlikely(len > (size_t) PY_SSIZE_T_MAX)) {
+        PyErr_SetString(PyExc_OverflowError, "byte string is too long");
+        return -1;
+    }
+    return (Py_ssize_t) len;
+}
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char* c_str) {
+    Py_ssize_t len = __Pyx_ssize_strlen(c_str);
+    if (unlikely(len < 0)) return NULL;
+    return __Pyx_PyUnicode_FromStringAndSize(c_str, len);
+}
+static CYTHON_INLINE PyObject* __Pyx_PyByteArray_FromString(const char* c_str) {
+    Py_ssize_t len = __Pyx_ssize_strlen(c_str);
+    if (unlikely(len < 0)) return NULL;
+    return PyByteArray_FromStringAndSize(c_str, len);
+}
+static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject* o) {
+    Py_ssize_t ignore;
+    return __Pyx_PyObject_AsStringAndSize(o, &ignore);
+}
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+#if !CYTHON_PEP393_ENABLED
+static const char* __Pyx_PyUnicode_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+    char* defenc_c;
+    PyObject* defenc = _PyUnicode_AsDefaultEncodedString(o, NULL);
+    if (!defenc) return NULL;
+    defenc_c = PyBytes_AS_STRING(defenc);
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+    {
+        char* end = defenc_c + PyBytes_GET_SIZE(defenc);
+        char* c;
+        for (c = defenc_c; c < end; c++) {
+            if ((unsigned char) (*c) >= 128) {
+                PyUnicode_AsASCIIString(o);
+                return NULL;
+            }
+        }
+    }
+#endif
+    *length = PyBytes_GET_SIZE(defenc);
+    return defenc_c;
+}
+#else
+static CYTHON_INLINE const char* __Pyx_PyUnicode_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+    if (unlikely(__Pyx_PyUnicode_READY(o) == -1)) return NULL;
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+    if (likely(PyUnicode_IS_ASCII(o))) {
+        *length = PyUnicode_GET_LENGTH(o);
+        return PyUnicode_AsUTF8(o);
+    } else {
+        PyUnicode_AsASCIIString(o);
+        return NULL;
+    }
+#else
+    return PyUnicode_AsUTF8AndSize(o, length);
+#endif
+}
+#endif
+#endif
+static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+    if (
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+            __Pyx_sys_getdefaultencoding_not_ascii &&
+#endif
+            PyUnicode_Check(o)) {
+        return __Pyx_PyUnicode_AsStringAndSize(o, length);
+    } else
+#endif
+#if (!CYTHON_COMPILING_IN_PYPY && !CYTHON_COMPILING_IN_LIMITED_API) || (defined(PyByteArray_AS_STRING) && defined(PyByteArray_GET_SIZE))
+    if (PyByteArray_Check(o)) {
+        *length = PyByteArray_GET_SIZE(o);
+        return PyByteArray_AS_STRING(o);
+    } else
+#endif
+    {
+        char* result;
+        int r = PyBytes_AsStringAndSize(o, &result, length);
+        if (unlikely(r < 0)) {
+            return NULL;
+        } else {
+            return result;
+        }
+    }
+}
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {
+   int is_true = x == Py_True;
+   if (is_true | (x == Py_False) | (x == Py_None)) return is_true;
+   else return PyObject_IsTrue(x);
+}
+static CYTHON_INLINE int __Pyx_PyObject_IsTrueAndDecref(PyObject* x) {
+    int retval;
+    if (unlikely(!x)) return -1;
+    retval = __Pyx_PyObject_IsTrue(x);
+    Py_DECREF(x);
+    return retval;
+}
+static PyObject* __Pyx_PyNumber_IntOrLongWrongResultType(PyObject* result, const char* type_name) {
+    __Pyx_TypeName result_type_name = __Pyx_PyType_GetName(Py_TYPE(result));
+#if PY_MAJOR_VERSION >= 3
+    if (PyLong_Check(result)) {
+        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                "__int__ returned non-int (type " __Pyx_FMT_TYPENAME ").  "
+                "The ability to return an instance of a strict subclass of int is deprecated, "
+                "and may be removed in a future version of Python.",
+                result_type_name)) {
+            __Pyx_DECREF_TypeName(result_type_name);
+            Py_DECREF(result);
+            return NULL;
+        }
+        __Pyx_DECREF_TypeName(result_type_name);
+        return result;
+    }
+#endif
+    PyErr_Format(PyExc_TypeError,
+                 "__%.4s__ returned non-%.4s (type " __Pyx_FMT_TYPENAME ")",
+                 type_name, type_name, result_type_name);
+    __Pyx_DECREF_TypeName(result_type_name);
+    Py_DECREF(result);
+    return NULL;
+}
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x) {
+#if CYTHON_USE_TYPE_SLOTS
+  PyNumberMethods *m;
+#endif
+  const char *name = NULL;
+  PyObject *res = NULL;
+#if PY_MAJOR_VERSION < 3
+  if (likely(PyInt_Check(x) || PyLong_Check(x)))
+#else
+  if (likely(PyLong_Check(x)))
+#endif
+    return __Pyx_NewRef(x);
+#if CYTHON_USE_TYPE_SLOTS
+  m = Py_TYPE(x)->tp_as_number;
+  #if PY_MAJOR_VERSION < 3
+  if (m && m->nb_int) {
+    name = "int";
+    res = m->nb_int(x);
+  }
+  else if (m && m->nb_long) {
+    name = "long";
+    res = m->nb_long(x);
+  }
+  #else
+  if (likely(m && m->nb_int)) {
+    name = "int";
+    res = m->nb_int(x);
+  }
+  #endif
+#else
+  if (!PyBytes_CheckExact(x) && !PyUnicode_CheckExact(x)) {
+    res = PyNumber_Int(x);
+  }
+#endif
+  if (likely(res)) {
+#if PY_MAJOR_VERSION < 3
+    if (unlikely(!PyInt_Check(res) && !PyLong_Check(res))) {
+#else
+    if (unlikely(!PyLong_CheckExact(res))) {
+#endif
+        return __Pyx_PyNumber_IntOrLongWrongResultType(res, name);
+    }
+  }
+  else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_TypeError,
+                    "an integer is required");
+  }
+  return res;
+}
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) {
+  Py_ssize_t ival;
+  PyObject *x;
+#if PY_MAJOR_VERSION < 3
+  if (likely(PyInt_CheckExact(b))) {
+    if (sizeof(Py_ssize_t) >= sizeof(long))
+        return PyInt_AS_LONG(b);
+    else
+        return PyInt_AsSsize_t(b);
+  }
+#endif
+  if (likely(PyLong_CheckExact(b))) {
+    #if CYTHON_USE_PYLONG_INTERNALS
+    if (likely(__Pyx_PyLong_IsCompact(b))) {
+        return __Pyx_PyLong_CompactValue(b);
+    } else {
+      const digit* digits = __Pyx_PyLong_Digits(b);
+      const Py_ssize_t size = __Pyx_PyLong_SignedDigitCount(b);
+      switch (size) {
+         case 2:
+           if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -2:
+           if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case 3:
+           if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -3:
+           if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case 4:
+           if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -4:
+           if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+      }
+    }
+    #endif
+    return PyLong_AsSsize_t(b);
+  }
+  x = PyNumber_Index(b);
+  if (!x) return -1;
+  ival = PyInt_AsSsize_t(x);
+  Py_DECREF(x);
+  return ival;
+}
+static CYTHON_INLINE Py_hash_t __Pyx_PyIndex_AsHash_t(PyObject* o) {
+  if (sizeof(Py_hash_t) == sizeof(Py_ssize_t)) {
+    return (Py_hash_t) __Pyx_PyIndex_AsSsize_t(o);
+#if PY_MAJOR_VERSION < 3
+  } else if (likely(PyInt_CheckExact(o))) {
+    return PyInt_AS_LONG(o);
+#endif
+  } else {
+    Py_ssize_t ival;
+    PyObject *x;
+    x = PyNumber_Index(o);
+    if (!x) return -1;
+    ival = PyInt_AsLong(x);
+    Py_DECREF(x);
+    return ival;
+  }
+}
+static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b) {
+  return b ? __Pyx_NewRef(Py_True) : __Pyx_NewRef(Py_False);
+}
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) {
+    return PyInt_FromSize_t(ival);
+}
+
+
+/* #### Code section: utility_code_pragmas_end ### */
+#ifdef _MSC_VER
+#pragma warning( pop )
+#endif
+
+
+
+/* #### Code section: end ### */
+#endif /* Py_PYTHON_H */
diff --git a/fairseq/data/data_utils_fast.cpython-310-x86_64-linux-gnu.so b/fairseq/data/data_utils_fast.cpython-310-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..3f9f034218ccae9ed274144e459e40d10b0d43bf
--- /dev/null
+++ b/fairseq/data/data_utils_fast.cpython-310-x86_64-linux-gnu.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c75ab2d7aba9471ec9d32a4c76dc68e51a9fd8ee6d3553637c68f70fceb2451
+size 258392
diff --git a/fairseq/data/data_utils_fast.pyx b/fairseq/data/data_utils_fast.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..c1f97bf5b661e1b597a447075f282ea56ccca796
--- /dev/null
+++ b/fairseq/data/data_utils_fast.pyx
@@ -0,0 +1,122 @@
+# cython: language_level=3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+cimport cython
+cimport numpy as np
+
+DTYPE = np.int64
+ctypedef np.int64_t DTYPE_t
+
+
+cdef _is_batch_full(long num_sentences, long num_tokens, long max_tokens, long max_sentences):
+    if num_sentences == 0:
+        return 0
+    if max_sentences > 0 and num_sentences == max_sentences:
+        return 1
+    if max_tokens > 0 and num_tokens > max_tokens:
+        return 1
+    return 0
+
+
+@cython.cdivision(True)
+cpdef list batch_by_size_fast(
+    np.ndarray[DTYPE_t, ndim=1] indices,
+    num_tokens_fn,
+    long max_tokens,
+    long max_sentences,
+    int bsz_mult,
+):
+    cdef long sample_len = 0
+    cdef list sample_lens = []
+    cdef list batch = []
+    cdef list batches = []
+    cdef long mod_len
+    cdef long i
+    cdef long idx
+    cdef long num_tokens
+    cdef DTYPE_t[:] indices_view = indices
+
+    for i in range(len(indices_view)):
+        idx = indices_view[i]
+        num_tokens = num_tokens_fn(idx)
+        sample_lens.append(num_tokens)
+        sample_len = max(sample_len, num_tokens)
+
+        assert max_tokens <= 0 or sample_len <= max_tokens, (
+            "sentence at index {} of size {} exceeds max_tokens "
+            "limit of {}!".format(idx, sample_len, max_tokens)
+        )
+        num_tokens = (len(batch) + 1) * sample_len
+
+        if _is_batch_full(len(batch), num_tokens, max_tokens, max_sentences):
+            mod_len = max(
+                bsz_mult * (len(batch) // bsz_mult),
+                len(batch) % bsz_mult,
+            )
+            batches.append(batch[:mod_len])
+            batch = batch[mod_len:]
+            sample_lens = sample_lens[mod_len:]
+            sample_len = max(sample_lens) if len(sample_lens) > 0 else 0
+        batch.append(idx)
+    if len(batch) > 0:
+        batches.append(batch)
+    return batches
+
+
+cdef _find_valid_shape(
+    DTYPE_t[:, :] shapes_view,
+    long num_sentences,
+    long num_tokens,
+):
+    """Return index of first valid shape of -1 if none is found."""
+    for i in range(shapes_view.shape[0]):
+        if num_sentences <= shapes_view[i][0] and num_tokens <= shapes_view[i][1]:
+            return i
+    return -1
+
+
+@cython.cdivision(True)
+cpdef list batch_fixed_shapes_fast(
+    np.ndarray[DTYPE_t, ndim=1] indices,
+    num_tokens_fn,
+    np.ndarray[DTYPE_t, ndim=2] fixed_shapes_sorted,
+):
+    cdef long sample_len = 0
+    cdef list sample_lens = []
+    cdef list batch = []
+    cdef list batches = []
+    cdef long mod_len
+    cdef long i
+    cdef long idx
+    cdef long num_tokens
+    cdef DTYPE_t[:] indices_view = indices
+    cdef DTYPE_t[:, :] shapes_view = fixed_shapes_sorted
+
+    for i in range(len(indices_view)):
+        idx = indices_view[i]
+        num_tokens = num_tokens_fn(idx)
+        sample_lens.append(num_tokens)
+        sample_len = max(sample_len, num_tokens)
+
+        shape_idx = _find_valid_shape(shapes_view, len(batch) + 1, sample_len)
+        if shape_idx == -1:
+            batches.append(batch)
+            batch = []
+            sample_lens = []
+            sample_len = 0
+            shapes_view = fixed_shapes_sorted
+        elif shape_idx > 0:
+            # small optimization for the next call to _find_valid_shape
+            shapes_view = shapes_view[shape_idx:]
+
+        batch.append(idx)
+
+    if len(batch) > 0:
+        batches.append(batch)
+
+    return batches
diff --git a/fairseq/data/denoising_dataset.py b/fairseq/data/denoising_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dc240c1ebebeba2cec15dcba9d8b2f27d8fa050
--- /dev/null
+++ b/fairseq/data/denoising_dataset.py
@@ -0,0 +1,407 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+import math
+
+from . import data_utils, FairseqDataset
+
+
+def collate(
+    samples,
+    pad_idx,
+    eos_idx,
+    vocab,
+    left_pad_source=False,
+    left_pad_target=False,
+    input_feeding=True,
+    pad_to_length=None,
+):
+    assert input_feeding
+    if len(samples) == 0:
+        return {}
+
+    def merge(key, left_pad, move_eos_to_beginning=False, pad_to_length=None):
+        return data_utils.collate_tokens(
+            [s[key] for s in samples],
+            pad_idx,
+            eos_idx=None,  # use eos_idx of each sample instead of vocab.eos()
+            left_pad=left_pad,
+            move_eos_to_beginning=move_eos_to_beginning,
+            pad_to_length=pad_to_length,
+        )
+
+    id = torch.LongTensor([s['id'] for s in samples])
+    src_tokens = merge(
+        'source', left_pad=left_pad_source,
+        pad_to_length=pad_to_length['source'] if pad_to_length is not None else None,
+    )
+    # sort by descending source length
+    src_lengths = torch.LongTensor([s['source'].numel() for s in samples])
+    src_lengths, sort_order = src_lengths.sort(descending=True)
+    id = id.index_select(0, sort_order)
+    src_tokens = src_tokens.index_select(0, sort_order)
+
+    prev_output_tokens = None
+    target = None
+    if samples[0].get('target', None) is not None:
+        target = merge(
+            'target', left_pad=left_pad_target,
+            pad_to_length=pad_to_length['target'] if pad_to_length is not None else None,
+        )
+        target = target.index_select(0, sort_order)
+        ntokens = sum(len(s['target']) for s in samples)
+
+        if input_feeding:
+            # we create a shifted version of targets for feeding the
+            # previous output token(s) into the next decoder step
+            prev_output_tokens = merge(
+                'target',
+                left_pad=left_pad_target,
+                move_eos_to_beginning=True,
+                pad_to_length=pad_to_length['target'] if pad_to_length is not None else None,
+            )
+            prev_output_tokens = prev_output_tokens.index_select(0, sort_order)
+    else:
+        ntokens = sum(len(s['source']) for s in samples)
+
+    batch = {
+        'id': id,
+        'ntokens': ntokens,
+        'net_input': {
+            'src_tokens': src_tokens,
+            'src_lengths': src_lengths,
+        },
+        'target': target,
+        'nsentences': samples[0]['source'].size(0),
+        'sort_order': sort_order,
+    }
+    if prev_output_tokens is not None:
+        batch['net_input']['prev_output_tokens'] = prev_output_tokens
+
+    return batch
+
+
+class DenoisingDataset(FairseqDataset):
+    """
+    A wrapper around TokenBlockDataset for BART dataset.
+
+    Args:
+        dataset (TokenBlockDataset): dataset to wrap
+        sizes (List[int]): sentence lengths
+        vocab (~fairseq.data.Dictionary): vocabulary
+        mask_idx (int): dictionary index used for masked token
+        mask_whole_words: only mask whole words. This should be a byte mask
+            over vocab indices, indicating whether it is the beginning of a
+            word. We will extend any mask to encompass the whole word.
+        shuffle (bool, optional): shuffle the elements before batching.
+          Default: ``True``
+        seed: Seed for random number generator for reproducibility.
+        args: argparse arguments.
+    """
+
+    def __init__(
+        self,
+        dataset,
+        sizes,
+        vocab,
+        mask_idx,
+        mask_whole_words,
+        shuffle,
+        seed,
+        args,
+        eos=None,
+        item_transform_func=None,
+    ):
+        self.dataset = dataset
+
+        self.sizes = sizes
+
+        self.vocab = vocab
+        self.shuffle = shuffle
+        self.seed = seed
+        self.mask_idx = mask_idx
+        self.mask_whole_word = mask_whole_words
+        self.mask_ratio = args.mask
+        self.random_ratio = args.mask_random
+        self.insert_ratio = args.insert
+        self.rotate_ratio = args.rotate
+        self.permute_sentence_ratio = args.permute_sentences
+        self.eos = (eos if eos is not None else vocab.eos())
+        self.item_transform_func = item_transform_func
+
+        if args.bpe != 'gpt2':
+            self.full_stop_index = self.vocab.eos()
+        else:
+            assert args.bpe == 'gpt2'
+            self.full_stop_index = self.vocab.index('13')
+
+        self.replace_length = args.replace_length
+        if self.replace_length not in [-1, 0, 1]:
+            raise ValueError(f'invalid arg: replace_length={self.replace_length}')
+        if args.mask_length not in ['subword', 'word', 'span-poisson']:
+            raise ValueError(f'invalid arg: mask-length={args.mask_length}')
+        if args.mask_length == 'subword' and args.replace_length not in [0, 1]:
+            raise ValueError(f'if using subwords, use replace-length=1 or 0')
+
+        self.mask_span_distribution = None
+        if args.mask_length == 'span-poisson':
+            _lambda = args.poisson_lambda
+
+            lambda_to_the_k = 1
+            e_to_the_minus_lambda = math.exp(-_lambda)
+            k_factorial = 1
+            ps = []
+            for k in range(0, 128):
+                ps.append(e_to_the_minus_lambda * lambda_to_the_k / k_factorial)
+                lambda_to_the_k *= _lambda
+                k_factorial *= (k + 1)
+                if ps[-1] < 0.0000001:
+                    break
+            ps = torch.FloatTensor(ps)
+            self.mask_span_distribution = torch.distributions.Categorical(ps)
+
+        self.epoch = 0
+
+    def set_epoch(self, epoch, **unused):
+        self.epoch = epoch
+
+    def __getitem__(self, index):
+        with data_utils.numpy_seed(self.seed, self.epoch, index):
+            tokens = self.dataset[index]
+            assert tokens[-1] == self.eos
+            source, target = tokens, tokens.clone()
+
+            if self.permute_sentence_ratio > 0.0:
+                source = self.permute_sentences(source, self.permute_sentence_ratio)
+
+            if self.mask_ratio > 0:
+                source = self.add_whole_word_mask(source, self.mask_ratio)
+
+            if self.insert_ratio > 0:
+                source = self.add_insertion_noise(source, self.insert_ratio)
+
+            if self.rotate_ratio > 0.0 and np.random.random() < self.rotate_ratio:
+                source = self.add_rolling_noise(source)
+        # there can additional changes to make:
+        if self.item_transform_func is not None:
+            source, target = self.item_transform_func(source, target)
+
+        assert (source >= 0).all()
+        assert (source[1:-1] >= 1).all()
+        assert (source <= len(self.vocab)).all()
+        assert source[0] == self.vocab.bos()
+        assert source[-1] == self.eos
+        return {
+            'id': index,
+            'source': source,
+            'target': target,
+        }
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def permute_sentences(self, source, p=1.0):
+        full_stops = (source == self.full_stop_index)
+        # Pretend it ends with a full stop so last span is a sentence
+        full_stops[-2] = 1
+
+        # Tokens that are full stops, where the previous token is not
+        sentence_ends = (full_stops[1:] * ~full_stops[:-1]).nonzero() + 2
+        result = source.clone()
+
+        num_sentences = sentence_ends.size(0)
+        num_to_permute = math.ceil((num_sentences * 2 * p) / 2.0)
+        substitutions = torch.randperm(num_sentences)[:num_to_permute]
+        ordering = torch.arange(0, num_sentences)
+        ordering[substitutions] = substitutions[torch.randperm(num_to_permute)]
+
+        # Ignore <bos> at start
+        index = 1
+        for i in ordering:
+            sentence = source[(sentence_ends[i - 1] if i > 0 else 1):sentence_ends[i]]
+            result[index:index + sentence.size(0)] = sentence
+            index += sentence.size(0)
+        return result
+
+    def word_starts(self, source):
+        if self.mask_whole_word is not None:
+            is_word_start = self.mask_whole_word.gather(0, source)
+        else:
+            is_word_start = torch.ones(source.size())
+        is_word_start[0] = 0
+        is_word_start[-1] = 0
+        return is_word_start
+
+    def add_whole_word_mask(self, source, p):
+        is_word_start = self.word_starts(source)
+        num_to_mask = int(math.ceil(is_word_start.float().sum() * p))
+        num_inserts = 0
+        if num_to_mask == 0:
+            return source
+
+        if self.mask_span_distribution is not None:
+            lengths = self.mask_span_distribution.sample(sample_shape=(num_to_mask,))
+
+            # Make sure we have enough to mask
+            cum_length = torch.cumsum(lengths, 0)
+            while cum_length[-1] < num_to_mask:
+                lengths = torch.cat([lengths, self.mask_span_distribution.sample(sample_shape=(num_to_mask,))], dim=0)
+                cum_length = torch.cumsum(lengths, 0)
+
+            # Trim to masking budget
+            i = 0
+            while cum_length[i] < num_to_mask:
+                i += 1
+            lengths[i] = num_to_mask - (0 if i == 0 else cum_length[i - 1])
+            num_to_mask = i + 1
+            lengths = lengths[:num_to_mask]
+
+            # Handle 0-length mask (inserts) separately
+            lengths = lengths[lengths > 0]
+            num_inserts = num_to_mask - lengths.size(0)
+            num_to_mask -= num_inserts
+            if num_to_mask == 0:
+                return self.add_insertion_noise(source, num_inserts / source.size(0))
+
+            assert (lengths > 0).all()
+        else:
+            lengths = torch.ones((num_to_mask,)).long()
+        assert is_word_start[-1] == 0
+        word_starts = is_word_start.nonzero()
+        indices = word_starts[torch.randperm(word_starts.size(0))[:num_to_mask]].squeeze(1)
+        mask_random = torch.FloatTensor(num_to_mask).uniform_() < self.random_ratio
+
+        source_length = source.size(0)
+        assert source_length - 1 not in indices
+        to_keep = torch.ones(source_length, dtype=torch.bool)
+        is_word_start[-1] = 255 # acts as a long length, so spans don't go over the end of doc
+        if self.replace_length == 0:
+            to_keep[indices] = 0
+        else:
+            # keep index, but replace it with [MASK]
+            source[indices] = self.mask_idx
+            source[indices[mask_random]] = torch.randint(1, len(self.vocab), size=(mask_random.sum(),))
+
+        if self.mask_span_distribution is not None:
+            assert len(lengths.size()) == 1
+            assert lengths.size() == indices.size()
+            lengths -= 1
+            while indices.size(0) > 0:
+                assert lengths.size() == indices.size()
+                lengths -= is_word_start[indices + 1].long()
+                uncompleted = lengths >= 0
+                indices = indices[uncompleted] + 1
+                mask_random = mask_random[uncompleted]
+                lengths = lengths[uncompleted]
+                if self.replace_length != -1:
+                    # delete token
+                    to_keep[indices] = 0
+                else:
+                    # keep index, but replace it with [MASK]
+                    source[indices] = self.mask_idx
+                    source[indices[mask_random]] = torch.randint(1, len(self.vocab), size=(mask_random.sum(),))
+        else:
+            # A bit faster when all lengths are 1
+            while indices.size(0) > 0:
+                uncompleted = is_word_start[indices + 1] == 0
+                indices = indices[uncompleted] + 1
+                mask_random = mask_random[uncompleted]
+                if self.replace_length != -1:
+                    # delete token
+                    to_keep[indices] = 0
+                else:
+                    # keep index, but replace it with [MASK]
+                    source[indices] = self.mask_idx
+                    source[indices[mask_random]] = torch.randint(1, len(self.vocab), size=(mask_random.sum(),))
+
+                assert source_length - 1 not in indices
+
+        source = source[to_keep]
+
+        if num_inserts > 0:
+            source = self.add_insertion_noise(source, num_inserts / source.size(0))
+
+        return source
+
+    def add_permuted_noise(self, tokens, p):
+        num_words = len(tokens)
+        num_to_permute = math.ceil(((num_words * 2) * p) / 2.0)
+        substitutions = torch.randperm(num_words - 2)[:num_to_permute] + 1
+        tokens[substitutions] = tokens[substitutions[torch.randperm(num_to_permute)]]
+        return tokens
+
+    def add_rolling_noise(self, tokens):
+        offset = np.random.randint(1, max(1, tokens.size(-1) - 1) + 1)
+        tokens = torch.cat(
+            (tokens[0:1], tokens[offset:-1], tokens[1:offset], tokens[-1:]),
+            dim=0,
+        )
+        return tokens
+
+    def add_insertion_noise(self, tokens, p):
+        if p == 0.0:
+            return tokens
+
+        num_tokens = len(tokens)
+        n = int(math.ceil(num_tokens * p))
+
+        noise_indices = torch.randperm(num_tokens + n - 2)[:n] + 1
+        noise_mask = torch.zeros(size=(num_tokens + n,), dtype=torch.bool)
+        noise_mask[noise_indices] = 1
+        result = torch.LongTensor(n + len(tokens)).fill_(-1)
+
+        num_random = int(math.ceil(n * self.random_ratio))
+        result[noise_indices[num_random:]] = self.mask_idx
+        result[noise_indices[:num_random]] = torch.randint(low=1, high=len(self.vocab), size=(num_random,))
+
+        result[~noise_mask] = tokens
+
+        assert (result >= 0).all()
+        return result
+
+    def collater(self, samples, pad_to_length=None):
+        """Merge a list of samples to form a mini-batch.
+        Args:
+            samples (List[dict]): samples to collate
+        Returns:
+            dict: a mini-batch of data
+        """
+        return collate(
+            samples, self.vocab.pad(), self.eos, self.vocab,
+            pad_to_length=pad_to_length)
+
+    def num_tokens(self, index):
+        """Return the number of tokens in a sample. This value is used to
+        enforce ``--max-tokens`` during batching."""
+        return self.sizes[index]
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        return self.sizes[index]
+
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        if self.shuffle:
+            indices = np.random.permutation(len(self))
+        else:
+            indices = np.arange(len(self))
+        return indices[np.argsort(self.sizes[indices], kind='mergesort')]
+
+    def prefetch(self, indices):
+        self.src.prefetch(indices)
+        self.tgt.prefetch(indices)
+
+    @property
+    def supports_prefetch(self):
+        return (
+            hasattr(self.src, 'supports_prefetch')
+            and self.src.supports_prefetch
+            and hasattr(self.tgt, 'supports_prefetch')
+            and self.tgt.supports_prefetch
+        )
diff --git a/fairseq/data/dictionary.py b/fairseq/data/dictionary.py
new file mode 100644
index 0000000000000000000000000000000000000000..01a6a8148688173d991017b39338068711e90b23
--- /dev/null
+++ b/fairseq/data/dictionary.py
@@ -0,0 +1,388 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from collections import Counter
+from multiprocessing import Pool
+
+import torch
+from fairseq import utils
+from fairseq.binarizer import safe_readline
+from fairseq.data import data_utils
+from fairseq.file_io import PathManager
+from fairseq.tokenizer import tokenize_line
+
+
+class Dictionary(object):
+    """A mapping from symbols to consecutive integers"""
+
+    def __init__(
+        self,
+        *,  # begin keyword-only arguments
+        pad="<pad>",
+        eos="</s>",
+        unk="<unk>",
+        bos="<s>",
+        extra_special_symbols=None,
+    ):
+        self.unk_word, self.pad_word, self.eos_word = unk, pad, eos
+        self.symbols = []
+        self.count = []
+        self.indices = {}
+        self.bos_index = self.add_symbol(bos)
+        self.pad_index = self.add_symbol(pad)
+        self.eos_index = self.add_symbol(eos)
+        self.unk_index = self.add_symbol(unk)
+        if extra_special_symbols:
+            for s in extra_special_symbols:
+                self.add_symbol(s)
+        self.nspecial = len(self.symbols)
+
+    def __eq__(self, other):
+        return self.indices == other.indices
+
+    def __getitem__(self, idx):
+        if idx < len(self.symbols):
+            return self.symbols[idx]
+        return self.unk_word
+
+    def __len__(self):
+        """Returns the number of symbols in the dictionary"""
+        return len(self.symbols)
+
+    def __contains__(self, sym):
+        return sym in self.indices
+
+    def index(self, sym):
+        """Returns the index of the specified symbol"""
+        assert isinstance(sym, str)
+        if sym in self.indices:
+            return self.indices[sym]
+        return self.unk_index
+
+    def string(
+        self,
+        tensor,
+        bpe_symbol=None,
+        escape_unk=False,
+        extra_symbols_to_ignore=None,
+        unk_string=None,
+    ):
+        """Helper for converting a tensor of token indices to a string.
+
+        Can optionally remove BPE symbols or escape <unk> words.
+        """
+        if torch.is_tensor(tensor) and tensor.dim() == 2:
+            return "\n".join(
+                self.string(t, bpe_symbol, escape_unk, extra_symbols_to_ignore)
+                for t in tensor
+            )
+
+        extra_symbols_to_ignore = set(extra_symbols_to_ignore or [])
+        extra_symbols_to_ignore.add(self.eos())
+
+        def token_string(i):
+            if i == self.unk():
+                if unk_string is not None:
+                    return unk_string
+                else:
+                    return self.unk_string(escape_unk)
+            else:
+                return self[i]
+
+        if hasattr(self, "bos_index"):
+            extra_symbols_to_ignore.add(self.bos())
+
+        sent = " ".join(
+            token_string(i)
+            for i in tensor
+            if utils.item(i) not in extra_symbols_to_ignore
+        )
+
+        return data_utils.post_process(sent, bpe_symbol)
+
+    def unk_string(self, escape=False):
+        """Return unknown string, optionally escaped as: <<unk>>"""
+        if escape:
+            return "<{}>".format(self.unk_word)
+        else:
+            return self.unk_word
+
+    def add_symbol(self, word, n=1, overwrite=False):
+        """Adds a word to the dictionary"""
+        if word in self.indices and not overwrite:
+            idx = self.indices[word]
+            self.count[idx] = self.count[idx] + n
+            return idx
+        else:
+            idx = len(self.symbols)
+            self.indices[word] = idx
+            self.symbols.append(word)
+            self.count.append(n)
+            return idx
+
+    def update(self, new_dict):
+        """Updates counts from new dictionary."""
+        for word in new_dict.symbols:
+            idx2 = new_dict.indices[word]
+            if word in self.indices:
+                idx = self.indices[word]
+                self.count[idx] = self.count[idx] + new_dict.count[idx2]
+            else:
+                idx = len(self.symbols)
+                self.indices[word] = idx
+                self.symbols.append(word)
+                self.count.append(new_dict.count[idx2])
+
+    def finalize(self, threshold=-1, nwords=-1, padding_factor=8):
+        """Sort symbols by frequency in descending order, ignoring special ones.
+
+        Args:
+            - threshold defines the minimum word count
+            - nwords defines the total number of words in the final dictionary,
+                including special symbols
+            - padding_factor can be used to pad the dictionary size to be a
+                multiple of 8, which is important on some hardware (e.g., Nvidia
+                Tensor Cores).
+        """
+        if nwords <= 0:
+            nwords = len(self)
+
+        new_indices = dict(zip(self.symbols[: self.nspecial], range(self.nspecial)))
+        new_symbols = self.symbols[: self.nspecial]
+        new_count = self.count[: self.nspecial]
+
+        c = Counter(
+            dict(
+                sorted(zip(self.symbols[self.nspecial :], self.count[self.nspecial :]))
+            )
+        )
+        for symbol, count in c.most_common(nwords - self.nspecial):
+            if count >= threshold:
+                new_indices[symbol] = len(new_symbols)
+                new_symbols.append(symbol)
+                new_count.append(count)
+            else:
+                break
+
+        assert len(new_symbols) == len(new_indices)
+
+        self.count = list(new_count)
+        self.symbols = list(new_symbols)
+        self.indices = new_indices
+
+        self.pad_to_multiple_(padding_factor)
+
+    def pad_to_multiple_(self, padding_factor):
+        """Pad Dictionary size to be a multiple of *padding_factor*."""
+        if padding_factor > 1:
+            i = 0
+            while len(self) % padding_factor != 0:
+                symbol = "madeupword{:04d}".format(i)
+                self.add_symbol(symbol, n=0)
+                i += 1
+
+    def bos(self):
+        """Helper to get index of beginning-of-sentence symbol"""
+        return self.bos_index
+
+    def pad(self):
+        """Helper to get index of pad symbol"""
+        return self.pad_index
+
+    def eos(self):
+        """Helper to get index of end-of-sentence symbol"""
+        return self.eos_index
+
+    def unk(self):
+        """Helper to get index of unk symbol"""
+        return self.unk_index
+
+    @classmethod
+    def load(cls, f):
+        """Loads the dictionary from a text file with the format:
+
+        ```
+        <symbol0> <count0>
+        <symbol1> <count1>
+        ...
+        ```
+        """
+        d = cls()
+        d.add_from_file(f)
+        return d
+
+    def add_from_file(self, f):
+        """
+        Loads a pre-existing dictionary from a text file and adds its symbols
+        to this instance.
+        """
+        if isinstance(f, str):
+            try:
+                with PathManager.open(f, "r", encoding="utf-8") as fd:
+                    self.add_from_file(fd)
+            except FileNotFoundError as fnfe:
+                raise fnfe
+            except UnicodeError:
+                raise Exception(
+                    "Incorrect encoding detected in {}, please "
+                    "rebuild the dataset".format(f)
+                )
+            return
+
+        lines = f.readlines()
+        indices_start_line = self._load_meta(lines)
+
+        for line in lines[indices_start_line:]:
+            try:
+                line, field = line.rstrip().rsplit(" ", 1)
+                if field == "#fairseq:overwrite":
+                    overwrite = True
+                    line, field = line.rsplit(" ", 1)
+                else:
+                    overwrite = False
+                count = int(field)
+                word = line
+                if word in self and not overwrite:
+                    raise RuntimeError(
+                        "Duplicate word found when loading Dictionary: '{}'. "
+                        "Duplicate words can overwrite earlier ones by adding the "
+                        "#fairseq:overwrite flag at the end of the corresponding row "
+                        "in the dictionary file. If using the Camembert model, please "
+                        "download an updated copy of the model file."
+                        .format(word)
+                    )
+                self.add_symbol(word, n=count, overwrite=overwrite)
+            except ValueError:
+                raise ValueError(
+                    "Incorrect dictionary format, expected '<token> <cnt> [flags]'"
+                )
+
+    def _save(self, f, kv_iterator):
+        if isinstance(f, str):
+            PathManager.mkdirs(os.path.dirname(f))
+            with PathManager.open(f, "w", encoding="utf-8") as fd:
+                return self.save(fd)
+        for k, v in kv_iterator:
+            print("{} {}".format(k, v), file=f)
+
+    def _get_meta(self):
+        return [], []
+
+    def _load_meta(self, lines):
+        return 0
+
+    def save(self, f):
+        """Stores dictionary into a text file"""
+        ex_keys, ex_vals = self._get_meta()
+        self._save(
+            f,
+            zip(
+                ex_keys + self.symbols[self.nspecial :],
+                ex_vals + self.count[self.nspecial :],
+            ),
+        )
+
+    def dummy_sentence(self, length):
+        t = torch.Tensor(length).uniform_(self.nspecial + 1, len(self)).long()
+        t[-1] = self.eos()
+        return t
+
+    def encode_line(
+        self,
+        line,
+        line_tokenizer=tokenize_line,
+        add_if_not_exist=True,
+        consumer=None,
+        append_eos=True,
+        reverse_order=False,
+    ):
+        words = line_tokenizer(line)
+        if reverse_order:
+            words = list(reversed(words))
+        nwords = len(words)
+        ids = torch.IntTensor(nwords + 1 if append_eos else nwords)
+
+        for i, word in enumerate(words):
+            if add_if_not_exist:
+                idx = self.add_symbol(word)
+            else:
+                idx = self.index(word)
+            if consumer is not None:
+                consumer(word, idx)
+            ids[i] = idx
+        if append_eos:
+            ids[nwords] = self.eos_index
+        return ids
+
+    @staticmethod
+    def _add_file_to_dictionary_single_worker(
+        filename, tokenize, eos_word, worker_id=0, num_workers=1
+    ):
+        counter = Counter()
+        with open(PathManager.get_local_path(filename), "r", encoding="utf-8") as f:
+            size = os.fstat(f.fileno()).st_size
+            chunk_size = size // num_workers
+            offset = worker_id * chunk_size
+            end = offset + chunk_size
+            f.seek(offset)
+            if offset > 0:
+                safe_readline(f)  # drop first incomplete line
+            line = f.readline()
+            while line:
+                for word in tokenize(line):
+                    counter.update([word])
+                counter.update([eos_word])
+                if f.tell() > end:
+                    break
+                line = f.readline()
+        return counter
+
+    @staticmethod
+    def add_file_to_dictionary(filename, dict, tokenize, num_workers):
+        def merge_result(counter):
+            for w, c in sorted(counter.items()):
+                dict.add_symbol(w, c)
+
+        if num_workers > 1:
+            pool = Pool(processes=num_workers)
+            results = []
+            for worker_id in range(num_workers):
+                results.append(
+                    pool.apply_async(
+                        Dictionary._add_file_to_dictionary_single_worker,
+                        (filename, tokenize, dict.eos_word, worker_id, num_workers),
+                    )
+                )
+            pool.close()
+            pool.join()
+            for r in results:
+                merge_result(r.get())
+        else:
+            merge_result(
+                Dictionary._add_file_to_dictionary_single_worker(
+                    filename, tokenize, dict.eos_word
+                )
+            )
+
+
+class TruncatedDictionary(object):
+    def __init__(self, wrapped_dict, length):
+        self.__class__ = type(
+            wrapped_dict.__class__.__name__,
+            (self.__class__, wrapped_dict.__class__),
+            {},
+        )
+        self.__dict__ = wrapped_dict.__dict__
+        self.wrapped_dict = wrapped_dict
+        self.length = min(len(self.wrapped_dict), length)
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, i):
+        if i < self.length:
+            return self.wrapped_dict[i]
+        return self.wrapped_dict.unk()
diff --git a/fairseq/data/encoders/__init__.py b/fairseq/data/encoders/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0909b669757e4e5731be8ac28cf9c221a1583ea
--- /dev/null
+++ b/fairseq/data/encoders/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import importlib
+import os
+
+from fairseq import registry
+
+
+build_tokenizer, register_tokenizer, TOKENIZER_REGISTRY = registry.setup_registry(
+    '--tokenizer',
+    default=None,
+)
+
+
+build_bpe, register_bpe, BPE_REGISTRY = registry.setup_registry(
+    '--bpe',
+    default=None,
+)
+
+
+# automatically import any Python files in the encoders/ directory
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith('.py') and not file.startswith('_'):
+        module = file[:file.find('.py')]
+        importlib.import_module('fairseq.data.encoders.' + module)
diff --git a/fairseq/data/encoders/__pycache__/__init__.cpython-310.pyc b/fairseq/data/encoders/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c8307bad166f28075f5e2b15b8d69c245c9df306
Binary files /dev/null and b/fairseq/data/encoders/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/data/encoders/__pycache__/byte_bpe.cpython-310.pyc b/fairseq/data/encoders/__pycache__/byte_bpe.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ecf22e48c8e24bc874d96b5cdc766cf2b7397ef
Binary files /dev/null and b/fairseq/data/encoders/__pycache__/byte_bpe.cpython-310.pyc differ
diff --git a/fairseq/data/encoders/__pycache__/byte_utils.cpython-310.pyc b/fairseq/data/encoders/__pycache__/byte_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66590e961ff2266dc568ba039186ee13b7b0c375
Binary files /dev/null and b/fairseq/data/encoders/__pycache__/byte_utils.cpython-310.pyc differ
diff --git a/fairseq/data/encoders/__pycache__/bytes.cpython-310.pyc b/fairseq/data/encoders/__pycache__/bytes.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca307a60e60e93ad624f4f1db8517d4d4c9ffc11
Binary files /dev/null and b/fairseq/data/encoders/__pycache__/bytes.cpython-310.pyc differ
diff --git a/fairseq/data/encoders/__pycache__/characters.cpython-310.pyc b/fairseq/data/encoders/__pycache__/characters.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a732cb9648b5625fbf05071cb927163c66da9db
Binary files /dev/null and b/fairseq/data/encoders/__pycache__/characters.cpython-310.pyc differ
diff --git a/fairseq/data/encoders/__pycache__/fastbpe.cpython-310.pyc b/fairseq/data/encoders/__pycache__/fastbpe.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10a3272ef6da700340d12792ed52a426ff22e943
Binary files /dev/null and b/fairseq/data/encoders/__pycache__/fastbpe.cpython-310.pyc differ
diff --git a/fairseq/data/encoders/__pycache__/gpt2_bpe.cpython-310.pyc b/fairseq/data/encoders/__pycache__/gpt2_bpe.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5db4a1da60595bb4efb19e0b0b8acaff5a992ec5
Binary files /dev/null and b/fairseq/data/encoders/__pycache__/gpt2_bpe.cpython-310.pyc differ
diff --git a/fairseq/data/encoders/__pycache__/gpt2_bpe_utils.cpython-310.pyc b/fairseq/data/encoders/__pycache__/gpt2_bpe_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81b680034357b9d785a811627b1cb96b22ca4bfb
Binary files /dev/null and b/fairseq/data/encoders/__pycache__/gpt2_bpe_utils.cpython-310.pyc differ
diff --git a/fairseq/data/encoders/__pycache__/hf_bert_bpe.cpython-310.pyc b/fairseq/data/encoders/__pycache__/hf_bert_bpe.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..88c5567a8611dc856426f576154c24b67f88b9d0
Binary files /dev/null and b/fairseq/data/encoders/__pycache__/hf_bert_bpe.cpython-310.pyc differ
diff --git a/fairseq/data/encoders/__pycache__/hf_byte_bpe.cpython-310.pyc b/fairseq/data/encoders/__pycache__/hf_byte_bpe.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ddc7736205b6914ebb6589133a914cdb1ee0d810
Binary files /dev/null and b/fairseq/data/encoders/__pycache__/hf_byte_bpe.cpython-310.pyc differ
diff --git a/fairseq/data/encoders/__pycache__/moses_tokenizer.cpython-310.pyc b/fairseq/data/encoders/__pycache__/moses_tokenizer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c7b725560f01c3f168086307af2c876c4a592d9d
Binary files /dev/null and b/fairseq/data/encoders/__pycache__/moses_tokenizer.cpython-310.pyc differ
diff --git a/fairseq/data/encoders/__pycache__/nltk_tokenizer.cpython-310.pyc b/fairseq/data/encoders/__pycache__/nltk_tokenizer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..98a6e22dd0a6c31887af105374af7190a0a66cbf
Binary files /dev/null and b/fairseq/data/encoders/__pycache__/nltk_tokenizer.cpython-310.pyc differ
diff --git a/fairseq/data/encoders/__pycache__/sentencepiece_bpe.cpython-310.pyc b/fairseq/data/encoders/__pycache__/sentencepiece_bpe.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c67c3c92ba28a3f527e7c5543168341d79f481a
Binary files /dev/null and b/fairseq/data/encoders/__pycache__/sentencepiece_bpe.cpython-310.pyc differ
diff --git a/fairseq/data/encoders/__pycache__/space_tokenizer.cpython-310.pyc b/fairseq/data/encoders/__pycache__/space_tokenizer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f210725b0ccf738576e3a13b36e1e1e56872ec0e
Binary files /dev/null and b/fairseq/data/encoders/__pycache__/space_tokenizer.cpython-310.pyc differ
diff --git a/fairseq/data/encoders/__pycache__/subword_nmt_bpe.cpython-310.pyc b/fairseq/data/encoders/__pycache__/subword_nmt_bpe.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5bd7d27f9f6d10186aea7729112e78cfb5571cc
Binary files /dev/null and b/fairseq/data/encoders/__pycache__/subword_nmt_bpe.cpython-310.pyc differ
diff --git a/fairseq/data/encoders/__pycache__/utils.cpython-310.pyc b/fairseq/data/encoders/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91d23e5dc4c756aa47b63049f9cfec3cd400a8bd
Binary files /dev/null and b/fairseq/data/encoders/__pycache__/utils.cpython-310.pyc differ
diff --git a/fairseq/data/encoders/byte_bpe.py b/fairseq/data/encoders/byte_bpe.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d78ff91504d5af66a958e3fc7422b89df95d3dc
--- /dev/null
+++ b/fairseq/data/encoders/byte_bpe.py
@@ -0,0 +1,38 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from fairseq import file_utils
+from fairseq.data.encoders import register_bpe
+from fairseq.data.encoders.byte_utils import (byte_encode, smart_byte_decode,
+                                              SPACE, SPACE_ESCAPE)
+
+
+@register_bpe('byte_bpe')
+class ByteBPE(object):
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--sentencepiece-model-path', type=str,
+                            help='path to sentencepiece model')
+        # fmt: on
+
+    def __init__(self, args):
+        vocab = file_utils.cached_path(args.sentencepiece_model_path)
+        try:
+            import sentencepiece as spm
+            self.sp = spm.SentencePieceProcessor()
+            self.sp.Load(vocab)
+        except ImportError:
+            raise ImportError('Please install sentencepiece with: pip install sentencepiece')
+
+    def encode(self, x: str) -> str:
+        byte_encoded = byte_encode(x)
+        return SPACE.join(self.sp.EncodeAsPieces(byte_encoded))
+
+    @staticmethod
+    def decode(x: str) -> str:
+        unescaped = x.replace(SPACE, '').replace(SPACE_ESCAPE, SPACE)
+        return smart_byte_decode(unescaped)
diff --git a/fairseq/data/encoders/byte_utils.py b/fairseq/data/encoders/byte_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c4bb7471359f54414b83166617475cf58375fca
--- /dev/null
+++ b/fairseq/data/encoders/byte_utils.py
@@ -0,0 +1,51 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import re
+
+WHITESPACE_NORMALIZER = re.compile(r'\s+')
+SPACE = chr(32)
+SPACE_ESCAPE = chr(9601)
+# excluding non-breaking space (160) here
+PRINTABLE_LATIN = set(
+    list(range(32, 126 + 1)) + list(range(161, 172 + 1)) +
+    list(range(174, 255 + 1))
+)
+BYTE_TO_BCHAR = {
+    b: chr(b) if b in PRINTABLE_LATIN else chr(256 + b) for b in range(256)
+}
+BCHAR_TO_BYTE = {bc: b for b, bc in BYTE_TO_BCHAR.items()}
+
+
+def byte_encode(x: str) -> str:
+    normalized = WHITESPACE_NORMALIZER.sub(SPACE, x)
+    return ''.join([BYTE_TO_BCHAR[b] for b in normalized.encode('utf-8')])
+
+
+def byte_decode(x: str) -> str:
+    try:
+        return bytes([BCHAR_TO_BYTE[bc] for bc in x]).decode('utf-8')
+    except ValueError:
+        return ''
+
+
+def smart_byte_decode(x: str) -> str:
+    output = byte_decode(x)
+    if output == '':
+        # DP the best recovery (max valid chars) if it's broken
+        n_bytes = len(x)
+        f = [0 for _ in range(n_bytes + 1)]
+        pt = [0 for _ in range(n_bytes + 1)]
+        for i in range(1, n_bytes + 1):
+            f[i], pt[i] = f[i - 1], i - 1
+            for j in range(1, min(4, i) + 1):
+                if f[i - j] + 1 > f[i] and len(byte_decode(x[i - j: i])) > 0:
+                    f[i], pt[i] = f[i - j] + 1, i - j
+        cur_pt = n_bytes
+        while cur_pt > 0:
+            if f[cur_pt] == f[pt[cur_pt]] + 1:
+                output = byte_decode(x[pt[cur_pt]: cur_pt]) + output
+            cur_pt = pt[cur_pt]
+    return output
diff --git a/fairseq/data/encoders/bytes.py b/fairseq/data/encoders/bytes.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bace19c53542bf872c3a02b322c0f39842d423f
--- /dev/null
+++ b/fairseq/data/encoders/bytes.py
@@ -0,0 +1,30 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from fairseq.data.encoders import register_bpe
+from fairseq.data.encoders.byte_utils import (byte_encode, smart_byte_decode,
+                                              SPACE, SPACE_ESCAPE)
+
+
+@register_bpe('bytes')
+class Bytes(object):
+    def __init__(self, args):
+        pass
+
+    @staticmethod
+    def add_args(parser):
+        pass
+
+    @staticmethod
+    def encode(x: str) -> str:
+        encoded = byte_encode(x)
+        escaped = encoded.replace(SPACE, SPACE_ESCAPE)
+        return SPACE.join(list(escaped))
+
+    @staticmethod
+    def decode(x: str) -> str:
+        unescaped = x.replace(SPACE, '').replace(SPACE_ESCAPE, SPACE)
+        return smart_byte_decode(unescaped)
diff --git a/fairseq/data/encoders/characters.py b/fairseq/data/encoders/characters.py
new file mode 100644
index 0000000000000000000000000000000000000000..db6a58a6502ace4709838ad574757074e893e1a3
--- /dev/null
+++ b/fairseq/data/encoders/characters.py
@@ -0,0 +1,29 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from fairseq.data.encoders import register_bpe
+
+SPACE = chr(32)
+SPACE_ESCAPE = chr(9601)
+
+
+@register_bpe('characters')
+class Characters(object):
+    def __init__(self, args):
+        pass
+
+    @staticmethod
+    def add_args(parser):
+        pass
+
+    @staticmethod
+    def encode(x: str) -> str:
+        escaped = x.replace(SPACE, SPACE_ESCAPE)
+        return SPACE.join(list(escaped))
+
+    @staticmethod
+    def decode(x: str) -> str:
+        return x.replace(SPACE, '').replace(SPACE_ESCAPE, SPACE)
diff --git a/fairseq/data/encoders/fastbpe.py b/fairseq/data/encoders/fastbpe.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea0badd544d506bd773b39693c02179ed54ca0b1
--- /dev/null
+++ b/fairseq/data/encoders/fastbpe.py
@@ -0,0 +1,35 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq import file_utils
+from fairseq.data.encoders import register_bpe
+
+
+@register_bpe('fastbpe')
+class fastBPE(object):
+
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--bpe-codes', type=str,
+                            help='path to fastBPE BPE')
+        # fmt: on
+
+    def __init__(self, args):
+        if args.bpe_codes is None:
+            raise ValueError('--bpe-codes is required for --bpe=fastbpe')
+        codes = file_utils.cached_path(args.bpe_codes)
+        try:
+            import fastBPE
+            self.bpe = fastBPE.fastBPE(codes)
+            self.bpe_symbol = "@@ "
+        except ImportError:
+            raise ImportError('Please install fastBPE with: pip install fastBPE')
+
+    def encode(self, x: str) -> str:
+        return self.bpe.apply([x])[0]
+
+    def decode(self, x: str) -> str:
+        return (x + ' ').replace(self.bpe_symbol, '').rstrip()
diff --git a/fairseq/data/encoders/gpt2_bpe.py b/fairseq/data/encoders/gpt2_bpe.py
new file mode 100644
index 0000000000000000000000000000000000000000..54e0593d00e23bc235e557df29909325f2a17056
--- /dev/null
+++ b/fairseq/data/encoders/gpt2_bpe.py
@@ -0,0 +1,49 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq import file_utils
+from fairseq.data.encoders import register_bpe
+
+from .gpt2_bpe_utils import get_encoder
+
+
+DEFAULT_ENCODER_JSON = 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
+DEFAULT_VOCAB_BPE = 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
+
+
+@register_bpe('gpt2')
+class GPT2BPE(object):
+
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--gpt2-encoder-json', type=str,
+                            default=DEFAULT_ENCODER_JSON,
+                            help='path to encoder.json')
+        parser.add_argument('--gpt2-vocab-bpe', type=str,
+                            default=DEFAULT_VOCAB_BPE,
+                            help='path to vocab.bpe')
+        # fmt: on
+
+    def __init__(self, args):
+        encoder_json = file_utils.cached_path(
+            getattr(args, 'gpt2_encoder_json', DEFAULT_ENCODER_JSON)
+        )
+        vocab_bpe = file_utils.cached_path(
+            getattr(args, 'gpt2_vocab_bpe', DEFAULT_VOCAB_BPE)
+        )
+        self.bpe = get_encoder(encoder_json, vocab_bpe)
+
+    def encode(self, x: str) -> str:
+        return ' '.join(map(str, self.bpe.encode(x)))
+
+    def decode(self, x: str) -> str:
+        return self.bpe.decode([
+            int(tok) if tok not in {'<unk>', '<mask>'} else tok
+            for tok in x.split()
+        ])
+
+    def is_beginning_of_word(self, x: str) -> bool:
+        return self.decode(x).startswith(' ')
diff --git a/fairseq/data/encoders/gpt2_bpe_utils.py b/fairseq/data/encoders/gpt2_bpe_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1917f823141306e2e7f5a4a19c59d35a370a8711
--- /dev/null
+++ b/fairseq/data/encoders/gpt2_bpe_utils.py
@@ -0,0 +1,127 @@
+"""
+Byte pair encoding utilities from GPT-2.
+
+Original source: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+Original license: MIT
+"""
+
+from functools import lru_cache
+import json
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+class Encoder:
+
+    def __init__(self, encoder, bpe_merges, errors='replace'):
+        self.encoder = encoder
+        self.decoder = {v:k for k,v in self.encoder.items()}
+        self.errors = errors # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+
+        try:
+            import regex as re
+            self.re = re
+        except ImportError:
+            raise ImportError('Please install regex with: pip install regex')
+
+        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = self.re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        for token in self.re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder.get(token, token) for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        return text
+
+def get_encoder(encoder_json_path, vocab_bpe_path):
+    with open(encoder_json_path, 'r') as f:
+        encoder = json.load(f)
+    with open(vocab_bpe_path, 'r', encoding="utf-8") as f:
+        bpe_data = f.read()
+    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
+    return Encoder(
+        encoder=encoder,
+        bpe_merges=bpe_merges,
+    )
diff --git a/fairseq/data/encoders/hf_bert_bpe.py b/fairseq/data/encoders/hf_bert_bpe.py
new file mode 100644
index 0000000000000000000000000000000000000000..16adc45aeea4e2c61ed5d99a3561113c7db6411b
--- /dev/null
+++ b/fairseq/data/encoders/hf_bert_bpe.py
@@ -0,0 +1,48 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.data.encoders import register_bpe
+
+
+@register_bpe('bert')
+class BertBPE(object):
+
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--bpe-cased', action='store_true',
+                            help='set for cased BPE',
+                            default=False)
+        parser.add_argument('--bpe-vocab-file', type=str,
+                            help='bpe vocab file.')
+        # fmt: on
+
+    def __init__(self, args):
+        try:
+            from transformers import BertTokenizer
+        except ImportError:
+            raise ImportError(
+                'Please install transformers with: pip install transformers'
+            )
+
+        if 'bpe_vocab_file' in args:
+            self.bert_tokenizer = BertTokenizer(
+                args.bpe_vocab_file,
+                do_lower_case=not args.bpe_cased
+            )
+        else:
+            vocab_file_name = 'bert-base-cased' if args.bpe_cased else 'bert-base-uncased'
+            self.bert_tokenizer = BertTokenizer.from_pretrained(vocab_file_name)
+
+    def encode(self, x: str) -> str:
+        return ' '.join(self.bert_tokenizer.tokenize(x))
+
+    def decode(self, x: str) -> str:
+        return self.bert_tokenizer.clean_up_tokenization(
+            self.bert_tokenizer.convert_tokens_to_string(x.split(' '))
+        )
+
+    def is_beginning_of_word(self, x: str) -> bool:
+        return not x.startswith('##')
diff --git a/fairseq/data/encoders/hf_byte_bpe.py b/fairseq/data/encoders/hf_byte_bpe.py
new file mode 100644
index 0000000000000000000000000000000000000000..2767df044e473ebde9e69536a923e8fa5d9ff500
--- /dev/null
+++ b/fairseq/data/encoders/hf_byte_bpe.py
@@ -0,0 +1,46 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.data.encoders import register_bpe
+
+
+@register_bpe('hf_byte_bpe')
+class HuggingFaceByteLevelBPE(object):
+
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--bpe-merges', help='path to merges.txt')
+        parser.add_argument('--bpe-vocab', help='path to vocab.json')
+        parser.add_argument('--bpe-add-prefix-space', action='store_true',
+                            help='add prefix space before encoding')
+        # fmt: on
+
+    def __init__(self, args):
+        try:
+            from tokenizers import ByteLevelBPETokenizer
+        except ImportError:
+            raise ImportError(
+                'Please install huggingface/tokenizers with: '
+                'pip install tokenizers'
+            )
+
+        self.bpe = ByteLevelBPETokenizer(
+            args.bpe_vocab,
+            args.bpe_merges,
+            add_prefix_space=getattr(args, 'bpe_add_prefix_space', False),
+        )
+
+    def encode(self, x: str) -> str:
+        return ' '.join(map(str, self.bpe.encode(x).ids))
+
+    def decode(self, x: str) -> str:
+        return self.bpe.decode([
+            int(tok) if tok not in {'<unk>', '<mask>'} else tok
+            for tok in x.split()
+        ])
+
+    def is_beginning_of_word(self, x: str) -> bool:
+        return self.decode(x).startswith(' ')
diff --git a/fairseq/data/encoders/moses_tokenizer.py b/fairseq/data/encoders/moses_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1e7478b9deecc12aaf2bfac066572fca109d0cd
--- /dev/null
+++ b/fairseq/data/encoders/moses_tokenizer.py
@@ -0,0 +1,49 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.data.encoders import register_tokenizer
+
+
+@register_tokenizer('moses')
+class MosesTokenizer(object):
+
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--moses-source-lang', metavar='SRC',
+                            help='source language')
+        parser.add_argument('--moses-target-lang', metavar='TARGET',
+                            help='target language')
+        parser.add_argument('--moses-no-dash-splits', action='store_true', default=False,
+                            help='don\'t apply dash split rules')
+        parser.add_argument('--moses-no-escape', action='store_true', default=False,
+                            help='don\'t perform HTML escaping on apostrophy, quotes, etc.')
+        # fmt: on
+
+    def __init__(self, args):
+        self.args = args
+
+        if getattr(args, 'moses_source_lang', None) is None:
+            args.moses_source_lang = getattr(args, 'source_lang', 'en')
+        if getattr(args, 'moses_target_lang', None) is None:
+            args.moses_target_lang = getattr(args, 'target_lang', 'en')
+
+        try:
+            from sacremoses import MosesTokenizer, MosesDetokenizer
+            self.tok = MosesTokenizer(args.moses_source_lang)
+            self.detok = MosesDetokenizer(args.moses_target_lang)
+        except ImportError:
+            raise ImportError('Please install Moses tokenizer with: pip install sacremoses')
+
+    def encode(self, x: str) -> str:
+        return self.tok.tokenize(
+            x,
+            aggressive_dash_splits=(not self.args.moses_no_dash_splits),
+            return_str=True,
+            escape=(not self.args.moses_no_escape),
+        )
+
+    def decode(self, x: str) -> str:
+        return self.detok.detokenize(x.split())
diff --git a/fairseq/data/encoders/nltk_tokenizer.py b/fairseq/data/encoders/nltk_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3db8ee56525196a68383dee5c04a22835e45625b
--- /dev/null
+++ b/fairseq/data/encoders/nltk_tokenizer.py
@@ -0,0 +1,23 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.data.encoders import register_tokenizer
+
+
+@register_tokenizer('nltk')
+class NLTKTokenizer(object):
+
+    def __init__(self, source_lang=None, target_lang=None):
+        try:
+            from nltk.tokenize import word_tokenize
+            self.word_tokenize = word_tokenize
+        except ImportError:
+            raise ImportError('Please install nltk with: pip install nltk')
+
+    def encode(self, x: str) -> str:
+        return ' '.join(self.word_tokenize(x))
+
+    def decode(self, x: str) -> str:
+        return x
diff --git a/fairseq/data/encoders/sentencepiece_bpe.py b/fairseq/data/encoders/sentencepiece_bpe.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5ff5db389a4718b028eef75b80616e1f0702b0b
--- /dev/null
+++ b/fairseq/data/encoders/sentencepiece_bpe.py
@@ -0,0 +1,43 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq import file_utils
+from fairseq.data.encoders import register_bpe
+
+
+@register_bpe('sentencepiece')
+class SentencepieceBPE(object):
+
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--sentencepiece-model', type=str,
+                            help='path to sentencepiece model')
+        # fmt: on
+
+    def __init__(self, args):
+        sentencepiece_model = file_utils.cached_path(args.sentencepiece_model)
+        try:
+            import sentencepiece as spm
+            self.sp = spm.SentencePieceProcessor()
+            self.sp.Load(sentencepiece_model)
+        except ImportError:
+            raise ImportError('Please install sentencepiece with: pip install sentencepiece')
+
+    def encode(self, x: str) -> str:
+        return ' '.join(self.sp.EncodeAsPieces(x))
+
+    def decode(self, x: str) -> str:
+        return x.replace(' ', '').replace('\u2581', ' ').strip()
+
+    def is_beginning_of_word(self, x: str) -> bool:
+        if x in ['<unk>', '<s>', '</s>', '<pad>']:
+            # special elements are always considered beginnings
+            # HACK: this logic is already present in fairseq/tasks/masked_lm.py
+            # but these special tokens are also contained in the sentencepiece
+            # vocabulary which causes duplicate special tokens. This hack makes
+            # sure that they are all taken into account.
+            return True
+        return x.startswith('\u2581')
diff --git a/fairseq/data/encoders/space_tokenizer.py b/fairseq/data/encoders/space_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..670001a8e8514e910c91281a88876261514f9374
--- /dev/null
+++ b/fairseq/data/encoders/space_tokenizer.py
@@ -0,0 +1,21 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import re
+
+from fairseq.data.encoders import register_tokenizer
+
+
+@register_tokenizer('space')
+class SpaceTokenizer(object):
+
+    def __init__(self, source_lang=None, target_lang=None):
+        self.space_tok = re.compile(r"\s+")
+
+    def encode(self, x: str) -> str:
+        return self.space_tok.sub(' ', x)
+
+    def decode(self, x: str) -> str:
+        return x
diff --git a/fairseq/data/encoders/subword_nmt_bpe.py b/fairseq/data/encoders/subword_nmt_bpe.py
new file mode 100644
index 0000000000000000000000000000000000000000..78f19b43eaec7745ef86f693359946c7b60bd14c
--- /dev/null
+++ b/fairseq/data/encoders/subword_nmt_bpe.py
@@ -0,0 +1,48 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq import file_utils
+from fairseq.data.encoders import register_bpe
+
+
+@register_bpe('subword_nmt')
+class SubwordNMTBPE(object):
+
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--bpe-codes', type=str,
+                            help='path to subword NMT BPE')
+        parser.add_argument('--bpe-separator', default='@@',
+                            help='BPE separator')
+        # fmt: on
+
+    def __init__(self, args):
+        if args.bpe_codes is None:
+            raise ValueError('--bpe-codes is required for --bpe=subword_nmt')
+        codes = file_utils.cached_path(args.bpe_codes)
+        try:
+            from subword_nmt import apply_bpe
+            bpe_parser = apply_bpe.create_parser()
+            bpe_args = bpe_parser.parse_args([
+                '--codes', codes,
+                '--separator', args.bpe_separator,
+            ])
+            self.bpe = apply_bpe.BPE(
+                bpe_args.codes,
+                bpe_args.merges,
+                bpe_args.separator,
+                None,
+                bpe_args.glossaries,
+            )
+            self.bpe_symbol = bpe_args.separator + ' '
+        except ImportError:
+            raise ImportError('Please install subword_nmt with: pip install subword-nmt')
+
+    def encode(self, x: str) -> str:
+        return self.bpe.process_line(x)
+
+    def decode(self, x: str) -> str:
+        return (x + ' ').replace(self.bpe_symbol, '').rstrip()
diff --git a/fairseq/data/encoders/utils.py b/fairseq/data/encoders/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0e491c143c17b8364cb1d4a730c7415fb116907
--- /dev/null
+++ b/fairseq/data/encoders/utils.py
@@ -0,0 +1,28 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from fairseq.data import encoders
+
+
+def get_whole_word_mask(args, dictionary):
+    bpe = encoders.build_bpe(args)
+    if bpe is not None:
+        def is_beginning_of_word(i):
+            if i < dictionary.nspecial:
+                # special elements are always considered beginnings
+                return True
+            tok = dictionary[i]
+            if tok.startswith('madeupword'):
+                return True
+            try:
+                return bpe.is_beginning_of_word(tok)
+            except ValueError:
+                return True
+        mask_whole_words = torch.ByteTensor(list(
+            map(is_beginning_of_word, range(len(dictionary)))
+        ))
+        return mask_whole_words
+    return None
diff --git a/fairseq/data/fairseq_dataset.py b/fairseq/data/fairseq_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..900bfaff101d9b22bcb60fab78546517ed910a8d
--- /dev/null
+++ b/fairseq/data/fairseq_dataset.py
@@ -0,0 +1,162 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch.utils.data
+
+from fairseq.data import data_utils
+
+
+class EpochListening:
+    """Mixin for receiving updates whenever the epoch increments."""
+
+    def set_epoch(self, epoch):
+        """Will receive the updated epoch number at the beginning of the epoch."""
+        pass
+
+
+class FairseqDataset(torch.utils.data.Dataset, EpochListening):
+    """A dataset that provides helpers for batching."""
+
+    def __getitem__(self, index):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+    def collater(self, samples):
+        """Merge a list of samples to form a mini-batch.
+
+        Args:
+            samples (List[dict]): samples to collate
+
+        Returns:
+            dict: a mini-batch suitable for forwarding with a Model
+        """
+        raise NotImplementedError
+
+    def num_tokens(self, index):
+        """Return the number of tokens in a sample. This value is used to
+        enforce ``--max-tokens`` during batching."""
+        raise NotImplementedError
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        raise NotImplementedError
+
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        return np.arange(len(self))
+
+    @property
+    def supports_prefetch(self):
+        """Whether this dataset supports prefetching."""
+        return False
+
+    def attr(self, attr: str, index: int):
+        return getattr(self, attr, None)
+
+    def prefetch(self, indices):
+        """Prefetch the data required for this epoch."""
+        raise NotImplementedError
+
+    def get_batch_shapes(self):
+        """
+        Return a list of valid batch shapes, for example::
+
+            [(8, 512), (16, 256), (32, 128)]
+
+        The first dimension of each tuple is the batch size and can be ``None``
+        to automatically infer the max batch size based on ``--max-tokens``.
+        The second dimension of each tuple is the max supported length as given
+        by :func:`fairseq.data.FairseqDataset.num_tokens`.
+
+        This will be used by :func:`fairseq.data.FairseqDataset.batch_by_size`
+        to restrict batch shapes. This is useful on TPUs to avoid too many
+        dynamic shapes (and recompilations).
+        """
+        return None
+
+    def batch_by_size(
+        self,
+        indices,
+        max_tokens=None,
+        max_sentences=None,
+        required_batch_size_multiple=1,
+    ):
+        """
+        Given an ordered set of indices, return batches according to
+        *max_tokens*, *max_sentences* and *required_batch_size_multiple*.
+        """
+        from fairseq.data import data_utils
+
+        fixed_shapes = self.get_batch_shapes()
+        if fixed_shapes is not None:
+
+            def adjust_bsz(bsz, num_tokens):
+                if bsz is None:
+                    assert max_tokens is not None, 'Must specify --max-tokens'
+                    bsz = max_tokens // num_tokens
+                if max_sentences is not None:
+                    bsz = min(bsz, max_sentences)
+                elif (
+                    bsz >= required_batch_size_multiple
+                    and bsz % required_batch_size_multiple != 0
+                ):
+                    bsz -= (bsz % required_batch_size_multiple)
+                return bsz
+
+            fixed_shapes = np.array([
+                [adjust_bsz(bsz, num_tokens), num_tokens]
+                for (bsz, num_tokens) in fixed_shapes
+            ])
+
+        return data_utils.batch_by_size(
+            indices,
+            num_tokens_fn=self.num_tokens,
+            max_tokens=max_tokens,
+            max_sentences=max_sentences,
+            required_batch_size_multiple=required_batch_size_multiple,
+            fixed_shapes=fixed_shapes,
+        )
+
+    def filter_indices_by_size(self, indices, max_sizes):
+        """ Filter a list of sample indices. Remove those that are longer
+            than specified in max_sizes.
+
+            WARNING: don't update, override method in child classes
+
+        Args:
+            indices (np.array): original array of sample indices
+            max_sizes (int or list[int] or tuple[int]): max sample size,
+                can be defined separately for src and tgt (then list or tuple)
+
+        Returns:
+            np.array: filtered sample array
+            list: list of removed indices
+        """
+        if isinstance(max_sizes, float) or isinstance(max_sizes, int):
+            if hasattr(self, 'sizes') and isinstance(self.sizes, np.ndarray):
+                ignored = indices[self.sizes[indices] > max_sizes].tolist()
+                indices = indices[self.sizes[indices] <= max_sizes]
+            elif hasattr(self, 'sizes') and isinstance(self.sizes, list) and len(self.sizes) == 1:
+                ignored = indices[self.sizes[0][indices] > max_sizes].tolist()
+                indices = indices[self.sizes[0][indices] <= max_sizes]
+            else:
+                indices, ignored = data_utils._filter_by_size_dynamic(indices, self.size, max_sizes)
+        else:
+            indices, ignored = data_utils._filter_by_size_dynamic(indices, self.size, max_sizes)
+        return indices, ignored
+
+
+class FairseqIterableDataset(torch.utils.data.IterableDataset, EpochListening):
+    """For datasets that need to be read sequentially, usually because the data
+    is being streamed or otherwise can't be manipulated on a single machine.
+    """
+
+    def __iter__(self):
+        raise NotImplementedError
diff --git a/fairseq/data/id_dataset.py b/fairseq/data/id_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a73ba1ff74e53d8789581c24880c0b6485cffa7
--- /dev/null
+++ b/fairseq/data/id_dataset.py
@@ -0,0 +1,20 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from . import FairseqDataset
+
+
+class IdDataset(FairseqDataset):
+
+    def __getitem__(self, index):
+        return index
+
+    def __len__(self):
+        return 0
+
+    def collater(self, samples):
+        return torch.tensor(samples)
diff --git a/fairseq/data/indexed_dataset.py b/fairseq/data/indexed_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..12497989bbe3a52bd70dc222538cb4fb5eb70b43
--- /dev/null
+++ b/fairseq/data/indexed_dataset.py
@@ -0,0 +1,523 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from functools import lru_cache
+import os
+import shutil
+import struct
+
+import numpy as np
+import torch
+
+from . import FairseqDataset
+
+
+def __best_fitting_dtype(vocab_size=None):
+    if vocab_size is not None and vocab_size < 65500:
+        return np.uint16
+    else:
+        return np.int32
+
+
+def get_available_dataset_impl():
+    return ['raw', 'lazy', 'cached', 'mmap']
+
+
+def infer_dataset_impl(path):
+    if IndexedRawTextDataset.exists(path):
+        return 'raw'
+    elif IndexedDataset.exists(path):
+        with open(index_file_path(path), 'rb') as f:
+            magic = f.read(8)
+            if magic == IndexedDataset._HDR_MAGIC:
+                return 'cached'
+            elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]:
+                return 'mmap'
+            else:
+                return None
+    else:
+        return None
+
+
+def make_builder(out_file, impl, vocab_size=None):
+    if impl == 'mmap':
+        return MMapIndexedDatasetBuilder(out_file, dtype=__best_fitting_dtype(vocab_size))
+    else:
+        return IndexedDatasetBuilder(out_file)
+
+
+def make_dataset(path, impl, fix_lua_indexing=False, dictionary=None):
+    if impl == 'raw' and IndexedRawTextDataset.exists(path):
+        assert dictionary is not None
+        return IndexedRawTextDataset(path, dictionary)
+    elif impl == 'lazy' and IndexedDataset.exists(path):
+        return IndexedDataset(path, fix_lua_indexing=fix_lua_indexing)
+    elif impl == 'cached' and IndexedDataset.exists(path):
+        return IndexedCachedDataset(path, fix_lua_indexing=fix_lua_indexing)
+    elif impl == 'mmap' and MMapIndexedDataset.exists(path):
+        return MMapIndexedDataset(path)
+    return None
+
+
+def dataset_exists(path, impl):
+    if impl == 'raw':
+        return IndexedRawTextDataset.exists(path)
+    elif impl == 'mmap':
+        return MMapIndexedDataset.exists(path)
+    else:
+        return IndexedDataset.exists(path)
+
+
+def read_longs(f, n):
+    a = np.empty(n, dtype=np.int64)
+    f.readinto(a)
+    return a
+
+
+def write_longs(f, a):
+    f.write(np.array(a, dtype=np.int64))
+
+
+dtypes = {
+    1: np.uint8,
+    2: np.int8,
+    3: np.int16,
+    4: np.int32,
+    5: np.int64,
+    6: np.float,
+    7: np.double,
+    8: np.uint16
+}
+
+
+def code(dtype):
+    for k in dtypes.keys():
+        if dtypes[k] == dtype:
+            return k
+    raise ValueError(dtype)
+
+
+def index_file_path(prefix_path):
+    return prefix_path + '.idx'
+
+
+def data_file_path(prefix_path):
+    return prefix_path + '.bin'
+
+
+class IndexedDataset(FairseqDataset):
+    """Loader for TorchNet IndexedDataset"""
+    _HDR_MAGIC = b'TNTIDX\x00\x00'
+
+    def __init__(self, path, fix_lua_indexing=False):
+        super().__init__()
+        self.path = path
+        self.fix_lua_indexing = fix_lua_indexing
+        self.data_file = None
+        self.read_index(path)
+
+    def read_index(self, path):
+        with open(index_file_path(path), 'rb') as f:
+            magic = f.read(8)
+            assert magic == self._HDR_MAGIC, (
+                'Index file doesn\'t match expected format. '
+                'Make sure that --dataset-impl is configured properly.'
+            )
+            version = f.read(8)
+            assert struct.unpack('<Q', version) == (1,)
+            code, self.element_size = struct.unpack('<QQ', f.read(16))
+            self.dtype = dtypes[code]
+            self._len, self.s = struct.unpack('<QQ', f.read(16))
+            self.dim_offsets = read_longs(f, self._len + 1)
+            self.data_offsets = read_longs(f, self._len + 1)
+            self.sizes = read_longs(f, self.s)
+
+    def read_data(self, path):
+        self.data_file = open(data_file_path(path), 'rb', buffering=0)
+
+    def check_index(self, i):
+        if i < 0 or i >= self._len:
+            raise IndexError('index out of range')
+
+    def __del__(self):
+        if self.data_file:
+            self.data_file.close()
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, i):
+        if not self.data_file:
+            self.read_data(self.path)
+        self.check_index(i)
+        tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
+        a = np.empty(tensor_size, dtype=self.dtype)
+        self.data_file.seek(self.data_offsets[i] * self.element_size)
+        self.data_file.readinto(a)
+        item = torch.from_numpy(a).long()
+        if self.fix_lua_indexing:
+            item -= 1  # subtract 1 for 0-based indexing
+        return item
+
+    def __len__(self):
+        return self._len
+
+    def num_tokens(self, index):
+        return self.sizes[index]
+
+    def size(self, index):
+        return self.sizes[index]
+
+    @staticmethod
+    def exists(path):
+        return (
+            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
+        )
+
+    @property
+    def supports_prefetch(self):
+        return False  # avoid prefetching to save memory
+
+
+class IndexedCachedDataset(IndexedDataset):
+
+    def __init__(self, path, fix_lua_indexing=False):
+        super().__init__(path, fix_lua_indexing=fix_lua_indexing)
+        self.cache = None
+        self.cache_index = {}
+
+    @property
+    def supports_prefetch(self):
+        return True
+
+    def prefetch(self, indices):
+        if all(i in self.cache_index for i in indices):
+            return
+        if not self.data_file:
+            self.read_data(self.path)
+        indices = sorted(set(indices))
+        total_size = 0
+        for i in indices:
+            total_size += self.data_offsets[i + 1] - self.data_offsets[i]
+        self.cache = np.empty(total_size, dtype=self.dtype)
+        ptx = 0
+        self.cache_index.clear()
+        for i in indices:
+            self.cache_index[i] = ptx
+            size = self.data_offsets[i + 1] - self.data_offsets[i]
+            a = self.cache[ptx: ptx + size]
+            self.data_file.seek(self.data_offsets[i] * self.element_size)
+            self.data_file.readinto(a)
+            ptx += size
+        if self.data_file:
+            # close and delete data file after prefetch so we can pickle
+            self.data_file.close()
+            self.data_file = None
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, i):
+        self.check_index(i)
+        tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
+        a = np.empty(tensor_size, dtype=self.dtype)
+        ptx = self.cache_index[i]
+        np.copyto(a, self.cache[ptx: ptx + a.size])
+        item = torch.from_numpy(a).long()
+        if self.fix_lua_indexing:
+            item -= 1  # subtract 1 for 0-based indexing
+        return item
+
+
+class IndexedRawTextDataset(FairseqDataset):
+    """Takes a text file as input and binarizes it in memory at instantiation.
+    Original lines are also kept in memory"""
+
+    def __init__(self, path, dictionary, append_eos=True, reverse_order=False):
+        self.tokens_list = []
+        self.lines = []
+        self.sizes = []
+        self.append_eos = append_eos
+        self.reverse_order = reverse_order
+        self.read_data(path, dictionary)
+        self.size = len(self.tokens_list)
+
+    def read_data(self, path, dictionary):
+        with open(path, 'r', encoding='utf-8') as f:
+            for line in f:
+                self.lines.append(line.strip('\n'))
+                tokens = dictionary.encode_line(
+                    line, add_if_not_exist=False,
+                    append_eos=self.append_eos, reverse_order=self.reverse_order,
+                ).long()
+                self.tokens_list.append(tokens)
+                self.sizes.append(len(tokens))
+        self.sizes = np.array(self.sizes)
+
+    def check_index(self, i):
+        if i < 0 or i >= self.size:
+            raise IndexError('index out of range')
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, i):
+        self.check_index(i)
+        return self.tokens_list[i]
+
+    def get_original_text(self, i):
+        self.check_index(i)
+        return self.lines[i]
+
+    def __del__(self):
+        pass
+
+    def __len__(self):
+        return self.size
+
+    def num_tokens(self, index):
+        return self.sizes[index]
+
+    def size(self, index):
+        return self.sizes[index]
+
+    @staticmethod
+    def exists(path):
+        return os.path.exists(path)
+
+
+class IndexedDatasetBuilder(object):
+    element_sizes = {
+        np.uint8: 1,
+        np.int8: 1,
+        np.int16: 2,
+        np.int32: 4,
+        np.int64: 8,
+        np.float: 4,
+        np.double: 8
+    }
+
+    def __init__(self, out_file, dtype=np.int32):
+        self.out_file = open(out_file, 'wb')
+        self.dtype = dtype
+        self.data_offsets = [0]
+        self.dim_offsets = [0]
+        self.sizes = []
+        self.element_size = self.element_sizes[self.dtype]
+
+    def add_item(self, tensor):
+        # +1 for Lua compatibility
+        bytes = self.out_file.write(np.array(tensor.numpy() + 1, dtype=self.dtype))
+        self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size)
+        for s in tensor.size():
+            self.sizes.append(s)
+        self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size()))
+
+    def merge_file_(self, another_file):
+        index = IndexedDataset(another_file)
+        assert index.dtype == self.dtype
+
+        begin = self.data_offsets[-1]
+        for offset in index.data_offsets[1:]:
+            self.data_offsets.append(begin + offset)
+        self.sizes.extend(index.sizes)
+        begin = self.dim_offsets[-1]
+        for dim_offset in index.dim_offsets[1:]:
+            self.dim_offsets.append(begin + dim_offset)
+
+        with open(data_file_path(another_file), 'rb') as f:
+            while True:
+                data = f.read(1024)
+                if data:
+                    self.out_file.write(data)
+                else:
+                    break
+
+    def finalize(self, index_file):
+        self.out_file.close()
+        index = open(index_file, 'wb')
+        index.write(b'TNTIDX\x00\x00')
+        index.write(struct.pack('<Q', 1))
+        index.write(struct.pack('<QQ', code(self.dtype), self.element_size))
+        index.write(struct.pack('<QQ', len(self.data_offsets) - 1, len(self.sizes)))
+        write_longs(index, self.dim_offsets)
+        write_longs(index, self.data_offsets)
+        write_longs(index, self.sizes)
+        index.close()
+
+
+def _warmup_mmap_file(path):
+    with open(path, 'rb') as stream:
+        while stream.read(100 * 1024 * 1024):
+            pass
+
+
+class MMapIndexedDataset(torch.utils.data.Dataset):
+    class Index(object):
+        _HDR_MAGIC = b'MMIDIDX\x00\x00'
+
+        @classmethod
+        def writer(cls, path, dtype):
+            class _Writer(object):
+                def __enter__(self):
+                    self._file = open(path, 'wb')
+
+                    self._file.write(cls._HDR_MAGIC)
+                    self._file.write(struct.pack('<Q', 1))
+                    self._file.write(struct.pack('<B', code(dtype)))
+
+                    return self
+
+                @staticmethod
+                def _get_pointers(sizes):
+                    dtype_size = dtype().itemsize
+                    address = 0
+                    pointers = []
+
+                    for size in sizes:
+                        pointers.append(address)
+                        address += size * dtype_size
+
+                    return pointers
+
+                def write(self, sizes):
+                    pointers = self._get_pointers(sizes)
+
+                    self._file.write(struct.pack('<Q', len(sizes)))
+
+                    sizes = np.array(sizes, dtype=np.int32)
+                    self._file.write(sizes.tobytes(order='C'))
+                    del sizes
+
+                    pointers = np.array(pointers, dtype=np.int64)
+                    self._file.write(pointers.tobytes(order='C'))
+                    del pointers
+
+                def __exit__(self, exc_type, exc_val, exc_tb):
+                    self._file.close()
+
+            return _Writer()
+
+        def __init__(self, path):
+            with open(path, 'rb') as stream:
+                magic_test = stream.read(9)
+                assert self._HDR_MAGIC == magic_test, (
+                    'Index file doesn\'t match expected format. '
+                    'Make sure that --dataset-impl is configured properly.'
+                )
+                version = struct.unpack('<Q', stream.read(8))
+                assert (1,) == version
+
+                dtype_code, = struct.unpack('<B', stream.read(1))
+                self._dtype = dtypes[dtype_code]
+                self._dtype_size = self._dtype().itemsize
+
+                self._len = struct.unpack('<Q', stream.read(8))[0]
+                offset = stream.tell()
+
+            _warmup_mmap_file(path)
+
+            self._bin_buffer_mmap = np.memmap(path, mode='r', order='C')
+            self._bin_buffer = memoryview(self._bin_buffer_mmap)
+            self._sizes = np.frombuffer(self._bin_buffer, dtype=np.int32, count=self._len, offset=offset)
+            self._pointers = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._len,
+                                           offset=offset + self._sizes.nbytes)
+
+        def __del__(self):
+            self._bin_buffer_mmap._mmap.close()
+            del self._bin_buffer_mmap
+
+        @property
+        def dtype(self):
+            return self._dtype
+
+        @property
+        def sizes(self):
+            return self._sizes
+
+        @lru_cache(maxsize=8)
+        def __getitem__(self, i):
+            return self._pointers[i], self._sizes[i]
+
+        def __len__(self):
+            return self._len
+
+    def __init__(self, path):
+        super().__init__()
+
+        self._path = None
+        self._index = None
+        self._bin_buffer = None
+
+        self._do_init(path)
+
+    def __getstate__(self):
+        return self._path
+
+    def __setstate__(self, state):
+        self._do_init(state)
+
+    def _do_init(self, path):
+        self._path = path
+        self._index = self.Index(index_file_path(self._path))
+
+        _warmup_mmap_file(data_file_path(self._path))
+        self._bin_buffer_mmap = np.memmap(data_file_path(self._path), mode='r', order='C')
+        self._bin_buffer = memoryview(self._bin_buffer_mmap)
+
+    def __del__(self):
+        self._bin_buffer_mmap._mmap.close()
+        del self._bin_buffer_mmap
+        del self._index
+
+    def __len__(self):
+        return len(self._index)
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, i):
+        ptr, size = self._index[i]
+        np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr)
+        if self._index.dtype != np.int64:
+            np_array = np_array.astype(np.int64)
+
+        return torch.from_numpy(np_array)
+
+    @property
+    def sizes(self):
+        return self._index.sizes
+
+    @property
+    def supports_prefetch(self):
+        return False
+
+    @staticmethod
+    def exists(path):
+        return (
+            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
+        )
+
+
+class MMapIndexedDatasetBuilder(object):
+    def __init__(self, out_file, dtype=np.int64):
+        self._data_file = open(out_file, 'wb')
+        self._dtype = dtype
+        self._sizes = []
+
+    def add_item(self, tensor):
+        np_array = np.array(tensor.numpy(), dtype=self._dtype)
+        self._data_file.write(np_array.tobytes(order='C'))
+        self._sizes.append(np_array.size)
+
+    def merge_file_(self, another_file):
+        # Concatenate index
+        index = MMapIndexedDataset.Index(index_file_path(another_file))
+        assert index.dtype == self._dtype
+
+        for size in index.sizes:
+            self._sizes.append(size)
+
+        # Concatenate data
+        with open(data_file_path(another_file), 'rb') as f:
+            shutil.copyfileobj(f, self._data_file)
+
+    def finalize(self, index_file):
+        self._data_file.close()
+
+        with MMapIndexedDataset.Index.writer(index_file, self._dtype) as index:
+            index.write(self._sizes)
diff --git a/fairseq/data/iterators.py b/fairseq/data/iterators.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f4a616c6541235855f7683a3394e1de05be3004
--- /dev/null
+++ b/fairseq/data/iterators.py
@@ -0,0 +1,520 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+import logging
+import math
+import operator
+import os
+import queue
+import time
+from threading import Thread
+
+import numpy as np
+import torch
+
+from fairseq.data import data_utils
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+# Object used by _background_consumer to signal the source is exhausted
+# to the main thread.
+_sentinel = object()
+
+
+class CountingIterator(object):
+    """Wrapper around an iterable that maintains the iteration count.
+
+    Args:
+        iterable (iterable): iterable to wrap
+        start (int): starting iteration count. Note that this doesn't
+            actually advance the iterator.
+        total (int): override the iterator length returned by
+            ``__len__``. This can be used to truncate *iterator*.
+
+    Attributes:
+        n (int): number of elements consumed from this iterator
+    """
+
+    def __init__(self, iterable, start=None, total=None):
+        self.iterable = iterable
+        self.itr = iter(self)
+
+        if start is None:
+            self.n = getattr(iterable, 'n', 0)
+        else:
+            self.n = start
+
+        if total is None:
+            self.total = self.n + len(iterable)
+        else:
+            self.total = total
+
+    def __len__(self):
+        return self.total
+
+    def __iter__(self):
+        for x in self.iterable:
+            if self.n >= self.total:
+                return
+            self.n += 1
+            yield x
+
+    def __next__(self):
+        return next(self.itr)
+
+    def has_next(self):
+        """Whether the iterator has been exhausted."""
+        return self.n < len(self)
+
+    def skip(self, num_to_skip):
+        """Fast-forward the iterator by skipping *num_to_skip* elements."""
+        next(itertools.islice(self.itr, num_to_skip, num_to_skip), None)
+        return self
+
+    def take(self, n):
+        """
+        Truncates the iterator to n elements at most.
+        """
+        self.total = min(self.total, n)
+
+        # Propagate this change to the underlying iterator
+        if hasattr(self.iterable, "take"):
+            self.iterable.take(n)
+
+
+class EpochBatchIterating(object):
+    def __len__(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def next_epoch_idx(self):
+        raise NotImplementedError
+
+    def next_epoch_itr(self, shuffle=True, fix_batches_to_gpus=False):
+        """Return a new iterator over the dataset.
+
+        Args:
+            shuffle (bool, optional): shuffle batches before returning the
+                iterator (default: True).
+            fix_batches_to_gpus: ensure that batches are always
+                allocated to the same shards across epochs. Requires
+                that :attr:`dataset` supports prefetching (default: False).
+        """
+        raise NotImplementedError
+
+    def end_of_epoch(self) -> bool:
+        """Returns whether the most recent epoch iterator has been exhausted"""
+        raise NotImplementedError
+
+    @property
+    def iterations_in_epoch(self) -> int:
+        """The number of consumed batches in the current epoch."""
+        raise NotImplementedError
+
+    def state_dict(self):
+        """Returns a dictionary containing a whole state of the iterator."""
+        raise NotImplementedError
+
+    def load_state_dict(self, state_dict):
+        """Copies the state of the iterator from the given *state_dict*."""
+        raise NotImplementedError
+
+
+class StreamingEpochBatchIterator(EpochBatchIterating):
+    def __init__(
+        self, dataset, epoch=1, num_shards=1, shard_id=0,
+    ):
+        assert isinstance(dataset, torch.utils.data.IterableDataset)
+        self.dataset = dataset
+        self.epoch = max(epoch, 1)  # we use 1-based indexing for epochs
+        self._current_epoch_iterator = None
+        self.num_shards = num_shards
+        self.shard_id = shard_id
+
+    @property
+    def next_epoch_idx(self):
+        """Return the epoch index after *next_epoch_itr* is called."""
+        if self._current_epoch_iterator is not None and self.end_of_epoch():
+            return self.epoch + 1
+        else:
+            return self.epoch
+
+    def next_epoch_itr(self, shuffle=True, fix_batches_to_gpus=False):
+        self.epoch = self.next_epoch_idx
+        self.dataset.set_epoch(self.epoch)
+        self._current_epoch_iterator = CountingIterator(
+            iterable=ShardedIterator(
+                iterable=self.dataset,
+                num_shards=self.num_shards,
+                shard_id=self.shard_id,
+            ),
+        )
+        return self._current_epoch_iterator
+
+    def end_of_epoch(self) -> bool:
+        return not self._current_epoch_iterator.has_next()
+
+    @property
+    def iterations_in_epoch(self) -> int:
+        if self._current_epoch_iterator is not None:
+            return self._current_epoch_iterator.n
+        return 0
+
+    def state_dict(self):
+        return {
+            'epoch': self.epoch,
+        }
+
+    def load_state_dict(self, state_dict):
+        self.epoch = state_dict['epoch']
+
+
+class EpochBatchIterator(EpochBatchIterating):
+    """A multi-epoch iterator over a :class:`torch.utils.data.Dataset`.
+
+    Compared to :class:`torch.utils.data.DataLoader`, this iterator:
+
+    - can be reused across multiple epochs with the :func:`next_epoch_itr`
+      method (optionally shuffled between epochs)
+    - can be serialized/deserialized with the :func:`state_dict` and
+      :func:`load_state_dict` methods
+    - supports sharding with the *num_shards* and *shard_id* arguments
+
+    Args:
+        dataset (~torch.utils.data.Dataset): dataset from which to load the data
+        collate_fn (callable): merges a list of samples to form a mini-batch
+        batch_sampler (~torch.utils.data.Sampler or a callable): an iterator over batches of
+            indices, or a callable to create such an iterator (~torch.utils.data.Sampler).
+            A callable batch_sampler will be called for each epoch to enable per epoch dynamic
+            batch iterators defined by this callable batch_sampler.
+        seed (int, optional): seed for random number generator for
+            reproducibility (default: 1).
+        num_shards (int, optional): shard the data iterator into N
+            shards (default: 1).
+        shard_id (int, optional): which shard of the data iterator to
+            return (default: 0).
+        num_workers (int, optional): how many subprocesses to use for data
+            loading. 0 means the data will be loaded in the main process
+            (default: 0).
+        epoch (int, optional): the epoch to start the iterator from
+            (default: 1).
+        buffer_size (int, optional): the number of batches to keep ready in the
+            queue. Helps speeding up dataloading. When buffer_size is zero, the
+            default torch.utils.data.DataLoader preloading is used.
+        timeout (int, optional): if positive, the timeout value for collecting a batch
+            from workers. Should always be non-negative. (default: ``0``)
+    """
+
+    def __init__(
+        self, dataset, collate_fn, batch_sampler, seed=1, num_shards=1, shard_id=0,
+        num_workers=0, epoch=1, buffer_size=0, timeout=0,
+    ):
+        assert isinstance(dataset, torch.utils.data.Dataset)
+        self.dataset = dataset
+        self.collate_fn = collate_fn
+        self.batch_sampler = batch_sampler
+        self._frozen_batches = tuple(batch_sampler) if not callable(batch_sampler) else None
+        self.seed = seed
+        self.num_shards = num_shards
+        self.shard_id = shard_id
+        self.num_workers = num_workers
+        # This upper limit here is to prevent people from abusing this feature
+        # in a shared computing environment.
+        self.buffer_size = min(buffer_size, 20)
+        self.timeout = timeout
+
+        self.epoch = max(epoch, 1)  # we use 1-based indexing for epochs
+        self.shuffle = True
+        self._cur_epoch_itr = None
+        self._next_epoch_itr = None
+        self._supports_prefetch = getattr(dataset, 'supports_prefetch', False)
+
+    @property
+    def frozen_batches(self):
+        if self._frozen_batches is None:
+            self._frozen_batches = tuple(self.batch_sampler(self.dataset, self.epoch))
+        return self._frozen_batches
+
+    def __len__(self):
+        return int(math.ceil(len(self.frozen_batches) / float(self.num_shards)))
+
+    @property
+    def n(self):
+        return self.iterations_in_epoch
+
+    @property
+    def next_epoch_idx(self):
+        """Return the epoch index after *next_epoch_itr* is called."""
+        if self._next_epoch_itr is not None:
+            return self.epoch
+        elif self._cur_epoch_itr is not None and self.end_of_epoch():
+            return self.epoch + 1
+        else:
+            return self.epoch
+
+    def next_epoch_itr(self, shuffle=True, fix_batches_to_gpus=False):
+        """Return a new iterator over the dataset.
+
+        Args:
+            shuffle (bool, optional): shuffle batches before returning the
+                iterator (default: True).
+            fix_batches_to_gpus: ensure that batches are always
+                allocated to the same shards across epochs. Requires
+                that :attr:`dataset` supports prefetching (default: False).
+        """
+        self.epoch = self.next_epoch_idx
+        self.dataset.set_epoch(self.epoch)
+        if self._next_epoch_itr is not None:
+            self._cur_epoch_itr = self._next_epoch_itr
+            self._next_epoch_itr = None
+        else:
+            if callable(self.batch_sampler):
+                # reset _frozen_batches to refresh the next epoch
+                self._frozen_batches = None
+            self._cur_epoch_itr = self._get_iterator_for_epoch(
+                self.epoch, shuffle, fix_batches_to_gpus=fix_batches_to_gpus,
+            )
+        self.shuffle = shuffle
+        return self._cur_epoch_itr
+
+    def end_of_epoch(self) -> bool:
+        """Returns whether the most recent epoch iterator has been exhausted"""
+        return not self._cur_epoch_itr.has_next()
+
+    @property
+    def iterations_in_epoch(self):
+        """The number of consumed batches in the current epoch."""
+        if self._cur_epoch_itr is not None:
+            return self._cur_epoch_itr.n
+        elif self._next_epoch_itr is not None:
+            return self._next_epoch_itr.n
+        return 0
+
+    def state_dict(self):
+        """Returns a dictionary containing a whole state of the iterator."""
+        return {
+            'epoch': self.epoch,
+            'iterations_in_epoch': self.iterations_in_epoch,
+            'shuffle': self.shuffle,
+        }
+
+    def load_state_dict(self, state_dict):
+        """Copies the state of the iterator from the given *state_dict*."""
+        self.epoch = state_dict['epoch']
+        itr_pos = state_dict.get('iterations_in_epoch', 0)
+        if itr_pos > 0:
+            # fast-forward epoch iterator
+            self._next_epoch_itr = self._get_iterator_for_epoch(
+                self.epoch,
+                shuffle=state_dict.get('shuffle', True),
+                offset=itr_pos,
+            )
+            if self._next_epoch_itr is None:
+                # we finished the epoch, increment epoch counter
+                self.epoch += 1
+        else:
+            self._next_epoch_itr = None
+
+    def _get_iterator_for_epoch(self, epoch, shuffle, fix_batches_to_gpus=False, offset=0):
+
+        def shuffle_batches(batches, seed):
+            with data_utils.numpy_seed(seed):
+                np.random.shuffle(batches)
+            return batches
+
+        if self._supports_prefetch:
+            batches = self.frozen_batches
+
+            if shuffle and not fix_batches_to_gpus:
+                batches = shuffle_batches(list(batches), self.seed + epoch)
+
+            batches = list(ShardedIterator(
+                batches, self.num_shards, self.shard_id, fill_value=[]
+            ))
+            self.dataset.prefetch([i for s in batches for i in s])
+
+            if shuffle and fix_batches_to_gpus:
+                batches = shuffle_batches(batches, self.seed + epoch + self.shard_id)
+        else:
+            if shuffle:
+                batches = shuffle_batches(list(self.frozen_batches), self.seed + epoch)
+            else:
+                batches = self.frozen_batches
+            batches = list(ShardedIterator(
+                batches, self.num_shards, self.shard_id, fill_value=[]
+            ))
+
+        if offset > 0 and offset >= len(batches):
+            return None
+
+        if self.num_workers > 0:
+            os.environ['PYTHONWARNINGS'] = 'ignore:semaphore_tracker:UserWarning'
+
+        # Create data loader
+        itr = torch.utils.data.DataLoader(
+            self.dataset,
+            collate_fn=self.collate_fn,
+            batch_sampler=batches[offset:],
+            num_workers=self.num_workers,
+            timeout=self.timeout,
+        )
+
+        # Wrap with a BufferedIterator if needed
+        if self.buffer_size > 0:
+            itr = BufferedIterator(self.buffer_size, itr)
+
+        # Wrap with CoutingIterator
+        itr = CountingIterator(itr, start=offset)
+        return itr
+
+
+class GroupedIterator(CountingIterator):
+    """Wrapper around an iterable that returns groups (chunks) of items.
+
+    Args:
+        iterable (iterable): iterable to wrap
+        chunk_size (int): size of each chunk
+
+    Attributes:
+        n (int): number of elements consumed from this iterator
+    """
+
+    def __init__(self, iterable, chunk_size):
+        itr = _chunk_iterator(iterable, chunk_size)
+        super().__init__(
+            itr,
+            start=int(math.ceil(getattr(iterable, 'n', 0) / float(chunk_size))),
+            total=int(math.ceil(len(iterable) / float(chunk_size))),
+        )
+        self.chunk_size = chunk_size
+
+
+def _chunk_iterator(itr, chunk_size):
+    chunk = []
+    for x in itr:
+        chunk.append(x)
+        if len(chunk) == chunk_size:
+            yield chunk
+            chunk = []
+    if len(chunk) > 0:
+        yield chunk
+
+
+class ShardedIterator(CountingIterator):
+    """A sharded wrapper around an iterable, padded to length.
+
+    Args:
+        iterable (iterable): iterable to wrap
+        num_shards (int): number of shards to split the iterable into
+        shard_id (int): which shard to iterator over
+        fill_value (Any, optional): padding value when the iterable doesn't
+            evenly divide *num_shards* (default: None).
+
+    Attributes:
+        n (int): number of elements consumed from this iterator
+    """
+
+    def __init__(self, iterable, num_shards, shard_id, fill_value=None):
+        if shard_id < 0 or shard_id >= num_shards:
+            raise ValueError('shard_id must be between 0 and num_shards')
+        sharded_len = int(math.ceil(len(iterable) / float(num_shards)))
+        itr = map(
+            operator.itemgetter(1),
+            itertools.zip_longest(
+                range(sharded_len),
+                itertools.islice(iterable, shard_id, len(iterable), num_shards),
+                fillvalue=fill_value,
+            ),
+        )
+        super().__init__(
+            itr,
+            start=int(math.ceil(getattr(iterable, 'n', 0) / float(num_shards))),
+            total=sharded_len,
+        )
+
+
+class BackgroundConsumer(Thread):
+    def __init__(self, queue, source, max_len):
+        Thread.__init__(self)
+
+        self._queue = queue
+        self._source = source
+        self._max_len = max_len
+        self.count = 0
+
+    def run(self):
+        try:
+            self._source_iter = iter(self._source)
+            for _ in range(len(self._source)):
+                item = next(self._source_iter)
+                self._queue.put(item)
+
+                # Stop if we reached the maximum length
+                self.count += 1
+                if self._max_len is not None and self.count >= self._max_len:
+                    break
+
+            # Signal the consumer we are done.
+            self._queue.put(_sentinel)
+        except Exception as e:
+            self._queue.put(e)
+
+        del self._source_iter
+
+
+class BufferedIterator(object):
+    def __init__(self, size, iterable):
+        self._queue = queue.Queue(size)
+        self._iterable = iterable
+        self.max_len = None
+        self._consumer = None
+
+        self.start_time = time.time()
+        self.warning_time = None
+
+    def _create_consumer(self):
+        self._consumer = BackgroundConsumer(
+            self._queue,
+            self._iterable,
+            self.max_len
+        )
+        self._consumer.daemon = True
+        self._consumer.start()
+
+    def __iter__(self):
+        return self
+
+    def __len__(self):
+        return len(self._iterable)
+
+    def take(self, n):
+        self.max_len = n
+
+    def __next__(self):
+        # Create consumer if not created yet
+        if self._consumer is None:
+            self._create_consumer()
+
+        # Notify the user if there is a data loading bottleneck
+        if self._queue.qsize() < min(2, max(1, self._queue.maxsize // 2)):
+            if time.time() - self.start_time > 5 * 60:
+                if self.warning_time is None or time.time() - self.warning_time > 15 * 60:
+                    logger.info(
+                        "Data loading buffer is empty or nearly empty. This may "
+                        "indicate a data loading bottleneck, and increasing the "
+                        "number of workers (--num-workers) may help."
+                    )
+                    self.warning_time = time.time()
+
+        # Get next example
+        item = self._queue.get(True)
+        if isinstance(item, Exception):
+            raise item
+        if item is _sentinel:
+            raise StopIteration()
+        return item
diff --git a/fairseq/data/language_pair_dataset.py b/fairseq/data/language_pair_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..7576e07d346352b673039d651ecf0b13104b7adc
--- /dev/null
+++ b/fairseq/data/language_pair_dataset.py
@@ -0,0 +1,420 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import numpy as np
+import torch
+
+from fairseq.data import data_utils, FairseqDataset
+
+
+logger = logging.getLogger(__name__)
+
+
+def collate(
+    samples,
+    pad_idx,
+    eos_idx,
+    left_pad_source=True,
+    left_pad_target=False,
+    input_feeding=True,
+    pad_to_length=None,
+):
+    if len(samples) == 0:
+        return {}
+
+    def merge(key, left_pad, move_eos_to_beginning=False, pad_to_length=None):
+        return data_utils.collate_tokens(
+            [s[key] for s in samples],
+            pad_idx, eos_idx, left_pad, move_eos_to_beginning,
+            pad_to_length=pad_to_length,
+        )
+
+    def check_alignment(alignment, src_len, tgt_len):
+        if alignment is None or len(alignment) == 0:
+            return False
+        if alignment[:, 0].max().item() >= src_len - 1 or alignment[:, 1].max().item() >= tgt_len - 1:
+            logger.warning("alignment size mismatch found, skipping alignment!")
+            return False
+        return True
+
+    def compute_alignment_weights(alignments):
+        """
+        Given a tensor of shape [:, 2] containing the source-target indices
+        corresponding to the alignments, a weight vector containing the
+        inverse frequency of each target index is computed.
+        For e.g. if alignments = [[5, 7], [2, 3], [1, 3], [4, 2]], then
+        a tensor containing [1., 0.5, 0.5, 1] should be returned (since target
+        index 3 is repeated twice)
+        """
+        align_tgt = alignments[:, 1]
+        _, align_tgt_i, align_tgt_c = torch.unique(align_tgt, return_inverse=True, return_counts=True)
+        align_weights = align_tgt_c[align_tgt_i[np.arange(len(align_tgt))]]
+        return 1. / align_weights.float()
+
+    id = torch.LongTensor([s['id'] for s in samples])
+    src_tokens = merge(
+        'source', left_pad=left_pad_source,
+        pad_to_length=pad_to_length['source'] if pad_to_length is not None else None
+    )
+    # sort by descending source length
+    src_lengths = torch.LongTensor([
+        s['source'].ne(pad_idx).long().sum() for s in samples
+    ])
+    src_lengths, sort_order = src_lengths.sort(descending=True)
+    id = id.index_select(0, sort_order)
+    src_tokens = src_tokens.index_select(0, sort_order)
+
+    prev_output_tokens = None
+    target = None
+    if samples[0].get('target', None) is not None:
+        target = merge(
+            'target', left_pad=left_pad_target,
+            pad_to_length=pad_to_length['target'] if pad_to_length is not None else None,
+        )
+        target = target.index_select(0, sort_order)
+        tgt_lengths = torch.LongTensor([
+            s['target'].ne(pad_idx).long().sum() for s in samples
+        ]).index_select(0, sort_order)
+        ntokens = tgt_lengths.sum().item()
+
+        if samples[0].get('prev_output_tokens', None) is not None:
+            prev_output_tokens = merge('prev_output_tokens', left_pad=left_pad_target)
+        elif input_feeding:
+            # we create a shifted version of targets for feeding the
+            # previous output token(s) into the next decoder step
+            prev_output_tokens = merge(
+                'target',
+                left_pad=left_pad_target,
+                move_eos_to_beginning=True,
+                pad_to_length=pad_to_length['target'] if pad_to_length is not None else None,
+            )
+    else:
+        ntokens = src_lengths.sum().item()
+
+    batch = {
+        'id': id,
+        'nsentences': len(samples),
+        'ntokens': ntokens,
+        'net_input': {
+            'src_tokens': src_tokens,
+            'src_lengths': src_lengths,
+        },
+        'target': target,
+    }
+    if prev_output_tokens is not None:
+        batch['net_input']['prev_output_tokens'] = prev_output_tokens.index_select(0, sort_order)
+
+    if samples[0].get('alignment', None) is not None:
+        bsz, tgt_sz = batch['target'].shape
+        src_sz = batch['net_input']['src_tokens'].shape[1]
+
+        offsets = torch.zeros((len(sort_order), 2), dtype=torch.long)
+        offsets[:, 1] += (torch.arange(len(sort_order), dtype=torch.long) * tgt_sz)
+        if left_pad_source:
+            offsets[:, 0] += (src_sz - src_lengths)
+        if left_pad_target:
+            offsets[:, 1] += (tgt_sz - tgt_lengths)
+
+        alignments = [
+            alignment + offset
+            for align_idx, offset, src_len, tgt_len in zip(sort_order, offsets, src_lengths, tgt_lengths)
+            for alignment in [samples[align_idx]['alignment'].view(-1, 2)]
+            if check_alignment(alignment, src_len, tgt_len)
+        ]
+
+        if len(alignments) > 0:
+            alignments = torch.cat(alignments, dim=0)
+            align_weights = compute_alignment_weights(alignments)
+
+            batch['alignments'] = alignments
+            batch['align_weights'] = align_weights
+
+    return batch
+
+
+class LanguagePairDataset(FairseqDataset):
+    """
+    A pair of torch.utils.data.Datasets.
+
+    Args:
+        src (torch.utils.data.Dataset): source dataset to wrap
+        src_sizes (List[int]): source sentence lengths
+        src_dict (~fairseq.data.Dictionary): source vocabulary
+        tgt (torch.utils.data.Dataset, optional): target dataset to wrap
+        tgt_sizes (List[int], optional): target sentence lengths
+        tgt_dict (~fairseq.data.Dictionary, optional): target vocabulary
+        left_pad_source (bool, optional): pad source tensors on the left side
+            (default: True).
+        left_pad_target (bool, optional): pad target tensors on the left side
+            (default: False).
+        shuffle (bool, optional): shuffle dataset elements before batching
+            (default: True).
+        input_feeding (bool, optional): create a shifted version of the targets
+            to be passed into the model for teacher forcing (default: True).
+        remove_eos_from_source (bool, optional): if set, removes eos from end
+            of source if it's present (default: False).
+        append_eos_to_target (bool, optional): if set, appends eos to end of
+            target if it's absent (default: False).
+        align_dataset (torch.utils.data.Dataset, optional): dataset
+            containing alignments.
+        append_bos (bool, optional): if set, appends bos to the beginning of
+            source/target sentence.
+        num_buckets (int, optional): if set to a value greater than 0, then
+            batches will be bucketed into the given number of batch shapes.
+        src_lang_id (int, optional): source language ID, if set, the collated batch
+            will contain a field 'src_lang_id' in 'net_input' which indicates the
+            source language of the samples.
+        tgt_lang_id (int, optional): target language ID, if set, the collated batch
+            will contain a field 'tgt_lang_id' which indicates the target language
+             of the samples.
+    """
+
+    def __init__(
+        self, src, src_sizes, src_dict,
+        tgt=None, tgt_sizes=None, tgt_dict=None,
+        left_pad_source=True, left_pad_target=False,
+        shuffle=True, input_feeding=True,
+        remove_eos_from_source=False, append_eos_to_target=False,
+        align_dataset=None,
+        append_bos=False, eos=None,
+        num_buckets=0,
+        src_lang_id=None,
+        tgt_lang_id=None,
+    ):
+        if tgt_dict is not None:
+            assert src_dict.pad() == tgt_dict.pad()
+            assert src_dict.eos() == tgt_dict.eos()
+            assert src_dict.unk() == tgt_dict.unk()
+        if tgt is not None:
+            assert len(src) == len(tgt), "Source and target must contain the same number of examples"
+        self.src = src
+        self.tgt = tgt
+        self.src_sizes = np.array(src_sizes)
+        self.tgt_sizes = np.array(tgt_sizes) if tgt_sizes is not None else None
+        self.src_dict = src_dict
+        self.tgt_dict = tgt_dict
+        self.left_pad_source = left_pad_source
+        self.left_pad_target = left_pad_target
+        self.shuffle = shuffle
+        self.input_feeding = input_feeding
+        self.remove_eos_from_source = remove_eos_from_source
+        self.append_eos_to_target = append_eos_to_target
+        self.align_dataset = align_dataset
+        if self.align_dataset is not None:
+            assert self.tgt_sizes is not None, "Both source and target needed when alignments are provided"
+        self.append_bos = append_bos
+        self.eos = (eos if eos is not None else src_dict.eos())
+        self.src_lang_id = src_lang_id
+        self.tgt_lang_id = tgt_lang_id
+        if num_buckets > 0:
+            from fairseq.data import BucketPadLengthDataset
+            self.src = BucketPadLengthDataset(
+                self.src,
+                sizes=self.src_sizes,
+                num_buckets=num_buckets,
+                pad_idx=self.src_dict.pad(),
+                left_pad=self.left_pad_source,
+            )
+            self.src_sizes = self.src.sizes
+            logger.info('bucketing source lengths: {}'.format(list(self.src.buckets)))
+            if self.tgt is not None:
+                self.tgt = BucketPadLengthDataset(
+                    self.tgt,
+                    sizes=self.tgt_sizes,
+                    num_buckets=num_buckets,
+                    pad_idx=self.tgt_dict.pad(),
+                    left_pad=self.left_pad_target,
+                )
+                self.tgt_sizes = self.tgt.sizes
+                logger.info('bucketing target lengths: {}'.format(list(self.tgt.buckets)))
+
+            # determine bucket sizes using self.num_tokens, which will return
+            # the padded lengths (thanks to BucketPadLengthDataset)
+            num_tokens = np.vectorize(self.num_tokens, otypes=[np.long])
+            self.bucketed_num_tokens = num_tokens(np.arange(len(self.src)))
+            self.buckets = [
+                (None, num_tokens)
+                for num_tokens in np.unique(self.bucketed_num_tokens)
+            ]
+        else:
+            self.buckets = None
+
+    def get_batch_shapes(self):
+        return self.buckets
+
+    def __getitem__(self, index):
+        tgt_item = self.tgt[index] if self.tgt is not None else None
+        src_item = self.src[index]
+        # Append EOS to end of tgt sentence if it does not have an EOS and remove
+        # EOS from end of src sentence if it exists. This is useful when we use
+        # use existing datasets for opposite directions i.e., when we want to
+        # use tgt_dataset as src_dataset and vice versa
+        if self.append_eos_to_target:
+            eos = self.tgt_dict.eos() if self.tgt_dict else self.src_dict.eos()
+            if self.tgt and self.tgt[index][-1] != eos:
+                tgt_item = torch.cat([self.tgt[index], torch.LongTensor([eos])])
+
+        if self.append_bos:
+            bos = self.tgt_dict.bos() if self.tgt_dict else self.src_dict.bos()
+            if self.tgt and self.tgt[index][0] != bos:
+                tgt_item = torch.cat([torch.LongTensor([bos]), self.tgt[index]])
+
+            bos = self.src_dict.bos()
+            if self.src[index][0] != bos:
+                src_item = torch.cat([torch.LongTensor([bos]), self.src[index]])
+
+        if self.remove_eos_from_source:
+            eos = self.src_dict.eos()
+            if self.src[index][-1] == eos:
+                src_item = self.src[index][:-1]
+
+        example = {
+            'id': index,
+            'source': src_item,
+            'target': tgt_item,
+        }
+        if self.align_dataset is not None:
+            example['alignment'] = self.align_dataset[index]
+        return example
+
+    def __len__(self):
+        return len(self.src)
+
+    def collater(self, samples, pad_to_length=None):
+        """Merge a list of samples to form a mini-batch.
+
+        Args:
+            samples (List[dict]): samples to collate
+            pad_to_length (dict, optional): a dictionary of
+                {'source': source_pad_to_length, 'target': target_pad_to_length}
+                to indicate the max length to pad to in source and target respectively.
+
+        Returns:
+            dict: a mini-batch with the following keys:
+
+                - `id` (LongTensor): example IDs in the original input order
+                - `ntokens` (int): total number of tokens in the batch
+                - `net_input` (dict): the input to the Model, containing keys:
+
+                  - `src_tokens` (LongTensor): a padded 2D Tensor of tokens in
+                    the source sentence of shape `(bsz, src_len)`. Padding will
+                    appear on the left if *left_pad_source* is ``True``.
+                  - `src_lengths` (LongTensor): 1D Tensor of the unpadded
+                    lengths of each source sentence of shape `(bsz)`
+                  - `prev_output_tokens` (LongTensor): a padded 2D Tensor of
+                    tokens in the target sentence, shifted right by one
+                    position for teacher forcing, of shape `(bsz, tgt_len)`.
+                    This key will not be present if *input_feeding* is
+                    ``False``.  Padding will appear on the left if
+                    *left_pad_target* is ``True``.
+                  - `src_lang_id` (LongTensor): a long Tensor which contains source
+                    language IDs of each sample in the batch
+
+                - `target` (LongTensor): a padded 2D Tensor of tokens in the
+                  target sentence of shape `(bsz, tgt_len)`. Padding will appear
+                  on the left if *left_pad_target* is ``True``.
+                - `tgt_lang_id` (LongTensor): a long Tensor which contains target language
+                   IDs of each sample in the batch
+        """
+        res = collate(
+            samples,
+            pad_idx=self.src_dict.pad(),
+            eos_idx=self.eos,
+            left_pad_source=self.left_pad_source,
+            left_pad_target=self.left_pad_target,
+            input_feeding=self.input_feeding,
+            pad_to_length=pad_to_length,
+        )
+        if self.src_lang_id is not None or self.tgt_lang_id is not None:
+            src_tokens = res['net_input']['src_tokens']
+            bsz = src_tokens.size(0)
+            if self.src_lang_id is not None:
+                res['net_input']['src_lang_id'] = torch.LongTensor(
+                            [[self.src_lang_id]]
+                            ).expand(bsz, 1).to(src_tokens)
+            if self.tgt_lang_id is not None:
+                res['tgt_lang_id'] = torch.LongTensor(
+                            [[self.tgt_lang_id]]
+                            ).expand(bsz, 1).to(src_tokens)
+        return res
+
+    def num_tokens(self, index):
+        """Return the number of tokens in a sample. This value is used to
+        enforce ``--max-tokens`` during batching."""
+        return max(self.src_sizes[index], self.tgt_sizes[index] if self.tgt_sizes is not None else 0)
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        return (self.src_sizes[index], self.tgt_sizes[index] if self.tgt_sizes is not None else 0)
+
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        if self.shuffle:
+            indices = np.random.permutation(len(self))
+        else:
+            indices = np.arange(len(self))
+        if self.buckets is None:
+            # sort by target length, then source length
+            if self.tgt_sizes is not None:
+                indices = indices[
+                    np.argsort(self.tgt_sizes[indices], kind='mergesort')
+                ]
+            return indices[np.argsort(self.src_sizes[indices], kind='mergesort')]
+        else:
+            # sort by bucketed_num_tokens, which is:
+            #   max(padded_src_len, padded_tgt_len)
+            return indices[
+                np.argsort(self.bucketed_num_tokens[indices], kind='mergesort')
+            ]
+
+    @property
+    def supports_prefetch(self):
+        return (
+            getattr(self.src, 'supports_prefetch', False)
+            and (getattr(self.tgt, 'supports_prefetch', False) or self.tgt is None)
+        )
+
+    def prefetch(self, indices):
+        self.src.prefetch(indices)
+        if self.tgt is not None:
+            self.tgt.prefetch(indices)
+        if self.align_dataset is not None:
+            self.align_dataset.prefetch(indices)
+
+    def filter_indices_by_size(self, indices, max_sizes):
+        """ Filter a list of sample indices. Remove those that are longer
+            than specified in max_sizes.
+
+        Args:
+            indices (np.array): original array of sample indices
+            max_sizes (int or list[int] or tuple[int]): max sample size,
+                can be defined separately for src and tgt (then list or tuple)
+
+        Returns:
+            np.array: filtered sample array
+            list: list of removed indices
+        """
+        if max_sizes is None:
+            return indices, []
+        if type(max_sizes) in (int, float):
+            max_src_size, max_tgt_size = max_sizes, max_sizes
+        else:
+            max_src_size, max_tgt_size = max_sizes
+        if self.tgt_sizes is None:
+            ignored = indices[self.src_sizes[indices] > max_src_size]
+        else:
+            ignored = indices[(self.src_sizes[indices] > max_src_size) |
+                              (self.tgt_sizes[indices] > max_tgt_size)]
+        if len(ignored) > 0:
+            if self.tgt_sizes is None:
+                indices = indices[self.src_sizes[indices] <= max_src_size]
+            else:
+                indices = indices[(self.src_sizes[indices] <= max_src_size) &
+                                  (self.tgt_sizes[indices] <= max_tgt_size)]
+        return indices, ignored.tolist()
diff --git a/fairseq/data/legacy/__init__.py b/fairseq/data/legacy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1acaafeb09176dfda2a8bb30bab7e0ea764faa23
--- /dev/null
+++ b/fairseq/data/legacy/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .masked_lm_dictionary import BertDictionary, MaskedLMDictionary
+from .block_pair_dataset import BlockPairDataset
+from .masked_lm_dataset import MaskedLMDataset
+
+__all__ = [
+    'BertDictionary',
+    'BlockPairDataset',
+    'MaskedLMDataset',
+    'MaskedLMDictionary',
+]
diff --git a/fairseq/data/legacy/__pycache__/__init__.cpython-310.pyc b/fairseq/data/legacy/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e082e96f212ee24f1a2b14ef14d019cc67655fc
Binary files /dev/null and b/fairseq/data/legacy/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/data/legacy/__pycache__/block_pair_dataset.cpython-310.pyc b/fairseq/data/legacy/__pycache__/block_pair_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de1e6d9c4ed07e64b0f03d3e32a5695d22429d8a
Binary files /dev/null and b/fairseq/data/legacy/__pycache__/block_pair_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/legacy/__pycache__/masked_lm_dataset.cpython-310.pyc b/fairseq/data/legacy/__pycache__/masked_lm_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f1f03ce979359de67ab2f27bf8aa51576132639
Binary files /dev/null and b/fairseq/data/legacy/__pycache__/masked_lm_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/legacy/__pycache__/masked_lm_dictionary.cpython-310.pyc b/fairseq/data/legacy/__pycache__/masked_lm_dictionary.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f233d68f91b112a112a9195c39af770e7d5a1563
Binary files /dev/null and b/fairseq/data/legacy/__pycache__/masked_lm_dictionary.cpython-310.pyc differ
diff --git a/fairseq/data/legacy/block_pair_dataset.py b/fairseq/data/legacy/block_pair_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9fc8141471984f644edbc97915647897f1de617
--- /dev/null
+++ b/fairseq/data/legacy/block_pair_dataset.py
@@ -0,0 +1,312 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import numpy as np
+import torch
+
+from fairseq.data import FairseqDataset
+
+
+class BlockPairDataset(FairseqDataset):
+    """Break a Dataset of tokens into sentence pair blocks for next sentence
+       prediction as well as masked language model.
+
+       High-level logics are:
+       1. break input tensor to tensor blocks
+       2. pair the blocks with 50% next sentence and 50% random sentence
+       3. return paired blocks as well as related segment labels
+
+    Args:
+        dataset (~torch.utils.data.Dataset): dataset to break into blocks
+        sizes: array of sentence lengths
+        dictionary: dictionary for the task
+        block_size: maximum block size
+        break_mode: mode for breaking copurs into block pairs. currently we support
+            2 modes
+            doc: respect document boundaries and each part of the pair should belong to on document
+            none: don't respect any boundary and cut tokens evenly
+        short_seq_prob: probability for generating shorter block pairs
+        doc_break_size: Size for empty line separating documents. Typically 1 if
+                        the sentences have eos, 0 otherwise.
+    """
+
+    def __init__(
+        self,
+        dataset,
+        dictionary,
+        sizes,
+        block_size,
+        break_mode="doc",
+        short_seq_prob=0.1,
+        doc_break_size=1,
+    ):
+        super().__init__()
+        self.dataset = dataset
+        self.pad = dictionary.pad()
+        self.eos = dictionary.eos()
+        self.cls = dictionary.cls()
+        self.mask = dictionary.mask()
+        self.sep = dictionary.sep()
+        self.break_mode = break_mode
+        self.dictionary = dictionary
+        self.short_seq_prob = short_seq_prob
+        self.block_indices = []
+
+        assert len(dataset) == len(sizes)
+
+        if break_mode == "doc":
+            cur_doc = []
+            for sent_id, sz in enumerate(sizes):
+                assert doc_break_size == 0 or sz != 0, (
+                    "when doc_break_size is non-zero, we expect documents to be"
+                    "separated by a blank line with a single eos."
+                )
+                # empty line as document separator
+                if sz == doc_break_size:
+                    if len(cur_doc) == 0:
+                        continue
+                    self.block_indices.append(cur_doc)
+                    cur_doc = []
+                else:
+                    cur_doc.append(sent_id)
+            max_num_tokens = block_size - 3  # Account for [CLS], [SEP], [SEP]
+            self.sent_pairs = []
+            self.sizes = []
+            for doc_id, doc in enumerate(self.block_indices):
+                self._generate_sentence_pair(doc, doc_id, max_num_tokens, sizes)
+        elif break_mode is None or break_mode == "none":
+            # each block should have half of the block size since we are constructing block pair
+            sent_length = (block_size - 3) // 2
+            total_len = sum(dataset.sizes)
+            length = math.ceil(total_len / sent_length)
+
+            def block_at(i):
+                start = i * sent_length
+                end = min(start + sent_length, total_len)
+                return (start, end)
+
+            sent_indices = np.array([block_at(i) for i in range(length)])
+            sent_sizes = np.array([e - s for s, e in sent_indices])
+            dataset_index = self._sent_to_dataset_index(sent_sizes)
+
+            # pair sentences
+            self._pair_sentences(dataset_index)
+        else:
+            raise ValueError("Invalid break_mode: " + break_mode)
+
+    def _pair_sentences(self, dataset_index):
+        """
+        Give a list of evenly cut blocks/sentences, pair these sentences with 50%
+        consecutive sentences and 50% random sentences.
+        This is used for none break mode
+        """
+        # pair sentences
+        for sent_id, sent in enumerate(dataset_index):
+            next_sent_label = (
+                1 if np.random.rand() > 0.5 and sent_id != len(dataset_index) - 1 else 0
+            )
+            if next_sent_label:
+                next_sent = dataset_index[sent_id + 1]
+            else:
+                next_sent = dataset_index[
+                    self._skip_sampling(len(dataset_index), [sent_id, sent_id + 1])
+                ]
+            self.sent_pairs.append((sent, next_sent, next_sent_label))
+
+            # The current blocks don't include the special tokens but the
+            # sizes already account for this
+            self.sizes.append(3 + sent[3] + next_sent[3])
+
+    def _sent_to_dataset_index(self, sent_sizes):
+        """
+        Build index mapping block indices to the underlying dataset indices
+        """
+        dataset_index = []
+        ds_idx, ds_remaining = -1, 0
+        for to_consume in sent_sizes:
+            sent_size = to_consume
+            if ds_remaining == 0:
+                ds_idx += 1
+                ds_remaining = sent_sizes[ds_idx]
+            start_ds_idx = ds_idx
+            start_offset = sent_sizes[ds_idx] - ds_remaining
+            while to_consume > ds_remaining:
+                to_consume -= ds_remaining
+                ds_idx += 1
+                ds_remaining = sent_sizes[ds_idx]
+            ds_remaining -= to_consume
+            dataset_index.append(
+                (
+                    start_ds_idx,  # starting index in dataset
+                    start_offset,  # starting offset within starting index
+                    ds_idx,  # ending index in dataset
+                    sent_size,  # sentence length
+                )
+            )
+        assert ds_remaining == 0
+        assert ds_idx == len(self.dataset) - 1
+        return dataset_index
+
+    def _generate_sentence_pair(self, doc, doc_id, max_num_tokens, sizes):
+        """
+        Go through a single document and genrate sentence paris from it
+        """
+        current_chunk = []
+        current_length = 0
+        curr = 0
+        # To provide more randomness, we decrease target seq length for parts of
+        # samples (10% by default). Note that max_num_tokens is the hard threshold
+        # for batching and will never be changed.
+        target_seq_length = max_num_tokens
+        if np.random.random() < self.short_seq_prob:
+            target_seq_length = np.random.randint(2, max_num_tokens)
+        # loop through all sentences in document
+        while curr < len(doc):
+            sent_id = doc[curr]
+            current_chunk.append(sent_id)
+            current_length = sum(sizes[current_chunk])
+            # split chunk and generate pair when exceed target_seq_length or
+            # finish the loop
+            if curr == len(doc) - 1 or current_length >= target_seq_length:
+                # split the chunk into 2 parts
+                a_end = 1
+                if len(current_chunk) > 2:
+                    a_end = np.random.randint(1, len(current_chunk) - 1)
+                sent_a = current_chunk[:a_end]
+                len_a = sum(sizes[sent_a])
+                # generate next sentence label, note that if there is only 1 sentence
+                # in current chunk, label is always 0
+                next_sent_label = (
+                    1 if np.random.rand() > 0.5 and len(current_chunk) != 1 else 0
+                )
+                if not next_sent_label:
+                    # if next sentence label is 0, sample sent_b from a random doc
+                    target_b_length = target_seq_length - len_a
+                    rand_doc_id = self._skip_sampling(len(self.block_indices), [doc_id])
+                    random_doc = self.block_indices[rand_doc_id]
+                    random_start = np.random.randint(0, len(random_doc))
+                    sent_b = []
+                    len_b = 0
+                    for j in range(random_start, len(random_doc)):
+                        sent_b.append(random_doc[j])
+                        len_b = sum(sizes[sent_b])
+                        if len_b >= target_b_length:
+                            break
+                    # return the second part of the chunk since it's not used
+                    num_unused_segments = len(current_chunk) - a_end
+                    curr -= num_unused_segments
+                else:
+                    # if next sentence label is 1, use the second part of chunk as sent_B
+                    sent_b = current_chunk[a_end:]
+                    len_b = sum(sizes[sent_b])
+                # currently sent_a and sent_B may be longer than max_num_tokens,
+                # truncate them and return block idx and offsets for them
+                sent_a, sent_b = self._truncate_sentences(
+                    sent_a, sent_b, max_num_tokens
+                )
+                self.sent_pairs.append((sent_a, sent_b, next_sent_label))
+                self.sizes.append(3 + sent_a[3] + sent_b[3])
+                current_chunk = []
+            curr += 1
+
+    def _skip_sampling(self, total, skip_ids):
+        """
+        Generate a random integer which is not in skip_ids. Sample range is [0, total)
+        TODO: ids in skip_ids should be consecutive, we can extend it to more generic version later
+        """
+        rand_id = np.random.randint(total - len(skip_ids))
+        return rand_id if rand_id < min(skip_ids) else rand_id + len(skip_ids)
+
+    def _truncate_sentences(self, sent_a, sent_b, max_num_tokens):
+        """
+        Trancate a pair of sentence to limit total length under max_num_tokens
+        Logics:
+            1. Truncate longer sentence
+            2. Tokens to be truncated could be at the beginning or the end of the sentnce
+        Returns:
+            Truncated sentences represented by dataset idx
+        """
+        len_a, len_b = sum(self.dataset.sizes[sent_a]), sum(self.dataset.sizes[sent_b])
+        front_cut_a = front_cut_b = end_cut_a = end_cut_b = 0
+
+        while True:
+            total_length = (
+                len_a + len_b - front_cut_a - front_cut_b - end_cut_a - end_cut_b
+            )
+            if total_length <= max_num_tokens:
+                break
+
+            if len_a - front_cut_a - end_cut_a > len_b - front_cut_b - end_cut_b:
+                if np.random.rand() < 0.5:
+                    front_cut_a += 1
+                else:
+                    end_cut_a += 1
+            else:
+                if np.random.rand() < 0.5:
+                    front_cut_b += 1
+                else:
+                    end_cut_b += 1
+
+        # calculate ds indices as well as offsets and return
+        truncated_sent_a = self._cut_sentence(sent_a, front_cut_a, end_cut_a)
+        truncated_sent_b = self._cut_sentence(sent_b, front_cut_b, end_cut_b)
+        return truncated_sent_a, truncated_sent_b
+
+    def _cut_sentence(self, sent, front_cut, end_cut):
+        """
+        Cut a sentence based on the numbers of tokens to be cut from beginning and end
+        Represent the sentence as dataset idx and return
+        """
+        start_ds_idx, end_ds_idx, offset = sent[0], sent[-1], 0
+        target_len = sum(self.dataset.sizes[sent]) - front_cut - end_cut
+        while front_cut > 0:
+            if self.dataset.sizes[start_ds_idx] > front_cut:
+                offset += front_cut
+                break
+            else:
+                front_cut -= self.dataset.sizes[start_ds_idx]
+                start_ds_idx += 1
+        while end_cut > 0:
+            if self.dataset.sizes[end_ds_idx] > end_cut:
+                break
+            else:
+                end_cut -= self.dataset.sizes[end_ds_idx]
+                end_ds_idx -= 1
+        return start_ds_idx, offset, end_ds_idx, target_len
+
+    def _fetch_block(self, start_ds_idx, offset, end_ds_idx, length):
+        """
+        Fetch a block of tokens based on its dataset idx
+        """
+        buffer = torch.cat(
+            [self.dataset[idx] for idx in range(start_ds_idx, end_ds_idx + 1)]
+        )
+        s, e = offset, offset + length
+        return buffer[s:e]
+
+    def __getitem__(self, index):
+        block1, block2, next_sent_label = self.sent_pairs[index]
+        block1 = self._fetch_block(*block1)
+        block2 = self._fetch_block(*block2)
+        return block1, block2, next_sent_label
+
+    def __len__(self):
+        return len(self.sizes)
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, "supports_prefetch", False)
+
+    def prefetch(self, indices):
+        prefetch_idx = set()
+        for index in indices:
+            for block1, block2, _ in [self.sent_pairs[index]]:
+                for ds_idx in range(block1[0], block1[2] + 1):
+                    prefetch_idx.add(ds_idx)
+                for ds_idx in range(block2[0], block2[2] + 1):
+                    prefetch_idx.add(ds_idx)
+        self.dataset.prefetch(prefetch_idx)
diff --git a/fairseq/data/legacy/masked_lm_dataset.py b/fairseq/data/legacy/masked_lm_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..953aa85dd42bcaacd2b0ff2be9309395c212eb13
--- /dev/null
+++ b/fairseq/data/legacy/masked_lm_dataset.py
@@ -0,0 +1,322 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import numpy as np
+import torch
+
+from typing import Dict, List, Tuple
+
+from fairseq.data import FairseqDataset, data_utils
+
+from fairseq.data import Dictionary
+from fairseq.data.legacy.block_pair_dataset import BlockPairDataset
+from fairseq.data.token_block_dataset import TokenBlockDataset
+from fairseq.data.concat_dataset import ConcatDataset
+
+
+class MaskedLMDataset(FairseqDataset):
+    """
+    A wrapper Dataset for masked language modelling. The dataset
+    wraps around TokenBlockDataset or BlockedPairDataset and creates a batch
+    where the input blocks are masked according to the specified masking
+    probability. Additionally the batch can also contain sentence level targets
+    if this is specified.
+
+    Args:
+        dataset: Dataset which generates blocks of data. Only BlockPairDataset
+            and TokenBlockDataset are supported.
+        sizes: Sentence lengths
+        vocab: Dictionary with the vocabulary and special tokens.
+        pad_idx: Id of padding token in dictionary
+        mask_idx: Id of mask token in dictionary
+        classif_token_idx: Id of classification token in dictionary. This is the
+            token associated with the sentence embedding (Eg: CLS for BERT)
+        sep_token_idx: Id of separator token in dictionary
+            (Eg: SEP in BERT)
+        seed: Seed for random number generator for reproducibility.
+        shuffle: Shuffle the elements before batching.
+        has_pairs: Specifies whether the underlying dataset
+            generates a pair of blocks along with a sentence_target or not.
+            Setting it to True assumes that the underlying dataset generates a
+            label for the pair of sentences which is surfaced as
+            sentence_target. The default value assumes a single block with no
+            sentence target.
+        segment_id: An optional segment id for filling in the segment labels
+            when we are in the single block setting (Eg: XLM). Default is 0.
+        masking_ratio: specifies what percentage of the blocks should be masked.
+        masking_prob: specifies the probability of a given token being
+            replaced with the "MASK" token.
+        random_token_prob: specifies the probability of a given token being
+            replaced by a random token from the vocabulary.
+    """
+
+    def __init__(
+            self,
+            dataset: FairseqDataset,
+            sizes: np.ndarray,
+            vocab: Dictionary,
+            pad_idx: int,
+            mask_idx: int,
+            classif_token_idx: int,
+            sep_token_idx: int,
+            seed: int = 1,
+            shuffle: bool = True,
+            has_pairs: bool = True,
+            segment_id: int = 0,
+            masking_ratio: float = 0.15,
+            masking_prob: float = 0.8,
+            random_token_prob: float = 0.1
+    ):
+        # Make sure the input datasets are the ones supported
+        assert (
+            isinstance(dataset, TokenBlockDataset) or
+            isinstance(dataset, BlockPairDataset) or
+            isinstance(dataset, ConcatDataset)
+        ), "MaskedLMDataset only wraps TokenBlockDataset or BlockPairDataset or " \
+           "ConcatDataset"
+
+        self.dataset = dataset
+        self.sizes = np.array(sizes)
+        self.vocab = vocab
+        self.pad_idx = pad_idx
+        self.mask_idx = mask_idx
+        self.classif_token_idx = classif_token_idx
+        self.sep_token_idx = sep_token_idx
+        self.shuffle = shuffle
+        self.seed = seed
+        self.has_pairs = has_pairs
+        self.segment_id = segment_id
+        self.masking_ratio = masking_ratio
+        self.masking_prob = masking_prob
+        self.random_token_prob = random_token_prob
+
+        # If we have only one block then sizes needs to be updated to include
+        # the classification token
+        if not has_pairs:
+            self.sizes = self.sizes + 1
+
+    def __getitem__(
+            self,
+            index: int
+    ):
+        # if has_pairs, then expect 2 blocks and a sentence target
+        if self.has_pairs:
+            (block_one, block_two, sentence_target) = self.dataset[index]
+        else:
+            block_one = self.dataset[index]
+
+        return {
+            "id": index,
+            "block_one": block_one,
+            "block_two": block_two if self.has_pairs else None,
+            "sentence_target": sentence_target if self.has_pairs else None,
+        }
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def _mask_block(
+            self,
+            sentence: np.ndarray,
+            mask_idx: int,
+            pad_idx: int,
+            dictionary_token_range: Tuple,
+    ):
+        """
+        Mask tokens for Masked Language Model training
+        Samples mask_ratio tokens that will be predicted by LM.
+
+        Note:This function may not be efficient enough since we had multiple
+        conversions between np and torch, we can replace them with torch
+        operators later.
+
+        Args:
+            sentence: 1d tensor to be masked
+            mask_idx: index to use for masking the sentence
+            pad_idx: index to use for masking the target for tokens we aren't
+                predicting
+            dictionary_token_range: range of indices in dictionary which can
+                be used for random word replacement
+                (e.g. without special characters)
+        Return:
+            masked_sent: masked sentence
+            target: target with words which we are not predicting replaced
+                by pad_idx
+        """
+        masked_sent = np.copy(sentence)
+        sent_length = len(sentence)
+        mask_num = math.ceil(sent_length * self.masking_ratio)
+        mask = np.random.choice(sent_length, mask_num, replace=False)
+        target = np.copy(sentence)
+
+        for i in range(sent_length):
+            if i in mask:
+                rand = np.random.random()
+
+                # replace with mask if probability is less than masking_prob
+                # (Eg: 0.8)
+                if rand < self.masking_prob:
+                    masked_sent[i] = mask_idx
+
+                # replace with random token if probability is less than
+                # masking_prob + random_token_prob (Eg: 0.9)
+                elif rand < (self.masking_prob + self.random_token_prob):
+                    # sample random token from dictionary
+                    masked_sent[i] = (
+                        np.random.randint(
+                            dictionary_token_range[0], dictionary_token_range[1]
+                        )
+                    )
+            else:
+                target[i] = pad_idx
+
+        return masked_sent, target
+
+    def _collate(
+            self,
+            samples: List[Dict],
+            pad_idx: int,
+            eos_idx: int
+    ):
+        """
+        Does the heavy lifting for creating a batch from the input list of
+        examples. The logic is as follows:
+            1. Mask the input blocks. In case has_pair is True then we have 2
+               blocks to mask.
+            2. Prepend the first masked block tensor with the special token
+               used as sentence embedding. Eg: CLS in BERT. This happens
+               irrespective of the value of has_pair.
+            3. If has_pair is True, then append the first masked block with the
+               special separator token (eg: SEP for BERT) and compute segment
+               label accordingly. In this case, also append the second masked
+               block with this special separator token and compute its segment
+               label.
+            4. For the targets tensor, prepend and append with padding index
+               accordingly.
+            5. Concatenate all tensors.
+        """
+        if len(samples) == 0:
+            return {}
+        # To ensure determinism, we reset the state of the PRNG after every
+        # batch based on the seed and the first id of the batch. This ensures
+        # that across epochs we get the same mask for the same example. This
+        # is needed for reproducibility and is how BERT does masking
+        # TODO: Can we add deteminism without this constraint?
+        with data_utils.numpy_seed(self.seed + samples[0]["id"]):
+            for s in samples:
+
+                # token range is needed for replacing with random token during
+                # masking
+                token_range = (self.vocab.nspecial, len(self.vocab))
+
+                # mask according to specified probabilities.
+                masked_blk_one, masked_tgt_one = self._mask_block(
+                    s["block_one"], self.mask_idx, self.pad_idx, token_range,
+                )
+
+                tokens = np.concatenate([
+                    [self.classif_token_idx], masked_blk_one
+                ])
+                targets = np.concatenate([[self.pad_idx], masked_tgt_one])
+                segments = np.ones(len(tokens)) * self.segment_id
+
+                # if has_pairs is True then we need to add the SEP token to both
+                # the blocks after masking and re-compute segments based on the new
+                # lengths.
+                if self.has_pairs:
+                    tokens_one = np.concatenate([tokens, [self.sep_token_idx]])
+                    targets_one = np.concatenate([targets, [self.pad_idx]])
+
+                    masked_blk_two, masked_tgt_two = self._mask_block(
+                        s["block_two"], self.mask_idx, self.pad_idx, token_range)
+                    tokens_two = np.concatenate(
+                        [masked_blk_two, [self.sep_token_idx]])
+                    targets_two = np.concatenate([masked_tgt_two, [self.pad_idx]])
+
+                    # block + 1 sep + 1 special (CLS)
+                    segments_one = np.zeros(len(tokens_one))
+                    # block + 1 sep
+                    segments_two = np.ones(len(tokens_two))
+
+                    tokens = np.concatenate([tokens_one, tokens_two])
+                    targets = np.concatenate([targets_one, targets_two])
+                    segments = np.concatenate([segments_one, segments_two])
+
+                s["source"] = torch.LongTensor(tokens)
+                s["segment_labels"] = torch.LongTensor(segments)
+                s["lm_target"] = torch.LongTensor(targets)
+
+        def merge(key):
+            return data_utils.collate_tokens(
+                [s[key] for s in samples], pad_idx, eos_idx, left_pad=False
+            )
+        return {
+            "id": torch.LongTensor([s["id"] for s in samples]),
+            "ntokens": sum(len(s["source"]) for s in samples),
+            "net_input": {
+                "src_tokens": merge("source"),
+                "segment_labels": merge("segment_labels"),
+            },
+            "lm_target": merge("lm_target"),
+            "sentence_target": torch.LongTensor(
+                [s["sentence_target"] for s in samples]
+            ) if self.has_pairs else None,
+            "nsentences": len(samples),
+        }
+
+    def collater(
+            self,
+            samples: List[Dict]
+    ):
+        """Merge a list of samples to form a mini-batch.
+
+        Args:
+            samples (List[dict]): samples to collate
+
+        Returns:
+            dict: a mini-batch of data
+        """
+        return self._collate(samples, self.vocab.pad(), self.vocab.eos())
+
+    def num_tokens(
+            self,
+            index: int
+    ):
+        """
+        Return the number of tokens in a sample. This value is used to
+        enforce max-tokens during batching.
+        """
+        return self.sizes[index]
+
+    def size(
+            self,
+            index: int
+    ):
+        """
+        Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with max-positions.
+        """
+        return self.sizes[index]
+
+    def ordered_indices(self):
+        """
+        Return an ordered list of indices. Batches will be constructed based
+        on this order.
+        """
+        if self.shuffle:
+            return np.random.permutation(len(self))
+        else:
+            order = [np.arange(len(self))]
+            order.append(self.sizes)
+            return np.lexsort(order)
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, "supports_prefetch", False)
+
+    def prefetch(self, indices):
+        self.dataset.prefetch(indices)
diff --git a/fairseq/data/legacy/masked_lm_dictionary.py b/fairseq/data/legacy/masked_lm_dictionary.py
new file mode 100644
index 0000000000000000000000000000000000000000..bff4bcb5ec085ced94aa198dff9c9a3bfe61b1ca
--- /dev/null
+++ b/fairseq/data/legacy/masked_lm_dictionary.py
@@ -0,0 +1,58 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.data import Dictionary
+
+
+class MaskedLMDictionary(Dictionary):
+    """
+    Dictionary for Masked Language Modelling tasks. This extends Dictionary by
+    adding the mask symbol.
+    """
+    def __init__(
+        self,
+        pad='<pad>',
+        eos='</s>',
+        unk='<unk>',
+        mask='<mask>',
+    ):
+        super().__init__(pad=pad, eos=eos, unk=unk)
+        self.mask_word = mask
+        self.mask_index = self.add_symbol(mask)
+        self.nspecial = len(self.symbols)
+
+    def mask(self):
+        """Helper to get index of mask symbol"""
+        return self.mask_index
+
+
+class BertDictionary(MaskedLMDictionary):
+    """
+    Dictionary for BERT task. This extends MaskedLMDictionary by adding support
+    for cls and sep symbols.
+    """
+    def __init__(
+        self,
+        pad='<pad>',
+        eos='</s>',
+        unk='<unk>',
+        mask='<mask>',
+        cls='<cls>',
+        sep='<sep>'
+    ):
+        super().__init__(pad=pad, eos=eos, unk=unk, mask=mask)
+        self.cls_word = cls
+        self.sep_word = sep
+        self.cls_index = self.add_symbol(cls)
+        self.sep_index = self.add_symbol(sep)
+        self.nspecial = len(self.symbols)
+
+    def cls(self):
+        """Helper to get index of cls symbol"""
+        return self.cls_index
+
+    def sep(self):
+        """Helper to get index of sep symbol"""
+        return self.sep_index
diff --git a/fairseq/data/list_dataset.py b/fairseq/data/list_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..b96bba343747280e1867d8368449458de1c91b1d
--- /dev/null
+++ b/fairseq/data/list_dataset.py
@@ -0,0 +1,33 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import BaseWrapperDataset
+
+
+class ListDataset(BaseWrapperDataset):
+
+    def __init__(self, dataset, sizes=None):
+        super().__init__(dataset)
+        self._sizes = sizes
+
+    def __iter__(self):
+        for x in self.dataset:
+            yield x
+
+    def collater(self, samples):
+        return samples
+
+    @property
+    def sizes(self):
+        return self._sizes
+
+    def num_tokens(self, index):
+        return self.sizes[index]
+
+    def size(self, index):
+        return self.sizes[index]
+
+    def set_epoch(self, epoch):
+        pass
diff --git a/fairseq/data/lm_context_window_dataset.py b/fairseq/data/lm_context_window_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..17ba08bc7f5a7026e234967b20cde4e9d53a0069
--- /dev/null
+++ b/fairseq/data/lm_context_window_dataset.py
@@ -0,0 +1,78 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from fairseq.data.monolingual_dataset import MonolingualDataset
+
+from . import FairseqDataset
+
+
+class LMContextWindowDataset(FairseqDataset):
+    """Wraps a MonolingualDataset and provides more context for evaluation."""
+
+    def __init__(self, dataset, tokens_per_sample, context_window, pad_idx):
+        assert isinstance(dataset, MonolingualDataset)
+        assert context_window > 0
+        self.dataset = dataset
+        self.tokens_per_sample = tokens_per_sample
+        self.context_window = context_window
+        self.pad_idx = pad_idx
+        self.prev_tokens = np.empty([0])
+
+    def __getitem__(self, index):
+        return self.dataset[index]
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def collater(self, samples):
+        sample = self.dataset.collater(samples)
+
+        pad = self.pad_idx
+        max_sample_len = self.tokens_per_sample + self.context_window
+
+        bsz, tsz = sample['net_input']['src_tokens'].shape
+        start_idxs = [0] * bsz
+        toks = sample['net_input']['src_tokens']
+        lengths = sample['net_input']['src_lengths']
+        tgt = sample['target']
+        new_toks = np.empty([bsz, tsz + self.context_window], dtype=np.int64)
+        new_tgt = np.full([bsz, tsz + self.context_window], pad, dtype=np.int64)
+        sample_lens = toks.ne(pad).long().sum(dim=1).cpu()
+        for i in range(bsz):
+            sample_len = sample_lens[i]
+            extra = len(self.prev_tokens) + sample_len - max_sample_len
+            if extra > 0:
+                self.prev_tokens = self.prev_tokens[extra:]
+            pads = np.full(self.context_window - len(self.prev_tokens), pad)
+            new_toks[i] = np.concatenate([self.prev_tokens, toks[i].numpy(), pads])
+            new_tgt[i, len(self.prev_tokens):len(self.prev_tokens) + len(tgt[i])] = tgt[i]
+            start_idxs[i] = len(self.prev_tokens)
+            lengths[i] += len(self.prev_tokens)
+            self.prev_tokens = new_toks[i][new_toks[i] != pad][-self.context_window:]
+        sample['net_input']['src_tokens'] = torch.from_numpy(new_toks)
+        sample['target'] = torch.from_numpy(new_tgt)
+        sample['start_indices'] = start_idxs
+
+        return sample
+
+    def num_tokens(self, index):
+        return self.dataset.num_tokens(index)
+
+    def size(self, index):
+        return self.dataset.size(index)
+
+    def ordered_indices(self):
+        # NOTE we don't shuffle the data to retain access to the previous dataset elements
+        return np.arange(len(self.dataset))
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, 'supports_prefetch', False)
+
+    def prefetch(self, indices):
+        return self.dataset.prefetch(indices)
diff --git a/fairseq/data/lru_cache_dataset.py b/fairseq/data/lru_cache_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..833a2c75cb7c2280dd63a80c020f16b385f9317d
--- /dev/null
+++ b/fairseq/data/lru_cache_dataset.py
@@ -0,0 +1,22 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from functools import lru_cache
+
+from . import BaseWrapperDataset
+
+
+class LRUCacheDataset(BaseWrapperDataset):
+
+    def __init__(self, dataset, token=None):
+        super().__init__(dataset)
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, index):
+        return self.dataset[index]
+
+    @lru_cache(maxsize=8)
+    def collater(self, samples):
+        return self.dataset.collater(samples)
diff --git a/fairseq/data/mask_tokens_dataset.py b/fairseq/data/mask_tokens_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..28bc3bc9cf0bf32c22560f3c5d4c26b41fae8684
--- /dev/null
+++ b/fairseq/data/mask_tokens_dataset.py
@@ -0,0 +1,173 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from functools import lru_cache
+
+import numpy as np
+import torch
+
+from fairseq.data import data_utils, Dictionary
+
+from . import BaseWrapperDataset, LRUCacheDataset
+
+
+class MaskTokensDataset(BaseWrapperDataset):
+    """
+    A wrapper Dataset for masked language modeling.
+
+    Input items are masked according to the specified masking probability.
+
+    Args:
+        dataset: Dataset to wrap.
+        sizes: Sentence lengths
+        vocab: Dictionary with the vocabulary and special tokens.
+        pad_idx: Id of pad token in vocab
+        mask_idx: Id of mask token in vocab
+        return_masked_tokens: controls whether to return the non-masked tokens
+            (the default) or to return a tensor with the original masked token
+            IDs (and *pad_idx* elsewhere). The latter is useful as targets for
+            masked LM training.
+        seed: Seed for random number generator for reproducibility.
+        mask_prob: probability of replacing a token with *mask_idx*.
+        leave_unmasked_prob: probability that a masked token is unmasked.
+        random_token_prob: probability of replacing a masked token with a
+            random token from the vocabulary.
+        freq_weighted_replacement: sample random replacement words based on
+            word frequencies in the vocab.
+        mask_whole_words: only mask whole words. This should be a byte mask
+            over vocab indices, indicating whether it is the beginning of a
+            word. We will extend any mask to encompass the whole word.
+        bpe: BPE to use for whole-word masking.
+    """
+
+    @classmethod
+    def apply_mask(cls, dataset: torch.utils.data.Dataset, *args, **kwargs):
+        """Return the source and target datasets for masked LM training."""
+        dataset = LRUCacheDataset(dataset)
+        return (
+            LRUCacheDataset(cls(dataset, *args, **kwargs, return_masked_tokens=False)),
+            LRUCacheDataset(cls(dataset, *args, **kwargs, return_masked_tokens=True)),
+        )
+
+    def __init__(
+        self,
+        dataset: torch.utils.data.Dataset,
+        vocab: Dictionary,
+        pad_idx: int,
+        mask_idx: int,
+        return_masked_tokens: bool = False,
+        seed: int = 1,
+        mask_prob: float = 0.15,
+        leave_unmasked_prob: float = 0.1,
+        random_token_prob: float = 0.1,
+        freq_weighted_replacement: bool = False,
+        mask_whole_words: torch.Tensor = None,
+    ):
+        assert 0.0 < mask_prob < 1.0
+        assert 0.0 <= random_token_prob <= 1.0
+        assert 0.0 <= leave_unmasked_prob <= 1.0
+        assert random_token_prob + leave_unmasked_prob <= 1.0
+
+        self.dataset = dataset
+        self.vocab = vocab
+        self.pad_idx = pad_idx
+        self.mask_idx = mask_idx
+        self.return_masked_tokens = return_masked_tokens
+        self.seed = seed
+        self.mask_prob = mask_prob
+        self.leave_unmasked_prob = leave_unmasked_prob
+        self.random_token_prob = random_token_prob
+        self.mask_whole_words = mask_whole_words
+
+        if random_token_prob > 0.0:
+            if freq_weighted_replacement:
+                weights = np.array(self.vocab.count)
+            else:
+                weights = np.ones(len(self.vocab))
+            weights[:self.vocab.nspecial] = 0
+            self.weights = weights / weights.sum()
+
+        self.epoch = 0
+
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, index: int):
+        with data_utils.numpy_seed(self.seed, self.epoch, index):
+            item = self.dataset[index]
+            sz = len(item)
+
+            assert self.mask_idx not in item, \
+                'Dataset contains mask_idx (={}), this is not expected!'.format(
+                    self.mask_idx,
+                )
+
+            if self.mask_whole_words is not None:
+                word_begins_mask = self.mask_whole_words.gather(0, item)
+                word_begins_idx = word_begins_mask.nonzero().view(-1)
+                sz = len(word_begins_idx)
+                words = np.split(word_begins_mask, word_begins_idx)[1:]
+                assert len(words) == sz
+                word_lens = list(map(len, words))
+
+            # decide elements to mask
+            mask = np.full(sz, False)
+            num_mask = int(
+                # add a random number for probabilistic rounding
+                self.mask_prob * sz + np.random.rand()
+            )
+            mask[np.random.choice(sz, num_mask, replace=False)] = True
+
+            if self.return_masked_tokens:
+                # exit early if we're just returning the masked tokens
+                # (i.e., the targets for masked LM training)
+                if self.mask_whole_words is not None:
+                    mask = np.repeat(mask, word_lens)
+                new_item = np.full(len(mask), self.pad_idx)
+                new_item[mask] = item[torch.from_numpy(mask.astype(np.uint8)) == 1]
+                return torch.from_numpy(new_item)
+
+            # decide unmasking and random replacement
+            rand_or_unmask_prob = self.random_token_prob + self.leave_unmasked_prob
+            if rand_or_unmask_prob > 0.0:
+                rand_or_unmask = mask & (np.random.rand(sz) < rand_or_unmask_prob)
+                if self.random_token_prob == 0.0:
+                    unmask = rand_or_unmask
+                    rand_mask = None
+                elif self.leave_unmasked_prob == 0.0:
+                    unmask = None
+                    rand_mask = rand_or_unmask
+                else:
+                    unmask_prob = self.leave_unmasked_prob / rand_or_unmask_prob
+                    decision = np.random.rand(sz) < unmask_prob
+                    unmask = rand_or_unmask & decision
+                    rand_mask = rand_or_unmask & (~decision)
+            else:
+                unmask = rand_mask = None
+
+            if unmask is not None:
+                mask = mask ^ unmask
+
+            if self.mask_whole_words is not None:
+                mask = np.repeat(mask, word_lens)
+
+            new_item = np.copy(item)
+            new_item[mask] = self.mask_idx
+            if rand_mask is not None:
+                num_rand = rand_mask.sum()
+                if num_rand > 0:
+                    if self.mask_whole_words is not None:
+                        rand_mask = np.repeat(rand_mask, word_lens)
+                        num_rand = rand_mask.sum()
+
+                    new_item[rand_mask] = np.random.choice(
+                        len(self.vocab),
+                        num_rand,
+                        p=self.weights,
+                    )
+
+            return torch.from_numpy(new_item)
diff --git a/fairseq/data/monolingual_dataset.py b/fairseq/data/monolingual_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..76c377237470fe01331bddc1a5d9f059641e817e
--- /dev/null
+++ b/fairseq/data/monolingual_dataset.py
@@ -0,0 +1,200 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from . import data_utils, FairseqDataset
+
+
+def collate(samples, pad_idx, eos_idx):
+    if len(samples) == 0:
+        return {}
+
+    def merge(key, is_list=False):
+        if is_list:
+            res = []
+            for i in range(len(samples[0][key])):
+                res.append(data_utils.collate_tokens(
+                    [s[key][i] for s in samples], pad_idx, eos_idx, left_pad=False,
+                ))
+            return res
+        else:
+            return data_utils.collate_tokens(
+                [s[key] for s in samples], pad_idx, eos_idx, left_pad=False,
+            )
+
+    src_tokens = merge('source')
+    if samples[0]['target'] is not None:
+        is_target_list = isinstance(samples[0]['target'], list)
+        target = merge('target', is_target_list)
+    else:
+        target = src_tokens
+
+    return {
+        'id': torch.LongTensor([s['id'] for s in samples]),
+        'nsentences': len(samples),
+        'ntokens': sum(len(s['source']) for s in samples),
+        'net_input': {
+            'src_tokens': src_tokens,
+            'src_lengths': torch.LongTensor([
+                s['source'].numel() for s in samples
+            ]),
+        },
+        'target': target,
+    }
+
+
+class MonolingualDataset(FairseqDataset):
+    """
+    A wrapper around torch.utils.data.Dataset for monolingual data.
+
+    Args:
+        dataset (torch.utils.data.Dataset): dataset to wrap
+        sizes (List[int]): sentence lengths
+        vocab (~fairseq.data.Dictionary): vocabulary
+        shuffle (bool, optional): shuffle the elements before batching
+            (default: True).
+    """
+
+    def __init__(self, dataset, sizes, src_vocab, tgt_vocab, add_eos_for_other_targets, shuffle,
+                 targets=None, add_bos_token=False):
+        self.dataset = dataset
+        self.sizes = np.array(sizes)
+        self.vocab = src_vocab
+        self.tgt_vocab = tgt_vocab
+        self.add_eos_for_other_targets = add_eos_for_other_targets
+        self.shuffle = shuffle
+        self.add_bos_token = add_bos_token
+
+        assert targets is None or all(t in {'self', 'future', 'past'} for t in targets), \
+            "targets must be none or one of 'self', 'future', 'past'"
+        if targets is not None and len(targets) == 0:
+            targets = None
+        self.targets = targets
+
+    def __getitem__(self, index):
+        if self.targets is not None:
+            # *future_target* is the original sentence
+            # *source* is shifted right by 1 (maybe left-padded with eos)
+            # *past_target* is shifted right by 2 (left-padded as needed)
+            #
+            # Left-to-right language models should condition on *source* and
+            # predict *future_target*.
+            # Right-to-left language models should condition on *source* and
+            # predict *past_target*.
+            source, future_target, past_target = self.dataset[index]
+            source, target = self._make_source_target(source, future_target, past_target)
+        else:
+            source = self.dataset[index]
+            target = None
+        source, target = self._maybe_add_bos(source, target)
+        return {'id': index, 'source': source, 'target': target}
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def _make_source_target(self, source, future_target, past_target):
+        if self.targets is not None:
+            target = []
+
+            if self.add_eos_for_other_targets and (('self' in self.targets) or ('past' in self.targets)) \
+                    and source[-1] != self.vocab.eos():
+                # append eos at the end of source
+                source = torch.cat([source, source.new([self.vocab.eos()])])
+
+                if 'future' in self.targets:
+                    future_target = torch.cat([future_target, future_target.new([self.vocab.pad()])])
+                if 'past' in self.targets:
+                    # first token is before the start of sentence which is only used in "none" break mode when
+                    # add_eos_for_other_targets is False
+                    past_target = torch.cat([past_target.new([self.vocab.pad()]), past_target[1:], source[-2, None]])
+
+            for t in self.targets:
+                if t == 'self':
+                    target.append(source)
+                elif t == 'future':
+                    target.append(future_target)
+                elif t == 'past':
+                    target.append(past_target)
+                else:
+                    raise Exception('invalid target ' + t)
+
+            if len(target) == 1:
+                target = target[0]
+        else:
+            target = future_target
+
+        return source, self._filter_vocab(target)
+
+    def _maybe_add_bos(self, source, target):
+        if self.add_bos_token:
+            source = torch.cat([source.new([self.vocab.bos()]), source])
+            if target is not None:
+                target = torch.cat([target.new([self.tgt_vocab.bos()]), target])
+        return source, target
+
+    def _filter_vocab(self, target):
+        if len(self.tgt_vocab) != len(self.vocab):
+            def _filter(target):
+                mask = target.ge(len(self.tgt_vocab))
+                if mask.any():
+                    target[mask] = self.tgt_vocab.unk()
+                return target
+
+            if isinstance(target, list):
+                return [_filter(t) for t in target]
+            return _filter(target)
+        return target
+
+    def collater(self, samples):
+        """Merge a list of samples to form a mini-batch.
+
+        Args:
+            samples (List[dict]): samples to collate
+
+        Returns:
+            dict: a mini-batch with the following keys:
+
+                - `id` (LongTensor): example IDs in the original input order
+                - `ntokens` (int): total number of tokens in the batch
+                - `net_input` (dict): the input to the Model, containing keys:
+
+                  - `src_tokens` (LongTensor): a padded 2D Tensor of tokens in
+                    the source sentence of shape `(bsz, src_len)`. Padding will
+                    appear on the right.
+
+                - `target` (LongTensor): a padded 2D Tensor of tokens in the
+                  target sentence of shape `(bsz, tgt_len)`. Padding will appear
+                  on the right.
+        """
+        return collate(samples, self.vocab.pad(), self.vocab.eos())
+
+    def num_tokens(self, index):
+        """Return the number of tokens in a sample. This value is used to
+        enforce ``--max-tokens`` during batching."""
+        return self.sizes[index]
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        return self.sizes[index]
+
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        if self.shuffle:
+            order = [np.random.permutation(len(self))]
+        else:
+            order = [np.arange(len(self))]
+        order.append(self.sizes)
+        return np.lexsort(order)
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, 'supports_prefetch', False)
+
+    def prefetch(self, indices):
+        self.dataset.prefetch(indices)
diff --git a/fairseq/data/multi_corpus_dataset.py b/fairseq/data/multi_corpus_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..02d269a17c176187fea30680c33b22a0850326d4
--- /dev/null
+++ b/fairseq/data/multi_corpus_dataset.py
@@ -0,0 +1,149 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from collections import OrderedDict
+from typing import Dict, List
+
+import numpy as np
+from fairseq.data import data_utils
+
+from . import FairseqDataset
+
+
+logger = logging.getLogger(__name__)
+
+
+class MultiCorpusDataset(FairseqDataset):
+    """
+    Stores multiple instances of FairseqDataset together. Requires each instance
+    to be the same dataset, as the collate method needs to work on batches with
+    samples from each dataset.
+
+    Allows specifying a distribution over the datasets to use. Note that unlike
+    MultiCorpusSampledDataset, this distribution allows sampling for each item,
+    rather than on a batch level.
+
+    Each time ordered_indices() is called, a new sample is generated with
+    the specified distribution.
+
+    Args:
+        datasets: a OrderedDict of FairseqDataset instances.
+        distribution: a List containing the probability of getting an utterance from
+                        corresponding dataset
+    """
+
+    def __init__(
+        self, datasets: Dict[str, FairseqDataset], distribution: List[float], seed: int
+    ):
+        super().__init__()
+        assert isinstance(datasets, OrderedDict)
+        assert len(datasets) == len(distribution)
+        self.datasets = datasets
+        self.distribution = distribution
+        self.seed = seed
+
+        # Avoid repeated conversions to list later
+        self.dataset_list = list(datasets.values())
+        self.total_num_instances = 0
+
+        first_dataset = list(self.datasets.values())[0]
+
+        self.dataset_offsets = []
+        for dataset in datasets.values():
+            assert isinstance(dataset, FairseqDataset)
+            assert type(dataset) is type(first_dataset)
+            self.dataset_offsets.append(self.total_num_instances)
+            self.total_num_instances += len(dataset)
+
+    def ordered_indices(self):
+        with data_utils.numpy_seed(self.seed, self.epoch):
+            # Used to store the order of indices of each dataset to use
+            indices = [
+                np.random.permutation(len(dataset))
+                for dataset in self.datasets.values()
+            ]
+            # Keep track of which samples we've  used for each dataset
+            counters = [0 for _ in self.datasets]
+
+            return np.array(
+                [
+                    self._sample(indices, counters)
+                    for _ in range(self.total_num_instances)
+                ],
+                dtype=np.int64,
+            )
+
+    def _sample(self, indices, counters):
+        # First pick dataset
+        dataset_idx = np.random.choice(len(self.distribution), p=self.distribution)
+
+        # Then get dataset internal index
+        idx = indices[dataset_idx][counters[dataset_idx]]
+
+        # Convert to multi-datasets index
+        idx += self.dataset_offsets[dataset_idx]
+
+        counters[dataset_idx] += 1
+
+        # Reset if we reach end
+        if counters[dataset_idx] == len(self.dataset_list[dataset_idx]):
+            counters[dataset_idx] = 0
+            indices[dataset_idx] = np.random.permutation(
+                len(self.dataset_list[dataset_idx])
+            )
+
+        return idx
+
+    def _map_index(self, index: int):
+        """
+        If dataset A has length N and dataset B has length M
+        then index 1 maps to index 1 of dataset A, and index N + 1
+        maps to index 1 of B.
+        """
+        counter = 0
+        for key, dataset in self.datasets.items():
+            if index < counter + len(dataset):
+                return index - counter, key
+            counter += len(dataset)
+        raise ValueError(
+            "Invalid index: {}, max: {}".format(index, self.total_num_instances)
+        )
+
+    def __len__(self):
+        """
+        Length of this dataset is the sum of individual datasets
+        """
+        return self.total_num_instances
+
+    def __getitem__(self, index):
+        index, key = self._map_index(index)
+        return self.datasets[key][index]
+
+    def collater(self, samples):
+        """
+        Since we enforce all datsets to be the same, collating is just
+        picking the first one and doing collate.
+        """
+        if len(samples) == 0:
+            return None
+
+        return list(self.datasets.values())[0].collater(samples)
+
+    def num_tokens(self, index: int):
+        index, key = self._map_index(index)
+        return self.datasets[key].num_tokens(index)
+
+    def size(self, index: int):
+        index, key = self._map_index(index)
+        return self.datasets[key].size(index)
+
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+
+    @property
+    def supports_prefetch(self):
+        return False
diff --git a/fairseq/data/multi_corpus_sampled_dataset.py b/fairseq/data/multi_corpus_sampled_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad8e951cc905a73fea28b4fac449e307cadfa52f
--- /dev/null
+++ b/fairseq/data/multi_corpus_sampled_dataset.py
@@ -0,0 +1,145 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import OrderedDict
+from typing import Callable, Dict, List
+
+import numpy as np
+
+from . import FairseqDataset
+
+
+def uniform_sampler(x):
+    # Sample from uniform distribution
+    return np.random.choice(x, 1).item()
+
+
+class MultiCorpusSampledDataset(FairseqDataset):
+    """
+    Stores multiple instances of FairseqDataset together and in every iteration
+    creates a batch by first sampling a dataset according to a specified
+    probability distribution and then getting instances from that dataset.
+
+    Args:
+        datasets: an OrderedDict of FairseqDataset instances.
+        sampling_func: A function for sampling over list of dataset keys.
+            The default strategy is to sample uniformly.
+    """
+
+    def __init__(
+        self,
+        datasets: Dict[str, FairseqDataset],
+        sampling_func: Callable[[List], int] = None,
+    ):
+        super().__init__()
+        assert isinstance(datasets, OrderedDict)
+        self.datasets = datasets
+        if sampling_func is None:
+            sampling_func = uniform_sampler
+        self.sampling_func = sampling_func
+
+        self.total_num_instances = 0
+        for _, dataset in datasets.items():
+            assert isinstance(dataset, FairseqDataset)
+            self.total_num_instances += len(dataset)
+
+        self._ordered_indices = None
+
+    def __len__(self):
+        """
+        Length of this dataset is the sum of individual datasets
+        """
+        return self.total_num_instances
+
+    def ordered_indices(self):
+        """
+        Ordered indices for batching. Here we call the underlying
+        dataset's ordered_indices() so that we get the same random ordering
+        as we would have from using the underlying dataset directly.
+        """
+        if self._ordered_indices is None:
+            self._ordered_indices = OrderedDict(
+                [
+                    (key, dataset.ordered_indices())
+                    for key, dataset in self.datasets.items()
+                ]
+            )
+        return np.arange(len(self))
+
+    def _map_index_to_dataset(self, key: int, index: int):
+        """
+        Different underlying datasets have different lengths. In order to ensure
+        we are not accessing an index outside the range of the current dataset
+        size, we wrap around. This function should be called after we have
+        created an ordering for this and all underlying datasets.
+        """
+        assert (
+            self._ordered_indices is not None
+        ), "Must call MultiCorpusSampledDataset.ordered_indices() first"
+        mapped_index = index % len(self.datasets[key])
+        return self._ordered_indices[key][mapped_index]
+
+    def __getitem__(self, index: int):
+        """
+        Get the item associated with index from each underlying dataset.
+        Since index is in the range of [0, TotalNumInstances], we need to
+        map the index to the dataset before retrieving the item.
+        """
+        return OrderedDict(
+            [
+                (key, dataset[self._map_index_to_dataset(key, index)])
+                for key, dataset in self.datasets.items()
+            ]
+        )
+
+    def collater(self, samples: List[Dict]):
+        """
+        Generate a mini-batch for this dataset.
+        To convert this into a regular mini-batch we use the following
+        logic:
+            1. Select a dataset using the specified probability distribution.
+            2. Call the collater function of the selected dataset.
+        """
+        if len(samples) == 0:
+            return None
+
+        selected_key = self.sampling_func(list(self.datasets.keys()))
+        selected_samples = [sample[selected_key] for sample in samples]
+        return self.datasets[selected_key].collater(selected_samples)
+
+    def num_tokens(self, index: int):
+        """
+        Return an example's length (number of tokens), used for batching. Here
+        we return the max across all examples at index across all underlying
+        datasets.
+        """
+        return max(
+            dataset.num_tokens(self._map_index_to_dataset(key, index))
+            for key, dataset in self.datasets.items()
+        )
+
+    def size(self, index: int):
+        """
+        Return an example's size as a float or tuple. Here we return the max
+        across all underlying datasets. This value is used when filtering a
+        dataset with max-positions.
+        """
+        return max(
+            dataset.size(self._map_index_to_dataset(key, index))
+            for key, dataset in self.datasets.items()
+        )
+
+    @property
+    def supports_prefetch(self):
+        return all(
+            getattr(dataset, "supports_prefetch", False)
+            for dataset in self.datasets.values()
+        )
+
+    def prefetch(self, indices):
+        for key, dataset in self.datasets.items():
+            dataset.prefetch(
+                [self._map_index_to_dataset(key, index) for index in indices]
+            )
diff --git a/fairseq/data/multilingual/__init__.py b/fairseq/data/multilingual/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6264236915a7269a4d920ee8213004374dd86a9a
--- /dev/null
+++ b/fairseq/data/multilingual/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/fairseq/data/multilingual/__pycache__/__init__.cpython-310.pyc b/fairseq/data/multilingual/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54cb89885e6765ef5d75a07cc0955d6781d67cee
Binary files /dev/null and b/fairseq/data/multilingual/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/data/multilingual/__pycache__/multilingual_data_manager.cpython-310.pyc b/fairseq/data/multilingual/__pycache__/multilingual_data_manager.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..890640754681ebbcf3deb6e47d6292469995e302
Binary files /dev/null and b/fairseq/data/multilingual/__pycache__/multilingual_data_manager.cpython-310.pyc differ
diff --git a/fairseq/data/multilingual/__pycache__/sampled_multi_dataset.cpython-310.pyc b/fairseq/data/multilingual/__pycache__/sampled_multi_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e9246b8078356a443c46a02773e990a6ba6b6305
Binary files /dev/null and b/fairseq/data/multilingual/__pycache__/sampled_multi_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/multilingual/__pycache__/sampled_multi_epoch_dataset.cpython-310.pyc b/fairseq/data/multilingual/__pycache__/sampled_multi_epoch_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3686a68c049f73511ba8b5ea0b253bddbe733fcf
Binary files /dev/null and b/fairseq/data/multilingual/__pycache__/sampled_multi_epoch_dataset.cpython-310.pyc differ
diff --git a/fairseq/data/multilingual/__pycache__/sampling_method.cpython-310.pyc b/fairseq/data/multilingual/__pycache__/sampling_method.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..821bed25db6dc5330c3623a2fd0b8d53fca12869
Binary files /dev/null and b/fairseq/data/multilingual/__pycache__/sampling_method.cpython-310.pyc differ
diff --git a/fairseq/data/multilingual/multilingual_data_manager.py b/fairseq/data/multilingual/multilingual_data_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..b31028c1912e3f88131609b4094f08f20ab3625a
--- /dev/null
+++ b/fairseq/data/multilingual/multilingual_data_manager.py
@@ -0,0 +1,836 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+import logging
+import os
+
+import numpy as np
+from collections import OrderedDict
+
+import json
+from fairseq import options, utils
+from fairseq.options import eval_str_dict, csv_str_list
+
+from fairseq.data import (
+    Dictionary,
+    AppendTokenDataset,
+    ConcatDataset,
+    data_utils,
+    indexed_dataset,
+    LanguagePairDataset,
+    PrependTokenDataset,
+    StripTokenDataset,
+    TruncateDataset,
+    SampledMultiDataset,
+    TransformEosLangPairDataset,
+    SampledMultiEpochDataset,
+)
+from fairseq.data.multilingual.sampled_multi_dataset import CollateFormat
+from fairseq.file_io import PathManager
+
+logger = logging.getLogger(__name__)
+
+
+def _lang_token(lang: str, style='__{}__'):
+    return style.format(lang)
+
+
+def _lang_token_index(dic: Dictionary, lang: str, style='__{}__'):
+    """Return language token index."""
+    idx = dic.index(_lang_token(lang, style))
+    assert idx != dic.unk_index, \
+        'cannot find language token for lang {}'.format(lang)
+    return idx
+
+
+def _lang_id(dic: Dictionary, lang: str):
+    """Return language ID index."""
+    idx = dic.index(lang)
+    assert idx != dic.unk_index, \
+        'cannot find language ID for lang {}'.format(lang)
+    return idx
+
+
+def load_sampling_weights(from_file):
+    with open(from_file) as f:
+        weights = json.load(f)
+    return weights
+
+
+class MultilingualDatasetManager(object):
+    def __init__(self, args, lang_pairs, langs, dicts, sampling_method):
+        super().__init__()
+        self.args = args
+        self.seed = args.seed
+        self.lang_pairs = lang_pairs
+        self.langs = langs
+        self.dicts = dicts
+        self.lang_dict = self.create_lang_dictionary(self.langs)
+        self.sampling_method = sampling_method
+        self.sampling_scheduler = None
+        self._has_sharded_data = False
+        self._num_shards_dict = {}
+
+    @classmethod
+    def setup_data_manager(cls, args, lang_pairs, langs, dicts, sampling_method):
+        return MultilingualDatasetManager(args, lang_pairs, langs, dicts, sampling_method)
+
+    @staticmethod
+    def add_args(parser):
+        parser.add_argument('data', help='colon separated path to data directories list, \
+                            will be iterated upon during epochs in round-robin manner')
+        parser.add_argument('--langs', default=None, type=csv_str_list,
+                            help='a list of languages comma sperated languages which can appear in lang-pairs; '
+                                 'note that the ordering determines language token IDs',
+                            )
+        parser.add_argument('--lang-dict', default=None, type=str,
+                            help='an external file which contains a list of '
+                                 'languages which can appear in lang-pairs; '
+                                 'note that the ordering determines language token IDs; '
+                                 '--langs and --lang-dict are two exclusive options')
+        parser.add_argument('--lang-tok-style', default='multilingual',
+                            type=str, choices=['multilingual', 'mbart'],
+                            help='language token styles')
+
+        parser.add_argument('--load-alignments', action='store_true',
+                            help='load the binarized alignments')
+        parser.add_argument('--left-pad-source', default='True', type=str, metavar='BOOL',
+                            help='pad the source on the left')
+        parser.add_argument('--left-pad-target', default='False', type=str, metavar='BOOL',
+                            help='pad the target on the left')
+        parser.add_argument('--max-source-positions', default=1024, type=int, metavar='N',
+                            help='max number of tokens in the source sequence')
+        parser.add_argument('--max-target-positions', default=1024, type=int, metavar='N',
+                            help='max number of tokens in the target sequence')
+        parser.add_argument('--upsample-primary', default=1, type=int,
+                            help='amount to upsample primary dataset')
+        parser.add_argument('--truncate-source', action='store_true', default=False,
+                            help='truncate source to max-source-positions')
+        parser.add_argument('--encoder-langtok', default=None, type=str, choices=['src', 'tgt'],
+                            metavar='SRCTGT',
+                            help='prepend to the beginning of source sentence the source or target '
+                                 'language token. (src/tgt)')
+        parser.add_argument('--decoder-langtok', action='store_true',
+                            help='prepend to the beginning of target sentence the target language token')
+        parser.add_argument('--lang-tok-replacing-bos-eos', action='store_true', default=False)
+        parser.add_argument('--enable-lang-ids', default=False, action='store_true',
+                            help='whether to include language IDs in samples')
+        parser.add_argument('--enable-reservsed-directions-shared-datasets', default=False, action='store_true',
+                            help='whether to allow datasets be used in reversed directions')
+
+        parser.add_argument('--extra-data', help='a dictionary of data name to this path, \
+                            e.g. {"mined", path_to_mined_data, "denoised": path_to_denoised_data}',
+                            type=lambda uf: eval_str_dict(uf, type=str),
+                            default=None)
+        parser.add_argument('--extra-lang-pairs', help='a dictionary of data name to the language pairs they serve, \
+                            e.g. {"mined": comma-separated-lang-pairs, "denoised":  comma-separated-lang-pairs}',
+                            type=lambda uf: eval_str_dict(uf, type=str),
+                            default=None)
+        parser.add_argument('--langtoks-specs',
+                            help='a list of comma separated data types that a set of language tokens to be specialized for, \
+                            e.g. "main,dae,mined". There will be a set of language tokens added to the vocab to \
+                            distinguish languages in different training data types. If not specified, default language \
+                            tokens per languages will be added',
+                            default='main',
+                            type=csv_str_list,
+                            )
+        parser.add_argument('--langtoks', help='a dictionary of how to add language tokens, \
+                            e.g. {"mined": (None, "tgt"), "mono_dae": ("src.dae", "tgt"), "main": \
+                            ("src", "tgt")}, or {"mined": ("src.mined", "tgt")}',
+                            default=None,
+                            type=lambda uf: eval_str_dict(uf, type=str),
+                            )
+        parser.add_argument('--sampling-weights-from-file',
+                            help='a file contain a python dictionary of how to sample data sets, \
+                                e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \
+                                    "mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }',
+                            default=None, type=str,
+                            )
+        parser.add_argument('--sampling-weights', help='a dictionary of how to sample data sets, \
+                            e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \
+                                   "mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }',
+                            default=None,
+                            type=lambda uf: eval_str_dict(uf, type=str),
+                            )
+        parser.add_argument('--virtual-epoch-size', default=1000000, type=int,
+                            help='virtual epoch size to speed up data loading')
+        parser.add_argument('--virtual-data-size', default=None, type=int,
+                            help='virtual data size of the whole joint dataset to speed'
+                                 'up data loading and have specific dynamic sampling strategy interval')
+
+    @classmethod
+    def load_langs(cls, args, **kwargs):
+        if args.lang_dict and args.langs:
+            raise ValueError('--langs and --lang-dict can not both be specified')
+        if args.lang_dict is None and args.langs is None:
+            logger.warning(
+                'External language dictionary is not provided; '
+                'use lang-pairs to infer the set of supported languages. '
+                'The language ordering is not stable which might cause '
+                'misalignment in pretraining and finetuning.')
+            # infer from lang_pairs as it is
+            langs = list({x for lang_pair in args.lang_pairs for x in lang_pair.split('-')})
+            langs = sorted(langs)
+            logger.info(f'inferred language list: {langs}')
+        elif args.lang_dict:
+            with PathManager.open(args.lang_dict, "r", encoding="utf-8") as f:
+                langs = [lang.strip() for lang in f.readlines() if lang.strip()]
+                logger.info(f'loaded language list from {args.lang_dict} as they are ordered in file')
+        elif args.langs:
+            langs = args.langs
+            logger.info(f'parsed the language list as they are ordered in the option: {langs}')
+        return langs
+
+    def has_sharded_data(self, split):
+        return self._has_sharded_data and split == getattr(self.args, "train_subset", None)
+
+    def _shared_collater(self):
+        return (
+            not (self.args.extra_data and 'mono_dae' in self.args.extra_data)
+            and (not self.args.lang_tok_replacing_bos_eos)
+        )
+
+    @classmethod
+    def prepare(cls, load_dictionary, args, **kargs):
+        args.left_pad_source = options.eval_bool(args.left_pad_source)
+        args.left_pad_target = options.eval_bool(args.left_pad_target)
+
+        if not hasattr(args, 'shuffle_instance'):
+            args.shuffle_instance = False
+        if args.langtoks is None:
+            args.langtoks = {}
+        if 'main' not in args.langtoks:
+            src_langtok_spec = args.encoder_langtok if args.encoder_langtok else None
+            tgt_langtok_spec = 'tgt' if args.decoder_langtok else None
+            args.langtoks['main'] = (src_langtok_spec, tgt_langtok_spec)
+
+        def check_langs(langs, pairs):
+            messages = []
+            for src, tgt in pairs:
+                if src not in langs or tgt not in langs:
+                    messages.append(f'language pair {src}-{tgt} contains languages '
+                                    'that are not in the language dictionary')
+            if len(messages) > 0:
+                raise ValueError(' '.join(messages) + f"; langs: {langs}")
+
+        if args.lang_pairs is None:
+            raise ValueError('--lang-pairs is required. List all the language pairs in the training objective.')
+        if isinstance(args.lang_pairs, str):
+            args.lang_pairs = args.lang_pairs.split(',')
+        if args.source_lang is not None or args.target_lang is not None:
+            training = False
+        else:
+            training = True
+        sorted_langs = cls.load_langs(args, **kargs)
+        check_langs(
+            sorted_langs,
+            ([p.split('-') for p in args.lang_pairs] if training
+             else [(args.source_lang, args.target_lang)])
+        )
+
+        # load dictionaries
+        if training:
+            extra_lang_pairs = (
+                list({p for _, v in args.extra_lang_pairs.items() for p in v.split(',')})
+                if args.extra_lang_pairs else []
+            )
+            langs_to_load_dicts = sorted({x for p in args.lang_pairs + extra_lang_pairs for x in p.split('-')})
+        else:
+            langs_to_load_dicts = sorted([args.source_lang, args.target_lang])
+
+        dicts = OrderedDict()
+        supported_langtok_specs = args.langtoks_specs
+        for lang in langs_to_load_dicts:
+            paths = utils.split_paths(args.data)
+            assert len(paths) > 0
+            dicts[lang] = load_dictionary(os.path.join(paths[0], 'dict.{}.txt'.format(lang)))
+            if len(dicts) > 0:
+                assert dicts[lang].pad() == dicts[langs_to_load_dicts[0]].pad()
+                assert dicts[lang].eos() == dicts[langs_to_load_dicts[0]].eos()
+                assert dicts[lang].unk() == dicts[langs_to_load_dicts[0]].unk()
+
+            # keep the langs consistent for all experiments with the same lang dict
+            # for finetuning regardless of whether lang_tok is required or not just add the tokens to the dicts
+            for spec in supported_langtok_specs:
+                for lang_to_add in sorted_langs:
+                    dicts[lang].add_symbol(
+                        MultilingualDatasetManager.get_lang_tok(lang_to_add, args, spec)
+                    )
+            if args.lang_tok_style == 'mbart' or (args.extra_data and 'mono_dae' in args.extra_data):
+                dicts[lang].add_symbol('<mask>')
+            logger.info('[{}] dictionary: {} types'.format(lang, len(dicts[lang])))
+        return sorted_langs, dicts, training
+
+    TOKEN_STYLES = {
+        'mbart': '[{}]',
+        'multilingual': '__{}__'
+    }
+
+    @classmethod
+    def create_lang_dictionary(cls, langs):
+        unk = '<unk>'
+        # hack to remove symbols other than unk as they are not needed by lang dict
+        lang_dict = Dictionary(
+            pad=unk,
+            eos=unk,
+            unk=unk,
+            bos=unk,
+        )
+        for lang in langs:
+            lang_dict.add_symbol(lang)
+        return lang_dict
+
+    @classmethod
+    def get_lang_tok_style(cls, args):
+        return cls.TOKEN_STYLES[args.lang_tok_style]
+
+    @classmethod
+    def get_lang_tok(cls, lang, args, spec=''):
+        if spec is None:
+            return None
+        if spec.endswith('dae'):
+            lang = f'{lang}_dae'
+        elif spec.endswith('mined'):
+            lang = f'{lang}_mined'
+        return _lang_token(lang, cls.get_lang_tok_style(args))
+
+    @classmethod
+    def get_langtok_index(cls, lang_tok, dic):
+        idx = dic.index(lang_tok)
+        assert idx != dic.unk_index, \
+            'cannot find language token {} in the dictionary'.format(lang_tok)
+        return idx
+
+    def get_encoder_langtok(self, src_lang, tgt_lang, spec=None):
+        if spec is None:
+            return None
+        if spec and spec.startswith('src'):
+            if src_lang is None:
+                return None
+            langtok = self.get_lang_tok(src_lang, self.args, spec)
+        else:
+            if tgt_lang is None:
+                return None
+            langtok = self.get_lang_tok(tgt_lang, self.args, spec)
+        return self.get_langtok_index(langtok, self.dicts[src_lang if src_lang else tgt_lang])
+
+    def get_decoder_langtok(self, tgt_lang, spec=None):
+        if spec is None:
+            return None
+        langtok = self.get_lang_tok(tgt_lang, self.args, spec)
+        return self.get_langtok_index(langtok, self.dicts[tgt_lang])
+
+    @classmethod
+    def load_data(cls, path, vdict, impl):
+        dataset = data_utils.load_indexed_dataset(path, vdict, impl)
+        return dataset
+
+    @classmethod
+    def split_exists(cls, split, src, tgt, lang, data_path, dataset_impl):
+        filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang))
+        return indexed_dataset.dataset_exists(filename, impl=dataset_impl)
+
+    @classmethod
+    def mono_split_exists(cls, split, lang, data_path, dataset_impl):
+        filename = os.path.join(data_path, '{}.{}'.format(split, lang))
+        return indexed_dataset.dataset_exists(filename, impl=dataset_impl)
+
+    @classmethod
+    def bitext_split_exists(cls, split, src, tgt, data_path, dataset_impl):
+        src_exists = cls.split_exists(split, src, tgt, lang=src, data_path=data_path, dataset_impl=dataset_impl) \
+            or cls.split_exists(split, tgt, src, lang=src, data_path=data_path, dataset_impl=dataset_impl)
+
+        tgt_exists = cls.split_exists(split, src, tgt, lang=tgt, data_path=data_path, dataset_impl=dataset_impl) \
+            or cls.split_exists(split, tgt, src, lang=tgt, data_path=data_path, dataset_impl=dataset_impl)
+        return src_exists and tgt_exists
+
+    @classmethod
+    def get_split_num_shards(cls, split, src, tgt, data_paths, dataset_impl):
+        return sum(
+            1 for path in data_paths
+            if cls.bitext_split_exists(split, src, tgt, path, dataset_impl)
+        )
+
+    @classmethod
+    def get_mono_split_num_shards(cls, split, lang, data_paths, dataset_impl):
+        return sum(
+            1 for path in data_paths
+            if cls.mono_split_exists(split, lang, path, dataset_impl)
+        )
+
+    def load_lang_dataset(
+            self,
+            data_path, split,
+            src, src_dict,
+            tgt, tgt_dict,
+            combine, dataset_impl, upsample_primary,
+            max_source_positions,
+            prepend_bos=False, load_alignments=False,
+            truncate_source=False,
+    ):
+
+        src_datasets = []
+        tgt_datasets = []
+
+        for k in itertools.count():
+            split_k = split + (str(k) if k > 0 else '')
+
+            # infer langcode
+            if self.split_exists(split_k, src, tgt, src, data_path, dataset_impl):
+                prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, src, tgt))
+            elif self.split_exists(split_k, tgt, src, src, data_path, dataset_impl):
+                prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, tgt, src))
+            else:
+                if k > 0:
+                    break
+                else:
+                    logger.error(f"Dataset not found: {data_path}, {split_k}, {src}, {tgt}")
+                    raise FileNotFoundError('Dataset not found: {} ({})'.format(split, data_path))
+
+            src_dataset = self.load_data(prefix + src, src_dict, dataset_impl)
+            if truncate_source:
+                src_dataset = AppendTokenDataset(
+                    TruncateDataset(
+                        StripTokenDataset(src_dataset, src_dict.eos()),
+                        max_source_positions - 1,
+                    ),
+                    src_dict.eos(),
+                )
+            src_datasets.append(src_dataset)
+            tgt_datasets.append(
+                self.load_data(prefix + tgt, tgt_dict, dataset_impl)
+            )
+
+            logger.info('{} {} {}-{} {} examples'.format(
+                data_path, split_k, src, tgt, len(src_datasets[-1])
+            ))
+
+            if not combine:
+                break
+
+        assert len(src_datasets) == len(tgt_datasets)
+
+        if len(src_datasets) == 1:
+            src_dataset, tgt_dataset = src_datasets[0], tgt_datasets[0]
+        else:
+            sample_ratios = [1] * len(src_datasets)
+            sample_ratios[0] = upsample_primary
+            src_dataset = ConcatDataset(src_datasets, sample_ratios)
+            tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios)
+
+        if prepend_bos:
+            assert hasattr(src_dict, "bos_index") and hasattr(tgt_dict, "bos_index")
+            src_dataset = PrependTokenDataset(src_dataset, src_dict.bos())
+            tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos())
+
+        align_dataset = None
+        if load_alignments:
+            align_path = os.path.join(data_path, '{}.align.{}-{}'.format(split, src, tgt))
+            if indexed_dataset.dataset_exists(align_path, impl=dataset_impl):
+                align_dataset = data_utils.load_indexed_dataset(align_path, None, dataset_impl)
+
+        return src_dataset, tgt_dataset, align_dataset
+
+    def load_langpair_dataset(
+        self,
+        data_path, split,
+        src, src_dict,
+        tgt, tgt_dict,
+        combine, dataset_impl, upsample_primary,
+        left_pad_source, left_pad_target, max_source_positions,
+        max_target_positions, prepend_bos=False, load_alignments=False,
+        truncate_source=False,
+        src_dataset_transform_func=lambda dataset: dataset,
+        tgt_dataset_transform_func=lambda dataset: dataset,
+        src_lang_id=None,
+        tgt_lang_id=None,
+        langpairs_sharing_datasets=None,
+    ):
+        norm_direction = "-".join(sorted([src, tgt]))
+        if langpairs_sharing_datasets is not None:
+            src_dataset = langpairs_sharing_datasets.get((data_path, split, norm_direction, src), 'NotInCache')
+            tgt_dataset = langpairs_sharing_datasets.get((data_path, split, norm_direction, tgt), 'NotInCache')
+            align_dataset = langpairs_sharing_datasets.get((data_path, split, norm_direction, src, tgt), 'NotInCache')
+
+        # a hack: any one is not in cache, we need to reload them
+        if (
+            langpairs_sharing_datasets is None
+            or src_dataset == 'NotInCache'
+            or tgt_dataset == 'NotInCache'
+            or align_dataset == 'NotInCache'
+            or split != getattr(self.args, "train_subset", None)
+        ):
+            # source and target datasets can be reused in reversed directions to save memory
+            # reversed directions of valid and test data will not share source and target datasets
+            src_dataset, tgt_dataset, align_dataset = self.load_lang_dataset(
+                data_path, split,
+                src, src_dict,
+                tgt, tgt_dict,
+                combine, dataset_impl, upsample_primary,
+                max_source_positions=max_source_positions,
+                prepend_bos=prepend_bos, load_alignments=load_alignments,
+                truncate_source=truncate_source,
+            )
+            src_dataset = src_dataset_transform_func(src_dataset)
+            tgt_dataset = tgt_dataset_transform_func(tgt_dataset)
+            if langpairs_sharing_datasets is not None:
+                langpairs_sharing_datasets[(data_path, split, norm_direction, src)] = src_dataset
+                langpairs_sharing_datasets[(data_path, split, norm_direction, tgt)] = tgt_dataset
+                langpairs_sharing_datasets[(data_path, split, norm_direction, src, tgt)] = align_dataset
+                if align_dataset is None:
+                    # no align data so flag the reverse direction as well in sharing
+                    langpairs_sharing_datasets[(data_path, split, norm_direction, tgt, src)] = align_dataset
+        else:
+            logger.info(f"Reusing source and target datasets of [{split}] {tgt}-{src} for reversed direction: "
+                        f"[{split}] {src}-{tgt}: src length={len(src_dataset)}; tgt length={len(tgt_dataset)}")
+
+        return LanguagePairDataset(
+            src_dataset, src_dataset.sizes, src_dict,
+            tgt_dataset, tgt_dataset.sizes, tgt_dict,
+            left_pad_source=left_pad_source,
+            left_pad_target=left_pad_target,
+            align_dataset=align_dataset,
+            src_lang_id=src_lang_id,
+            tgt_lang_id=tgt_lang_id,
+        )
+
+    def src_dataset_tranform_func(self, src_lang, tgt_lang, dataset, spec=None):
+        if self.args.lang_tok_replacing_bos_eos:
+            # it is handled by self.alter_dataset_langtok
+            # TODO: Unifiy with alter_dataset_langtok
+            return dataset
+        if spec is None:
+            return dataset
+        tok = self.get_encoder_langtok(src_lang, tgt_lang, spec)
+        if tok:
+            return PrependTokenDataset(dataset, tok)
+        return dataset
+
+    def tgt_dataset_tranform_func(self, source_lang, target_lang, dataset, spec=None):
+        if self.args.lang_tok_replacing_bos_eos:
+            # TODO: Unifiy with alter_dataset_langtok
+            # It is handled by self.alter_dataset_langtok.
+            # The complication in self.alter_dataset_langtok
+            # makes a unified framework difficult.
+            return dataset
+        # if not self.args.decoder_langtok:
+        if not spec:
+            return dataset
+        tok = self.get_decoder_langtok(target_lang, spec)
+        if tok:
+            return PrependTokenDataset(dataset, tok)
+        return dataset
+
+    def alter_dataset_langtok(self, lang_pair_dataset,
+                              src_eos=None, src_lang=None,
+                              tgt_eos=None, tgt_lang=None,
+                              src_langtok_spec=None, tgt_langtok_spec=None,
+                              ):
+        if src_langtok_spec is None and tgt_langtok_spec is None:
+            return lang_pair_dataset
+
+        new_src_eos = None
+        if src_langtok_spec is not None and src_eos is not None \
+           and (src_lang is not None or tgt_lang is not None):
+            new_src_eos = self.get_encoder_langtok(src_lang, tgt_lang, src_langtok_spec)
+        else:
+            src_eos = None
+
+        new_tgt_bos = None
+        if tgt_langtok_spec and tgt_eos is not None and tgt_lang is not None:
+            new_tgt_bos = self.get_decoder_langtok(tgt_lang, tgt_langtok_spec)
+        else:
+            tgt_eos = None
+
+        return TransformEosLangPairDataset(
+            lang_pair_dataset,
+            src_eos=src_eos,
+            new_src_eos=new_src_eos,
+            tgt_bos=tgt_eos,
+            new_tgt_bos=new_tgt_bos,
+        )
+
+    def load_a_dataset(
+            self,
+            split,
+            data_path,
+            src, src_dict,
+            tgt, tgt_dict,
+            combine,
+            prepend_bos=False,
+            langpairs_sharing_datasets=None,
+            data_category=None,
+            **extra_kwargs,
+    ):
+        dataset_impl = self.args.dataset_impl
+        upsample_primary = self.args.upsample_primary
+        left_pad_source = self.args.left_pad_source
+        left_pad_target = self.args.left_pad_target
+        max_source_positions = self.args.max_source_positions
+        max_target_positions = self.args.max_target_positions
+        load_alignments = self.args.load_alignments
+        truncate_source = self.args.truncate_source
+        src_dataset_transform_func = self.src_dataset_tranform_func
+        tgt_dataset_transform_func = self.tgt_dataset_tranform_func
+        enable_lang_ids = self.args.enable_lang_ids
+        lang_dictionary = self.lang_dict
+        src_langtok_spec, tgt_langtok_spec = extra_kwargs['langtok_spec']
+
+        src_langtok = self.get_encoder_langtok(src, tgt, src_langtok_spec)
+        tgt_langtok = self.get_decoder_langtok(tgt, tgt_langtok_spec)
+        logger.info(f'{data_category}:{src}-{tgt} src_langtok: {src_langtok}; tgt_langtok: {tgt_langtok}')
+
+        langpair_ds = self.load_langpair_dataset(
+            data_path, split,
+            src, src_dict,
+            tgt, tgt_dict,
+            combine, dataset_impl, upsample_primary,
+            left_pad_source, left_pad_target, max_source_positions,
+            max_target_positions, prepend_bos, load_alignments,
+            truncate_source,
+            src_dataset_transform_func=lambda dataset: src_dataset_transform_func(src, tgt, dataset, src_langtok_spec),
+            tgt_dataset_transform_func=lambda dataset: tgt_dataset_transform_func(src, tgt, dataset, tgt_langtok_spec),
+            src_lang_id=_lang_id(lang_dictionary, src) if enable_lang_ids and lang_dictionary is not None else None,
+            tgt_lang_id=_lang_id(lang_dictionary, tgt) if enable_lang_ids and lang_dictionary is not None else None,
+            langpairs_sharing_datasets=langpairs_sharing_datasets,
+        )
+        if langpair_ds.tgt_sizes is None:
+            # hack to use src_sizes as the sizes for the whole pair dataset for ConcatDataset
+            langpair_ds.sizes = langpair_ds.src_sizes
+        else:
+            # use the max of two sides to define the size to help max positions filtering
+            langpair_ds.sizes = np.vstack([langpair_ds.src_sizes, langpair_ds.tgt_sizes]).max(axis=0)
+        assert langpair_ds.sizes.shape == langpair_ds.src_sizes.shape
+        # TODO: handle modified lang toks for mined data and dae data
+        if self.args.lang_tok_replacing_bos_eos:
+            ds = self.alter_dataset_langtok(
+                langpair_ds,
+                src_eos=self.dicts[src if src else tgt].eos(),
+                src_lang=src,
+                tgt_eos=self.dicts[tgt].eos(),
+                tgt_lang=tgt,
+                src_langtok_spec=src_langtok_spec,
+                tgt_langtok_spec=tgt_langtok_spec,
+            )
+        else:
+            ds = langpair_ds
+        return ds
+
+    def load_split_langpair_datasets(
+            self,
+            split,
+            data_param_list,
+    ):
+        datasets = []
+        langpairs_sharing_datasets = {} if self.args.enable_reservsed_directions_shared_datasets else None
+        for param in data_param_list:
+            ds = self.load_a_dataset(split=split, langpairs_sharing_datasets=langpairs_sharing_datasets, **param)
+            datasets.append(ds)
+        return datasets
+
+    def get_data_paths_and_lang_pairs(self, split):
+        datapaths = {
+            'main': self.args.data,
+        }
+        lang_pairs = {
+            'main': self.lang_pairs
+        }
+        if split == getattr(self.args, "train_subset", None):
+            # only training data can have extra data and extra language pairs
+            if self.args.extra_data:
+                extra_datapaths = self.args.extra_data
+                datapaths.update(extra_datapaths)
+            if self.args.extra_lang_pairs:
+                extra_lang_pairs = {k: v.split(',') for k, v in self.args.extra_lang_pairs.items()}
+                lang_pairs.update(extra_lang_pairs)
+        return datapaths, lang_pairs
+
+    @classmethod
+    def get_dataset_key(cls, data_category, src, tgt):
+        return f'{data_category}:{src}-{tgt}'
+
+    def get_split_num_data_shards(self, split):
+        if split in self._num_shards_dict:
+            return self._num_shards_dict[split]
+        num_shards_dict = {}
+        data_paths, lang_pairs = self.get_data_paths_and_lang_pairs(split)
+
+        for data_category, paths in data_paths.items():
+            if data_category not in lang_pairs:
+                continue
+            paths = utils.split_paths(paths)
+            lang_dirs = [lang_pair.split('-') for lang_pair in lang_pairs[data_category]]
+            lang_dirs = [x if len(x) > 1 else (x[0], x[0]) for x in lang_dirs]
+            for src, tgt in lang_dirs:
+                # monolingual data ruqires tgt only
+                assert src is not None or 'mono_' in data_category, (f'error: src={src}, '
+                                                                     'tgt={tgt} for data_category={data_category}')
+                key = self.get_dataset_key(data_category, src, tgt)
+                if 'mono_' in data_category:
+                    num_shards_dict[key] = self.get_mono_split_num_shards(
+                        split, tgt, paths, self.args.dataset_impl)
+                else:
+                    num_shards_dict[key] = self.get_split_num_shards(
+                        split, src, tgt, paths, self.args.dataset_impl)
+        self._num_shards_dict[split] = num_shards_dict
+        logger.info(f"[{split}] num of shards: {num_shards_dict}")
+        return num_shards_dict
+
+    def get_split_data_path(self, paths, epoch, shard_epoch, num_shards):
+        shard = epoch if shard_epoch is None else shard_epoch
+        shard = (shard - 1) % num_shards
+        path = paths[shard]
+        return path
+
+    def get_split_data_param_list(self, split, epoch, shard_epoch=None):
+        # TODO: to extend with extra datasets and keys and loop over different shard data paths
+        param_list = []
+        data_paths, lang_pairs = self.get_data_paths_and_lang_pairs(split)
+        logger.info(f'langtoks settings: {self.args.langtoks}')
+        split_num_shards_dict = self.get_split_num_data_shards(split)
+        for data_category, paths in data_paths.items():
+            if data_category not in lang_pairs:
+                continue
+            paths = utils.split_paths(paths)
+            assert len(paths) > 0
+            if len(paths) > 1:
+                self._has_sharded_data = True
+
+            if data_category in self.args.langtoks:
+                lang_tok_spec = self.args.langtoks[data_category]
+            else:
+                # default to None
+                lang_tok_spec = (None, None)
+
+            # infer langcode
+            lang_dirs = [lang_pair.split('-') for lang_pair in lang_pairs[data_category]]
+            lang_dirs = [x if len(x) > 1 else (x[0], x[0]) for x in lang_dirs]
+            for src, tgt in lang_dirs:
+                assert src is not None or data_category == 'mono_dae', (f'error: src={src}, '
+                                                                        'tgt={tgt} for data_category={data_category}')
+                # logger.info(f"preparing param for {data_category}: {src} - {tgt}")
+                key = self.get_dataset_key(data_category, src, tgt)
+                data_path = self.get_split_data_path(
+                    paths, epoch, shard_epoch, split_num_shards_dict[key])
+                param_list.append(
+                        {
+                            'key': key,
+                            'data_path': data_path,
+                            'split': split,
+                            'src': src,
+                            'src_dict': self.dicts[src] if src and data_category != 'mono_dae' else None,
+                            'tgt': tgt,
+                            'tgt_dict': self.dicts[tgt],
+                            'data_category': data_category,
+                            'langtok_spec': lang_tok_spec,
+                        }
+                )
+        return param_list
+
+    def get_train_dataset_sizes(self, data_param_list, datasets):
+        num_shards = [
+            self.get_split_num_data_shards(param['split'])[param['key']] for param in data_param_list]
+        data_sizes = [(key, len(d) * num_shard) for (key, d), num_shard in zip(datasets, num_shards)]
+        logger.info(f'data sizes multiplied by num_shards used in sampling ratios: {data_sizes}')
+        return [s for _, s in data_sizes]
+
+    def get_train_sampling_ratios(self, data_param_list, datasets, epoch=1):
+        data_sizes = self.get_train_dataset_sizes(data_param_list, datasets)
+        sampling_func = self.sampling_method.sampling_method_selector()
+        sample_ratios = sampling_func(data_sizes) if sampling_func is not None else None
+        return sample_ratios
+
+    def get_sampling_ratios(self, data_param_list, datasets, epoch):
+        if self.args.sampling_weights_from_file:
+            weights = load_sampling_weights(self.args.sampling_weights_from_file)
+            sample_ratios = [weights[k] for k, _ in datasets]
+            logger.info('| ignoring --sampling-weights when loadding sampling weights '
+                        f'from file {self.args.sampling_weights_from_file}')
+        elif self.args.sampling_weights:
+            sample_ratios = [self.args.sampling_weights[k] for k, _ in datasets]
+        else:
+            sample_ratios = self.get_train_sampling_ratios(data_param_list, datasets, epoch)
+
+        if sample_ratios is not None:
+            logger.info('| Upsample ratios: {}'.format(
+                list(zip(map(lambda x: x['key'], data_param_list), sample_ratios))
+            ))
+            assert len(sample_ratios) == len(datasets)
+        return sample_ratios
+
+    def load_split_datasets(
+            self,
+            split,
+            training,
+            epoch=1, combine=False, shard_epoch=None, **kwargs,
+    ):
+        data_param_list = self.get_split_data_param_list(
+            split, epoch, shard_epoch=shard_epoch,
+        )
+        langpairs_sharing_datasets = {} if self.args.enable_reservsed_directions_shared_datasets else None
+        datasets = [
+            (
+                param['key'],
+                self.load_a_dataset(
+                    combine=combine,
+                    langpairs_sharing_datasets=langpairs_sharing_datasets,
+                    **param
+                ),
+            )
+            for param in data_param_list
+        ]
+        return datasets, data_param_list
+
+    def load_into_sampled_multi_epoch_dataset(
+            self, split, datasets, data_param_list,
+            epoch, shard_epoch=None
+    ):
+        sample_ratios = self.get_sampling_ratios(data_param_list, datasets, epoch)
+        return SampledMultiEpochDataset(
+                OrderedDict(datasets),
+                epoch=epoch,
+                shard_epoch=shard_epoch,
+                # valid and test datasets will be degerate to concating datasets:
+                sampling_ratios=sample_ratios,
+                eval_key=None,
+                batch_by_size=True,
+                collate_format=CollateFormat.single,
+                virtual_size=self.args.virtual_data_size,
+                split=split,
+                virtual_epoch_size=self.args.virtual_epoch_size,
+                # if not using lang_tok altering, simplified to use the same collater
+                shared_collater=self._shared_collater(),
+        )
+
+    def load_into_concat_dataset(self, split, datasets, data_param_list):
+        if self.args.lang_tok_replacing_bos_eos:
+            # TODO: to investigate why TransformEosLangPairDataset doesn't work with ConcatDataset
+            return SampledMultiDataset(
+                        OrderedDict(datasets),
+                        sampling_ratios=None,
+                        eval_key=None,
+                        batch_by_size=True,
+                        collate_format=CollateFormat.single,
+                        virtual_size=None,
+                        split=split,
+                    )
+        return ConcatDataset([d for _, d in datasets])
+
+    def load_sampled_multi_epoch_dataset(
+            self,
+            split,
+            training,
+            epoch=0, combine=False, shard_epoch=None, **kwargs
+    ):
+        datasets, data_param_list = self.load_split_datasets(
+                split, training,
+                epoch, combine, shard_epoch=shard_epoch, **kwargs
+         )
+        if training and split == getattr(self.args, "train_subset", None):
+            return self.load_into_sampled_multi_epoch_dataset(
+                split, datasets, data_param_list, epoch, shard_epoch=shard_epoch)
+        else:
+            return self.load_into_concat_dataset(split, datasets, data_param_list)
diff --git a/fairseq/data/multilingual/sampled_multi_dataset.py b/fairseq/data/multilingual/sampled_multi_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..95eab280f030bfbd8ced892f4e0c59c4ffed8cfe
--- /dev/null
+++ b/fairseq/data/multilingual/sampled_multi_dataset.py
@@ -0,0 +1,396 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List
+from enum import Enum
+from collections import OrderedDict
+from collections import defaultdict
+from bisect import bisect_right
+import hashlib
+import logging
+import datetime
+import time
+
+import numpy as np
+import torch
+
+from fairseq import distributed_utils
+from fairseq.data import plasma_utils, FairseqDataset
+
+
+def get_time_gap(s, e):
+    return (datetime.datetime.fromtimestamp(e) - datetime.datetime.fromtimestamp(s)).__str__()
+
+
+logger = logging.getLogger(__name__)
+
+
+def default_virtual_size_func(datasets, ratios, max_scale_up=1.5):
+    sizes = [len(d) for d in datasets]
+    if ratios is None:
+        return sum(sizes)
+    largest_idx = np.argmax(sizes)
+    largest_r = ratios[largest_idx]
+    largest_s = sizes[largest_idx]
+    # set virtual sizes relative to the largest dataset
+    virtual_sizes = [(r / largest_r) * largest_s for r in ratios]
+    vsize = sum(virtual_sizes)
+    max_size = sum(sizes) * max_scale_up
+    return int(vsize if vsize < max_size else max_size)
+
+
+class CollateFormat(Enum):
+    single = 1
+    ordered_dict = 2
+
+
+class SampledMultiDataset(FairseqDataset):
+    """Samples from multiple sub-datasets according to given sampling ratios.
+    Args:
+        datasets (
+            List[~torch.utils.data.Dataset]
+            or OrderedDict[str, ~torch.utils.data.Dataset]
+        ): datasets
+        sampling_ratios (List[float]): list of probability of each dataset to be sampled
+            (default: None, which corresponds to concating all dataset together).
+        batch_by_size (bool): whether or not to batch by sequence length
+            (default: True).
+        seed (int): RNG seed to use (default: 2).
+        epoch (int): starting epoch number (default: 1).
+        eval_key (str, optional): a key used at evaluation time that causes
+            this instance to pass-through batches from *datasets[eval_key]*.
+        collate_format (CollateFormat):  collater output format, either CollateFormat.ordered_dict or
+            CollateFormat.single (default: CollateFormat.single) where CollateFormat.single configures
+            the collater to output batches of data mixed from all sub-datasets,
+            and CollateFormat.ordered_dict configures the collater to output a dictionary of batches indexed by keys
+            of sub-datasets.
+            Note that not all sub-datasets will present in a single batch in both formats.
+        virtual_size (int, or callable): the expected virtual size of the dataset (default: default_virtual_size_func).
+        split (str): the split of the data, e.g. 'train', 'valid' or 'test'.
+        shared_collater (bool): whether or not to all sub-datasets have the same collater.
+    """
+
+    def __init__(
+            self,
+            datasets,
+            sampling_ratios=None,
+            batch_by_size=False,
+            seed=2,
+            epoch=1,
+            eval_key=None,
+            collate_format=CollateFormat.single,
+            virtual_size=default_virtual_size_func,
+            split='',
+            shared_collater=False,
+    ):
+        super().__init__()
+        self.batch_by_size = batch_by_size
+        self.shared_collater = shared_collater
+
+        if isinstance(datasets, OrderedDict):
+            self.keys = list(datasets.keys())
+            datasets = list(datasets.values())
+        elif isinstance(datasets, List):
+            self.keys = list(range(len(datasets)))
+        else:
+            raise AssertionError()
+        self.datasets = datasets
+        self.split = split
+
+        self.eval_key = eval_key
+        if self.eval_key is not None:
+            self.collate_format = CollateFormat.single
+        else:
+            self.collate_format = collate_format
+
+        self.seed = seed
+        self._cur_epoch = None
+        self._cur_indices = None
+        self._sizes = None
+        self._ordered_indices = None
+        self.virtual_size_per_dataset = None
+        # caching properties
+        self._reset_cached_properties()
+        self.setup_sampling(sampling_ratios, virtual_size)
+        self.cumulated_sizes = None
+        self.virtual_size_per_dataset = None
+        self._size_cache = {}
+        self.set_epoch(epoch)
+
+    def _clean_if_not_none(self, var_list):
+        for v in var_list:
+            if v is not None:
+                del v
+
+    def _reset_cached_properties(self):
+        self._clean_if_not_none([
+            self._sizes, self._ordered_indices, self._cur_indices
+        ])
+        self._sizes = None
+        self._ordered_indices = None
+        self._cur_indices = None
+
+    def setup_sampling(self, sample_ratios, virtual_size):
+        sizes = [len(d) for d in self.datasets]
+        if sample_ratios is None:
+            # default back to concating datasets
+            self.sample_ratios = None
+            self.virtual_size = sum(sizes)
+        else:
+            if not isinstance(sample_ratios, np.ndarray):
+                sample_ratios = np.array(sample_ratios)
+            self.sample_ratios = plasma_utils.PlasmaArray(sample_ratios)
+            virtual_size = default_virtual_size_func if virtual_size is None else virtual_size
+            self.virtual_size = (
+                virtual_size(self.datasets, self.sample_ratios.array) if callable(virtual_size)
+                else virtual_size)
+
+    def adjust_sampling(self, epoch, sampling_ratios, virtual_size):
+        if sampling_ratios is not None:
+            sampling_ratios = self._sync_sample_ratios(sampling_ratios)
+            self.setup_sampling(sampling_ratios, virtual_size)
+
+    def _sync_sample_ratios(self, ratios):
+        # in case the ratios are not precisely the same across processes
+        # also to ensure every procresses update the ratios in the same pace
+        ratios = torch.DoubleTensor(ratios)
+        if torch.distributed.is_initialized():
+            if torch.cuda.is_available():
+                distributed_utils.all_reduce(ratios.cuda())
+            else:
+                distributed_utils.all_reduce(ratios)
+            ret = ratios.cpu()
+            ret = ret.numpy()
+        return ret
+
+    def random_choice_in_dataset(self, rng, dataset, choice_size):
+        if hasattr(dataset, 'random_choice_in_dataset'):
+            return dataset.random_choice_in_dataset(rng, choice_size)
+        dataset_size = len(dataset)
+        return rng.choice(dataset_size, choice_size, replace=(choice_size > dataset_size))
+
+    def get_virtual_indices(self, rng, datasets, sample_ratios, virtual_size):
+        def get_counts(sample_ratios):
+            counts = np.array([virtual_size * r for r in sample_ratios], dtype=np.int64)
+            diff = virtual_size - counts.sum()
+            assert diff >= 0
+            # due to round-offs, the size might not match the desired sizes
+            if diff > 0:
+                dataset_indices = rng.choice(len(sample_ratios), size=diff, p=sample_ratios)
+                for i in dataset_indices:
+                    counts[i] += 1
+            return counts
+
+        def get_in_dataset_indices(datasets, sizes, sample_ratios):
+            counts = get_counts(sample_ratios)
+            # uniformally sample desired counts for each dataset
+            # if the desired counts are large, sample with replacement:
+            indices = [
+                self.random_choice_in_dataset(rng, d, c)
+                for c, d in zip(counts, datasets)]
+            return indices
+
+        sizes = [len(d) for d in datasets]
+        if sample_ratios is None:
+            # default back to concating datasets
+            in_dataset_indices = [list(range(s)) for s in sizes]
+            virtual_sizes_per_dataset = sizes
+        else:
+            sample_ratios = sample_ratios.array
+            ratios = sample_ratios / sample_ratios.sum()
+            in_dataset_indices = get_in_dataset_indices(datasets, sizes, ratios)
+            virtual_sizes_per_dataset = [len(d) for d in in_dataset_indices]
+        virtual_sizes_per_dataset = np.array(virtual_sizes_per_dataset, np.int64)
+        cumulative_sizes = np.cumsum(virtual_sizes_per_dataset)
+        assert sum(virtual_sizes_per_dataset) == virtual_size
+        assert cumulative_sizes[-1] == virtual_size
+        if virtual_size < sum(sizes):
+            logger.warning(
+                f'virtual data size ({virtual_size}) is less than real data size ({sum(sizes)}).'
+                ' If virtual size << real data size, there could be data coverage issue.'
+            )
+        in_dataset_indices = np.hstack(in_dataset_indices)
+        return in_dataset_indices, cumulative_sizes, virtual_sizes_per_dataset
+
+    def _get_dataset_and_index(self, index):
+        i = bisect_right(self.cumulated_sizes.array, index)
+        return i, self._cur_indices.array[index]
+
+    def __getitem__(self, index):
+        ds_idx, ds_sample_idx = self._get_dataset_and_index(index)
+        ret = (ds_idx, self.datasets[ds_idx][ds_sample_idx])
+        return ret
+
+    def num_tokens(self, index):
+        ds_idx, ds_sample_idx = self._get_dataset_and_index(index)
+        return self.datasets[ds_idx].num_tokens(ds_sample_idx)
+
+    def size(self, index):
+        if self._sizes is not None:
+            return self._sizes[index]
+        ds_idx, ds_sample_idx = self._get_dataset_and_index(index)
+        return self.datasets[ds_idx].size(ds_sample_idx)
+
+    def __len__(self):
+        return self.virtual_size
+
+    def collater(self, samples, **extra_args):
+        """Merge a list of samples to form a mini-batch."""
+        if len(samples) == 0:
+            return None
+        if self.collate_format == 'ordered_dict':
+            collect_samples = [[] for _ in range(len(self.datasets))]
+            for (i, sample) in samples:
+                collect_samples[i].append(sample)
+            return OrderedDict([
+                (self.keys[i], dataset.collater(collect_samples[i]))
+                for i, (key, dataset) in enumerate(zip(self.keys, self.datasets))
+                if len(collect_samples[i]) > 0
+            ])
+        elif self.shared_collater:
+            return self.datasets[0].collater(
+                [s for _, s in samples]
+            )
+        else:
+            samples_dict = defaultdict(list)
+            pad_to_length = defaultdict(int) if 'pad_to_length' not in extra_args else extra_args['pad_to_length']
+            for ds_idx, s in samples:
+                pad_to_length['source'] = max(pad_to_length['source'], s['source'].size(0))
+                if s['target'] is not None:
+                    pad_to_length['target'] = max(pad_to_length['target'], s['target'].size(0))
+                samples_dict[ds_idx].append(s)
+            batches = [
+                self.datasets[i].collater(samples_dict[i], pad_to_length=pad_to_length)
+                for i in range(len(self.datasets))
+                if len(samples_dict[i]) > 0
+            ]
+
+            def straight_data(tensors):
+                batch = torch.cat(tensors, dim=0)
+                return batch
+
+            src_lengths = straight_data([b['net_input']['src_lengths'] for b in batches])
+            src_lengths, sort_order = src_lengths.sort(descending=True)
+
+            def straight_order(tensors):
+                batch = straight_data(tensors)
+                return batch.index_select(0, sort_order)
+
+            batch = {
+                'id': straight_order([b['id'] for b in batches]),
+                'nsentences': sum(b['nsentences'] for b in batches),
+                'ntokens': sum(b['ntokens'] for b in batches),
+                'net_input': {
+                    'src_tokens': straight_order([b['net_input']['src_tokens'] for b in batches]),
+                    'src_lengths': src_lengths,
+                },
+                'target': straight_order([b['target'] for b in batches]) if batches[0]['target'] is not None else None,
+            }
+            if 'prev_output_tokens' in batches[0]['net_input']:
+                batch['net_input']['prev_output_tokens'] = straight_order(
+                    [b['net_input']['prev_output_tokens'] for b in batches])
+            if 'src_lang_id' in batches[0]['net_input']:
+                batch['net_input']['src_lang_id'] = straight_order([b['net_input']['src_lang_id'] for b in batches])
+            if 'tgt_lang_id' in batches[0]:
+                batch['tgt_lang_id'] = straight_order([b['tgt_lang_id'] for b in batches])
+            return batch
+
+    @property
+    def sizes(self):
+        if self._sizes is not None:
+            return self._sizes
+        start_time = time.time()
+        size_cache = self._size_cache
+        ret = []
+        for i in range(len(self)):
+            ds_idx, ds_sample_idx = self._get_dataset_and_index(i)
+            if (ds_idx, ds_sample_idx) in size_cache:
+                ret.append(size_cache[(ds_idx, ds_sample_idx)])
+            else:
+                s = self.datasets[ds_idx].size(ds_sample_idx)
+                size_cache[(ds_idx, ds_sample_idx)] = s
+                ret.append(s)
+        logger.debug(f'sizes() calling time: {get_time_gap(start_time, time.time())}')
+        self._sizes = np.array(ret, np.int64)
+        return self._sizes
+
+    def ordered_indices(self):
+        if self._ordered_indices is not None:
+            return self._ordered_indices
+
+        if self.batch_by_size:
+            # No need to do shuffle as the data items are already randomized
+            indices = np.arange(len(self))
+            sizes = self.sizes
+            tgt_sizes = sizes[:, 1] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else None
+            src_sizes = sizes[:, 0] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else sizes
+
+            # sort by target length, then source length
+            if tgt_sizes is not None:
+                indices = indices[
+                    np.argsort(tgt_sizes[indices], kind='mergesort')
+                ]
+            sort_indices = indices[np.argsort(src_sizes[indices], kind='mergesort')]
+        else:
+            sort_indices = np.arange(len(self))
+        self._ordered_indices = sort_indices
+        return sort_indices
+
+    def prefetch(self, indices):
+        prefetch_indices = [[] for _ in range(len(self.datasets))]
+        for i in indices:
+            ds_idx, ds_sample_idx = self._get_dataset_and_index(i)
+            prefetch_indices[ds_idx].append(ds_sample_idx)
+        for i in range(len(prefetch_indices)):
+            self.datasets[i].prefetch(prefetch_indices[i])
+
+    def set_epoch(self, epoch):
+        super().set_epoch(epoch)
+        if epoch == self._cur_epoch:
+            # re-enter so return
+            return
+        for d in self.datasets:
+            if hasattr(d, 'set_epoch'):
+                d.set_epoch(epoch)
+        self._cur_epoch = epoch
+        self._establish_virtual_datasets()
+
+    def _establish_virtual_datasets(self):
+        if self.sample_ratios is None and self._cur_indices is not None:
+            # not a samping dataset, no need to resample if indices are already established
+            return
+        self._reset_cached_properties()
+
+        start_time = time.time()
+        # Generate a weighted sample of indices as a function of the
+        # random seed and the current epoch.
+        rng = np.random.RandomState(
+           [
+               int(hashlib.sha1(str(self.__class__.__name__).encode('utf-8')).hexdigest(), 16) % (2 ** 32),
+               self.seed % (2 ** 32),  # global seed
+               self._cur_epoch,  # epoch index,
+           ]
+        )
+        indices, cumulated_sizes, virtual_size_per_dataset = self.get_virtual_indices(
+            rng, self.datasets, self.sample_ratios, self.virtual_size)
+
+        self._clean_if_not_none([
+            self.cumulated_sizes, self.virtual_size_per_dataset
+        ])
+        self._cur_indices = plasma_utils.PlasmaArray(indices)
+        self.cumulated_sizes = plasma_utils.PlasmaArray(cumulated_sizes)
+        self.virtual_size_per_dataset = plasma_utils.PlasmaArray(virtual_size_per_dataset)
+
+        raw_sizes = [len(d) for d in self.datasets]
+        sampled_sizes = self.virtual_size_per_dataset.array
+        logger.info(f'[{self.split}] Raw sizes: {str(dict(zip(self.keys, raw_sizes)))}; '
+                    f'raw total size: {sum(raw_sizes)}')
+        logger.info(f'[{self.split}] Resampled sizes: {str(dict(zip(self.keys, sampled_sizes)))}; '
+                    f'resampled total size: {sum(sampled_sizes)}')
+        if self.sample_ratios is not None:
+            logger.info(f'[{self.split}] Upsampling ratios: {str(dict(zip(self.keys, self.sample_ratios.array)))}')
+        else:
+            logger.info(f'[{self.split}] A concat dataset')
+        logger.debug(f'[{self.split}] virtual dataset established time: {get_time_gap(start_time, time.time())}')
diff --git a/fairseq/data/multilingual/sampled_multi_epoch_dataset.py b/fairseq/data/multilingual/sampled_multi_epoch_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdd47e50910d8bc9a37f6936264591e8ae8413ae
--- /dev/null
+++ b/fairseq/data/multilingual/sampled_multi_epoch_dataset.py
@@ -0,0 +1,258 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import hashlib
+import math
+import logging
+import time
+
+import numpy as np
+import torch
+from fairseq import distributed_utils
+from fairseq.data import plasma_utils, SampledMultiDataset
+from .sampled_multi_dataset import default_virtual_size_func, get_time_gap, CollateFormat
+
+
+logger = logging.getLogger(__name__)
+
+
+class SampledMultiEpochDataset(SampledMultiDataset):
+    """Samples from multiple sub-datasets according to sampling ratios
+       using virtual epoch sizes to speed up dataloading.
+    Args:
+        datasets (
+            List[~torch.utils.data.Dataset]
+            or OrderedDict[str, ~torch.utils.data.Dataset]
+        ): datasets
+        sampling_ratios (List[float]): list of probability of each dataset to be sampled
+            (default: None, which corresponds to concating all dataset together).
+        batch_by_size (bool): whether or not to batch by sequence length
+            (default: True).
+        seed (int): RNG seed to use (default: 2).
+        epoch (int): starting epoch number (default: 1).
+        eval_key (str, optional): a key used at evaluation time that causes
+            this instance to pass-through batches from *datasets[eval_key]*.
+        collate_format (CollateFormat):  collater output format, either CollateFormat.ordered_dict or
+            CollateFormat.single (default: CollateFormat.single) where CollateFormat.single configures
+            the collater to output batches of data mixed from all sub-datasets,
+            and CollateFormat.ordered_dict configures the collater to output a dictionary of batches indexed by keys
+            of sub-datasets.
+            Note that not all sub-datasets will present in a single batch in both formats.
+        virtual_size (int, or callable): the expected virtual size of the dataset (default: default_virtual_size_func).
+        split (str): the split of the data, e.g. 'train', 'valid' or 'test'.
+        virtual_epoch_size (int): virtual epoch size, the dataset will go through the data by
+            this virtual epoch size one by one to speed up data loading, e.g. indicing and filtering
+            can be performed whenever a virtual epoch is loaded without waiting for the whole dataset to be loaded.
+        shared_collater (bool): whether or not to all sub-datasets have the same collater.
+        shard_epoch (int): the real epoch number for shard selection.
+    """
+    def __init__(
+        self,
+        datasets,
+        sampling_ratios=None,
+        batch_by_size=False,
+        seed=2,
+        epoch=1,
+        eval_key=None,
+        collate_format=CollateFormat.single,
+        virtual_size=default_virtual_size_func,
+        split='',
+        virtual_epoch_size=None,
+        shared_collater=False,
+        shard_epoch=1,
+    ):
+        self.virtual_epoch_size = virtual_epoch_size
+        self._current_epoch_start_index = None
+        self._epoch_sizes = None
+        self._epoch_ordered_indices = None
+        self._random_globa_indices = None
+        self.shard_epoch = shard_epoch if shard_epoch is not None else 1
+        self.load_next_shard = None
+        super().__init__(
+            datasets=datasets,
+            sampling_ratios=sampling_ratios,
+            batch_by_size=batch_by_size,
+            seed=seed,
+            epoch=epoch,
+            eval_key=eval_key,
+            collate_format=collate_format,
+            virtual_size=virtual_size,
+            split=split,
+            shared_collater=shared_collater,
+        )
+
+    def _setup(self, epoch):
+        self.virtual_epoch_size = self.virtual_epoch_size if self.virtual_epoch_size is not None else self.virtual_size
+        if self.virtual_epoch_size > self.virtual_size:
+            logger.warning(f'virtual epoch size {self.virtual_epoch_size} '
+                           f'is greater than virtual dataset size {self.virtual_size}')
+            self.virtual_epoch_size = self.virtual_size
+        self.num_virtual_epochs = math.ceil(self.virtual_size / self.virtual_epoch_size)
+        self._current_epoch_start_index = self._get_epoch_start_index(epoch)
+        logger.info(f'virtual epoch size {self.virtual_epoch_size}; virtual dataset size {self.virtual_size}')
+
+    def _map_epoch_index_to_global(self, index):
+        index = self._current_epoch_start_index + index
+        # add randomness
+        return self._random_globa_indices.array[index]
+
+    def __getitem__(self, index):
+        i = self._map_epoch_index_to_global(index)
+        return super().__getitem__(i)
+
+    def num_tokens(self, index):
+        i = self._map_epoch_index_to_global(index)
+        return super().num_tokens(i)
+
+    def size(self, index):
+        if self._epoch_sizes is not None:
+            return self._epoch_sizes.array[index]
+        index = self._map_epoch_index_to_global(index)
+        ds_idx, ds_sample_idx = self._get_dataset_and_index(index)
+        return self.datasets[ds_idx].size(ds_sample_idx)
+
+    def __len__(self):
+        return (
+            self.virtual_epoch_size
+            if self._current_epoch_start_index + self.virtual_epoch_size < self.virtual_size
+            else self.virtual_size - self._current_epoch_start_index
+        )
+
+    @property
+    def sizes(self):
+        if self._epoch_sizes is not None:
+            return self._epoch_sizes.array
+        start_time = time.time()
+
+        size_cache = self._size_cache
+        ret = []
+        for i in range(len(self)):
+            index = self._map_epoch_index_to_global(i)
+            ds_idx, ds_sample_idx = self._get_dataset_and_index(index)
+
+            if (ds_idx, ds_sample_idx) in size_cache:
+                ret.append(size_cache[(ds_idx, ds_sample_idx)])
+            else:
+                s = self.datasets[ds_idx].size(ds_sample_idx)
+                s = (s, s) if not isinstance(s, tuple) else s
+                size_cache[(ds_idx, ds_sample_idx)] = s
+                ret.append(s)
+        self._epoch_sizes = plasma_utils.PlasmaArray(np.array(ret, np.int64))
+        logger.info(f'sizes() calling time: {get_time_gap(start_time, time.time())}')
+        return self._epoch_sizes.array
+
+    def ordered_indices(self):
+        if self._epoch_ordered_indices is not None:
+            return self._epoch_ordered_indices.array
+
+        if self.batch_by_size:
+            # No need to do shuffle as the data items are already randomized
+            indices = np.arange(len(self))
+            sizes = self.sizes
+            tgt_sizes = sizes[:, 1] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else None
+            src_sizes = sizes[:, 0] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else sizes
+
+            # sort by target length, then source length
+            if tgt_sizes is not None:
+                indices = indices[
+                    np.argsort(tgt_sizes[indices], kind='mergesort')
+                ]
+            sort_indices = indices[np.argsort(src_sizes[indices], kind='mergesort')]
+        else:
+            sort_indices = np.arange(len(self))
+        self._epoch_ordered_indices = plasma_utils.PlasmaArray(sort_indices)
+        return self._epoch_ordered_indices.array
+
+    def prefetch(self, indices):
+        prefetch_indices = [[] for _ in range(len(self.datasets))]
+        for i in indices:
+            index = self._map_epoch_index_to_global(i)
+            ds_idx, ds_sample_idx = self._get_dataset_and_index(index)
+            prefetch_indices[ds_idx].append(ds_sample_idx)
+        for i in range(len(prefetch_indices)):
+            self.datasets[i].prefetch(prefetch_indices[i])
+
+    def set_epoch(self, epoch):
+        if self._current_epoch_start_index is None:
+            self._setup(epoch)
+            self._next_virtual_epoch(epoch)
+        if epoch == self._cur_epoch:
+            # re-enter so return
+            return
+        self._next_virtual_epoch(epoch)
+
+    def _get_epoch_start_index(self, epoch):
+        assert epoch >= 1  # fairseq is using 1-based epoch everywhere
+        return ((epoch - 1) % self.num_virtual_epochs) * self.virtual_epoch_size
+
+    def _next_global_indices(self, epoch):
+        rng = np.random.RandomState(
+           [
+               int(hashlib.sha1(str(self.__class__.__name__).encode('utf-8')).hexdigest(), 16) % (2 ** 32),
+               self.seed % (2 ** 32),  # global seed
+               epoch,  # epoch index,
+           ]
+        )
+        del self._random_globa_indices
+        self._random_globa_indices = plasma_utils.PlasmaArray(
+            rng.choice(self.virtual_size, self.virtual_size, replace=False))
+        if self.load_next_shard is None:
+            self.load_next_shard = False
+        else:
+            # increase shard epoch for next loading
+            self.shard_epoch += 1
+            self.load_next_shard = True
+            # a hack to avoid possible out of sync of shard epoch number
+            # TODO: to confirm whether this is needed; without it, CUDA event error is occassionally observed
+            synced_shard_epoch = self._sync_shard_epoch(self.shard_epoch)
+            logger.info('to load next epoch/shard in next load_dataset: '
+                        f'epoch={epoch}/shard_epoch={self.shard_epoch}[synced={synced_shard_epoch}]')
+
+    def _sync_shard_epoch(self, shard_epoch):
+        # in case the ratios are not precisely the same across processes
+        # also to ensure every procresses update the ratios in the same pace
+        shard_epoch = torch.DoubleTensor([shard_epoch])
+        if torch.distributed.is_initialized():
+            if torch.cuda.is_available():
+                distributed_utils.all_reduce(shard_epoch.cuda())
+            else:
+                distributed_utils.all_reduce(shard_epoch)
+            ret = shard_epoch.cpu()
+            ret = ret.numpy()
+        return ret
+
+    def _sync_epoch(self, epoch):
+        # in case the ratios are not precisely the same across processes
+        # also to ensure every procresses update the ratios in the same pace
+        epoch = torch.DoubleTensor([epoch])
+        if torch.distributed.is_initialized():
+            if torch.cuda.is_available():
+                distributed_utils.all_reduce(epoch.cuda())
+            else:
+                distributed_utils.all_reduce(epoch)
+            ret = epoch.cpu()
+            ret = ret.numpy()
+        return ret
+
+    def _next_virtual_epoch(self, epoch):
+        index = self._get_epoch_start_index(epoch)
+        if index == 0 or self._random_globa_indices is None:
+            # need to start from the beginning,
+            # so call super().set_epoch(epoch) to establish the global virtual indices
+            logger.info('establishing a new set of global virtual indices for '
+                        f'epoch={epoch}/shard_epoch={self.shard_epoch}')
+            super().set_epoch(epoch)
+            self._next_global_indices(epoch)
+        else:
+            self._cur_epoch = epoch
+        # reset cache sizes and ordered_indices for the epoch after moving to a new epoch
+
+        self._clean_if_not_none([
+            self._epoch_sizes, self._epoch_ordered_indices, self._size_cache
+        ])
+        self._epoch_sizes = None
+        self._epoch_ordered_indices = None
+        self._current_epoch_start_index = index
+        self._size_cache = {}
diff --git a/fairseq/data/multilingual/sampling_method.py b/fairseq/data/multilingual/sampling_method.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a9d39f7a6e48104201d9a8019abd9124aa775b8
--- /dev/null
+++ b/fairseq/data/multilingual/sampling_method.py
@@ -0,0 +1,66 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List
+import logging
+
+
+logger = logging.getLogger(__name__)
+
+
+def uniform(dataset_sizes: List[int]):
+    return [1.0] * len(dataset_sizes)
+
+
+def temperature_sampling(dataset_sizes, temp):
+    total_size = sum(dataset_sizes)
+    return [(size / total_size) ** (1.0/temp) for size in dataset_sizes]
+
+
+def make_temperature_sampling(temp=1.0):
+    def sampling_func(dataset_sizes):
+        return temperature_sampling(dataset_sizes, temp)
+    return sampling_func
+
+
+def make_ratio_sampling(ratios):
+    def sampling_func(dataset_sizes):
+        return ratios
+    return sampling_func
+
+
+class SamplingMethod:
+    @staticmethod
+    def add_arguments(parser):
+        parser.add_argument(
+            '--sampling-method',
+            choices=['uniform', 'temperature',  'concat', 'RoundRobin', ],
+            type=str,
+            default='concat',
+            help='The method to sample data per language pairs')
+        parser.add_argument('--sampling-temperature', default=1.5, type=float,
+                            help='only work with --sampling-method temperature')
+
+    @staticmethod
+    def build_sampler(args, task):
+        return SamplingMethod(args, task)
+
+    def __init__(self, args, task):
+        self.args = args
+        self.task = task
+
+    def is_adaptive(self):
+        return False
+
+    def sampling_method_selector(self):
+        args = self.args
+        logger.info(f'selected sampler: {args.sampling_method}')
+        if args.sampling_method == 'uniform':
+            return uniform
+        elif args.sampling_method == 'temperature' or self.is_adaptive():
+            return make_temperature_sampling(float(args.sampling_temperature))
+        else:
+            # default to concating all data set together
+            return None
diff --git a/fairseq/data/nested_dictionary_dataset.py b/fairseq/data/nested_dictionary_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..2795f895ddeeb70b53368c3624d466436810c7f9
--- /dev/null
+++ b/fairseq/data/nested_dictionary_dataset.py
@@ -0,0 +1,116 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import OrderedDict
+
+import torch
+from torch.utils.data.dataloader import default_collate
+
+from . import FairseqDataset
+
+
+def _flatten(dico, prefix=None):
+    """Flatten a nested dictionary."""
+    new_dico = OrderedDict()
+    if isinstance(dico, dict):
+        prefix = prefix + '.' if prefix is not None else ''
+        for k, v in dico.items():
+            if v is None:
+                continue
+            new_dico.update(_flatten(v, prefix + k))
+    elif isinstance(dico, list):
+        for i, v in enumerate(dico):
+            new_dico.update(_flatten(v, prefix + '.[' + str(i) + ']'))
+    else:
+        new_dico = OrderedDict({prefix: dico})
+    return new_dico
+
+
+def _unflatten(dico):
+    """Unflatten a flattened dictionary into a nested dictionary."""
+    new_dico = OrderedDict()
+    for full_k, v in dico.items():
+        full_k = full_k.split('.')
+        node = new_dico
+        for k in full_k[:-1]:
+            if k.startswith('[') and k.endswith(']'):
+                k = int(k[1:-1])
+            if k not in node:
+                node[k] = OrderedDict()
+            node = node[k]
+        node[full_k[-1]] = v
+    return new_dico
+
+
+class NestedDictionaryDataset(FairseqDataset):
+
+    def __init__(self, defn, sizes=None):
+        super().__init__()
+        self.defn = _flatten(defn)
+        self.sizes = [sizes] if not isinstance(sizes, (list, tuple)) else sizes
+
+        first = None
+        for v in self.defn.values():
+            if not isinstance(v, (FairseqDataset, torch.utils.data.Dataset, )):
+                raise ValueError('Expected Dataset but found: {}'.format(v.__class__))
+            first = first or v
+            if len(v) > 0:
+                assert len(v) == len(first), 'dataset lengths must match'
+
+        self._len = len(first)
+
+    def __getitem__(self, index):
+        return OrderedDict((k, ds[index]) for k, ds in self.defn.items())
+
+    def __len__(self):
+        return self._len
+
+    def collater(self, samples):
+        """Merge a list of samples to form a mini-batch.
+
+        Args:
+            samples (List[dict]): samples to collate
+
+        Returns:
+            dict: a mini-batch suitable for forwarding with a Model
+        """
+        if len(samples) == 0:
+            return {}
+        sample = OrderedDict()
+        for k, ds in self.defn.items():
+            try:
+                sample[k] = ds.collater([s[k] for s in samples])
+            except NotImplementedError:
+                sample[k] = default_collate([s[k] for s in samples])
+        return _unflatten(sample)
+
+    def num_tokens(self, index):
+        """Return the number of tokens in a sample. This value is used to
+        enforce ``--max-tokens`` during batching."""
+        return max(s[index] for s in self.sizes)
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        if len(self.sizes) == 1:
+            return self.sizes[0][index]
+        else:
+            return (s[index] for s in self.sizes)
+
+    @property
+    def supports_prefetch(self):
+        """Whether this dataset supports prefetching."""
+        return any(ds.supports_prefetch for ds in self.defn.values())
+
+    def prefetch(self, indices):
+        """Prefetch the data required for this epoch."""
+        for ds in self.defn.values():
+            if getattr(ds, 'supports_prefetch', False):
+                ds.prefetch(indices)
+
+    def set_epoch(self, epoch):
+        super().set_epoch(epoch)
+        for ds in self.defn.values():
+            ds.set_epoch(epoch)
diff --git a/fairseq/data/noising.py b/fairseq/data/noising.py
new file mode 100644
index 0000000000000000000000000000000000000000..5801ae6eac1a0c96750e24bf8d43803f125e03c1
--- /dev/null
+++ b/fairseq/data/noising.py
@@ -0,0 +1,315 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import numpy as np
+
+from fairseq.data import data_utils
+
+
+class WordNoising(object):
+    """Generate a noisy version of a sentence, without changing words themselves."""
+    def __init__(self, dictionary, bpe_cont_marker="@@", bpe_end_marker=None):
+        self.dictionary = dictionary
+        self.bpe_end = None
+        if bpe_cont_marker:
+            self.bpe_end = np.array([
+                not self.dictionary[i].endswith(bpe_cont_marker)
+                for i in range(len(self.dictionary))
+            ])
+        elif bpe_end_marker:
+            self.bpe_end = np.array([
+                self.dictionary[i].endswith(bpe_end_marker)
+                for i in range(len(self.dictionary))
+            ])
+
+        self.get_word_idx = (
+            self._get_bpe_word_idx
+            if self.bpe_end is not None
+            else self._get_token_idx
+        )
+
+    def noising(self, x, lengths, noising_prob=0.0):
+        raise NotImplementedError()
+
+    def _get_bpe_word_idx(self, x):
+        """
+        Given a list of BPE tokens, for every index in the tokens list,
+        return the index of the word grouping that it belongs to.
+        For example, for input x corresponding to ["how", "are", "y@@", "ou"],
+        return [[0], [1], [2], [2]].
+        """
+        # x: (T x B)
+        bpe_end = self.bpe_end[x]
+
+        if (x.size(0) == 1 and x.size(1) == 1):
+            # Special case when we only have one word in x. If x = [[N]],
+            # bpe_end is a scalar (bool) instead of a 2-dim array of bools,
+            # which makes the sum operation below fail.
+            return np.array([[0]])
+
+        # do a reduce front sum to generate word ids
+        word_idx = bpe_end[::-1].cumsum(0)[::-1]
+        word_idx = word_idx.max(0)[None, :] - word_idx
+        return word_idx
+
+    def _get_token_idx(self, x):
+        """
+        This is to extend noising functions to be able to apply to non-bpe
+        tokens, e.g. word or characters.
+        """
+        x = torch.t(x)
+        word_idx = np.array([range(len(x_i)) for x_i in x])
+        return np.transpose(word_idx)
+
+
+class WordDropout(WordNoising):
+    """Randomly drop input words. If not passing blank_idx (default is None),
+    then dropped words will be removed. Otherwise, it will be replaced by the
+    blank_idx."""
+
+    def __init__(self, dictionary, default_dropout_prob=0.1, bpe_cont_marker="@@", bpe_end_marker=None):
+        super().__init__(dictionary, bpe_cont_marker, bpe_end_marker)
+        self.default_dropout_prob = default_dropout_prob
+
+    def noising(self, x, lengths, dropout_prob=None, blank_idx=None):
+        if dropout_prob is None:
+            dropout_prob = self.default_dropout_prob
+        # x: (T x B), lengths: B
+        if dropout_prob == 0:
+            return x, lengths
+
+        assert 0 < dropout_prob < 1
+
+        # be sure to drop entire words
+        word_idx = self.get_word_idx(x)
+        sentences = []
+        modified_lengths = []
+        for i in range(lengths.size(0)):
+            # Since dropout probabilities need to apply over non-pad tokens,
+            # it is not trivial to generate the keep mask without consider
+            # input lengths; otherwise, this could be done outside the loop
+
+            # We want to drop whole words based on word_idx grouping
+            num_words = max(word_idx[:, i]) + 1
+
+            # ith example: [x0, x1, ..., eos, pad, ..., pad]
+            # We should only generate keep probs for non-EOS tokens. Thus if the
+            # input sentence ends in EOS, the last word idx is not included in
+            # the dropout mask generation and we append True to always keep EOS.
+            # Otherwise, just generate the dropout mask for all word idx
+            # positions.
+            has_eos = x[lengths[i] - 1, i] == self.dictionary.eos()
+            if has_eos:  # has eos?
+                keep = np.random.rand(num_words - 1) >= dropout_prob
+                keep = np.append(keep, [True])  # keep EOS symbol
+            else:
+                keep = np.random.rand(num_words) >= dropout_prob
+
+            words = x[:lengths[i], i].tolist()
+
+            # TODO: speed up the following loop
+            # drop words from the input according to keep
+            new_s = [
+                w if keep[word_idx[j, i]] else blank_idx
+                for j, w in enumerate(words)
+            ]
+            new_s = [w for w in new_s if w is not None]
+            # we need to have at least one word in the sentence (more than the
+            # start / end sentence symbols)
+            if len(new_s) <= 1:
+                # insert at beginning in case the only token left is EOS
+                # EOS should be at end of list.
+                new_s.insert(0, words[np.random.randint(0, len(words))])
+            assert len(new_s) >= 1 and (
+                not has_eos  # Either don't have EOS at end or last token is EOS
+                or (len(new_s) >= 2 and new_s[-1] == self.dictionary.eos())
+            ), "New sentence is invalid."
+            sentences.append(new_s)
+            modified_lengths.append(len(new_s))
+        # re-construct input
+        modified_lengths = torch.LongTensor(modified_lengths)
+        modified_x = torch.LongTensor(
+            modified_lengths.max(),
+            modified_lengths.size(0)
+        ).fill_(self.dictionary.pad())
+        for i in range(modified_lengths.size(0)):
+            modified_x[:modified_lengths[i], i].copy_(torch.LongTensor(sentences[i]))
+
+        return modified_x, modified_lengths
+
+
+class WordShuffle(WordNoising):
+    """Shuffle words by no more than k positions."""
+
+    def __init__(self, dictionary, default_max_shuffle_distance=3, bpe_cont_marker="@@", bpe_end_marker=None):
+        super().__init__(dictionary, bpe_cont_marker, bpe_end_marker)
+        self.default_max_shuffle_distance = 3
+
+    def noising(self, x, lengths, max_shuffle_distance=None):
+        if max_shuffle_distance is None:
+            max_shuffle_distance = self.default_max_shuffle_distance
+        # x: (T x B), lengths: B
+        if max_shuffle_distance == 0:
+            return x, lengths
+
+        # max_shuffle_distance < 1 will return the same sequence
+        assert max_shuffle_distance > 1
+
+        # define noise word scores
+        noise = np.random.uniform(
+            0,
+            max_shuffle_distance,
+            size=(x.size(0), x.size(1)),
+        )
+        noise[0] = -1  # do not move start sentence symbol
+        # be sure to shuffle entire words
+        word_idx = self.get_word_idx(x)
+        x2 = x.clone()
+        for i in range(lengths.size(0)):
+            length_no_eos = lengths[i]
+            if x[lengths[i] - 1, i] == self.dictionary.eos():
+                length_no_eos = lengths[i] - 1
+            # generate a random permutation
+            scores = word_idx[:length_no_eos, i] + noise[word_idx[:length_no_eos, i], i]
+            # ensure no reordering inside a word
+            scores += 1e-6 * np.arange(length_no_eos.item())
+            permutation = scores.argsort()
+            # shuffle words
+            x2[:length_no_eos, i].copy_(
+                x2[:length_no_eos, i][torch.from_numpy(permutation)]
+            )
+        return x2, lengths
+
+
+class UnsupervisedMTNoising(WordNoising):
+    """
+    Implements the default configuration for noising in UnsupervisedMT
+    (github.com/facebookresearch/UnsupervisedMT)
+    """
+    def __init__(
+        self,
+        dictionary,
+        max_word_shuffle_distance,
+        word_dropout_prob,
+        word_blanking_prob,
+        bpe_cont_marker="@@",
+        bpe_end_marker=None,
+    ):
+        super().__init__(dictionary)
+        self.max_word_shuffle_distance = max_word_shuffle_distance
+        self.word_dropout_prob = word_dropout_prob
+        self.word_blanking_prob = word_blanking_prob
+
+        self.word_dropout = WordDropout(
+            dictionary=dictionary,
+            bpe_cont_marker=bpe_cont_marker,
+            bpe_end_marker=bpe_end_marker,
+        )
+        self.word_shuffle = WordShuffle(
+            dictionary=dictionary,
+            bpe_cont_marker=bpe_cont_marker,
+            bpe_end_marker=bpe_end_marker,
+        )
+
+    def noising(self, x, lengths):
+        # 1. Word Shuffle
+        noisy_src_tokens, noisy_src_lengths = self.word_shuffle.noising(
+            x=x,
+            lengths=lengths,
+            max_shuffle_distance=self.max_word_shuffle_distance,
+        )
+        # 2. Word Dropout
+        noisy_src_tokens, noisy_src_lengths = self.word_dropout.noising(
+            x=noisy_src_tokens,
+            lengths=noisy_src_lengths,
+            dropout_prob=self.word_dropout_prob,
+        )
+        # 3. Word Blanking
+        noisy_src_tokens, noisy_src_lengths = self.word_dropout.noising(
+            x=noisy_src_tokens,
+            lengths=noisy_src_lengths,
+            dropout_prob=self.word_blanking_prob,
+            blank_idx=self.dictionary.unk(),
+        )
+
+        return noisy_src_tokens
+
+
+class NoisingDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        src_dataset,
+        src_dict,
+        seed,
+        noiser=None,
+        noising_class=UnsupervisedMTNoising,
+        **kwargs
+    ):
+        """
+        Wrap a :class:`~torch.utils.data.Dataset` and apply noise to the
+        samples based on the supplied noising configuration.
+
+        Args:
+            src_dataset (~torch.utils.data.Dataset): dataset to wrap.
+                to build self.src_dataset --
+                a LanguagePairDataset with src dataset as the source dataset and
+                None as the target dataset. Should NOT have padding so that
+                src_lengths are accurately calculated by language_pair_dataset
+                collate function.
+                We use language_pair_dataset here to encapsulate the tgt_dataset
+                so we can re-use the LanguagePairDataset collater to format the
+                batches in the structure that SequenceGenerator expects.
+            src_dict (~fairseq.data.Dictionary): source dictionary
+            seed (int): seed to use when generating random noise
+            noiser (WordNoising): a pre-initialized :class:`WordNoising`
+                instance. If this is None, a new instance will be created using
+                *noising_class* and *kwargs*.
+            noising_class (class, optional): class to use to initialize a
+                default :class:`WordNoising` instance.
+            kwargs (dict, optional): arguments to initialize the default
+                :class:`WordNoising` instance given by *noiser*.
+        """
+        self.src_dataset = src_dataset
+        self.src_dict = src_dict
+        self.seed = seed
+        self.noiser = noiser if noiser is not None else noising_class(
+            dictionary=src_dict, **kwargs,
+        )
+
+    def __getitem__(self, index):
+        """
+        Returns a single noisy sample. Multiple samples are fed to the collater
+        create a noising dataset batch.
+        """
+        src_tokens = self.src_dataset[index]
+        src_lengths = torch.LongTensor([len(src_tokens)])
+        src_tokens = src_tokens.unsqueeze(0)
+
+        # Transpose src tokens to fit expected shape of x in noising function
+        # (batch size, sequence length) -> (sequence length, batch size)
+        src_tokens_t = torch.t(src_tokens)
+
+        with data_utils.numpy_seed(self.seed + index):
+            noisy_src_tokens = self.noiser.noising(src_tokens_t, src_lengths)
+
+        # Transpose back to expected src_tokens format
+        # (sequence length, 1) -> (1, sequence length)
+        noisy_src_tokens = torch.t(noisy_src_tokens)
+        return noisy_src_tokens[0]
+
+    def __len__(self):
+        """
+        The length of the noising dataset is the length of src.
+        """
+        return len(self.src_dataset)
+
+    @property
+    def supports_prefetch(self):
+        return self.src_dataset.supports_prefetch
+
+    def prefetch(self, indices):
+        if self.src_dataset.supports_prefetch:
+            self.src_dataset.prefetch(indices)
diff --git a/fairseq/data/num_samples_dataset.py b/fairseq/data/num_samples_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d7ea440192d5ba91efcc5fa6dfae781c60c74f0
--- /dev/null
+++ b/fairseq/data/num_samples_dataset.py
@@ -0,0 +1,18 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import FairseqDataset
+
+
+class NumSamplesDataset(FairseqDataset):
+
+    def __getitem__(self, index):
+        return 1
+
+    def __len__(self):
+        return 0
+
+    def collater(self, samples):
+        return sum(samples)
diff --git a/fairseq/data/numel_dataset.py b/fairseq/data/numel_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..50087e5857cb9c988ff4eadba1bf59cf5527a47f
--- /dev/null
+++ b/fairseq/data/numel_dataset.py
@@ -0,0 +1,32 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from . import BaseWrapperDataset
+
+
+class NumelDataset(BaseWrapperDataset):
+
+    def __init__(self, dataset, reduce=False):
+        super().__init__(dataset)
+        self.reduce = reduce
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        if torch.is_tensor(item):
+            return torch.numel(item)
+        else:
+            return np.size(item)
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def collater(self, samples):
+        if self.reduce:
+            return sum(samples)
+        else:
+            return torch.tensor(samples)
diff --git a/fairseq/data/offset_tokens_dataset.py b/fairseq/data/offset_tokens_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6fd559a304d2f9ef06704dee7dbec19a3843b10
--- /dev/null
+++ b/fairseq/data/offset_tokens_dataset.py
@@ -0,0 +1,16 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import BaseWrapperDataset
+
+
+class OffsetTokensDataset(BaseWrapperDataset):
+
+    def __init__(self, dataset, offset):
+        super().__init__(dataset)
+        self.offset = offset
+
+    def __getitem__(self, idx):
+        return self.dataset[idx] + self.offset
diff --git a/fairseq/data/pad_dataset.py b/fairseq/data/pad_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c13b549aab2fbe09feae8b9054934840d89512b
--- /dev/null
+++ b/fairseq/data/pad_dataset.py
@@ -0,0 +1,31 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.data import data_utils
+
+from . import BaseWrapperDataset
+
+
+class PadDataset(BaseWrapperDataset):
+
+    def __init__(self, dataset, pad_idx, left_pad):
+        super().__init__(dataset)
+        self.pad_idx = pad_idx
+        self.left_pad = left_pad
+
+    def collater(self, samples):
+        return data_utils.collate_tokens(samples, self.pad_idx, left_pad=self.left_pad)
+
+
+class LeftPadDataset(PadDataset):
+
+    def __init__(self, dataset, pad_idx):
+        super().__init__(dataset, pad_idx, left_pad=True)
+
+
+class RightPadDataset(PadDataset):
+
+    def __init__(self, dataset, pad_idx):
+        super().__init__(dataset, pad_idx, left_pad=False)
diff --git a/fairseq/data/plasma_utils.py b/fairseq/data/plasma_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..33f250eea943ee492a400f331b2bcbe8d69f9018
--- /dev/null
+++ b/fairseq/data/plasma_utils.py
@@ -0,0 +1,86 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import subprocess
+import tempfile
+
+
+class PlasmaArray(object):
+    """
+    Wrapper around numpy arrays that automatically moves the data to shared
+    memory upon serialization. This is particularly helpful when passing numpy
+    arrays through multiprocessing, so that data is not unnecessarily
+    duplicated or pickled.
+    """
+
+    def __init__(self, array):
+        super().__init__()
+        self.array = array
+        self.disable = array.nbytes < 134217728  # disable for arrays <128MB
+        self.object_id = None
+        self.path = None
+
+        # variables with underscores shouldn't be pickled
+        self._client = None
+        self._server = None
+        self._server_tmp = None
+        self._plasma = None
+
+    @property
+    def plasma(self):
+        if self._plasma is None and not self.disable:
+            try:
+                import pyarrow.plasma as plasma
+                self._plasma = plasma
+            except ImportError:
+                self._plasma = None
+        return self._plasma
+
+    def start_server(self):
+        if self.plasma is None or self._server is not None:
+            return
+        assert self.object_id is None
+        assert self.path is None
+        self._server_tmp = tempfile.NamedTemporaryFile()
+        self.path = self._server_tmp.name
+        self._server = subprocess.Popen([
+            'plasma_store',
+            '-m', str(int(1.05 * self.array.nbytes)),
+            '-s', self.path,
+        ])
+
+    @property
+    def client(self):
+        if self._client is None:
+            assert self.path is not None
+            self._client = self.plasma.connect(self.path)
+        return self._client
+
+    def __getstate__(self):
+        if self.plasma is None:
+            return self.__dict__
+        if self.object_id is None:
+            self.start_server()
+            self.object_id = self.client.put(self.array)
+        state = self.__dict__.copy()
+        del state['array']
+        state['_client'] = None
+        state['_server'] = None
+        state['_server_tmp'] = None
+        state['_plasma'] = None
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        if self.plasma is None:
+            return
+        self.array = self.client.get(self.object_id)
+
+    def __del__(self):
+        if self._server is not None:
+            self._server.kill()
+            self._server = None
+            self._server_tmp.close()
+            self._server_tmp = None
diff --git a/fairseq/data/prepend_dataset.py b/fairseq/data/prepend_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad74784d2d7920e4a6225282d95543ce16ea50d9
--- /dev/null
+++ b/fairseq/data/prepend_dataset.py
@@ -0,0 +1,28 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from . import BaseWrapperDataset
+
+
+class PrependDataset(BaseWrapperDataset):
+    def __init__(self, dataset, prepend_getter, ensure_first_token_is=None):
+        super().__init__(dataset)
+        self.prepend_getter = prepend_getter
+        self.ensure_first_token = ensure_first_token_is
+
+    def __getitem__(self, idx):
+        item = self.dataset[idx]
+        is_tuple = isinstance(item, tuple)
+        src = item[0] if is_tuple else item
+
+        assert self.ensure_first_token is None or src[0] == self.ensure_first_token
+        prepend_idx = self.prepend_getter(self.dataset, idx)
+        assert isinstance(prepend_idx, int)
+        src[0] = prepend_idx
+        item = tuple((src,) + item[1:]) if is_tuple else src
+        return item
diff --git a/fairseq/data/prepend_token_dataset.py b/fairseq/data/prepend_token_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dac71badf9f182ed5a1afdca99815a63717e214
--- /dev/null
+++ b/fairseq/data/prepend_token_dataset.py
@@ -0,0 +1,42 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from . import BaseWrapperDataset
+
+
+class PrependTokenDataset(BaseWrapperDataset):
+
+    def __init__(self, dataset, token=None):
+        super().__init__(dataset)
+        self.token = token
+        if token is not None:
+            self._sizes = np.array(dataset.sizes) + 1
+        else:
+            self._sizes = dataset.sizes
+
+    def __getitem__(self, idx):
+        item = self.dataset[idx]
+        if self.token is not None:
+            item = torch.cat([item.new([self.token]), item])
+        return item
+
+    @property
+    def sizes(self):
+        return self._sizes
+
+    def num_tokens(self, index):
+        n = self.dataset.num_tokens(index)
+        if self.token is not None:
+            n += 1
+        return n
+
+    def size(self, index):
+        n = self.dataset.size(index)
+        if self.token is not None:
+            n += 1
+        return n
diff --git a/fairseq/data/raw_label_dataset.py b/fairseq/data/raw_label_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e67170f1a5e1dc5f861dd651ec126a05a09cb085
--- /dev/null
+++ b/fairseq/data/raw_label_dataset.py
@@ -0,0 +1,24 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from . import FairseqDataset
+
+
+class RawLabelDataset(FairseqDataset):
+
+    def __init__(self, labels):
+        super().__init__()
+        self.labels = labels
+
+    def __getitem__(self, index):
+        return self.labels[index]
+
+    def __len__(self):
+        return len(self.labels)
+
+    def collater(self, samples):
+        return torch.tensor(samples)
diff --git a/fairseq/data/replace_dataset.py b/fairseq/data/replace_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bc52f0fb52b62ce689494973ed49acda462be87
--- /dev/null
+++ b/fairseq/data/replace_dataset.py
@@ -0,0 +1,36 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import BaseWrapperDataset
+
+
+class ReplaceDataset(BaseWrapperDataset):
+    """Replaces tokens found in the dataset by a specified replacement token
+
+        Args:
+            dataset (~torch.utils.data.Dataset): dataset to replace tokens in
+            replace_map(Dictionary[int,int]): map of token to replace -> replacement token
+            offsets (List[int]): do not replace tokens before (from left if pos, right if neg) this offset. should be
+            as many as the number of objects returned by the underlying dataset __getitem__ method.
+        """
+
+    def __init__(self, dataset, replace_map, offsets):
+        super().__init__(dataset)
+        assert len(replace_map) > 0
+        self.replace_map = replace_map
+        self.offsets = offsets
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        is_tuple = isinstance(item, tuple)
+        srcs = item if is_tuple else [item]
+
+        for offset, src in zip(self.offsets, srcs):
+            for k, v in self.replace_map.items():
+                src_off = src[offset:] if offset >= 0 else src[:offset]
+                src_off.masked_fill_(src_off == k, v)
+
+        item = srcs if is_tuple else srcs[0]
+        return item
diff --git a/fairseq/data/resampling_dataset.py b/fairseq/data/resampling_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2c9b31d7953840c89ce5a898ba66cc12c5f4f1b
--- /dev/null
+++ b/fairseq/data/resampling_dataset.py
@@ -0,0 +1,136 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import numpy as np
+
+from fairseq.data import BaseWrapperDataset, plasma_utils
+
+
+logger = logging.getLogger(__name__)
+
+
+class ResamplingDataset(BaseWrapperDataset):
+    """Randomly samples from a given dataset at each epoch.
+
+    Sampling is done with or without replacement, depending on the "replace"
+    parameter.
+
+    Optionally, the epoch size can be rescaled. This is potentially desirable
+    to increase per-epoch coverage of the base dataset (since sampling with
+    replacement means that many items in the dataset will be left out). In the
+    case of sampling without replacement, size_ratio should be strictly less
+    than 1.
+
+    Args:
+        dataset (~torch.utils.data.Dataset): dataset on which to sample.
+        weights (List[float]): list of probability weights
+            (default: None, which corresponds to uniform sampling).
+        replace (bool): sampling mode; True for "with replacement", or False
+            for "without replacement" (default: True)
+        size_ratio (float): the ratio to subsample to; must be positive
+            (default: 1.0).
+        batch_by_size (bool): whether or not to batch by sequence length
+            (default: True).
+        seed (int): RNG seed to use (default: 0).
+        epoch (int): starting epoch number (default: 1).
+    """
+
+    def __init__(
+        self,
+        dataset,
+        weights=None,
+        replace=True,
+        size_ratio=1.0,
+        batch_by_size=True,
+        seed=0,
+        epoch=1,
+    ):
+        super().__init__(dataset)
+
+        if weights is None:
+            self.weights = None
+
+        else:
+            assert len(weights) == len(dataset)
+            weights_arr = np.array(weights, dtype=np.float64)
+            weights_arr /= weights_arr.sum()
+            self.weights = plasma_utils.PlasmaArray(weights_arr)
+
+        self.replace = replace
+
+        assert size_ratio > 0.0
+        if not self.replace:
+            assert size_ratio < 1.0
+        self.size_ratio = float(size_ratio)
+        self.actual_size = np.ceil(len(dataset) * self.size_ratio).astype(int)
+
+        self.batch_by_size = batch_by_size
+        self.seed = seed
+
+        self._cur_epoch = None
+        self._cur_indices = None
+
+        self.set_epoch(epoch)
+
+    def __getitem__(self, index):
+        return self.dataset[self._cur_indices.array[index]]
+
+    def __len__(self):
+        return self.actual_size
+
+    @property
+    def sizes(self):
+        if isinstance(self.dataset.sizes, list):
+            return [s[self._cur_indices.array] for s in self.dataset.sizes]
+        return self.dataset.sizes[self._cur_indices.array]
+
+    def num_tokens(self, index):
+        return self.dataset.num_tokens(self._cur_indices.array[index])
+
+    def size(self, index):
+        return self.dataset.size(self._cur_indices.array[index])
+
+    def ordered_indices(self):
+        if self.batch_by_size:
+            order = [
+                np.arange(len(self)),
+                self.sizes,
+            ]  # No need to handle `self.shuffle == True`
+            return np.lexsort(order)
+        else:
+            return np.arange(len(self))
+
+    def prefetch(self, indices):
+        self.dataset.prefetch(self._cur_indices.array[indices])
+
+    def set_epoch(self, epoch):
+        logger.debug('ResamplingDataset.set_epoch: {}'.format(epoch))
+        super().set_epoch(epoch)
+
+        if epoch == self._cur_epoch:
+            return
+
+        self._cur_epoch = epoch
+
+        # Generate a weighted sample of indices as a function of the
+        # random seed and the current epoch.
+
+        rng = np.random.RandomState(
+            [
+                42,  # magic number
+                self.seed % (2 ** 32),  # global seed
+                self._cur_epoch,  # epoch index
+            ]
+        )
+        self._cur_indices = plasma_utils.PlasmaArray(
+            rng.choice(
+                len(self.dataset),
+                self.actual_size,
+                replace=self.replace,
+                p=(None if self.weights is None else self.weights.array),
+            )
+        )
diff --git a/fairseq/data/roll_dataset.py b/fairseq/data/roll_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d07800d0f6625446b24d031a112204c962ef751c
--- /dev/null
+++ b/fairseq/data/roll_dataset.py
@@ -0,0 +1,19 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from . import BaseWrapperDataset
+
+
+class RollDataset(BaseWrapperDataset):
+
+    def __init__(self, dataset, shifts):
+        super().__init__(dataset)
+        self.shifts = shifts
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        return torch.roll(item, self.shifts)
diff --git a/fairseq/data/round_robin_zip_datasets.py b/fairseq/data/round_robin_zip_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bfc966ce8fb33080152c6c910969b5350610b71
--- /dev/null
+++ b/fairseq/data/round_robin_zip_datasets.py
@@ -0,0 +1,110 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import OrderedDict
+
+import numpy as np
+
+from . import FairseqDataset
+
+
+class RoundRobinZipDatasets(FairseqDataset):
+    """Zip multiple :class:`~fairseq.data.FairseqDataset` instances together.
+
+    Shorter datasets are repeated in a round-robin fashion to match the length
+    of the longest one.
+
+    Args:
+        datasets (Dict[~fairseq.data.FairseqDataset]): a dictionary of
+            :class:`~fairseq.data.FairseqDataset` instances.
+        eval_key (str, optional): a key used at evaluation time that causes
+            this instance to pass-through batches from *datasets[eval_key]*.
+    """
+
+    def __init__(self, datasets, eval_key=None):
+        super().__init__()
+        assert isinstance(datasets, OrderedDict)
+        self.datasets = datasets
+        self.eval_key = eval_key
+
+        self.longest_dataset = None
+        self.longest_dataset_key = None
+        for key, dataset in datasets.items():
+            assert isinstance(dataset, FairseqDataset)
+            if self.longest_dataset is None or len(dataset) > len(self.longest_dataset):
+                self.longest_dataset = dataset
+                self.longest_dataset_key = key
+
+        self._ordered_indices = None
+
+    def _map_index(self, key, index):
+        assert self._ordered_indices is not None, \
+            'Must call RoundRobinZipDatasets.ordered_indices() first'
+        return self._ordered_indices[key][index % len(self.datasets[key])]
+
+    def __getitem__(self, index):
+        if self.eval_key is None:
+            return OrderedDict([
+                (key, dataset[self._map_index(key, index)])
+                for key, dataset in self.datasets.items()
+            ])
+        else:
+            # at evaluation time it's useful to pass-through batches from a single key
+            return self.datasets[self.eval_key][self._map_index(self.eval_key, index)]
+
+    def __len__(self):
+        return len(self.longest_dataset)
+
+    def collater(self, samples):
+        """Merge a list of samples to form a mini-batch."""
+        if len(samples) == 0:
+            return None
+        if self.eval_key is None:
+            return OrderedDict([
+                (key, dataset.collater([sample[key] for sample in samples]))
+                for key, dataset in self.datasets.items()
+            ])
+        else:
+            # at evaluation time it's useful to pass-through batches from a single key
+            return self.datasets[self.eval_key].collater(samples)
+
+    def num_tokens(self, index):
+        """Return an example's length (number of tokens), used for batching."""
+        # TODO make it configurable whether to use max() or sum() here
+        return max(
+            dataset.num_tokens(self._map_index(key, index))
+            for key, dataset in self.datasets.items()
+        )
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        return {
+            key: dataset.size(self._map_index(key, index))
+            for key, dataset in self.datasets.items()
+        }
+
+    def ordered_indices(self):
+        """Ordered indices for batching."""
+        if self._ordered_indices is None:
+            # Call the underlying dataset's ordered_indices() here, so that we
+            # get the same random ordering as we would have from using the
+            # underlying dataset directly.
+            self._ordered_indices = OrderedDict([
+                (key, dataset.ordered_indices())
+                for key, dataset in self.datasets.items()
+            ])
+        return np.arange(len(self))
+
+    @property
+    def supports_prefetch(self):
+        return all(
+            getattr(dataset, 'supports_prefetch', False)
+            for dataset in self.datasets.values()
+        )
+
+    def prefetch(self, indices):
+        for key, dataset in self.datasets.items():
+            dataset.prefetch([self._map_index(key, index) for index in indices])
diff --git a/fairseq/data/shorten_dataset.py b/fairseq/data/shorten_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c84219dc7bccb2562d52a2039616236e05f782a
--- /dev/null
+++ b/fairseq/data/shorten_dataset.py
@@ -0,0 +1,74 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+from fairseq.data import data_utils
+
+from . import BaseWrapperDataset
+
+
+class TruncateDataset(BaseWrapperDataset):
+    """Truncate a sequence by returning the first truncation_length tokens
+    """
+
+    def __init__(self, dataset, truncation_length):
+        super().__init__(dataset)
+        assert truncation_length is not None
+        self.truncation_length = truncation_length
+        self.dataset = dataset
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        item_len = item.size(0)
+        if item_len > self.truncation_length:
+            item = item[:self.truncation_length]
+        return item
+
+    @property
+    def sizes(self):
+        return np.minimum(self.dataset.sizes, self.truncation_length)
+
+    def __len__(self):
+        return len(self.dataset)
+
+
+class RandomCropDataset(TruncateDataset):
+    """Truncate a sequence by returning a random crop of truncation_length tokens
+    """
+
+    def __init__(self, dataset, truncation_length, seed=1):
+        super().__init__(dataset, truncation_length)
+        self.seed = seed
+        self.epoch = 0
+
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+
+    def __getitem__(self, index):
+        with data_utils.numpy_seed(self.seed, self.epoch, index):
+            item = self.dataset[index]
+            item_len = item.size(0)
+            excess = item_len - self.truncation_length
+            if excess > 0:
+                start_idx = np.random.randint(0, excess)
+                item = item[start_idx:start_idx+self.truncation_length]
+            return item
+
+def maybe_shorten_dataset(
+    dataset,
+    split,
+    shorten_data_split_list,
+    shorten_method,
+    tokens_per_sample,
+    seed,
+):
+    truncate_split = split in shorten_data_split_list.split(',') \
+        or len(shorten_data_split_list) == 0
+    if shorten_method == 'truncate' and truncate_split:
+        dataset = TruncateDataset(dataset, tokens_per_sample)
+    elif shorten_method == 'random_crop' and truncate_split:
+        dataset = RandomCropDataset(dataset, tokens_per_sample, seed)
+    return dataset
diff --git a/fairseq/data/sort_dataset.py b/fairseq/data/sort_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b510b93a0f22d2c9ba98ce36d5a79921f456a77
--- /dev/null
+++ b/fairseq/data/sort_dataset.py
@@ -0,0 +1,22 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+from . import BaseWrapperDataset
+
+
+class SortDataset(BaseWrapperDataset):
+
+    def __init__(self, dataset, sort_order):
+        super().__init__(dataset)
+        if not isinstance(sort_order, (list, tuple)):
+            sort_order = [sort_order]
+        self.sort_order = sort_order
+
+        assert all(len(so) == len(dataset) for so in sort_order)
+
+    def ordered_indices(self):
+        return np.lexsort(self.sort_order)
diff --git a/fairseq/data/strip_token_dataset.py b/fairseq/data/strip_token_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e388db0e5fc6575a8d6fa60b7d5f546de7d4c1fd
--- /dev/null
+++ b/fairseq/data/strip_token_dataset.py
@@ -0,0 +1,21 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import BaseWrapperDataset
+
+
+class StripTokenDataset(BaseWrapperDataset):
+
+    def __init__(self, dataset, id_to_strip):
+        super().__init__(dataset)
+        self.id_to_strip = id_to_strip
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        while len(item) > 0 and item[-1] == self.id_to_strip:
+            item = item[:-1]
+        while len(item) > 0 and item[0] == self.id_to_strip:
+            item = item[1:]
+        return item
diff --git a/fairseq/data/subsample_dataset.py b/fairseq/data/subsample_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e395674a55572065662095aada5eb1675edfafbb
--- /dev/null
+++ b/fairseq/data/subsample_dataset.py
@@ -0,0 +1,71 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import numpy as np
+
+from . import BaseWrapperDataset
+
+
+logger = logging.getLogger(__name__)
+
+
+class SubsampleDataset(BaseWrapperDataset):
+    """Subsamples a given dataset by a specified ratio. Subsampling is done on the number of examples
+
+            Args:
+                dataset (~torch.utils.data.Dataset): dataset to subsample
+                size_ratio(float): the ratio to subsample to. must be between 0 and 1 (exclusive)
+            """
+
+    def __init__(self, dataset, size_ratio):
+        super().__init__(dataset)
+        assert size_ratio < 1
+        self.actual_size = np.ceil(len(dataset) * size_ratio).astype(int)
+        self.indices = np.random.choice(
+            list(range(len(self.dataset))), self.actual_size, replace=False
+        )
+        logger.info(
+            "subsampled dataset from {} to {} (ratio={})".format(
+                len(self.dataset), self.actual_size, size_ratio
+            )
+        )
+
+    def __getitem__(self, index):
+        return self.dataset[self.indices[index]]
+
+    def __len__(self):
+        return self.actual_size
+
+    def collater(self, samples):
+        return self.dataset.collater(samples)
+
+    @property
+    def sizes(self):
+        return self.dataset.sizes[self.indices]
+
+    @property
+    def name(self):
+        return self.dataset.name
+
+    def num_tokens(self, index):
+        return self.dataset.num_tokens(self.indices[index])
+
+    def size(self, index):
+        return self.dataset.size(self.indices[index])
+
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        if self.shuffle:
+            order = [np.random.permutation(len(self))]
+        else:
+            order = [np.arange(len(self))]
+        order.append(self.sizes)
+        return np.lexsort(order)
+
+    def prefetch(self, indices):
+        self.dataset.prefetch(self.indices[indices])
diff --git a/fairseq/data/token_block_dataset.py b/fairseq/data/token_block_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e2f5cc482870cb85c0bc1b47d0ced5ff1b57c59
--- /dev/null
+++ b/fairseq/data/token_block_dataset.py
@@ -0,0 +1,166 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from fairseq.data import FairseqDataset, plasma_utils
+
+
+class TokenBlockDataset(FairseqDataset):
+    """Break a Dataset of tokens into blocks.
+
+    Args:
+        dataset (~torch.utils.data.Dataset): dataset to break into blocks
+        sizes (List[int]): sentence lengths (required for 'complete' and 'eos')
+        block_size (int): maximum block size (ignored in 'eos' break mode)
+        break_mode (str, optional): Mode used for breaking tokens. Values can
+            be one of:
+            - 'none': break tokens into equally sized blocks (up to block_size)
+            - 'complete': break tokens into blocks (up to block_size) such that
+                blocks contains complete sentences, although block_size may be
+                exceeded if some sentences exceed block_size
+            - 'complete_doc': similar to 'complete' mode, but do not
+                cross document boundaries
+            - 'eos': each block contains one sentence (block_size is ignored)
+        include_targets (bool, optional): return next tokens as targets
+            (default: False).
+        document_sep_len (int, optional): document separator size (required for
+            'complete_doc' break mode). Typically 1 if the sentences have eos
+            and 0 otherwise.
+    """
+    def __init__(
+        self,
+        dataset,
+        sizes,
+        block_size,
+        pad,
+        eos,
+        break_mode=None,
+        include_targets=False,
+        document_sep_len=1,
+    ):
+        try:
+            from fairseq.data.token_block_utils_fast import (
+                _get_slice_indices_fast,
+                _get_block_to_dataset_index_fast,
+            )
+        except ImportError:
+            raise ImportError(
+                'Please build Cython components with: `pip install --editable .` '
+                'or `python setup.py build_ext --inplace`'
+            )
+
+        super().__init__()
+        self.dataset = dataset
+        self.pad = pad
+        self.eos = eos
+        self.include_targets = include_targets
+
+        assert len(dataset) == len(sizes)
+        assert len(dataset) > 0
+
+        if isinstance(sizes, list):
+            sizes = np.array(sizes, dtype=np.int64)
+        else:
+            if torch.is_tensor(sizes):
+                sizes = sizes.numpy()
+            sizes = sizes.astype(np.int64)
+
+        break_mode = break_mode if break_mode is not None else 'none'
+
+        # For "eos" break-mode, block_size is not required parameters.
+        if break_mode == "eos" and block_size is None:
+            block_size = 0
+
+        slice_indices = _get_slice_indices_fast(sizes, break_mode, block_size, document_sep_len)
+        self._sizes = slice_indices[:, 1] - slice_indices[:, 0]
+
+        # build index mapping block indices to the underlying dataset indices
+        if break_mode == "eos":
+            # much faster version for eos break mode
+            block_to_dataset_index = np.stack(
+                [
+                    np.arange(len(sizes)),  # starting index in dataset
+                    np.zeros(
+                        len(sizes), dtype=np.long
+                    ),  # starting offset within starting index
+                    np.arange(len(sizes)),  # ending index in dataset
+                ],
+                1,
+            )
+        else:
+            block_to_dataset_index = _get_block_to_dataset_index_fast(
+                sizes,
+                slice_indices,
+            )
+        self._slice_indices = plasma_utils.PlasmaArray(slice_indices)
+        self._sizes = plasma_utils.PlasmaArray(self._sizes)
+        self._block_to_dataset_index = plasma_utils.PlasmaArray(block_to_dataset_index)
+
+    @property
+    def slice_indices(self):
+        return self._slice_indices.array
+
+    @property
+    def sizes(self):
+        return self._sizes.array
+
+    @property
+    def block_to_dataset_index(self):
+        return self._block_to_dataset_index.array
+
+    def attr(self, attr: str, index: int):
+        start_ds_idx, _, _ = self.block_to_dataset_index[index]
+        return self.dataset.attr(attr, start_ds_idx)
+
+    def __getitem__(self, index):
+        start_ds_idx, start_offset, end_ds_idx = self.block_to_dataset_index[index]
+
+        buffer = torch.cat(
+            [self.dataset[idx] for idx in range(start_ds_idx, end_ds_idx + 1)]
+        )
+
+        slice_s, slice_e = self.slice_indices[index]
+        length = slice_e - slice_s
+        s, e = start_offset, start_offset + length
+        item = buffer[s:e]
+
+        if self.include_targets:
+            # *target* is the original sentence (=item)
+            # *source* is shifted right by 1 (maybe left-padded with eos)
+            # *past_target* is shifted right by 2 (left-padded as needed)
+            if s == 0:
+                source = torch.cat([item.new([self.eos]), buffer[0 : e - 1]])
+                past_target = torch.cat(
+                    [item.new([self.pad, self.eos]), buffer[0 : e - 2]]
+                )
+            else:
+                source = buffer[s - 1 : e - 1]
+                if s == 1:
+                    past_target = torch.cat([item.new([self.eos]), buffer[0 : e - 2]])
+                else:
+                    past_target = buffer[s - 2 : e - 2]
+
+            return source, item, past_target
+
+        return item
+
+    def __len__(self):
+        return len(self.slice_indices)
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, "supports_prefetch", False)
+
+    def prefetch(self, indices):
+        self.dataset.prefetch(
+            {
+                ds_idx
+                for index in indices
+                for start_ds_idx, _, end_ds_idx in [self.block_to_dataset_index[index]]
+                for ds_idx in range(start_ds_idx, end_ds_idx + 1)
+            }
+        )
diff --git a/fairseq/data/token_block_utils_fast.cpp b/fairseq/data/token_block_utils_fast.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a99e960fd2d0615707a9de9b2d26bad8b9e3db60
--- /dev/null
+++ b/fairseq/data/token_block_utils_fast.cpp
@@ -0,0 +1,34438 @@
+/* Generated by Cython 3.0.12 */
+
+/* BEGIN: Cython Metadata
+{
+    "distutils": {
+        "depends": [
+            "/tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/_core/include/numpy/arrayobject.h",
+            "/tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/_core/include/numpy/arrayscalars.h",
+            "/tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/_core/include/numpy/ndarrayobject.h",
+            "/tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/_core/include/numpy/ndarraytypes.h",
+            "/tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/_core/include/numpy/ufuncobject.h"
+        ],
+        "extra_compile_args": [
+            "-std=c++11",
+            "-O3"
+        ],
+        "include_dirs": [
+            "/tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/_core/include",
+            "/tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/_core/include"
+        ],
+        "language": "c++",
+        "name": "fairseq.data.token_block_utils_fast",
+        "sources": [
+            "fairseq/data/token_block_utils_fast.pyx"
+        ]
+    },
+    "module_name": "fairseq.data.token_block_utils_fast"
+}
+END: Cython Metadata */
+
+#ifndef PY_SSIZE_T_CLEAN
+#define PY_SSIZE_T_CLEAN
+#endif /* PY_SSIZE_T_CLEAN */
+#if defined(CYTHON_LIMITED_API) && 0
+  #ifndef Py_LIMITED_API
+    #if CYTHON_LIMITED_API+0 > 0x03030000
+      #define Py_LIMITED_API CYTHON_LIMITED_API
+    #else
+      #define Py_LIMITED_API 0x03030000
+    #endif
+  #endif
+#endif
+
+#include "Python.h"
+#ifndef Py_PYTHON_H
+    #error Python headers needed to compile C extensions, please install development version of Python.
+#elif PY_VERSION_HEX < 0x02070000 || (0x03000000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x03030000)
+    #error Cython requires Python 2.7+ or Python 3.3+.
+#else
+#if defined(CYTHON_LIMITED_API) && CYTHON_LIMITED_API
+#define __PYX_EXTRA_ABI_MODULE_NAME "limited"
+#else
+#define __PYX_EXTRA_ABI_MODULE_NAME ""
+#endif
+#define CYTHON_ABI "3_0_12" __PYX_EXTRA_ABI_MODULE_NAME
+#define __PYX_ABI_MODULE_NAME "_cython_" CYTHON_ABI
+#define __PYX_TYPE_MODULE_PREFIX __PYX_ABI_MODULE_NAME "."
+#define CYTHON_HEX_VERSION 0x03000CF0
+#define CYTHON_FUTURE_DIVISION 1
+#include <stddef.h>
+#ifndef offsetof
+  #define offsetof(type, member) ( (size_t) & ((type*)0) -> member )
+#endif
+#if !defined(_WIN32) && !defined(WIN32) && !defined(MS_WINDOWS)
+  #ifndef __stdcall
+    #define __stdcall
+  #endif
+  #ifndef __cdecl
+    #define __cdecl
+  #endif
+  #ifndef __fastcall
+    #define __fastcall
+  #endif
+#endif
+#ifndef DL_IMPORT
+  #define DL_IMPORT(t) t
+#endif
+#ifndef DL_EXPORT
+  #define DL_EXPORT(t) t
+#endif
+#define __PYX_COMMA ,
+#ifndef HAVE_LONG_LONG
+  #define HAVE_LONG_LONG
+#endif
+#ifndef PY_LONG_LONG
+  #define PY_LONG_LONG LONG_LONG
+#endif
+#ifndef Py_HUGE_VAL
+  #define Py_HUGE_VAL HUGE_VAL
+#endif
+#define __PYX_LIMITED_VERSION_HEX PY_VERSION_HEX
+#if defined(GRAALVM_PYTHON)
+  /* For very preliminary testing purposes. Most variables are set the same as PyPy.
+     The existence of this section does not imply that anything works or is even tested */
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #define CYTHON_COMPILING_IN_LIMITED_API 0
+  #define CYTHON_COMPILING_IN_GRAAL 1
+  #define CYTHON_COMPILING_IN_NOGIL 0
+  #undef CYTHON_USE_TYPE_SLOTS
+  #define CYTHON_USE_TYPE_SLOTS 0
+  #undef CYTHON_USE_TYPE_SPECS
+  #define CYTHON_USE_TYPE_SPECS 0
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #if PY_VERSION_HEX < 0x03050000
+    #undef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 0
+  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_INTERNALS
+  #define CYTHON_USE_UNICODE_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #undef CYTHON_AVOID_BORROWED_REFS
+  #define CYTHON_AVOID_BORROWED_REFS 1
+  #undef CYTHON_ASSUME_SAFE_MACROS
+  #define CYTHON_ASSUME_SAFE_MACROS 0
+  #undef CYTHON_UNPACK_METHODS
+  #define CYTHON_UNPACK_METHODS 0
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_GIL
+  #define CYTHON_FAST_GIL 0
+  #undef CYTHON_METH_FASTCALL
+  #define CYTHON_METH_FASTCALL 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #ifndef CYTHON_PEP487_INIT_SUBCLASS
+    #define CYTHON_PEP487_INIT_SUBCLASS (PY_MAJOR_VERSION >= 3)
+  #endif
+  #undef CYTHON_PEP489_MULTI_PHASE_INIT
+  #define CYTHON_PEP489_MULTI_PHASE_INIT 1
+  #undef CYTHON_USE_MODULE_STATE
+  #define CYTHON_USE_MODULE_STATE 0
+  #undef CYTHON_USE_TP_FINALIZE
+  #define CYTHON_USE_TP_FINALIZE 0
+  #undef CYTHON_USE_DICT_VERSIONS
+  #define CYTHON_USE_DICT_VERSIONS 0
+  #undef CYTHON_USE_EXC_INFO_STACK
+  #define CYTHON_USE_EXC_INFO_STACK 0
+  #ifndef CYTHON_UPDATE_DESCRIPTOR_DOC
+    #define CYTHON_UPDATE_DESCRIPTOR_DOC 0
+  #endif
+  #undef CYTHON_USE_FREELISTS
+  #define CYTHON_USE_FREELISTS 0
+#elif defined(PYPY_VERSION)
+  #define CYTHON_COMPILING_IN_PYPY 1
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #define CYTHON_COMPILING_IN_LIMITED_API 0
+  #define CYTHON_COMPILING_IN_GRAAL 0
+  #define CYTHON_COMPILING_IN_NOGIL 0
+  #undef CYTHON_USE_TYPE_SLOTS
+  #define CYTHON_USE_TYPE_SLOTS 0
+  #ifndef CYTHON_USE_TYPE_SPECS
+    #define CYTHON_USE_TYPE_SPECS 0
+  #endif
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #if PY_VERSION_HEX < 0x03050000
+    #undef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 0
+  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_INTERNALS
+  #define CYTHON_USE_UNICODE_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #undef CYTHON_AVOID_BORROWED_REFS
+  #define CYTHON_AVOID_BORROWED_REFS 1
+  #undef CYTHON_ASSUME_SAFE_MACROS
+  #define CYTHON_ASSUME_SAFE_MACROS 0
+  #undef CYTHON_UNPACK_METHODS
+  #define CYTHON_UNPACK_METHODS 0
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_GIL
+  #define CYTHON_FAST_GIL 0
+  #undef CYTHON_METH_FASTCALL
+  #define CYTHON_METH_FASTCALL 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #ifndef CYTHON_PEP487_INIT_SUBCLASS
+    #define CYTHON_PEP487_INIT_SUBCLASS (PY_MAJOR_VERSION >= 3)
+  #endif
+  #if PY_VERSION_HEX < 0x03090000
+    #undef CYTHON_PEP489_MULTI_PHASE_INIT
+    #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+  #elif !defined(CYTHON_PEP489_MULTI_PHASE_INIT)
+    #define CYTHON_PEP489_MULTI_PHASE_INIT 1
+  #endif
+  #undef CYTHON_USE_MODULE_STATE
+  #define CYTHON_USE_MODULE_STATE 0
+  #undef CYTHON_USE_TP_FINALIZE
+  #define CYTHON_USE_TP_FINALIZE (PY_VERSION_HEX >= 0x030400a1 && PYPY_VERSION_NUM >= 0x07030C00)
+  #undef CYTHON_USE_DICT_VERSIONS
+  #define CYTHON_USE_DICT_VERSIONS 0
+  #undef CYTHON_USE_EXC_INFO_STACK
+  #define CYTHON_USE_EXC_INFO_STACK 0
+  #ifndef CYTHON_UPDATE_DESCRIPTOR_DOC
+    #define CYTHON_UPDATE_DESCRIPTOR_DOC 0
+  #endif
+  #undef CYTHON_USE_FREELISTS
+  #define CYTHON_USE_FREELISTS 0
+#elif defined(CYTHON_LIMITED_API)
+  #ifdef Py_LIMITED_API
+    #undef __PYX_LIMITED_VERSION_HEX
+    #define __PYX_LIMITED_VERSION_HEX Py_LIMITED_API
+  #endif
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #define CYTHON_COMPILING_IN_LIMITED_API 1
+  #define CYTHON_COMPILING_IN_GRAAL 0
+  #define CYTHON_COMPILING_IN_NOGIL 0
+  #undef CYTHON_CLINE_IN_TRACEBACK
+  #define CYTHON_CLINE_IN_TRACEBACK 0
+  #undef CYTHON_USE_TYPE_SLOTS
+  #define CYTHON_USE_TYPE_SLOTS 0
+  #undef CYTHON_USE_TYPE_SPECS
+  #define CYTHON_USE_TYPE_SPECS 1
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #undef CYTHON_USE_ASYNC_SLOTS
+  #define CYTHON_USE_ASYNC_SLOTS 0
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_INTERNALS
+  #define CYTHON_USE_UNICODE_INTERNALS 0
+  #ifndef CYTHON_USE_UNICODE_WRITER
+    #define CYTHON_USE_UNICODE_WRITER 0
+  #endif
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #undef CYTHON_ASSUME_SAFE_MACROS
+  #define CYTHON_ASSUME_SAFE_MACROS 0
+  #undef CYTHON_UNPACK_METHODS
+  #define CYTHON_UNPACK_METHODS 0
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_GIL
+  #define CYTHON_FAST_GIL 0
+  #undef CYTHON_METH_FASTCALL
+  #define CYTHON_METH_FASTCALL 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #ifndef CYTHON_PEP487_INIT_SUBCLASS
+    #define CYTHON_PEP487_INIT_SUBCLASS 1
+  #endif
+  #undef CYTHON_PEP489_MULTI_PHASE_INIT
+  #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+  #undef CYTHON_USE_MODULE_STATE
+  #define CYTHON_USE_MODULE_STATE 1
+  #ifndef CYTHON_USE_TP_FINALIZE
+    #define CYTHON_USE_TP_FINALIZE 0
+  #endif
+  #undef CYTHON_USE_DICT_VERSIONS
+  #define CYTHON_USE_DICT_VERSIONS 0
+  #undef CYTHON_USE_EXC_INFO_STACK
+  #define CYTHON_USE_EXC_INFO_STACK 0
+  #ifndef CYTHON_UPDATE_DESCRIPTOR_DOC
+    #define CYTHON_UPDATE_DESCRIPTOR_DOC 0
+  #endif
+  #undef CYTHON_USE_FREELISTS
+  #define CYTHON_USE_FREELISTS 0
+#elif defined(Py_GIL_DISABLED) || defined(Py_NOGIL)
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #define CYTHON_COMPILING_IN_LIMITED_API 0
+  #define CYTHON_COMPILING_IN_GRAAL 0
+  #define CYTHON_COMPILING_IN_NOGIL 1
+  #ifndef CYTHON_USE_TYPE_SLOTS
+    #define CYTHON_USE_TYPE_SLOTS 1
+  #endif
+  #ifndef CYTHON_USE_TYPE_SPECS
+    #define CYTHON_USE_TYPE_SPECS 0
+  #endif
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #ifndef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #ifndef CYTHON_USE_PYLONG_INTERNALS
+    #define CYTHON_USE_PYLONG_INTERNALS 0
+  #endif
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #ifndef CYTHON_USE_UNICODE_INTERNALS
+    #define CYTHON_USE_UNICODE_INTERNALS 1
+  #endif
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #ifndef CYTHON_ASSUME_SAFE_MACROS
+    #define CYTHON_ASSUME_SAFE_MACROS 1
+  #endif
+  #ifndef CYTHON_UNPACK_METHODS
+    #define CYTHON_UNPACK_METHODS 1
+  #endif
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_GIL
+  #define CYTHON_FAST_GIL 0
+  #ifndef CYTHON_METH_FASTCALL
+    #define CYTHON_METH_FASTCALL 1
+  #endif
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #ifndef CYTHON_PEP487_INIT_SUBCLASS
+    #define CYTHON_PEP487_INIT_SUBCLASS 1
+  #endif
+  #ifndef CYTHON_PEP489_MULTI_PHASE_INIT
+    #define CYTHON_PEP489_MULTI_PHASE_INIT 1
+  #endif
+  #ifndef CYTHON_USE_MODULE_STATE
+    #define CYTHON_USE_MODULE_STATE 0
+  #endif
+  #ifndef CYTHON_USE_TP_FINALIZE
+    #define CYTHON_USE_TP_FINALIZE 1
+  #endif
+  #undef CYTHON_USE_DICT_VERSIONS
+  #define CYTHON_USE_DICT_VERSIONS 0
+  #undef CYTHON_USE_EXC_INFO_STACK
+  #define CYTHON_USE_EXC_INFO_STACK 0
+  #ifndef CYTHON_UPDATE_DESCRIPTOR_DOC
+    #define CYTHON_UPDATE_DESCRIPTOR_DOC 1
+  #endif
+  #ifndef CYTHON_USE_FREELISTS
+    #define CYTHON_USE_FREELISTS 0
+  #endif
+#else
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_CPYTHON 1
+  #define CYTHON_COMPILING_IN_LIMITED_API 0
+  #define CYTHON_COMPILING_IN_GRAAL 0
+  #define CYTHON_COMPILING_IN_NOGIL 0
+  #ifndef CYTHON_USE_TYPE_SLOTS
+    #define CYTHON_USE_TYPE_SLOTS 1
+  #endif
+  #ifndef CYTHON_USE_TYPE_SPECS
+    #define CYTHON_USE_TYPE_SPECS 0
+  #endif
+  #ifndef CYTHON_USE_PYTYPE_LOOKUP
+    #define CYTHON_USE_PYTYPE_LOOKUP 1
+  #endif
+  #if PY_MAJOR_VERSION < 3
+    #undef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 0
+  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #ifndef CYTHON_USE_PYLONG_INTERNALS
+    #define CYTHON_USE_PYLONG_INTERNALS 1
+  #endif
+  #ifndef CYTHON_USE_PYLIST_INTERNALS
+    #define CYTHON_USE_PYLIST_INTERNALS 1
+  #endif
+  #ifndef CYTHON_USE_UNICODE_INTERNALS
+    #define CYTHON_USE_UNICODE_INTERNALS 1
+  #endif
+  #if PY_VERSION_HEX < 0x030300F0 || PY_VERSION_HEX >= 0x030B00A2
+    #undef CYTHON_USE_UNICODE_WRITER
+    #define CYTHON_USE_UNICODE_WRITER 0
+  #elif !defined(CYTHON_USE_UNICODE_WRITER)
+    #define CYTHON_USE_UNICODE_WRITER 1
+  #endif
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #ifndef CYTHON_ASSUME_SAFE_MACROS
+    #define CYTHON_ASSUME_SAFE_MACROS 1
+  #endif
+  #ifndef CYTHON_UNPACK_METHODS
+    #define CYTHON_UNPACK_METHODS 1
+  #endif
+  #ifndef CYTHON_FAST_THREAD_STATE
+    #define CYTHON_FAST_THREAD_STATE 1
+  #endif
+  #ifndef CYTHON_FAST_GIL
+    #define CYTHON_FAST_GIL (PY_MAJOR_VERSION < 3 || PY_VERSION_HEX >= 0x03060000 && PY_VERSION_HEX < 0x030C00A6)
+  #endif
+  #ifndef CYTHON_METH_FASTCALL
+    #define CYTHON_METH_FASTCALL (PY_VERSION_HEX >= 0x030700A1)
+  #endif
+  #ifndef CYTHON_FAST_PYCALL
+    #define CYTHON_FAST_PYCALL 1
+  #endif
+  #ifndef CYTHON_PEP487_INIT_SUBCLASS
+    #define CYTHON_PEP487_INIT_SUBCLASS 1
+  #endif
+  #if PY_VERSION_HEX < 0x03050000
+    #undef CYTHON_PEP489_MULTI_PHASE_INIT
+    #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+  #elif !defined(CYTHON_PEP489_MULTI_PHASE_INIT)
+    #define CYTHON_PEP489_MULTI_PHASE_INIT 1
+  #endif
+  #ifndef CYTHON_USE_MODULE_STATE
+    #define CYTHON_USE_MODULE_STATE 0
+  #endif
+  #if PY_VERSION_HEX < 0x030400a1
+    #undef CYTHON_USE_TP_FINALIZE
+    #define CYTHON_USE_TP_FINALIZE 0
+  #elif !defined(CYTHON_USE_TP_FINALIZE)
+    #define CYTHON_USE_TP_FINALIZE 1
+  #endif
+  #if PY_VERSION_HEX < 0x030600B1
+    #undef CYTHON_USE_DICT_VERSIONS
+    #define CYTHON_USE_DICT_VERSIONS 0
+  #elif !defined(CYTHON_USE_DICT_VERSIONS)
+    #define CYTHON_USE_DICT_VERSIONS  (PY_VERSION_HEX < 0x030C00A5)
+  #endif
+  #if PY_VERSION_HEX < 0x030700A3
+    #undef CYTHON_USE_EXC_INFO_STACK
+    #define CYTHON_USE_EXC_INFO_STACK 0
+  #elif !defined(CYTHON_USE_EXC_INFO_STACK)
+    #define CYTHON_USE_EXC_INFO_STACK 1
+  #endif
+  #ifndef CYTHON_UPDATE_DESCRIPTOR_DOC
+    #define CYTHON_UPDATE_DESCRIPTOR_DOC 1
+  #endif
+  #ifndef CYTHON_USE_FREELISTS
+    #define CYTHON_USE_FREELISTS 1
+  #endif
+#endif
+#if !defined(CYTHON_FAST_PYCCALL)
+#define CYTHON_FAST_PYCCALL  (CYTHON_FAST_PYCALL && PY_VERSION_HEX >= 0x030600B1)
+#endif
+#if !defined(CYTHON_VECTORCALL)
+#define CYTHON_VECTORCALL  (CYTHON_FAST_PYCCALL && PY_VERSION_HEX >= 0x030800B1)
+#endif
+#define CYTHON_BACKPORT_VECTORCALL (CYTHON_METH_FASTCALL && PY_VERSION_HEX < 0x030800B1)
+#if CYTHON_USE_PYLONG_INTERNALS
+  #if PY_MAJOR_VERSION < 3
+    #include "longintrepr.h"
+  #endif
+  #undef SHIFT
+  #undef BASE
+  #undef MASK
+  #ifdef SIZEOF_VOID_P
+    enum { __pyx_check_sizeof_voidp = 1 / (int)(SIZEOF_VOID_P == sizeof(void*)) };
+  #endif
+#endif
+#ifndef __has_attribute
+  #define __has_attribute(x) 0
+#endif
+#ifndef __has_cpp_attribute
+  #define __has_cpp_attribute(x) 0
+#endif
+#ifndef CYTHON_RESTRICT
+  #if defined(__GNUC__)
+    #define CYTHON_RESTRICT __restrict__
+  #elif defined(_MSC_VER) && _MSC_VER >= 1400
+    #define CYTHON_RESTRICT __restrict
+  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define CYTHON_RESTRICT restrict
+  #else
+    #define CYTHON_RESTRICT
+  #endif
+#endif
+#ifndef CYTHON_UNUSED
+  #if defined(__cplusplus)
+    /* for clang __has_cpp_attribute(maybe_unused) is true even before C++17
+     * but leads to warnings with -pedantic, since it is a C++17 feature */
+    #if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
+      #if __has_cpp_attribute(maybe_unused)
+        #define CYTHON_UNUSED [[maybe_unused]]
+      #endif
+    #endif
+  #endif
+#endif
+#ifndef CYTHON_UNUSED
+# if defined(__GNUC__)
+#   if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+#     define CYTHON_UNUSED __attribute__ ((__unused__))
+#   else
+#     define CYTHON_UNUSED
+#   endif
+# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER))
+#   define CYTHON_UNUSED __attribute__ ((__unused__))
+# else
+#   define CYTHON_UNUSED
+# endif
+#endif
+#ifndef CYTHON_UNUSED_VAR
+#  if defined(__cplusplus)
+     template<class T> void CYTHON_UNUSED_VAR( const T& ) { }
+#  else
+#    define CYTHON_UNUSED_VAR(x) (void)(x)
+#  endif
+#endif
+#ifndef CYTHON_MAYBE_UNUSED_VAR
+  #define CYTHON_MAYBE_UNUSED_VAR(x) CYTHON_UNUSED_VAR(x)
+#endif
+#ifndef CYTHON_NCP_UNUSED
+# if CYTHON_COMPILING_IN_CPYTHON
+#  define CYTHON_NCP_UNUSED
+# else
+#  define CYTHON_NCP_UNUSED CYTHON_UNUSED
+# endif
+#endif
+#ifndef CYTHON_USE_CPP_STD_MOVE
+  #if defined(__cplusplus) && (\
+    __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1600))
+    #define CYTHON_USE_CPP_STD_MOVE 1
+  #else
+    #define CYTHON_USE_CPP_STD_MOVE 0
+  #endif
+#endif
+#define __Pyx_void_to_None(void_result) ((void)(void_result), Py_INCREF(Py_None), Py_None)
+#ifdef _MSC_VER
+    #ifndef _MSC_STDINT_H_
+        #if _MSC_VER < 1300
+            typedef unsigned char     uint8_t;
+            typedef unsigned short    uint16_t;
+            typedef unsigned int      uint32_t;
+        #else
+            typedef unsigned __int8   uint8_t;
+            typedef unsigned __int16  uint16_t;
+            typedef unsigned __int32  uint32_t;
+        #endif
+    #endif
+    #if _MSC_VER < 1300
+        #ifdef _WIN64
+            typedef unsigned long long  __pyx_uintptr_t;
+        #else
+            typedef unsigned int        __pyx_uintptr_t;
+        #endif
+    #else
+        #ifdef _WIN64
+            typedef unsigned __int64    __pyx_uintptr_t;
+        #else
+            typedef unsigned __int32    __pyx_uintptr_t;
+        #endif
+    #endif
+#else
+    #include <stdint.h>
+    typedef uintptr_t  __pyx_uintptr_t;
+#endif
+#ifndef CYTHON_FALLTHROUGH
+  #if defined(__cplusplus)
+    /* for clang __has_cpp_attribute(fallthrough) is true even before C++17
+     * but leads to warnings with -pedantic, since it is a C++17 feature */
+    #if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
+      #if __has_cpp_attribute(fallthrough)
+        #define CYTHON_FALLTHROUGH [[fallthrough]]
+      #endif
+    #endif
+    #ifndef CYTHON_FALLTHROUGH
+      #if __has_cpp_attribute(clang::fallthrough)
+        #define CYTHON_FALLTHROUGH [[clang::fallthrough]]
+      #elif __has_cpp_attribute(gnu::fallthrough)
+        #define CYTHON_FALLTHROUGH [[gnu::fallthrough]]
+      #endif
+    #endif
+  #endif
+  #ifndef CYTHON_FALLTHROUGH
+    #if __has_attribute(fallthrough)
+      #define CYTHON_FALLTHROUGH __attribute__((fallthrough))
+    #else
+      #define CYTHON_FALLTHROUGH
+    #endif
+  #endif
+  #if defined(__clang__) && defined(__apple_build_version__)
+    #if __apple_build_version__ < 7000000
+      #undef  CYTHON_FALLTHROUGH
+      #define CYTHON_FALLTHROUGH
+    #endif
+  #endif
+#endif
+#ifdef __cplusplus
+  template <typename T>
+  struct __PYX_IS_UNSIGNED_IMPL {static const bool value = T(0) < T(-1);};
+  #define __PYX_IS_UNSIGNED(type) (__PYX_IS_UNSIGNED_IMPL<type>::value)
+#else
+  #define __PYX_IS_UNSIGNED(type) (((type)-1) > 0)
+#endif
+#if CYTHON_COMPILING_IN_PYPY == 1
+  #define __PYX_NEED_TP_PRINT_SLOT  (PY_VERSION_HEX >= 0x030800b4 && PY_VERSION_HEX < 0x030A0000)
+#else
+  #define __PYX_NEED_TP_PRINT_SLOT  (PY_VERSION_HEX >= 0x030800b4 && PY_VERSION_HEX < 0x03090000)
+#endif
+#define __PYX_REINTERPRET_FUNCION(func_pointer, other_pointer) ((func_pointer)(void(*)(void))(other_pointer))
+
+#ifndef __cplusplus
+  #error "Cython files generated with the C++ option must be compiled with a C++ compiler."
+#endif
+#ifndef CYTHON_INLINE
+  #if defined(__clang__)
+    #define CYTHON_INLINE __inline__ __attribute__ ((__unused__))
+  #else
+    #define CYTHON_INLINE inline
+  #endif
+#endif
+template<typename T>
+void __Pyx_call_destructor(T& x) {
+    x.~T();
+}
+template<typename T>
+class __Pyx_FakeReference {
+  public:
+    __Pyx_FakeReference() : ptr(NULL) { }
+    __Pyx_FakeReference(const T& ref) : ptr(const_cast<T*>(&ref)) { }
+    T *operator->() { return ptr; }
+    T *operator&() { return ptr; }
+    operator T&() { return *ptr; }
+    template<typename U> bool operator ==(const U& other) const { return *ptr == other; }
+    template<typename U> bool operator !=(const U& other) const { return *ptr != other; }
+    template<typename U> bool operator==(const __Pyx_FakeReference<U>& other) const { return *ptr == *other.ptr; }
+    template<typename U> bool operator!=(const __Pyx_FakeReference<U>& other) const { return *ptr != *other.ptr; }
+  private:
+    T *ptr;
+};
+
+#define __PYX_BUILD_PY_SSIZE_T "n"
+#define CYTHON_FORMAT_SSIZE_T "z"
+#if PY_MAJOR_VERSION < 3
+  #define __Pyx_BUILTIN_MODULE_NAME "__builtin__"
+  #define __Pyx_DefaultClassType PyClass_Type
+  #define __Pyx_PyCode_New(a, p, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_New(a+k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+#else
+  #define __Pyx_BUILTIN_MODULE_NAME "builtins"
+  #define __Pyx_DefaultClassType PyType_Type
+#if CYTHON_COMPILING_IN_LIMITED_API
+    static CYTHON_INLINE PyObject* __Pyx_PyCode_New(int a, int p, int k, int l, int s, int f,
+                                                    PyObject *code, PyObject *c, PyObject* n, PyObject *v,
+                                                    PyObject *fv, PyObject *cell, PyObject* fn,
+                                                    PyObject *name, int fline, PyObject *lnos) {
+        PyObject *exception_table = NULL;
+        PyObject *types_module=NULL, *code_type=NULL, *result=NULL;
+        #if __PYX_LIMITED_VERSION_HEX < 0x030B0000
+        PyObject *version_info;
+        PyObject *py_minor_version = NULL;
+        #endif
+        long minor_version = 0;
+        PyObject *type, *value, *traceback;
+        PyErr_Fetch(&type, &value, &traceback);
+        #if __PYX_LIMITED_VERSION_HEX >= 0x030B0000
+        minor_version = 11;
+        #else
+        if (!(version_info = PySys_GetObject("version_info"))) goto end;
+        if (!(py_minor_version = PySequence_GetItem(version_info, 1))) goto end;
+        minor_version = PyLong_AsLong(py_minor_version);
+        Py_DECREF(py_minor_version);
+        if (minor_version == -1 && PyErr_Occurred()) goto end;
+        #endif
+        if (!(types_module = PyImport_ImportModule("types"))) goto end;
+        if (!(code_type = PyObject_GetAttrString(types_module, "CodeType"))) goto end;
+        if (minor_version <= 7) {
+            (void)p;
+            result = PyObject_CallFunction(code_type, "iiiiiOOOOOOiOO", a, k, l, s, f, code,
+                          c, n, v, fn, name, fline, lnos, fv, cell);
+        } else if (minor_version <= 10) {
+            result = PyObject_CallFunction(code_type, "iiiiiiOOOOOOiOO", a,p, k, l, s, f, code,
+                          c, n, v, fn, name, fline, lnos, fv, cell);
+        } else {
+            if (!(exception_table = PyBytes_FromStringAndSize(NULL, 0))) goto end;
+            result = PyObject_CallFunction(code_type, "iiiiiiOOOOOOOiOO", a,p, k, l, s, f, code,
+                          c, n, v, fn, name, name, fline, lnos, exception_table, fv, cell);
+        }
+    end:
+        Py_XDECREF(code_type);
+        Py_XDECREF(exception_table);
+        Py_XDECREF(types_module);
+        if (type) {
+            PyErr_Restore(type, value, traceback);
+        }
+        return result;
+    }
+    #ifndef CO_OPTIMIZED
+    #define CO_OPTIMIZED 0x0001
+    #endif
+    #ifndef CO_NEWLOCALS
+    #define CO_NEWLOCALS 0x0002
+    #endif
+    #ifndef CO_VARARGS
+    #define CO_VARARGS 0x0004
+    #endif
+    #ifndef CO_VARKEYWORDS
+    #define CO_VARKEYWORDS 0x0008
+    #endif
+    #ifndef CO_ASYNC_GENERATOR
+    #define CO_ASYNC_GENERATOR 0x0200
+    #endif
+    #ifndef CO_GENERATOR
+    #define CO_GENERATOR 0x0020
+    #endif
+    #ifndef CO_COROUTINE
+    #define CO_COROUTINE 0x0080
+    #endif
+#elif PY_VERSION_HEX >= 0x030B0000
+  static CYTHON_INLINE PyCodeObject* __Pyx_PyCode_New(int a, int p, int k, int l, int s, int f,
+                                                    PyObject *code, PyObject *c, PyObject* n, PyObject *v,
+                                                    PyObject *fv, PyObject *cell, PyObject* fn,
+                                                    PyObject *name, int fline, PyObject *lnos) {
+    PyCodeObject *result;
+    PyObject *empty_bytes = PyBytes_FromStringAndSize("", 0);
+    if (!empty_bytes) return NULL;
+    result =
+      #if PY_VERSION_HEX >= 0x030C0000
+        PyUnstable_Code_NewWithPosOnlyArgs
+      #else
+        PyCode_NewWithPosOnlyArgs
+      #endif
+        (a, p, k, l, s, f, code, c, n, v, fv, cell, fn, name, name, fline, lnos, empty_bytes);
+    Py_DECREF(empty_bytes);
+    return result;
+  }
+#elif PY_VERSION_HEX >= 0x030800B2 && !CYTHON_COMPILING_IN_PYPY
+  #define __Pyx_PyCode_New(a, p, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_NewWithPosOnlyArgs(a, p, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+#else
+  #define __Pyx_PyCode_New(a, p, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+#endif
+#endif
+#if PY_VERSION_HEX >= 0x030900A4 || defined(Py_IS_TYPE)
+  #define __Pyx_IS_TYPE(ob, type) Py_IS_TYPE(ob, type)
+#else
+  #define __Pyx_IS_TYPE(ob, type) (((const PyObject*)ob)->ob_type == (type))
+#endif
+#if PY_VERSION_HEX >= 0x030A00B1 || defined(Py_Is)
+  #define __Pyx_Py_Is(x, y)  Py_Is(x, y)
+#else
+  #define __Pyx_Py_Is(x, y) ((x) == (y))
+#endif
+#if PY_VERSION_HEX >= 0x030A00B1 || defined(Py_IsNone)
+  #define __Pyx_Py_IsNone(ob) Py_IsNone(ob)
+#else
+  #define __Pyx_Py_IsNone(ob) __Pyx_Py_Is((ob), Py_None)
+#endif
+#if PY_VERSION_HEX >= 0x030A00B1 || defined(Py_IsTrue)
+  #define __Pyx_Py_IsTrue(ob) Py_IsTrue(ob)
+#else
+  #define __Pyx_Py_IsTrue(ob) __Pyx_Py_Is((ob), Py_True)
+#endif
+#if PY_VERSION_HEX >= 0x030A00B1 || defined(Py_IsFalse)
+  #define __Pyx_Py_IsFalse(ob) Py_IsFalse(ob)
+#else
+  #define __Pyx_Py_IsFalse(ob) __Pyx_Py_Is((ob), Py_False)
+#endif
+#define __Pyx_NoneAsNull(obj)  (__Pyx_Py_IsNone(obj) ? NULL : (obj))
+#if PY_VERSION_HEX >= 0x030900F0 && !CYTHON_COMPILING_IN_PYPY
+  #define __Pyx_PyObject_GC_IsFinalized(o) PyObject_GC_IsFinalized(o)
+#else
+  #define __Pyx_PyObject_GC_IsFinalized(o) _PyGC_FINALIZED(o)
+#endif
+#ifndef CO_COROUTINE
+  #define CO_COROUTINE 0x80
+#endif
+#ifndef CO_ASYNC_GENERATOR
+  #define CO_ASYNC_GENERATOR 0x200
+#endif
+#ifndef Py_TPFLAGS_CHECKTYPES
+  #define Py_TPFLAGS_CHECKTYPES 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_INDEX
+  #define Py_TPFLAGS_HAVE_INDEX 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_NEWBUFFER
+  #define Py_TPFLAGS_HAVE_NEWBUFFER 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_FINALIZE
+  #define Py_TPFLAGS_HAVE_FINALIZE 0
+#endif
+#ifndef Py_TPFLAGS_SEQUENCE
+  #define Py_TPFLAGS_SEQUENCE 0
+#endif
+#ifndef Py_TPFLAGS_MAPPING
+  #define Py_TPFLAGS_MAPPING 0
+#endif
+#ifndef METH_STACKLESS
+  #define METH_STACKLESS 0
+#endif
+#if PY_VERSION_HEX <= 0x030700A3 || !defined(METH_FASTCALL)
+  #ifndef METH_FASTCALL
+     #define METH_FASTCALL 0x80
+  #endif
+  typedef PyObject *(*__Pyx_PyCFunctionFast) (PyObject *self, PyObject *const *args, Py_ssize_t nargs);
+  typedef PyObject *(*__Pyx_PyCFunctionFastWithKeywords) (PyObject *self, PyObject *const *args,
+                                                          Py_ssize_t nargs, PyObject *kwnames);
+#else
+  #if PY_VERSION_HEX >= 0x030d00A4
+  #  define __Pyx_PyCFunctionFast PyCFunctionFast
+  #  define __Pyx_PyCFunctionFastWithKeywords PyCFunctionFastWithKeywords
+  #else
+  #  define __Pyx_PyCFunctionFast _PyCFunctionFast
+  #  define __Pyx_PyCFunctionFastWithKeywords _PyCFunctionFastWithKeywords
+  #endif
+#endif
+#if CYTHON_METH_FASTCALL
+  #define __Pyx_METH_FASTCALL METH_FASTCALL
+  #define __Pyx_PyCFunction_FastCall __Pyx_PyCFunctionFast
+  #define __Pyx_PyCFunction_FastCallWithKeywords __Pyx_PyCFunctionFastWithKeywords
+#else
+  #define __Pyx_METH_FASTCALL METH_VARARGS
+  #define __Pyx_PyCFunction_FastCall PyCFunction
+  #define __Pyx_PyCFunction_FastCallWithKeywords PyCFunctionWithKeywords
+#endif
+#if CYTHON_VECTORCALL
+  #define __pyx_vectorcallfunc vectorcallfunc
+  #define __Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET  PY_VECTORCALL_ARGUMENTS_OFFSET
+  #define __Pyx_PyVectorcall_NARGS(n)  PyVectorcall_NARGS((size_t)(n))
+#elif CYTHON_BACKPORT_VECTORCALL
+  typedef PyObject *(*__pyx_vectorcallfunc)(PyObject *callable, PyObject *const *args,
+                                            size_t nargsf, PyObject *kwnames);
+  #define __Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET  ((size_t)1 << (8 * sizeof(size_t) - 1))
+  #define __Pyx_PyVectorcall_NARGS(n)  ((Py_ssize_t)(((size_t)(n)) & ~__Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET))
+#else
+  #define __Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET  0
+  #define __Pyx_PyVectorcall_NARGS(n)  ((Py_ssize_t)(n))
+#endif
+#if PY_MAJOR_VERSION >= 0x030900B1
+#define __Pyx_PyCFunction_CheckExact(func)  PyCFunction_CheckExact(func)
+#else
+#define __Pyx_PyCFunction_CheckExact(func)  PyCFunction_Check(func)
+#endif
+#define __Pyx_CyOrPyCFunction_Check(func)  PyCFunction_Check(func)
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_CyOrPyCFunction_GET_FUNCTION(func)  (((PyCFunctionObject*)(func))->m_ml->ml_meth)
+#elif !CYTHON_COMPILING_IN_LIMITED_API
+#define __Pyx_CyOrPyCFunction_GET_FUNCTION(func)  PyCFunction_GET_FUNCTION(func)
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_CyOrPyCFunction_GET_FLAGS(func)  (((PyCFunctionObject*)(func))->m_ml->ml_flags)
+static CYTHON_INLINE PyObject* __Pyx_CyOrPyCFunction_GET_SELF(PyObject *func) {
+    return (__Pyx_CyOrPyCFunction_GET_FLAGS(func) & METH_STATIC) ? NULL : ((PyCFunctionObject*)func)->m_self;
+}
+#endif
+static CYTHON_INLINE int __Pyx__IsSameCFunction(PyObject *func, void *cfunc) {
+#if CYTHON_COMPILING_IN_LIMITED_API
+    return PyCFunction_Check(func) && PyCFunction_GetFunction(func) == (PyCFunction) cfunc;
+#else
+    return PyCFunction_Check(func) && PyCFunction_GET_FUNCTION(func) == (PyCFunction) cfunc;
+#endif
+}
+#define __Pyx_IsSameCFunction(func, cfunc)   __Pyx__IsSameCFunction(func, cfunc)
+#if __PYX_LIMITED_VERSION_HEX < 0x030900B1
+  #define __Pyx_PyType_FromModuleAndSpec(m, s, b)  ((void)m, PyType_FromSpecWithBases(s, b))
+  typedef PyObject *(*__Pyx_PyCMethod)(PyObject *, PyTypeObject *, PyObject *const *, size_t, PyObject *);
+#else
+  #define __Pyx_PyType_FromModuleAndSpec(m, s, b)  PyType_FromModuleAndSpec(m, s, b)
+  #define __Pyx_PyCMethod  PyCMethod
+#endif
+#ifndef METH_METHOD
+  #define METH_METHOD 0x200
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Malloc)
+  #define PyObject_Malloc(s)   PyMem_Malloc(s)
+  #define PyObject_Free(p)     PyMem_Free(p)
+  #define PyObject_Realloc(p)  PyMem_Realloc(p)
+#endif
+#if CYTHON_COMPILING_IN_LIMITED_API
+  #define __Pyx_PyCode_HasFreeVars(co)  (PyCode_GetNumFree(co) > 0)
+  #define __Pyx_PyFrame_SetLineNumber(frame, lineno)
+#else
+  #define __Pyx_PyCode_HasFreeVars(co)  (PyCode_GetNumFree(co) > 0)
+  #define __Pyx_PyFrame_SetLineNumber(frame, lineno)  (frame)->f_lineno = (lineno)
+#endif
+#if CYTHON_COMPILING_IN_LIMITED_API
+  #define __Pyx_PyThreadState_Current PyThreadState_Get()
+#elif !CYTHON_FAST_THREAD_STATE
+  #define __Pyx_PyThreadState_Current PyThreadState_GET()
+#elif PY_VERSION_HEX >= 0x030d00A1
+  #define __Pyx_PyThreadState_Current PyThreadState_GetUnchecked()
+#elif PY_VERSION_HEX >= 0x03060000
+  #define __Pyx_PyThreadState_Current _PyThreadState_UncheckedGet()
+#elif PY_VERSION_HEX >= 0x03000000
+  #define __Pyx_PyThreadState_Current PyThreadState_GET()
+#else
+  #define __Pyx_PyThreadState_Current _PyThreadState_Current
+#endif
+#if CYTHON_COMPILING_IN_LIMITED_API
+static CYTHON_INLINE void *__Pyx_PyModule_GetState(PyObject *op)
+{
+    void *result;
+    result = PyModule_GetState(op);
+    if (!result)
+        Py_FatalError("Couldn't find the module state");
+    return result;
+}
+#endif
+#define __Pyx_PyObject_GetSlot(obj, name, func_ctype)  __Pyx_PyType_GetSlot(Py_TYPE(obj), name, func_ctype)
+#if CYTHON_COMPILING_IN_LIMITED_API
+  #define __Pyx_PyType_GetSlot(type, name, func_ctype)  ((func_ctype) PyType_GetSlot((type), Py_##name))
+#else
+  #define __Pyx_PyType_GetSlot(type, name, func_ctype)  ((type)->name)
+#endif
+#if PY_VERSION_HEX < 0x030700A2 && !defined(PyThread_tss_create) && !defined(Py_tss_NEEDS_INIT)
+#include "pythread.h"
+#define Py_tss_NEEDS_INIT 0
+typedef int Py_tss_t;
+static CYTHON_INLINE int PyThread_tss_create(Py_tss_t *key) {
+  *key = PyThread_create_key();
+  return 0;
+}
+static CYTHON_INLINE Py_tss_t * PyThread_tss_alloc(void) {
+  Py_tss_t *key = (Py_tss_t *)PyObject_Malloc(sizeof(Py_tss_t));
+  *key = Py_tss_NEEDS_INIT;
+  return key;
+}
+static CYTHON_INLINE void PyThread_tss_free(Py_tss_t *key) {
+  PyObject_Free(key);
+}
+static CYTHON_INLINE int PyThread_tss_is_created(Py_tss_t *key) {
+  return *key != Py_tss_NEEDS_INIT;
+}
+static CYTHON_INLINE void PyThread_tss_delete(Py_tss_t *key) {
+  PyThread_delete_key(*key);
+  *key = Py_tss_NEEDS_INIT;
+}
+static CYTHON_INLINE int PyThread_tss_set(Py_tss_t *key, void *value) {
+  return PyThread_set_key_value(*key, value);
+}
+static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) {
+  return PyThread_get_key_value(*key);
+}
+#endif
+#if PY_MAJOR_VERSION < 3
+    #if CYTHON_COMPILING_IN_PYPY
+        #if PYPY_VERSION_NUM < 0x07030600
+            #if defined(__cplusplus) && __cplusplus >= 201402L
+                [[deprecated("`with nogil:` inside a nogil function will not release the GIL in PyPy2 < 7.3.6")]]
+            #elif defined(__GNUC__) || defined(__clang__)
+                __attribute__ ((__deprecated__("`with nogil:` inside a nogil function will not release the GIL in PyPy2 < 7.3.6")))
+            #elif defined(_MSC_VER)
+                __declspec(deprecated("`with nogil:` inside a nogil function will not release the GIL in PyPy2 < 7.3.6"))
+            #endif
+            static CYTHON_INLINE int PyGILState_Check(void) {
+                return 0;
+            }
+        #else  // PYPY_VERSION_NUM < 0x07030600
+        #endif  // PYPY_VERSION_NUM < 0x07030600
+    #else
+        static CYTHON_INLINE int PyGILState_Check(void) {
+            PyThreadState * tstate = _PyThreadState_Current;
+            return tstate && (tstate == PyGILState_GetThisThreadState());
+        }
+    #endif
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX < 0x030d0000 || defined(_PyDict_NewPresized)
+#define __Pyx_PyDict_NewPresized(n)  ((n <= 8) ? PyDict_New() : _PyDict_NewPresized(n))
+#else
+#define __Pyx_PyDict_NewPresized(n)  PyDict_New()
+#endif
+#if PY_MAJOR_VERSION >= 3 || CYTHON_FUTURE_DIVISION
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_TrueDivide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceTrueDivide(x,y)
+#else
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_Divide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceDivide(x,y)
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX > 0x030600B4 && PY_VERSION_HEX < 0x030d0000 && CYTHON_USE_UNICODE_INTERNALS
+#define __Pyx_PyDict_GetItemStrWithError(dict, name)  _PyDict_GetItem_KnownHash(dict, name, ((PyASCIIObject *) name)->hash)
+static CYTHON_INLINE PyObject * __Pyx_PyDict_GetItemStr(PyObject *dict, PyObject *name) {
+    PyObject *res = __Pyx_PyDict_GetItemStrWithError(dict, name);
+    if (res == NULL) PyErr_Clear();
+    return res;
+}
+#elif PY_MAJOR_VERSION >= 3 && (!CYTHON_COMPILING_IN_PYPY || PYPY_VERSION_NUM >= 0x07020000)
+#define __Pyx_PyDict_GetItemStrWithError  PyDict_GetItemWithError
+#define __Pyx_PyDict_GetItemStr           PyDict_GetItem
+#else
+static CYTHON_INLINE PyObject * __Pyx_PyDict_GetItemStrWithError(PyObject *dict, PyObject *name) {
+#if CYTHON_COMPILING_IN_PYPY
+    return PyDict_GetItem(dict, name);
+#else
+    PyDictEntry *ep;
+    PyDictObject *mp = (PyDictObject*) dict;
+    long hash = ((PyStringObject *) name)->ob_shash;
+    assert(hash != -1);
+    ep = (mp->ma_lookup)(mp, name, hash);
+    if (ep == NULL) {
+        return NULL;
+    }
+    return ep->me_value;
+#endif
+}
+#define __Pyx_PyDict_GetItemStr           PyDict_GetItem
+#endif
+#if CYTHON_USE_TYPE_SLOTS
+  #define __Pyx_PyType_GetFlags(tp)   (((PyTypeObject *)tp)->tp_flags)
+  #define __Pyx_PyType_HasFeature(type, feature)  ((__Pyx_PyType_GetFlags(type) & (feature)) != 0)
+  #define __Pyx_PyObject_GetIterNextFunc(obj)  (Py_TYPE(obj)->tp_iternext)
+#else
+  #define __Pyx_PyType_GetFlags(tp)   (PyType_GetFlags((PyTypeObject *)tp))
+  #define __Pyx_PyType_HasFeature(type, feature)  PyType_HasFeature(type, feature)
+  #define __Pyx_PyObject_GetIterNextFunc(obj)  PyIter_Next
+#endif
+#if CYTHON_COMPILING_IN_LIMITED_API
+  #define __Pyx_SetItemOnTypeDict(tp, k, v) PyObject_GenericSetAttr((PyObject*)tp, k, v)
+#else
+  #define __Pyx_SetItemOnTypeDict(tp, k, v) PyDict_SetItem(tp->tp_dict, k, v)
+#endif
+#if CYTHON_USE_TYPE_SPECS && PY_VERSION_HEX >= 0x03080000
+#define __Pyx_PyHeapTypeObject_GC_Del(obj)  {\
+    PyTypeObject *type = Py_TYPE((PyObject*)obj);\
+    assert(__Pyx_PyType_HasFeature(type, Py_TPFLAGS_HEAPTYPE));\
+    PyObject_GC_Del(obj);\
+    Py_DECREF(type);\
+}
+#else
+#define __Pyx_PyHeapTypeObject_GC_Del(obj)  PyObject_GC_Del(obj)
+#endif
+#if CYTHON_COMPILING_IN_LIMITED_API
+  #define CYTHON_PEP393_ENABLED 1
+  #define __Pyx_PyUnicode_READY(op)       (0)
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GetLength(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_ReadChar(u, i)
+  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   ((void)u, 1114111U)
+  #define __Pyx_PyUnicode_KIND(u)         ((void)u, (0))
+  #define __Pyx_PyUnicode_DATA(u)         ((void*)u)
+  #define __Pyx_PyUnicode_READ(k, d, i)   ((void)k, PyUnicode_ReadChar((PyObject*)(d), i))
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != PyUnicode_GetLength(u))
+#elif PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_KIND)
+  #define CYTHON_PEP393_ENABLED 1
+  #if PY_VERSION_HEX >= 0x030C0000
+    #define __Pyx_PyUnicode_READY(op)       (0)
+  #else
+    #define __Pyx_PyUnicode_READY(op)       (likely(PyUnicode_IS_READY(op)) ?\
+                                                0 : _PyUnicode_Ready((PyObject *)(op)))
+  #endif
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_LENGTH(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_READ_CHAR(u, i)
+  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   PyUnicode_MAX_CHAR_VALUE(u)
+  #define __Pyx_PyUnicode_KIND(u)         ((int)PyUnicode_KIND(u))
+  #define __Pyx_PyUnicode_DATA(u)         PyUnicode_DATA(u)
+  #define __Pyx_PyUnicode_READ(k, d, i)   PyUnicode_READ(k, d, i)
+  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  PyUnicode_WRITE(k, d, i, (Py_UCS4) ch)
+  #if PY_VERSION_HEX >= 0x030C0000
+    #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != PyUnicode_GET_LENGTH(u))
+  #else
+    #if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x03090000
+    #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : ((PyCompactUnicodeObject *)(u))->wstr_length))
+    #else
+    #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u)))
+    #endif
+  #endif
+#else
+  #define CYTHON_PEP393_ENABLED 0
+  #define PyUnicode_1BYTE_KIND  1
+  #define PyUnicode_2BYTE_KIND  2
+  #define PyUnicode_4BYTE_KIND  4
+  #define __Pyx_PyUnicode_READY(op)       (0)
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_SIZE(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) ((Py_UCS4)(PyUnicode_AS_UNICODE(u)[i]))
+  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   ((sizeof(Py_UNICODE) == 2) ? 65535U : 1114111U)
+  #define __Pyx_PyUnicode_KIND(u)         ((int)sizeof(Py_UNICODE))
+  #define __Pyx_PyUnicode_DATA(u)         ((void*)PyUnicode_AS_UNICODE(u))
+  #define __Pyx_PyUnicode_READ(k, d, i)   ((void)(k), (Py_UCS4)(((Py_UNICODE*)d)[i]))
+  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  (((void)(k)), ((Py_UNICODE*)d)[i] = (Py_UNICODE) ch)
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != PyUnicode_GET_SIZE(u))
+#endif
+#if CYTHON_COMPILING_IN_PYPY
+  #define __Pyx_PyUnicode_Concat(a, b)      PyNumber_Add(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  PyNumber_Add(a, b)
+#else
+  #define __Pyx_PyUnicode_Concat(a, b)      PyUnicode_Concat(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ?\
+      PyNumber_Add(a, b) : __Pyx_PyUnicode_Concat(a, b))
+#endif
+#if CYTHON_COMPILING_IN_PYPY
+  #if !defined(PyUnicode_DecodeUnicodeEscape)
+    #define PyUnicode_DecodeUnicodeEscape(s, size, errors)  PyUnicode_Decode(s, size, "unicode_escape", errors)
+  #endif
+  #if !defined(PyUnicode_Contains) || (PY_MAJOR_VERSION == 2 && PYPY_VERSION_NUM < 0x07030500)
+    #undef PyUnicode_Contains
+    #define PyUnicode_Contains(u, s)  PySequence_Contains(u, s)
+  #endif
+  #if !defined(PyByteArray_Check)
+    #define PyByteArray_Check(obj)  PyObject_TypeCheck(obj, &PyByteArray_Type)
+  #endif
+  #if !defined(PyObject_Format)
+    #define PyObject_Format(obj, fmt)  PyObject_CallMethod(obj, "__format__", "O", fmt)
+  #endif
+#endif
+#define __Pyx_PyString_FormatSafe(a, b)   ((unlikely((a) == Py_None || (PyString_Check(b) && !PyString_CheckExact(b)))) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b))
+#define __Pyx_PyUnicode_FormatSafe(a, b)  ((unlikely((a) == Py_None || (PyUnicode_Check(b) && !PyUnicode_CheckExact(b)))) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b))
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyString_Format(a, b)  PyUnicode_Format(a, b)
+#else
+  #define __Pyx_PyString_Format(a, b)  PyString_Format(a, b)
+#endif
+#if PY_MAJOR_VERSION < 3 && !defined(PyObject_ASCII)
+  #define PyObject_ASCII(o)            PyObject_Repr(o)
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBaseString_Type            PyUnicode_Type
+  #define PyStringObject               PyUnicodeObject
+  #define PyString_Type                PyUnicode_Type
+  #define PyString_Check               PyUnicode_Check
+  #define PyString_CheckExact          PyUnicode_CheckExact
+#ifndef PyObject_Unicode
+  #define PyObject_Unicode             PyObject_Str
+#endif
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj)
+  #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj)
+#else
+  #define __Pyx_PyBaseString_Check(obj) (PyString_Check(obj) || PyUnicode_Check(obj))
+  #define __Pyx_PyBaseString_CheckExact(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj))
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+  #define __Pyx_PySequence_ListKeepNew(obj)\
+    (likely(PyList_CheckExact(obj) && Py_REFCNT(obj) == 1) ? __Pyx_NewRef(obj) : PySequence_List(obj))
+#else
+  #define __Pyx_PySequence_ListKeepNew(obj)  PySequence_List(obj)
+#endif
+#ifndef PySet_CheckExact
+  #define PySet_CheckExact(obj)        __Pyx_IS_TYPE(obj, &PySet_Type)
+#endif
+#if PY_VERSION_HEX >= 0x030900A4
+  #define __Pyx_SET_REFCNT(obj, refcnt) Py_SET_REFCNT(obj, refcnt)
+  #define __Pyx_SET_SIZE(obj, size) Py_SET_SIZE(obj, size)
+#else
+  #define __Pyx_SET_REFCNT(obj, refcnt) Py_REFCNT(obj) = (refcnt)
+  #define __Pyx_SET_SIZE(obj, size) Py_SIZE(obj) = (size)
+#endif
+#if CYTHON_ASSUME_SAFE_MACROS
+  #define __Pyx_PySequence_ITEM(o, i) PySequence_ITEM(o, i)
+  #define __Pyx_PySequence_SIZE(seq)  Py_SIZE(seq)
+  #define __Pyx_PyTuple_SET_ITEM(o, i, v) (PyTuple_SET_ITEM(o, i, v), (0))
+  #define __Pyx_PyList_SET_ITEM(o, i, v) (PyList_SET_ITEM(o, i, v), (0))
+  #define __Pyx_PyTuple_GET_SIZE(o) PyTuple_GET_SIZE(o)
+  #define __Pyx_PyList_GET_SIZE(o) PyList_GET_SIZE(o)
+  #define __Pyx_PySet_GET_SIZE(o) PySet_GET_SIZE(o)
+  #define __Pyx_PyBytes_GET_SIZE(o) PyBytes_GET_SIZE(o)
+  #define __Pyx_PyByteArray_GET_SIZE(o) PyByteArray_GET_SIZE(o)
+#else
+  #define __Pyx_PySequence_ITEM(o, i) PySequence_GetItem(o, i)
+  #define __Pyx_PySequence_SIZE(seq)  PySequence_Size(seq)
+  #define __Pyx_PyTuple_SET_ITEM(o, i, v) PyTuple_SetItem(o, i, v)
+  #define __Pyx_PyList_SET_ITEM(o, i, v) PyList_SetItem(o, i, v)
+  #define __Pyx_PyTuple_GET_SIZE(o) PyTuple_Size(o)
+  #define __Pyx_PyList_GET_SIZE(o) PyList_Size(o)
+  #define __Pyx_PySet_GET_SIZE(o) PySet_Size(o)
+  #define __Pyx_PyBytes_GET_SIZE(o) PyBytes_Size(o)
+  #define __Pyx_PyByteArray_GET_SIZE(o) PyByteArray_Size(o)
+#endif
+#if __PYX_LIMITED_VERSION_HEX >= 0x030d00A1
+  #define __Pyx_PyImport_AddModuleRef(name) PyImport_AddModuleRef(name)
+#else
+  static CYTHON_INLINE PyObject *__Pyx_PyImport_AddModuleRef(const char *name) {
+      PyObject *module = PyImport_AddModule(name);
+      Py_XINCREF(module);
+      return module;
+  }
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyIntObject                  PyLongObject
+  #define PyInt_Type                   PyLong_Type
+  #define PyInt_Check(op)              PyLong_Check(op)
+  #define PyInt_CheckExact(op)         PyLong_CheckExact(op)
+  #define __Pyx_Py3Int_Check(op)       PyLong_Check(op)
+  #define __Pyx_Py3Int_CheckExact(op)  PyLong_CheckExact(op)
+  #define PyInt_FromString             PyLong_FromString
+  #define PyInt_FromUnicode            PyLong_FromUnicode
+  #define PyInt_FromLong               PyLong_FromLong
+  #define PyInt_FromSize_t             PyLong_FromSize_t
+  #define PyInt_FromSsize_t            PyLong_FromSsize_t
+  #define PyInt_AsLong                 PyLong_AsLong
+  #define PyInt_AS_LONG                PyLong_AS_LONG
+  #define PyInt_AsSsize_t              PyLong_AsSsize_t
+  #define PyInt_AsUnsignedLongMask     PyLong_AsUnsignedLongMask
+  #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask
+  #define PyNumber_Int                 PyNumber_Long
+#else
+  #define __Pyx_Py3Int_Check(op)       (PyLong_Check(op) || PyInt_Check(op))
+  #define __Pyx_Py3Int_CheckExact(op)  (PyLong_CheckExact(op) || PyInt_CheckExact(op))
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBoolObject                 PyLongObject
+#endif
+#if PY_MAJOR_VERSION >= 3 && CYTHON_COMPILING_IN_PYPY
+  #ifndef PyUnicode_InternFromString
+    #define PyUnicode_InternFromString(s) PyUnicode_FromString(s)
+  #endif
+#endif
+#if PY_VERSION_HEX < 0x030200A4
+  typedef long Py_hash_t;
+  #define __Pyx_PyInt_FromHash_t PyInt_FromLong
+  #define __Pyx_PyInt_AsHash_t   __Pyx_PyIndex_AsHash_t
+#else
+  #define __Pyx_PyInt_FromHash_t PyInt_FromSsize_t
+  #define __Pyx_PyInt_AsHash_t   __Pyx_PyIndex_AsSsize_t
+#endif
+#if CYTHON_USE_ASYNC_SLOTS
+  #if PY_VERSION_HEX >= 0x030500B1
+    #define __Pyx_PyAsyncMethodsStruct PyAsyncMethods
+    #define __Pyx_PyType_AsAsync(obj) (Py_TYPE(obj)->tp_as_async)
+  #else
+    #define __Pyx_PyType_AsAsync(obj) ((__Pyx_PyAsyncMethodsStruct*) (Py_TYPE(obj)->tp_reserved))
+  #endif
+#else
+  #define __Pyx_PyType_AsAsync(obj) NULL
+#endif
+#ifndef __Pyx_PyAsyncMethodsStruct
+    typedef struct {
+        unaryfunc am_await;
+        unaryfunc am_aiter;
+        unaryfunc am_anext;
+    } __Pyx_PyAsyncMethodsStruct;
+#endif
+
+#if defined(_WIN32) || defined(WIN32) || defined(MS_WINDOWS)
+  #if !defined(_USE_MATH_DEFINES)
+    #define _USE_MATH_DEFINES
+  #endif
+#endif
+#include <math.h>
+#ifdef NAN
+#define __PYX_NAN() ((float) NAN)
+#else
+static CYTHON_INLINE float __PYX_NAN() {
+  float value;
+  memset(&value, 0xFF, sizeof(value));
+  return value;
+}
+#endif
+#if defined(__CYGWIN__) && defined(_LDBL_EQ_DBL)
+#define __Pyx_truncl trunc
+#else
+#define __Pyx_truncl truncl
+#endif
+
+#define __PYX_MARK_ERR_POS(f_index, lineno) \
+    { __pyx_filename = __pyx_f[f_index]; (void)__pyx_filename; __pyx_lineno = lineno; (void)__pyx_lineno; __pyx_clineno = __LINE__;  (void)__pyx_clineno; }
+#define __PYX_ERR(f_index, lineno, Ln_error) \
+    { __PYX_MARK_ERR_POS(f_index, lineno) goto Ln_error; }
+
+#ifdef CYTHON_EXTERN_C
+    #undef __PYX_EXTERN_C
+    #define __PYX_EXTERN_C CYTHON_EXTERN_C
+#elif defined(__PYX_EXTERN_C)
+    #ifdef _MSC_VER
+    #pragma message ("Please do not define the '__PYX_EXTERN_C' macro externally. Use 'CYTHON_EXTERN_C' instead.")
+    #else
+    #warning Please do not define the '__PYX_EXTERN_C' macro externally. Use 'CYTHON_EXTERN_C' instead.
+    #endif
+#else
+    #define __PYX_EXTERN_C extern "C++"
+#endif
+
+#define __PYX_HAVE__fairseq__data__token_block_utils_fast
+#define __PYX_HAVE_API__fairseq__data__token_block_utils_fast
+/* Early includes */
+#include <math.h>
+#include <string.h>
+#include <stdio.h>
+
+    /* Using NumPy API declarations from "numpy/__init__.cython-30.pxd" */
+    
+#include "numpy/arrayobject.h"
+#include "numpy/ndarrayobject.h"
+#include "numpy/ndarraytypes.h"
+#include "numpy/arrayscalars.h"
+#include "numpy/ufuncobject.h"
+#include "pythread.h"
+#include <stdlib.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif /* _OPENMP */
+
+#if defined(PYREX_WITHOUT_ASSERTIONS) && !defined(CYTHON_WITHOUT_ASSERTIONS)
+#define CYTHON_WITHOUT_ASSERTIONS
+#endif
+
+typedef struct {PyObject **p; const char *s; const Py_ssize_t n; const char* encoding;
+                const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry;
+
+#define __PYX_DEFAULT_STRING_ENCODING_IS_ASCII 0
+#define __PYX_DEFAULT_STRING_ENCODING_IS_UTF8 0
+#define __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT (PY_MAJOR_VERSION >= 3 && __PYX_DEFAULT_STRING_ENCODING_IS_UTF8)
+#define __PYX_DEFAULT_STRING_ENCODING ""
+#define __Pyx_PyObject_FromString __Pyx_PyBytes_FromString
+#define __Pyx_PyObject_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#define __Pyx_uchar_cast(c) ((unsigned char)c)
+#define __Pyx_long_cast(x) ((long)x)
+#define __Pyx_fits_Py_ssize_t(v, type, is_signed)  (\
+    (sizeof(type) < sizeof(Py_ssize_t))  ||\
+    (sizeof(type) > sizeof(Py_ssize_t) &&\
+          likely(v < (type)PY_SSIZE_T_MAX ||\
+                 v == (type)PY_SSIZE_T_MAX)  &&\
+          (!is_signed || likely(v > (type)PY_SSIZE_T_MIN ||\
+                                v == (type)PY_SSIZE_T_MIN)))  ||\
+    (sizeof(type) == sizeof(Py_ssize_t) &&\
+          (is_signed || likely(v < (type)PY_SSIZE_T_MAX ||\
+                               v == (type)PY_SSIZE_T_MAX)))  )
+static CYTHON_INLINE int __Pyx_is_valid_index(Py_ssize_t i, Py_ssize_t limit) {
+    return (size_t) i < (size_t) limit;
+}
+#if defined (__cplusplus) && __cplusplus >= 201103L
+    #include <cstdlib>
+    #define __Pyx_sst_abs(value) std::abs(value)
+#elif SIZEOF_INT >= SIZEOF_SIZE_T
+    #define __Pyx_sst_abs(value) abs(value)
+#elif SIZEOF_LONG >= SIZEOF_SIZE_T
+    #define __Pyx_sst_abs(value) labs(value)
+#elif defined (_MSC_VER)
+    #define __Pyx_sst_abs(value) ((Py_ssize_t)_abs64(value))
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define __Pyx_sst_abs(value) llabs(value)
+#elif defined (__GNUC__)
+    #define __Pyx_sst_abs(value) __builtin_llabs(value)
+#else
+    #define __Pyx_sst_abs(value) ((value<0) ? -value : value)
+#endif
+static CYTHON_INLINE Py_ssize_t __Pyx_ssize_strlen(const char *s);
+static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject*);
+static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject*, Py_ssize_t* length);
+static CYTHON_INLINE PyObject* __Pyx_PyByteArray_FromString(const char*);
+#define __Pyx_PyByteArray_FromStringAndSize(s, l) PyByteArray_FromStringAndSize((const char*)s, l)
+#define __Pyx_PyBytes_FromString        PyBytes_FromString
+#define __Pyx_PyBytes_FromStringAndSize PyBytes_FromStringAndSize
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char*);
+#if PY_MAJOR_VERSION < 3
+    #define __Pyx_PyStr_FromString        __Pyx_PyBytes_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#else
+    #define __Pyx_PyStr_FromString        __Pyx_PyUnicode_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyUnicode_FromStringAndSize
+#endif
+#define __Pyx_PyBytes_AsWritableString(s)     ((char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsWritableSString(s)    ((signed char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsWritableUString(s)    ((unsigned char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsString(s)     ((const char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsSString(s)    ((const signed char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsUString(s)    ((const unsigned char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyObject_AsWritableString(s)    ((char*)(__pyx_uintptr_t) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsWritableSString(s)    ((signed char*)(__pyx_uintptr_t) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsWritableUString(s)    ((unsigned char*)(__pyx_uintptr_t) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsSString(s)    ((const signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsUString(s)    ((const unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_FromCString(s)  __Pyx_PyObject_FromString((const char*)s)
+#define __Pyx_PyBytes_FromCString(s)   __Pyx_PyBytes_FromString((const char*)s)
+#define __Pyx_PyByteArray_FromCString(s)   __Pyx_PyByteArray_FromString((const char*)s)
+#define __Pyx_PyStr_FromCString(s)     __Pyx_PyStr_FromString((const char*)s)
+#define __Pyx_PyUnicode_FromCString(s) __Pyx_PyUnicode_FromString((const char*)s)
+#define __Pyx_PyUnicode_FromOrdinal(o)       PyUnicode_FromOrdinal((int)o)
+#define __Pyx_PyUnicode_AsUnicode            PyUnicode_AsUnicode
+#define __Pyx_NewRef(obj) (Py_INCREF(obj), obj)
+#define __Pyx_Owned_Py_None(b) __Pyx_NewRef(Py_None)
+static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b);
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*);
+static CYTHON_INLINE int __Pyx_PyObject_IsTrueAndDecref(PyObject*);
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x);
+#define __Pyx_PySequence_Tuple(obj)\
+    (likely(PyTuple_CheckExact(obj)) ? __Pyx_NewRef(obj) : PySequence_Tuple(obj))
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*);
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t);
+static CYTHON_INLINE Py_hash_t __Pyx_PyIndex_AsHash_t(PyObject*);
+#if CYTHON_ASSUME_SAFE_MACROS
+#define __pyx_PyFloat_AsDouble(x) (PyFloat_CheckExact(x) ? PyFloat_AS_DOUBLE(x) : PyFloat_AsDouble(x))
+#else
+#define __pyx_PyFloat_AsDouble(x) PyFloat_AsDouble(x)
+#endif
+#define __pyx_PyFloat_AsFloat(x) ((float) __pyx_PyFloat_AsDouble(x))
+#if PY_MAJOR_VERSION >= 3
+#define __Pyx_PyNumber_Int(x) (PyLong_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Long(x))
+#else
+#define __Pyx_PyNumber_Int(x) (PyInt_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Int(x))
+#endif
+#if CYTHON_USE_PYLONG_INTERNALS
+  #if PY_VERSION_HEX >= 0x030C00A7
+  #ifndef _PyLong_SIGN_MASK
+    #define _PyLong_SIGN_MASK 3
+  #endif
+  #ifndef _PyLong_NON_SIZE_BITS
+    #define _PyLong_NON_SIZE_BITS 3
+  #endif
+  #define __Pyx_PyLong_Sign(x)  (((PyLongObject*)x)->long_value.lv_tag & _PyLong_SIGN_MASK)
+  #define __Pyx_PyLong_IsNeg(x)  ((__Pyx_PyLong_Sign(x) & 2) != 0)
+  #define __Pyx_PyLong_IsNonNeg(x)  (!__Pyx_PyLong_IsNeg(x))
+  #define __Pyx_PyLong_IsZero(x)  (__Pyx_PyLong_Sign(x) & 1)
+  #define __Pyx_PyLong_IsPos(x)  (__Pyx_PyLong_Sign(x) == 0)
+  #define __Pyx_PyLong_CompactValueUnsigned(x)  (__Pyx_PyLong_Digits(x)[0])
+  #define __Pyx_PyLong_DigitCount(x)  ((Py_ssize_t) (((PyLongObject*)x)->long_value.lv_tag >> _PyLong_NON_SIZE_BITS))
+  #define __Pyx_PyLong_SignedDigitCount(x)\
+        ((1 - (Py_ssize_t) __Pyx_PyLong_Sign(x)) * __Pyx_PyLong_DigitCount(x))
+  #if defined(PyUnstable_Long_IsCompact) && defined(PyUnstable_Long_CompactValue)
+    #define __Pyx_PyLong_IsCompact(x)     PyUnstable_Long_IsCompact((PyLongObject*) x)
+    #define __Pyx_PyLong_CompactValue(x)  PyUnstable_Long_CompactValue((PyLongObject*) x)
+  #else
+    #define __Pyx_PyLong_IsCompact(x)     (((PyLongObject*)x)->long_value.lv_tag < (2 << _PyLong_NON_SIZE_BITS))
+    #define __Pyx_PyLong_CompactValue(x)  ((1 - (Py_ssize_t) __Pyx_PyLong_Sign(x)) * (Py_ssize_t) __Pyx_PyLong_Digits(x)[0])
+  #endif
+  typedef Py_ssize_t  __Pyx_compact_pylong;
+  typedef size_t  __Pyx_compact_upylong;
+  #else
+  #define __Pyx_PyLong_IsNeg(x)  (Py_SIZE(x) < 0)
+  #define __Pyx_PyLong_IsNonNeg(x)  (Py_SIZE(x) >= 0)
+  #define __Pyx_PyLong_IsZero(x)  (Py_SIZE(x) == 0)
+  #define __Pyx_PyLong_IsPos(x)  (Py_SIZE(x) > 0)
+  #define __Pyx_PyLong_CompactValueUnsigned(x)  ((Py_SIZE(x) == 0) ? 0 : __Pyx_PyLong_Digits(x)[0])
+  #define __Pyx_PyLong_DigitCount(x)  __Pyx_sst_abs(Py_SIZE(x))
+  #define __Pyx_PyLong_SignedDigitCount(x)  Py_SIZE(x)
+  #define __Pyx_PyLong_IsCompact(x)  (Py_SIZE(x) == 0 || Py_SIZE(x) == 1 || Py_SIZE(x) == -1)
+  #define __Pyx_PyLong_CompactValue(x)\
+        ((Py_SIZE(x) == 0) ? (sdigit) 0 : ((Py_SIZE(x) < 0) ? -(sdigit)__Pyx_PyLong_Digits(x)[0] : (sdigit)__Pyx_PyLong_Digits(x)[0]))
+  typedef sdigit  __Pyx_compact_pylong;
+  typedef digit  __Pyx_compact_upylong;
+  #endif
+  #if PY_VERSION_HEX >= 0x030C00A5
+  #define __Pyx_PyLong_Digits(x)  (((PyLongObject*)x)->long_value.ob_digit)
+  #else
+  #define __Pyx_PyLong_Digits(x)  (((PyLongObject*)x)->ob_digit)
+  #endif
+#endif
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+#include <string.h>
+static int __Pyx_sys_getdefaultencoding_not_ascii;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys;
+    PyObject* default_encoding = NULL;
+    PyObject* ascii_chars_u = NULL;
+    PyObject* ascii_chars_b = NULL;
+    const char* default_encoding_c;
+    sys = PyImport_ImportModule("sys");
+    if (!sys) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) "getdefaultencoding", NULL);
+    Py_DECREF(sys);
+    if (!default_encoding) goto bad;
+    default_encoding_c = PyBytes_AsString(default_encoding);
+    if (!default_encoding_c) goto bad;
+    if (strcmp(default_encoding_c, "ascii") == 0) {
+        __Pyx_sys_getdefaultencoding_not_ascii = 0;
+    } else {
+        char ascii_chars[128];
+        int c;
+        for (c = 0; c < 128; c++) {
+            ascii_chars[c] = (char) c;
+        }
+        __Pyx_sys_getdefaultencoding_not_ascii = 1;
+        ascii_chars_u = PyUnicode_DecodeASCII(ascii_chars, 128, NULL);
+        if (!ascii_chars_u) goto bad;
+        ascii_chars_b = PyUnicode_AsEncodedString(ascii_chars_u, default_encoding_c, NULL);
+        if (!ascii_chars_b || !PyBytes_Check(ascii_chars_b) || memcmp(ascii_chars, PyBytes_AS_STRING(ascii_chars_b), 128) != 0) {
+            PyErr_Format(
+                PyExc_ValueError,
+                "This module compiled with c_string_encoding=ascii, but default encoding '%.200s' is not a superset of ascii.",
+                default_encoding_c);
+            goto bad;
+        }
+        Py_DECREF(ascii_chars_u);
+        Py_DECREF(ascii_chars_b);
+    }
+    Py_DECREF(default_encoding);
+    return 0;
+bad:
+    Py_XDECREF(default_encoding);
+    Py_XDECREF(ascii_chars_u);
+    Py_XDECREF(ascii_chars_b);
+    return -1;
+}
+#endif
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT && PY_MAJOR_VERSION >= 3
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_DecodeUTF8(c_str, size, NULL)
+#else
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_Decode(c_str, size, __PYX_DEFAULT_STRING_ENCODING, NULL)
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+#include <string.h>
+static char* __PYX_DEFAULT_STRING_ENCODING;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys;
+    PyObject* default_encoding = NULL;
+    char* default_encoding_c;
+    sys = PyImport_ImportModule("sys");
+    if (!sys) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL);
+    Py_DECREF(sys);
+    if (!default_encoding) goto bad;
+    default_encoding_c = PyBytes_AsString(default_encoding);
+    if (!default_encoding_c) goto bad;
+    __PYX_DEFAULT_STRING_ENCODING = (char*) malloc(strlen(default_encoding_c) + 1);
+    if (!__PYX_DEFAULT_STRING_ENCODING) goto bad;
+    strcpy(__PYX_DEFAULT_STRING_ENCODING, default_encoding_c);
+    Py_DECREF(default_encoding);
+    return 0;
+bad:
+    Py_XDECREF(default_encoding);
+    return -1;
+}
+#endif
+#endif
+
+
+/* Test for GCC > 2.95 */
+#if defined(__GNUC__)     && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))
+  #define likely(x)   __builtin_expect(!!(x), 1)
+  #define unlikely(x) __builtin_expect(!!(x), 0)
+#else /* !__GNUC__ or GCC < 2.95 */
+  #define likely(x)   (x)
+  #define unlikely(x) (x)
+#endif /* __GNUC__ */
+static CYTHON_INLINE void __Pyx_pretend_to_initialize(void* ptr) { (void)ptr; }
+
+#if !CYTHON_USE_MODULE_STATE
+static PyObject *__pyx_m = NULL;
+#endif
+static int __pyx_lineno;
+static int __pyx_clineno = 0;
+static const char * __pyx_cfilenm = __FILE__;
+static const char *__pyx_filename;
+
+/* Header.proto */
+#if !defined(CYTHON_CCOMPLEX)
+  #if defined(__cplusplus)
+    #define CYTHON_CCOMPLEX 1
+  #elif (defined(_Complex_I) && !defined(_MSC_VER)) || ((defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_COMPLEX__) && !defined(_MSC_VER))
+    #define CYTHON_CCOMPLEX 1
+  #else
+    #define CYTHON_CCOMPLEX 0
+  #endif
+#endif
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    #include <complex>
+  #else
+    #include <complex.h>
+  #endif
+#endif
+#if CYTHON_CCOMPLEX && !defined(__cplusplus) && defined(__sun__) && defined(__GNUC__)
+  #undef _Complex_I
+  #define _Complex_I 1.0fj
+#endif
+
+/* #### Code section: filename_table ### */
+
+static const char *__pyx_f[] = {
+  "fairseq/data/token_block_utils_fast.pyx",
+  "<stringsource>",
+  "__init__.cython-30.pxd",
+  "type.pxd",
+};
+/* #### Code section: utility_code_proto_before_types ### */
+/* ForceInitThreads.proto */
+#ifndef __PYX_FORCE_INIT_THREADS
+  #define __PYX_FORCE_INIT_THREADS 0
+#endif
+
+/* NoFastGil.proto */
+#define __Pyx_PyGILState_Ensure PyGILState_Ensure
+#define __Pyx_PyGILState_Release PyGILState_Release
+#define __Pyx_FastGIL_Remember()
+#define __Pyx_FastGIL_Forget()
+#define __Pyx_FastGilFuncInit()
+
+/* BufferFormatStructs.proto */
+struct __Pyx_StructField_;
+#define __PYX_BUF_FLAGS_PACKED_STRUCT (1 << 0)
+typedef struct {
+  const char* name;
+  struct __Pyx_StructField_* fields;
+  size_t size;
+  size_t arraysize[8];
+  int ndim;
+  char typegroup;
+  char is_unsigned;
+  int flags;
+} __Pyx_TypeInfo;
+typedef struct __Pyx_StructField_ {
+  __Pyx_TypeInfo* type;
+  const char* name;
+  size_t offset;
+} __Pyx_StructField;
+typedef struct {
+  __Pyx_StructField* field;
+  size_t parent_offset;
+} __Pyx_BufFmt_StackElem;
+typedef struct {
+  __Pyx_StructField root;
+  __Pyx_BufFmt_StackElem* head;
+  size_t fmt_offset;
+  size_t new_count, enc_count;
+  size_t struct_alignment;
+  int is_complex;
+  char enc_type;
+  char new_packmode;
+  char enc_packmode;
+  char is_valid_array;
+} __Pyx_BufFmt_Context;
+
+/* Atomics.proto */
+#include <pythread.h>
+#ifndef CYTHON_ATOMICS
+    #define CYTHON_ATOMICS 1
+#endif
+#define __PYX_CYTHON_ATOMICS_ENABLED() CYTHON_ATOMICS
+#define __pyx_atomic_int_type int
+#define __pyx_nonatomic_int_type int
+#if CYTHON_ATOMICS && (defined(__STDC_VERSION__) &&\
+                        (__STDC_VERSION__ >= 201112L) &&\
+                        !defined(__STDC_NO_ATOMICS__))
+    #include <stdatomic.h>
+#elif CYTHON_ATOMICS && (defined(__cplusplus) && (\
+                    (__cplusplus >= 201103L) ||\
+                    (defined(_MSC_VER) && _MSC_VER >= 1700)))
+    #include <atomic>
+#endif
+#if CYTHON_ATOMICS && (defined(__STDC_VERSION__) &&\
+                        (__STDC_VERSION__ >= 201112L) &&\
+                        !defined(__STDC_NO_ATOMICS__) &&\
+                       ATOMIC_INT_LOCK_FREE == 2)
+    #undef __pyx_atomic_int_type
+    #define __pyx_atomic_int_type atomic_int
+    #define __pyx_atomic_incr_aligned(value) atomic_fetch_add_explicit(value, 1, memory_order_relaxed)
+    #define __pyx_atomic_decr_aligned(value) atomic_fetch_sub_explicit(value, 1, memory_order_acq_rel)
+    #if defined(__PYX_DEBUG_ATOMICS) && defined(_MSC_VER)
+        #pragma message ("Using standard C atomics")
+    #elif defined(__PYX_DEBUG_ATOMICS)
+        #warning "Using standard C atomics"
+    #endif
+#elif CYTHON_ATOMICS && (defined(__cplusplus) && (\
+                    (__cplusplus >= 201103L) ||\
+\
+                    (defined(_MSC_VER) && _MSC_VER >= 1700)) &&\
+                    ATOMIC_INT_LOCK_FREE == 2)
+    #undef __pyx_atomic_int_type
+    #define __pyx_atomic_int_type std::atomic_int
+    #define __pyx_atomic_incr_aligned(value) std::atomic_fetch_add_explicit(value, 1, std::memory_order_relaxed)
+    #define __pyx_atomic_decr_aligned(value) std::atomic_fetch_sub_explicit(value, 1, std::memory_order_acq_rel)
+    #if defined(__PYX_DEBUG_ATOMICS) && defined(_MSC_VER)
+        #pragma message ("Using standard C++ atomics")
+    #elif defined(__PYX_DEBUG_ATOMICS)
+        #warning "Using standard C++ atomics"
+    #endif
+#elif CYTHON_ATOMICS && (__GNUC__ >= 5 || (__GNUC__ == 4 &&\
+                    (__GNUC_MINOR__ > 1 ||\
+                    (__GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ >= 2))))
+    #define __pyx_atomic_incr_aligned(value) __sync_fetch_and_add(value, 1)
+    #define __pyx_atomic_decr_aligned(value) __sync_fetch_and_sub(value, 1)
+    #ifdef __PYX_DEBUG_ATOMICS
+        #warning "Using GNU atomics"
+    #endif
+#elif CYTHON_ATOMICS && defined(_MSC_VER)
+    #include <intrin.h>
+    #undef __pyx_atomic_int_type
+    #define __pyx_atomic_int_type long
+    #undef __pyx_nonatomic_int_type
+    #define __pyx_nonatomic_int_type long
+    #pragma intrinsic (_InterlockedExchangeAdd)
+    #define __pyx_atomic_incr_aligned(value) _InterlockedExchangeAdd(value, 1)
+    #define __pyx_atomic_decr_aligned(value) _InterlockedExchangeAdd(value, -1)
+    #ifdef __PYX_DEBUG_ATOMICS
+        #pragma message ("Using MSVC atomics")
+    #endif
+#else
+    #undef CYTHON_ATOMICS
+    #define CYTHON_ATOMICS 0
+    #ifdef __PYX_DEBUG_ATOMICS
+        #warning "Not using atomics"
+    #endif
+#endif
+#if CYTHON_ATOMICS
+    #define __pyx_add_acquisition_count(memview)\
+             __pyx_atomic_incr_aligned(__pyx_get_slice_count_pointer(memview))
+    #define __pyx_sub_acquisition_count(memview)\
+            __pyx_atomic_decr_aligned(__pyx_get_slice_count_pointer(memview))
+#else
+    #define __pyx_add_acquisition_count(memview)\
+            __pyx_add_acquisition_count_locked(__pyx_get_slice_count_pointer(memview), memview->lock)
+    #define __pyx_sub_acquisition_count(memview)\
+            __pyx_sub_acquisition_count_locked(__pyx_get_slice_count_pointer(memview), memview->lock)
+#endif
+
+/* MemviewSliceStruct.proto */
+struct __pyx_memoryview_obj;
+typedef struct {
+  struct __pyx_memoryview_obj *memview;
+  char *data;
+  Py_ssize_t shape[8];
+  Py_ssize_t strides[8];
+  Py_ssize_t suboffsets[8];
+} __Pyx_memviewslice;
+#define __Pyx_MemoryView_Len(m)  (m.shape[0])
+
+/* #### Code section: numeric_typedefs ### */
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":787
+ * # in Cython to enable them only on the right systems.
+ * 
+ * ctypedef npy_int8       int8_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t
+ */
+typedef npy_int8 __pyx_t_5numpy_int8_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":788
+ * 
+ * ctypedef npy_int8       int8_t
+ * ctypedef npy_int16      int16_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int32      int32_t
+ * ctypedef npy_int64      int64_t
+ */
+typedef npy_int16 __pyx_t_5numpy_int16_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":789
+ * ctypedef npy_int8       int8_t
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int64      int64_t
+ * #ctypedef npy_int96      int96_t
+ */
+typedef npy_int32 __pyx_t_5numpy_int32_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":790
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t
+ * ctypedef npy_int64      int64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_int96      int96_t
+ * #ctypedef npy_int128     int128_t
+ */
+typedef npy_int64 __pyx_t_5numpy_int64_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":794
+ * #ctypedef npy_int128     int128_t
+ * 
+ * ctypedef npy_uint8      uint8_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t
+ */
+typedef npy_uint8 __pyx_t_5numpy_uint8_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":795
+ * 
+ * ctypedef npy_uint8      uint8_t
+ * ctypedef npy_uint16     uint16_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint32     uint32_t
+ * ctypedef npy_uint64     uint64_t
+ */
+typedef npy_uint16 __pyx_t_5numpy_uint16_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":796
+ * ctypedef npy_uint8      uint8_t
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint64     uint64_t
+ * #ctypedef npy_uint96     uint96_t
+ */
+typedef npy_uint32 __pyx_t_5numpy_uint32_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":797
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t
+ * ctypedef npy_uint64     uint64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_uint96     uint96_t
+ * #ctypedef npy_uint128    uint128_t
+ */
+typedef npy_uint64 __pyx_t_5numpy_uint64_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":801
+ * #ctypedef npy_uint128    uint128_t
+ * 
+ * ctypedef npy_float32    float32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_float64    float64_t
+ * #ctypedef npy_float80    float80_t
+ */
+typedef npy_float32 __pyx_t_5numpy_float32_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":802
+ * 
+ * ctypedef npy_float32    float32_t
+ * ctypedef npy_float64    float64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_float80    float80_t
+ * #ctypedef npy_float128   float128_t
+ */
+typedef npy_float64 __pyx_t_5numpy_float64_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":809
+ * ctypedef double complex complex128_t
+ * 
+ * ctypedef npy_longlong   longlong_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_ulonglong  ulonglong_t
+ * 
+ */
+typedef npy_longlong __pyx_t_5numpy_longlong_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":810
+ * 
+ * ctypedef npy_longlong   longlong_t
+ * ctypedef npy_ulonglong  ulonglong_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_intp       intp_t
+ */
+typedef npy_ulonglong __pyx_t_5numpy_ulonglong_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":812
+ * ctypedef npy_ulonglong  ulonglong_t
+ * 
+ * ctypedef npy_intp       intp_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uintp      uintp_t
+ * 
+ */
+typedef npy_intp __pyx_t_5numpy_intp_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":813
+ * 
+ * ctypedef npy_intp       intp_t
+ * ctypedef npy_uintp      uintp_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_double     float_t
+ */
+typedef npy_uintp __pyx_t_5numpy_uintp_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":815
+ * ctypedef npy_uintp      uintp_t
+ * 
+ * ctypedef npy_double     float_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_double     double_t
+ * ctypedef npy_longdouble longdouble_t
+ */
+typedef npy_double __pyx_t_5numpy_float_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":816
+ * 
+ * ctypedef npy_double     float_t
+ * ctypedef npy_double     double_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longdouble longdouble_t
+ * 
+ */
+typedef npy_double __pyx_t_5numpy_double_t;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":817
+ * ctypedef npy_double     float_t
+ * ctypedef npy_double     double_t
+ * ctypedef npy_longdouble longdouble_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef float complex       cfloat_t
+ */
+typedef npy_longdouble __pyx_t_5numpy_longdouble_t;
+
+/* "fairseq/data/token_block_utils_fast.pyx":16
+ * 
+ * DTYPE = np.int64
+ * ctypedef np.int64_t DTYPE_t             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+typedef __pyx_t_5numpy_int64_t __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t;
+/* #### Code section: complex_type_declarations ### */
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+  #ifdef __cplusplus
+    typedef ::std::complex< float > __pyx_t_float_complex;
+  #else
+    typedef float _Complex __pyx_t_float_complex;
+  #endif
+#else
+    typedef struct { float real, imag; } __pyx_t_float_complex;
+#endif
+static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float, float);
+
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+  #ifdef __cplusplus
+    typedef ::std::complex< double > __pyx_t_double_complex;
+  #else
+    typedef double _Complex __pyx_t_double_complex;
+  #endif
+#else
+    typedef struct { double real, imag; } __pyx_t_double_complex;
+#endif
+static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double, double);
+
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+  #ifdef __cplusplus
+    typedef ::std::complex< long double > __pyx_t_long_double_complex;
+  #else
+    typedef long double _Complex __pyx_t_long_double_complex;
+  #endif
+#else
+    typedef struct { long double real, imag; } __pyx_t_long_double_complex;
+#endif
+static CYTHON_INLINE __pyx_t_long_double_complex __pyx_t_long_double_complex_from_parts(long double, long double);
+
+/* #### Code section: type_declarations ### */
+
+/*--- Type declarations ---*/
+struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher;
+struct __pyx_array_obj;
+struct __pyx_MemviewEnum_obj;
+struct __pyx_memoryview_obj;
+struct __pyx_memoryviewslice_obj;
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1113
+ * 
+ * # Iterator API added in v1.6
+ * ctypedef int (*NpyIter_IterNextFunc)(NpyIter* it) noexcept nogil             # <<<<<<<<<<<<<<
+ * ctypedef void (*NpyIter_GetMultiIndexFunc)(NpyIter* it, npy_intp* outcoords) noexcept nogil
+ * 
+ */
+typedef int (*__pyx_t_5numpy_NpyIter_IterNextFunc)(NpyIter *);
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1114
+ * # Iterator API added in v1.6
+ * ctypedef int (*NpyIter_IterNextFunc)(NpyIter* it) noexcept nogil
+ * ctypedef void (*NpyIter_GetMultiIndexFunc)(NpyIter* it, npy_intp* outcoords) noexcept nogil             # <<<<<<<<<<<<<<
+ * 
+ * cdef extern from "numpy/arrayobject.h":
+ */
+typedef void (*__pyx_t_5numpy_NpyIter_GetMultiIndexFunc)(NpyIter *, npy_intp *);
+
+/* "fairseq/data/token_block_utils_fast.pyx":139
+ * 
+ * 
+ * cdef class DatasetSearcher(object):             # <<<<<<<<<<<<<<
+ *     """Helper for mapping "flat" indices to indices and offsets in an
+ *     underlying dataset."""
+ */
+struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher {
+  PyObject_HEAD
+  struct __pyx_vtabstruct_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *__pyx_vtab;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t current_i;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t current_offset;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t current_index;
+  __Pyx_memviewslice sizes;
+};
+
+
+/* "View.MemoryView":114
+ * @cython.collection_type("sequence")
+ * @cname("__pyx_array")
+ * cdef class array:             # <<<<<<<<<<<<<<
+ * 
+ *     cdef:
+ */
+struct __pyx_array_obj {
+  PyObject_HEAD
+  struct __pyx_vtabstruct_array *__pyx_vtab;
+  char *data;
+  Py_ssize_t len;
+  char *format;
+  int ndim;
+  Py_ssize_t *_shape;
+  Py_ssize_t *_strides;
+  Py_ssize_t itemsize;
+  PyObject *mode;
+  PyObject *_format;
+  void (*callback_free_data)(void *);
+  int free_data;
+  int dtype_is_object;
+};
+
+
+/* "View.MemoryView":302
+ * 
+ * @cname('__pyx_MemviewEnum')
+ * cdef class Enum(object):             # <<<<<<<<<<<<<<
+ *     cdef object name
+ *     def __init__(self, name):
+ */
+struct __pyx_MemviewEnum_obj {
+  PyObject_HEAD
+  PyObject *name;
+};
+
+
+/* "View.MemoryView":337
+ * 
+ * @cname('__pyx_memoryview')
+ * cdef class memoryview:             # <<<<<<<<<<<<<<
+ * 
+ *     cdef object obj
+ */
+struct __pyx_memoryview_obj {
+  PyObject_HEAD
+  struct __pyx_vtabstruct_memoryview *__pyx_vtab;
+  PyObject *obj;
+  PyObject *_size;
+  PyObject *_array_interface;
+  PyThread_type_lock lock;
+  __pyx_atomic_int_type acquisition_count;
+  Py_buffer view;
+  int flags;
+  int dtype_is_object;
+  __Pyx_TypeInfo *typeinfo;
+};
+
+
+/* "View.MemoryView":952
+ * @cython.collection_type("sequence")
+ * @cname('__pyx_memoryviewslice')
+ * cdef class _memoryviewslice(memoryview):             # <<<<<<<<<<<<<<
+ *     "Internal class for passing memoryview slices to Python"
+ * 
+ */
+struct __pyx_memoryviewslice_obj {
+  struct __pyx_memoryview_obj __pyx_base;
+  __Pyx_memviewslice from_slice;
+  PyObject *from_object;
+  PyObject *(*to_object_func)(char *);
+  int (*to_dtype_func)(char *, PyObject *);
+};
+
+
+
+/* "fairseq/data/token_block_utils_fast.pyx":139
+ * 
+ * 
+ * cdef class DatasetSearcher(object):             # <<<<<<<<<<<<<<
+ *     """Helper for mapping "flat" indices to indices and offsets in an
+ *     underlying dataset."""
+ */
+
+struct __pyx_vtabstruct_7fairseq_4data_22token_block_utils_fast_DatasetSearcher {
+  PyObject *(*reset)(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *);
+  int (*step)(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *, __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t);
+  PyObject *(*seek)(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *, __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t);
+};
+static struct __pyx_vtabstruct_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *__pyx_vtabptr_7fairseq_4data_22token_block_utils_fast_DatasetSearcher;
+
+
+/* "View.MemoryView":114
+ * @cython.collection_type("sequence")
+ * @cname("__pyx_array")
+ * cdef class array:             # <<<<<<<<<<<<<<
+ * 
+ *     cdef:
+ */
+
+struct __pyx_vtabstruct_array {
+  PyObject *(*get_memview)(struct __pyx_array_obj *);
+};
+static struct __pyx_vtabstruct_array *__pyx_vtabptr_array;
+
+
+/* "View.MemoryView":337
+ * 
+ * @cname('__pyx_memoryview')
+ * cdef class memoryview:             # <<<<<<<<<<<<<<
+ * 
+ *     cdef object obj
+ */
+
+struct __pyx_vtabstruct_memoryview {
+  char *(*get_item_pointer)(struct __pyx_memoryview_obj *, PyObject *);
+  PyObject *(*is_slice)(struct __pyx_memoryview_obj *, PyObject *);
+  PyObject *(*setitem_slice_assignment)(struct __pyx_memoryview_obj *, PyObject *, PyObject *);
+  PyObject *(*setitem_slice_assign_scalar)(struct __pyx_memoryview_obj *, struct __pyx_memoryview_obj *, PyObject *);
+  PyObject *(*setitem_indexed)(struct __pyx_memoryview_obj *, PyObject *, PyObject *);
+  PyObject *(*convert_item_to_object)(struct __pyx_memoryview_obj *, char *);
+  PyObject *(*assign_item_from_object)(struct __pyx_memoryview_obj *, char *, PyObject *);
+  PyObject *(*_get_base)(struct __pyx_memoryview_obj *);
+};
+static struct __pyx_vtabstruct_memoryview *__pyx_vtabptr_memoryview;
+
+
+/* "View.MemoryView":952
+ * @cython.collection_type("sequence")
+ * @cname('__pyx_memoryviewslice')
+ * cdef class _memoryviewslice(memoryview):             # <<<<<<<<<<<<<<
+ *     "Internal class for passing memoryview slices to Python"
+ * 
+ */
+
+struct __pyx_vtabstruct__memoryviewslice {
+  struct __pyx_vtabstruct_memoryview __pyx_base;
+};
+static struct __pyx_vtabstruct__memoryviewslice *__pyx_vtabptr__memoryviewslice;
+/* #### Code section: utility_code_proto ### */
+
+/* --- Runtime support code (head) --- */
+/* Refnanny.proto */
+#ifndef CYTHON_REFNANNY
+  #define CYTHON_REFNANNY 0
+#endif
+#if CYTHON_REFNANNY
+  typedef struct {
+    void (*INCREF)(void*, PyObject*, Py_ssize_t);
+    void (*DECREF)(void*, PyObject*, Py_ssize_t);
+    void (*GOTREF)(void*, PyObject*, Py_ssize_t);
+    void (*GIVEREF)(void*, PyObject*, Py_ssize_t);
+    void* (*SetupContext)(const char*, Py_ssize_t, const char*);
+    void (*FinishContext)(void**);
+  } __Pyx_RefNannyAPIStruct;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNanny = NULL;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname);
+  #define __Pyx_RefNannyDeclarations void *__pyx_refnanny = NULL;
+#ifdef WITH_THREAD
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+          if (acquire_gil) {\
+              PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();\
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), (__LINE__), (__FILE__));\
+              PyGILState_Release(__pyx_gilstate_save);\
+          } else {\
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), (__LINE__), (__FILE__));\
+          }
+  #define __Pyx_RefNannyFinishContextNogil() {\
+              PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();\
+              __Pyx_RefNannyFinishContext();\
+              PyGILState_Release(__pyx_gilstate_save);\
+          }
+#else
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+          __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), (__LINE__), (__FILE__))
+  #define __Pyx_RefNannyFinishContextNogil() __Pyx_RefNannyFinishContext()
+#endif
+  #define __Pyx_RefNannyFinishContextNogil() {\
+              PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();\
+              __Pyx_RefNannyFinishContext();\
+              PyGILState_Release(__pyx_gilstate_save);\
+          }
+  #define __Pyx_RefNannyFinishContext()\
+          __Pyx_RefNanny->FinishContext(&__pyx_refnanny)
+  #define __Pyx_INCREF(r)  __Pyx_RefNanny->INCREF(__pyx_refnanny, (PyObject *)(r), (__LINE__))
+  #define __Pyx_DECREF(r)  __Pyx_RefNanny->DECREF(__pyx_refnanny, (PyObject *)(r), (__LINE__))
+  #define __Pyx_GOTREF(r)  __Pyx_RefNanny->GOTREF(__pyx_refnanny, (PyObject *)(r), (__LINE__))
+  #define __Pyx_GIVEREF(r) __Pyx_RefNanny->GIVEREF(__pyx_refnanny, (PyObject *)(r), (__LINE__))
+  #define __Pyx_XINCREF(r)  do { if((r) == NULL); else {__Pyx_INCREF(r); }} while(0)
+  #define __Pyx_XDECREF(r)  do { if((r) == NULL); else {__Pyx_DECREF(r); }} while(0)
+  #define __Pyx_XGOTREF(r)  do { if((r) == NULL); else {__Pyx_GOTREF(r); }} while(0)
+  #define __Pyx_XGIVEREF(r) do { if((r) == NULL); else {__Pyx_GIVEREF(r);}} while(0)
+#else
+  #define __Pyx_RefNannyDeclarations
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)
+  #define __Pyx_RefNannyFinishContextNogil()
+  #define __Pyx_RefNannyFinishContext()
+  #define __Pyx_INCREF(r) Py_INCREF(r)
+  #define __Pyx_DECREF(r) Py_DECREF(r)
+  #define __Pyx_GOTREF(r)
+  #define __Pyx_GIVEREF(r)
+  #define __Pyx_XINCREF(r) Py_XINCREF(r)
+  #define __Pyx_XDECREF(r) Py_XDECREF(r)
+  #define __Pyx_XGOTREF(r)
+  #define __Pyx_XGIVEREF(r)
+#endif
+#define __Pyx_Py_XDECREF_SET(r, v) do {\
+        PyObject *tmp = (PyObject *) r;\
+        r = v; Py_XDECREF(tmp);\
+    } while (0)
+#define __Pyx_XDECREF_SET(r, v) do {\
+        PyObject *tmp = (PyObject *) r;\
+        r = v; __Pyx_XDECREF(tmp);\
+    } while (0)
+#define __Pyx_DECREF_SET(r, v) do {\
+        PyObject *tmp = (PyObject *) r;\
+        r = v; __Pyx_DECREF(tmp);\
+    } while (0)
+#define __Pyx_CLEAR(r)    do { PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);} while(0)
+#define __Pyx_XCLEAR(r)   do { if((r) != NULL) {PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);}} while(0)
+
+/* PyErrExceptionMatches.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyErr_ExceptionMatches(err) __Pyx_PyErr_ExceptionMatchesInState(__pyx_tstate, err)
+static CYTHON_INLINE int __Pyx_PyErr_ExceptionMatchesInState(PyThreadState* tstate, PyObject* err);
+#else
+#define __Pyx_PyErr_ExceptionMatches(err)  PyErr_ExceptionMatches(err)
+#endif
+
+/* PyThreadStateGet.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyThreadState_declare  PyThreadState *__pyx_tstate;
+#define __Pyx_PyThreadState_assign  __pyx_tstate = __Pyx_PyThreadState_Current;
+#if PY_VERSION_HEX >= 0x030C00A6
+#define __Pyx_PyErr_Occurred()  (__pyx_tstate->current_exception != NULL)
+#define __Pyx_PyErr_CurrentExceptionType()  (__pyx_tstate->current_exception ? (PyObject*) Py_TYPE(__pyx_tstate->current_exception) : (PyObject*) NULL)
+#else
+#define __Pyx_PyErr_Occurred()  (__pyx_tstate->curexc_type != NULL)
+#define __Pyx_PyErr_CurrentExceptionType()  (__pyx_tstate->curexc_type)
+#endif
+#else
+#define __Pyx_PyThreadState_declare
+#define __Pyx_PyThreadState_assign
+#define __Pyx_PyErr_Occurred()  (PyErr_Occurred() != NULL)
+#define __Pyx_PyErr_CurrentExceptionType()  PyErr_Occurred()
+#endif
+
+/* PyErrFetchRestore.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyErr_Clear() __Pyx_ErrRestore(NULL, NULL, NULL)
+#define __Pyx_ErrRestoreWithState(type, value, tb)  __Pyx_ErrRestoreInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb)    __Pyx_ErrFetchInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb)  __Pyx_ErrRestoreInState(__pyx_tstate, type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb)    __Pyx_ErrFetchInState(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
+static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX < 0x030C00A6
+#define __Pyx_PyErr_SetNone(exc) (Py_INCREF(exc), __Pyx_ErrRestore((exc), NULL, NULL))
+#else
+#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
+#endif
+#else
+#define __Pyx_PyErr_Clear() PyErr_Clear()
+#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
+#define __Pyx_ErrRestoreWithState(type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb)  PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestoreInState(tstate, type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchInState(tstate, type, value, tb)  PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb)  PyErr_Fetch(type, value, tb)
+#endif
+
+/* PyObjectGetAttrStr.proto */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name);
+#else
+#define __Pyx_PyObject_GetAttrStr(o,n) PyObject_GetAttr(o,n)
+#endif
+
+/* PyObjectGetAttrStrNoError.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStrNoError(PyObject* obj, PyObject* attr_name);
+
+/* GetBuiltinName.proto */
+static PyObject *__Pyx_GetBuiltinName(PyObject *name);
+
+/* TupleAndListFromArray.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyList_FromArray(PyObject *const *src, Py_ssize_t n);
+static CYTHON_INLINE PyObject* __Pyx_PyTuple_FromArray(PyObject *const *src, Py_ssize_t n);
+#endif
+
+/* IncludeStringH.proto */
+#include <string.h>
+
+/* BytesEquals.proto */
+static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int equals);
+
+/* UnicodeEquals.proto */
+static CYTHON_INLINE int __Pyx_PyUnicode_Equals(PyObject* s1, PyObject* s2, int equals);
+
+/* fastcall.proto */
+#if CYTHON_AVOID_BORROWED_REFS
+    #define __Pyx_Arg_VARARGS(args, i) PySequence_GetItem(args, i)
+#elif CYTHON_ASSUME_SAFE_MACROS
+    #define __Pyx_Arg_VARARGS(args, i) PyTuple_GET_ITEM(args, i)
+#else
+    #define __Pyx_Arg_VARARGS(args, i) PyTuple_GetItem(args, i)
+#endif
+#if CYTHON_AVOID_BORROWED_REFS
+    #define __Pyx_Arg_NewRef_VARARGS(arg) __Pyx_NewRef(arg)
+    #define __Pyx_Arg_XDECREF_VARARGS(arg) Py_XDECREF(arg)
+#else
+    #define __Pyx_Arg_NewRef_VARARGS(arg) arg
+    #define __Pyx_Arg_XDECREF_VARARGS(arg)
+#endif
+#define __Pyx_NumKwargs_VARARGS(kwds) PyDict_Size(kwds)
+#define __Pyx_KwValues_VARARGS(args, nargs) NULL
+#define __Pyx_GetKwValue_VARARGS(kw, kwvalues, s) __Pyx_PyDict_GetItemStrWithError(kw, s)
+#define __Pyx_KwargsAsDict_VARARGS(kw, kwvalues) PyDict_Copy(kw)
+#if CYTHON_METH_FASTCALL
+    #define __Pyx_Arg_FASTCALL(args, i) args[i]
+    #define __Pyx_NumKwargs_FASTCALL(kwds) PyTuple_GET_SIZE(kwds)
+    #define __Pyx_KwValues_FASTCALL(args, nargs) ((args) + (nargs))
+    static CYTHON_INLINE PyObject * __Pyx_GetKwValue_FASTCALL(PyObject *kwnames, PyObject *const *kwvalues, PyObject *s);
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030d0000
+    CYTHON_UNUSED static PyObject *__Pyx_KwargsAsDict_FASTCALL(PyObject *kwnames, PyObject *const *kwvalues);
+  #else
+    #define __Pyx_KwargsAsDict_FASTCALL(kw, kwvalues) _PyStack_AsDict(kwvalues, kw)
+  #endif
+    #define __Pyx_Arg_NewRef_FASTCALL(arg) arg  /* no-op, __Pyx_Arg_FASTCALL is direct and this needs
+                                                   to have the same reference counting */
+    #define __Pyx_Arg_XDECREF_FASTCALL(arg)
+#else
+    #define __Pyx_Arg_FASTCALL __Pyx_Arg_VARARGS
+    #define __Pyx_NumKwargs_FASTCALL __Pyx_NumKwargs_VARARGS
+    #define __Pyx_KwValues_FASTCALL __Pyx_KwValues_VARARGS
+    #define __Pyx_GetKwValue_FASTCALL __Pyx_GetKwValue_VARARGS
+    #define __Pyx_KwargsAsDict_FASTCALL __Pyx_KwargsAsDict_VARARGS
+    #define __Pyx_Arg_NewRef_FASTCALL(arg) __Pyx_Arg_NewRef_VARARGS(arg)
+    #define __Pyx_Arg_XDECREF_FASTCALL(arg) __Pyx_Arg_XDECREF_VARARGS(arg)
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+#define __Pyx_ArgsSlice_VARARGS(args, start, stop) __Pyx_PyTuple_FromArray(&__Pyx_Arg_VARARGS(args, start), stop - start)
+#define __Pyx_ArgsSlice_FASTCALL(args, start, stop) __Pyx_PyTuple_FromArray(&__Pyx_Arg_FASTCALL(args, start), stop - start)
+#else
+#define __Pyx_ArgsSlice_VARARGS(args, start, stop) PyTuple_GetSlice(args, start, stop)
+#define __Pyx_ArgsSlice_FASTCALL(args, start, stop) PyTuple_GetSlice(args, start, stop)
+#endif
+
+/* RaiseArgTupleInvalid.proto */
+static void __Pyx_RaiseArgtupleInvalid(const char* func_name, int exact,
+    Py_ssize_t num_min, Py_ssize_t num_max, Py_ssize_t num_found);
+
+/* RaiseDoubleKeywords.proto */
+static void __Pyx_RaiseDoubleKeywordsError(const char* func_name, PyObject* kw_name);
+
+/* ParseKeywords.proto */
+static int __Pyx_ParseOptionalKeywords(PyObject *kwds, PyObject *const *kwvalues,
+    PyObject **argnames[],
+    PyObject *kwds2, PyObject *values[], Py_ssize_t num_pos_args,
+    const char* function_name);
+
+/* ArgTypeTest.proto */
+#define __Pyx_ArgTypeTest(obj, type, none_allowed, name, exact)\
+    ((likely(__Pyx_IS_TYPE(obj, type) | (none_allowed && (obj == Py_None)))) ? 1 :\
+        __Pyx__ArgTypeTest(obj, type, name, exact))
+static int __Pyx__ArgTypeTest(PyObject *obj, PyTypeObject *type, const char *name, int exact);
+
+/* RaiseException.proto */
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause);
+
+/* PyFunctionFastCall.proto */
+#if CYTHON_FAST_PYCALL
+#if !CYTHON_VECTORCALL
+#define __Pyx_PyFunction_FastCall(func, args, nargs)\
+    __Pyx_PyFunction_FastCallDict((func), (args), (nargs), NULL)
+static PyObject *__Pyx_PyFunction_FastCallDict(PyObject *func, PyObject **args, Py_ssize_t nargs, PyObject *kwargs);
+#endif
+#define __Pyx_BUILD_ASSERT_EXPR(cond)\
+    (sizeof(char [1 - 2*!(cond)]) - 1)
+#ifndef Py_MEMBER_SIZE
+#define Py_MEMBER_SIZE(type, member) sizeof(((type *)0)->member)
+#endif
+#if !CYTHON_VECTORCALL
+#if PY_VERSION_HEX >= 0x03080000
+  #include "frameobject.h"
+#if PY_VERSION_HEX >= 0x030b00a6 && !CYTHON_COMPILING_IN_LIMITED_API && !defined(PYPY_VERSION)
+  #ifndef Py_BUILD_CORE
+    #define Py_BUILD_CORE 1
+  #endif
+  #include "internal/pycore_frame.h"
+#endif
+  #define __Pxy_PyFrame_Initialize_Offsets()
+  #define __Pyx_PyFrame_GetLocalsplus(frame)  ((frame)->f_localsplus)
+#else
+  static size_t __pyx_pyframe_localsplus_offset = 0;
+  #include "frameobject.h"
+  #define __Pxy_PyFrame_Initialize_Offsets()\
+    ((void)__Pyx_BUILD_ASSERT_EXPR(sizeof(PyFrameObject) == offsetof(PyFrameObject, f_localsplus) + Py_MEMBER_SIZE(PyFrameObject, f_localsplus)),\
+     (void)(__pyx_pyframe_localsplus_offset = ((size_t)PyFrame_Type.tp_basicsize) - Py_MEMBER_SIZE(PyFrameObject, f_localsplus)))
+  #define __Pyx_PyFrame_GetLocalsplus(frame)\
+    (assert(__pyx_pyframe_localsplus_offset), (PyObject **)(((char *)(frame)) + __pyx_pyframe_localsplus_offset))
+#endif
+#endif
+#endif
+
+/* PyObjectCall.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw);
+#else
+#define __Pyx_PyObject_Call(func, arg, kw) PyObject_Call(func, arg, kw)
+#endif
+
+/* PyObjectCallMethO.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg);
+#endif
+
+/* PyObjectFastCall.proto */
+#define __Pyx_PyObject_FastCall(func, args, nargs)  __Pyx_PyObject_FastCallDict(func, args, (size_t)(nargs), NULL)
+static CYTHON_INLINE PyObject* __Pyx_PyObject_FastCallDict(PyObject *func, PyObject **args, size_t nargs, PyObject *kwargs);
+
+/* RaiseUnexpectedTypeError.proto */
+static int __Pyx_RaiseUnexpectedTypeError(const char *expected, PyObject *obj);
+
+/* GCCDiagnostics.proto */
+#if !defined(__INTEL_COMPILER) && defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
+#define __Pyx_HAS_GCC_DIAGNOSTIC
+#endif
+
+/* BuildPyUnicode.proto */
+static PyObject* __Pyx_PyUnicode_BuildFromAscii(Py_ssize_t ulength, char* chars, int clength,
+                                                int prepend_sign, char padding_char);
+
+/* CIntToPyUnicode.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_From_int(int value, Py_ssize_t width, char padding_char, char format_char);
+
+/* CIntToPyUnicode.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_From_Py_ssize_t(Py_ssize_t value, Py_ssize_t width, char padding_char, char format_char);
+
+/* JoinPyUnicode.proto */
+static PyObject* __Pyx_PyUnicode_Join(PyObject* value_tuple, Py_ssize_t value_count, Py_ssize_t result_ulength,
+                                      Py_UCS4 max_char);
+
+/* StrEquals.proto */
+#if PY_MAJOR_VERSION >= 3
+#define __Pyx_PyString_Equals __Pyx_PyUnicode_Equals
+#else
+#define __Pyx_PyString_Equals __Pyx_PyBytes_Equals
+#endif
+
+/* PyObjectFormatSimple.proto */
+#if CYTHON_COMPILING_IN_PYPY
+    #define __Pyx_PyObject_FormatSimple(s, f) (\
+        likely(PyUnicode_CheckExact(s)) ? (Py_INCREF(s), s) :\
+        PyObject_Format(s, f))
+#elif PY_MAJOR_VERSION < 3
+    #define __Pyx_PyObject_FormatSimple(s, f) (\
+        likely(PyUnicode_CheckExact(s)) ? (Py_INCREF(s), s) :\
+        likely(PyString_CheckExact(s)) ? PyUnicode_FromEncodedObject(s, NULL, "strict") :\
+        PyObject_Format(s, f))
+#elif CYTHON_USE_TYPE_SLOTS
+    #define __Pyx_PyObject_FormatSimple(s, f) (\
+        likely(PyUnicode_CheckExact(s)) ? (Py_INCREF(s), s) :\
+        likely(PyLong_CheckExact(s)) ? PyLong_Type.tp_repr(s) :\
+        likely(PyFloat_CheckExact(s)) ? PyFloat_Type.tp_repr(s) :\
+        PyObject_Format(s, f))
+#else
+    #define __Pyx_PyObject_FormatSimple(s, f) (\
+        likely(PyUnicode_CheckExact(s)) ? (Py_INCREF(s), s) :\
+        PyObject_Format(s, f))
+#endif
+
+CYTHON_UNUSED static int __pyx_array_getbuffer(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /*proto*/
+static PyObject *__pyx_array_get_memview(struct __pyx_array_obj *); /*proto*/
+/* GetAttr.proto */
+static CYTHON_INLINE PyObject *__Pyx_GetAttr(PyObject *, PyObject *);
+
+/* GetItemInt.proto */
+#define __Pyx_GetItemInt(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
+    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
+    __Pyx_GetItemInt_Fast(o, (Py_ssize_t)i, is_list, wraparound, boundscheck) :\
+    (is_list ? (PyErr_SetString(PyExc_IndexError, "list index out of range"), (PyObject*)NULL) :\
+               __Pyx_GetItemInt_Generic(o, to_py_func(i))))
+#define __Pyx_GetItemInt_List(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
+    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
+    __Pyx_GetItemInt_List_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) :\
+    (PyErr_SetString(PyExc_IndexError, "list index out of range"), (PyObject*)NULL))
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, Py_ssize_t i,
+                                                              int wraparound, int boundscheck);
+#define __Pyx_GetItemInt_Tuple(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
+    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
+    __Pyx_GetItemInt_Tuple_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) :\
+    (PyErr_SetString(PyExc_IndexError, "tuple index out of range"), (PyObject*)NULL))
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, Py_ssize_t i,
+                                                              int wraparound, int boundscheck);
+static PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j);
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i,
+                                                     int is_list, int wraparound, int boundscheck);
+
+/* PyObjectCallOneArg.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg);
+
+/* ObjectGetItem.proto */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject *__Pyx_PyObject_GetItem(PyObject *obj, PyObject *key);
+#else
+#define __Pyx_PyObject_GetItem(obj, key)  PyObject_GetItem(obj, key)
+#endif
+
+/* KeywordStringCheck.proto */
+static int __Pyx_CheckKeywordStrings(PyObject *kw, const char* function_name, int kw_allowed);
+
+/* DivInt[Py_ssize_t].proto */
+static CYTHON_INLINE Py_ssize_t __Pyx_div_Py_ssize_t(Py_ssize_t, Py_ssize_t);
+
+/* UnaryNegOverflows.proto */
+#define __Pyx_UNARY_NEG_WOULD_OVERFLOW(x)\
+        (((x) < 0) & ((unsigned long)(x) == 0-(unsigned long)(x)))
+
+/* GetAttr3.proto */
+static CYTHON_INLINE PyObject *__Pyx_GetAttr3(PyObject *, PyObject *, PyObject *);
+
+/* PyDictVersioning.proto */
+#if CYTHON_USE_DICT_VERSIONS && CYTHON_USE_TYPE_SLOTS
+#define __PYX_DICT_VERSION_INIT  ((PY_UINT64_T) -1)
+#define __PYX_GET_DICT_VERSION(dict)  (((PyDictObject*)(dict))->ma_version_tag)
+#define __PYX_UPDATE_DICT_CACHE(dict, value, cache_var, version_var)\
+    (version_var) = __PYX_GET_DICT_VERSION(dict);\
+    (cache_var) = (value);
+#define __PYX_PY_DICT_LOOKUP_IF_MODIFIED(VAR, DICT, LOOKUP) {\
+    static PY_UINT64_T __pyx_dict_version = 0;\
+    static PyObject *__pyx_dict_cached_value = NULL;\
+    if (likely(__PYX_GET_DICT_VERSION(DICT) == __pyx_dict_version)) {\
+        (VAR) = __pyx_dict_cached_value;\
+    } else {\
+        (VAR) = __pyx_dict_cached_value = (LOOKUP);\
+        __pyx_dict_version = __PYX_GET_DICT_VERSION(DICT);\
+    }\
+}
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_tp_dict_version(PyObject *obj);
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_object_dict_version(PyObject *obj);
+static CYTHON_INLINE int __Pyx_object_dict_version_matches(PyObject* obj, PY_UINT64_T tp_dict_version, PY_UINT64_T obj_dict_version);
+#else
+#define __PYX_GET_DICT_VERSION(dict)  (0)
+#define __PYX_UPDATE_DICT_CACHE(dict, value, cache_var, version_var)
+#define __PYX_PY_DICT_LOOKUP_IF_MODIFIED(VAR, DICT, LOOKUP)  (VAR) = (LOOKUP);
+#endif
+
+/* GetModuleGlobalName.proto */
+#if CYTHON_USE_DICT_VERSIONS
+#define __Pyx_GetModuleGlobalName(var, name)  do {\
+    static PY_UINT64_T __pyx_dict_version = 0;\
+    static PyObject *__pyx_dict_cached_value = NULL;\
+    (var) = (likely(__pyx_dict_version == __PYX_GET_DICT_VERSION(__pyx_d))) ?\
+        (likely(__pyx_dict_cached_value) ? __Pyx_NewRef(__pyx_dict_cached_value) : __Pyx_GetBuiltinName(name)) :\
+        __Pyx__GetModuleGlobalName(name, &__pyx_dict_version, &__pyx_dict_cached_value);\
+} while(0)
+#define __Pyx_GetModuleGlobalNameUncached(var, name)  do {\
+    PY_UINT64_T __pyx_dict_version;\
+    PyObject *__pyx_dict_cached_value;\
+    (var) = __Pyx__GetModuleGlobalName(name, &__pyx_dict_version, &__pyx_dict_cached_value);\
+} while(0)
+static PyObject *__Pyx__GetModuleGlobalName(PyObject *name, PY_UINT64_T *dict_version, PyObject **dict_cached_value);
+#else
+#define __Pyx_GetModuleGlobalName(var, name)  (var) = __Pyx__GetModuleGlobalName(name)
+#define __Pyx_GetModuleGlobalNameUncached(var, name)  (var) = __Pyx__GetModuleGlobalName(name)
+static CYTHON_INLINE PyObject *__Pyx__GetModuleGlobalName(PyObject *name);
+#endif
+
+/* AssertionsEnabled.proto */
+#if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX < 0x02070600 && !defined(Py_OptimizeFlag)
+  #define __Pyx_init_assertions_enabled()  (0)
+  #define __pyx_assertions_enabled()  (1)
+#elif CYTHON_COMPILING_IN_LIMITED_API  ||  (CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030C0000)
+  static int __pyx_assertions_enabled_flag;
+  #define __pyx_assertions_enabled() (__pyx_assertions_enabled_flag)
+  static int __Pyx_init_assertions_enabled(void) {
+    PyObject *builtins, *debug, *debug_str;
+    int flag;
+    builtins = PyEval_GetBuiltins();
+    if (!builtins) goto bad;
+    debug_str = PyUnicode_FromStringAndSize("__debug__", 9);
+    if (!debug_str) goto bad;
+    debug = PyObject_GetItem(builtins, debug_str);
+    Py_DECREF(debug_str);
+    if (!debug) goto bad;
+    flag = PyObject_IsTrue(debug);
+    Py_DECREF(debug);
+    if (flag == -1) goto bad;
+    __pyx_assertions_enabled_flag = flag;
+    return 0;
+  bad:
+    __pyx_assertions_enabled_flag = 1;
+    return -1;
+  }
+#else
+  #define __Pyx_init_assertions_enabled()  (0)
+  #define __pyx_assertions_enabled()  (!Py_OptimizeFlag)
+#endif
+
+/* RaiseTooManyValuesToUnpack.proto */
+static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected);
+
+/* RaiseNeedMoreValuesToUnpack.proto */
+static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index);
+
+/* RaiseNoneIterError.proto */
+static CYTHON_INLINE void __Pyx_RaiseNoneNotIterableError(void);
+
+/* ExtTypeTest.proto */
+static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type);
+
+/* GetTopmostException.proto */
+#if CYTHON_USE_EXC_INFO_STACK && CYTHON_FAST_THREAD_STATE
+static _PyErr_StackItem * __Pyx_PyErr_GetTopmostException(PyThreadState *tstate);
+#endif
+
+/* SaveResetException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_ExceptionSave(type, value, tb)  __Pyx__ExceptionSave(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#define __Pyx_ExceptionReset(type, value, tb)  __Pyx__ExceptionReset(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
+#else
+#define __Pyx_ExceptionSave(type, value, tb)   PyErr_GetExcInfo(type, value, tb)
+#define __Pyx_ExceptionReset(type, value, tb)  PyErr_SetExcInfo(type, value, tb)
+#endif
+
+/* GetException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_GetException(type, value, tb)  __Pyx__GetException(__pyx_tstate, type, value, tb)
+static int __Pyx__GetException(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#else
+static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb);
+#endif
+
+/* SwapException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_ExceptionSwap(type, value, tb)  __Pyx__ExceptionSwap(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionSwap(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#else
+static CYTHON_INLINE void __Pyx_ExceptionSwap(PyObject **type, PyObject **value, PyObject **tb);
+#endif
+
+/* Import.proto */
+static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level);
+
+/* ImportDottedModule.proto */
+static PyObject *__Pyx_ImportDottedModule(PyObject *name, PyObject *parts_tuple);
+#if PY_MAJOR_VERSION >= 3
+static PyObject *__Pyx_ImportDottedModule_WalkParts(PyObject *module, PyObject *name, PyObject *parts_tuple);
+#endif
+
+/* FastTypeChecks.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_TypeCheck(obj, type) __Pyx_IsSubtype(Py_TYPE(obj), (PyTypeObject *)type)
+#define __Pyx_TypeCheck2(obj, type1, type2) __Pyx_IsAnySubtype2(Py_TYPE(obj), (PyTypeObject *)type1, (PyTypeObject *)type2)
+static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b);
+static CYTHON_INLINE int __Pyx_IsAnySubtype2(PyTypeObject *cls, PyTypeObject *a, PyTypeObject *b);
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject *type);
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *type1, PyObject *type2);
+#else
+#define __Pyx_TypeCheck(obj, type) PyObject_TypeCheck(obj, (PyTypeObject *)type)
+#define __Pyx_TypeCheck2(obj, type1, type2) (PyObject_TypeCheck(obj, (PyTypeObject *)type1) || PyObject_TypeCheck(obj, (PyTypeObject *)type2))
+#define __Pyx_PyErr_GivenExceptionMatches(err, type) PyErr_GivenExceptionMatches(err, type)
+#define __Pyx_PyErr_GivenExceptionMatches2(err, type1, type2) (PyErr_GivenExceptionMatches(err, type1) || PyErr_GivenExceptionMatches(err, type2))
+#endif
+#define __Pyx_PyErr_ExceptionMatches2(err1, err2)  __Pyx_PyErr_GivenExceptionMatches2(__Pyx_PyErr_CurrentExceptionType(), err1, err2)
+#define __Pyx_PyException_Check(obj) __Pyx_TypeCheck(obj, PyExc_Exception)
+
+CYTHON_UNUSED static int __pyx_memoryview_getbuffer(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /*proto*/
+/* ListCompAppend.proto */
+#if CYTHON_USE_PYLIST_INTERNALS && CYTHON_ASSUME_SAFE_MACROS
+static CYTHON_INLINE int __Pyx_ListComp_Append(PyObject* list, PyObject* x) {
+    PyListObject* L = (PyListObject*) list;
+    Py_ssize_t len = Py_SIZE(list);
+    if (likely(L->allocated > len)) {
+        Py_INCREF(x);
+        #if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030d0000
+        L->ob_item[len] = x;
+        #else
+        PyList_SET_ITEM(list, len, x);
+        #endif
+        __Pyx_SET_SIZE(list, len + 1);
+        return 0;
+    }
+    return PyList_Append(list, x);
+}
+#else
+#define __Pyx_ListComp_Append(L,x) PyList_Append(L,x)
+#endif
+
+/* PySequenceMultiply.proto */
+#define __Pyx_PySequence_Multiply_Left(mul, seq)  __Pyx_PySequence_Multiply(seq, mul)
+static CYTHON_INLINE PyObject* __Pyx_PySequence_Multiply(PyObject *seq, Py_ssize_t mul);
+
+/* SetItemInt.proto */
+#define __Pyx_SetItemInt(o, i, v, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
+    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
+    __Pyx_SetItemInt_Fast(o, (Py_ssize_t)i, v, is_list, wraparound, boundscheck) :\
+    (is_list ? (PyErr_SetString(PyExc_IndexError, "list assignment index out of range"), -1) :\
+               __Pyx_SetItemInt_Generic(o, to_py_func(i), v)))
+static int __Pyx_SetItemInt_Generic(PyObject *o, PyObject *j, PyObject *v);
+static CYTHON_INLINE int __Pyx_SetItemInt_Fast(PyObject *o, Py_ssize_t i, PyObject *v,
+                                               int is_list, int wraparound, int boundscheck);
+
+/* RaiseUnboundLocalError.proto */
+static CYTHON_INLINE void __Pyx_RaiseUnboundLocalError(const char *varname);
+
+/* DivInt[long].proto */
+static CYTHON_INLINE long __Pyx_div_long(long, long);
+
+/* PySequenceContains.proto */
+static CYTHON_INLINE int __Pyx_PySequence_ContainsTF(PyObject* item, PyObject* seq, int eq) {
+    int result = PySequence_Contains(seq, item);
+    return unlikely(result < 0) ? result : (result == (eq == Py_EQ));
+}
+
+/* ImportFrom.proto */
+static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name);
+
+/* HasAttr.proto */
+static CYTHON_INLINE int __Pyx_HasAttr(PyObject *, PyObject *);
+
+/* IsLittleEndian.proto */
+static CYTHON_INLINE int __Pyx_Is_Little_Endian(void);
+
+/* BufferFormatCheck.proto */
+static const char* __Pyx_BufFmt_CheckString(__Pyx_BufFmt_Context* ctx, const char* ts);
+static void __Pyx_BufFmt_Init(__Pyx_BufFmt_Context* ctx,
+                              __Pyx_BufFmt_StackElem* stack,
+                              __Pyx_TypeInfo* type);
+
+/* BufferGetAndValidate.proto */
+#define __Pyx_GetBufferAndValidate(buf, obj, dtype, flags, nd, cast, stack)\
+    ((obj == Py_None || obj == NULL) ?\
+    (__Pyx_ZeroBuffer(buf), 0) :\
+    __Pyx__GetBufferAndValidate(buf, obj, dtype, flags, nd, cast, stack))
+static int  __Pyx__GetBufferAndValidate(Py_buffer* buf, PyObject* obj,
+    __Pyx_TypeInfo* dtype, int flags, int nd, int cast, __Pyx_BufFmt_StackElem* stack);
+static void __Pyx_ZeroBuffer(Py_buffer* buf);
+static CYTHON_INLINE void __Pyx_SafeReleaseBuffer(Py_buffer* info);
+static Py_ssize_t __Pyx_minusones[] = { -1, -1, -1, -1, -1, -1, -1, -1 };
+static Py_ssize_t __Pyx_zeros[] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+/* BufferFallbackError.proto */
+static void __Pyx_RaiseBufferFallbackError(void);
+
+/* ListAppend.proto */
+#if CYTHON_USE_PYLIST_INTERNALS && CYTHON_ASSUME_SAFE_MACROS
+static CYTHON_INLINE int __Pyx_PyList_Append(PyObject* list, PyObject* x) {
+    PyListObject* L = (PyListObject*) list;
+    Py_ssize_t len = Py_SIZE(list);
+    if (likely(L->allocated > len) & likely(len > (L->allocated >> 1))) {
+        Py_INCREF(x);
+        #if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030d0000
+        L->ob_item[len] = x;
+        #else
+        PyList_SET_ITEM(list, len, x);
+        #endif
+        __Pyx_SET_SIZE(list, len + 1);
+        return 0;
+    }
+    return PyList_Append(list, x);
+}
+#else
+#define __Pyx_PyList_Append(L,x) PyList_Append(L,x)
+#endif
+
+/* PyIntBinop.proto */
+#if !CYTHON_COMPILING_IN_PYPY
+static PyObject* __Pyx_PyInt_SubtractObjC(PyObject *op1, PyObject *op2, long intval, int inplace, int zerodivision_check);
+#else
+#define __Pyx_PyInt_SubtractObjC(op1, op2, intval, inplace, zerodivision_check)\
+    (inplace ? PyNumber_InPlaceSubtract(op1, op2) : PyNumber_Subtract(op1, op2))
+#endif
+
+/* SliceObject.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetSlice(
+        PyObject* obj, Py_ssize_t cstart, Py_ssize_t cstop,
+        PyObject** py_start, PyObject** py_stop, PyObject** py_slice,
+        int has_cstart, int has_cstop, int wraparound);
+
+/* PyObject_GenericGetAttrNoDict.proto */
+#if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GenericGetAttrNoDict(PyObject* obj, PyObject* attr_name);
+#else
+#define __Pyx_PyObject_GenericGetAttrNoDict PyObject_GenericGetAttr
+#endif
+
+/* PyObject_GenericGetAttr.proto */
+#if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000
+static PyObject* __Pyx_PyObject_GenericGetAttr(PyObject* obj, PyObject* attr_name);
+#else
+#define __Pyx_PyObject_GenericGetAttr PyObject_GenericGetAttr
+#endif
+
+/* IncludeStructmemberH.proto */
+#include <structmember.h>
+
+/* FixUpExtensionType.proto */
+#if CYTHON_USE_TYPE_SPECS
+static int __Pyx_fix_up_extension_type_from_spec(PyType_Spec *spec, PyTypeObject *type);
+#endif
+
+/* PyObjectCallNoArg.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallNoArg(PyObject *func);
+
+/* PyObjectGetMethod.proto */
+static int __Pyx_PyObject_GetMethod(PyObject *obj, PyObject *name, PyObject **method);
+
+/* PyObjectCallMethod0.proto */
+static PyObject* __Pyx_PyObject_CallMethod0(PyObject* obj, PyObject* method_name);
+
+/* ValidateBasesTuple.proto */
+#if CYTHON_COMPILING_IN_CPYTHON || CYTHON_COMPILING_IN_LIMITED_API || CYTHON_USE_TYPE_SPECS
+static int __Pyx_validate_bases_tuple(const char *type_name, Py_ssize_t dictoffset, PyObject *bases);
+#endif
+
+/* PyType_Ready.proto */
+CYTHON_UNUSED static int __Pyx_PyType_Ready(PyTypeObject *t);
+
+/* SetVTable.proto */
+static int __Pyx_SetVtable(PyTypeObject* typeptr , void* vtable);
+
+/* GetVTable.proto */
+static void* __Pyx_GetVtable(PyTypeObject *type);
+
+/* MergeVTables.proto */
+#if !CYTHON_COMPILING_IN_LIMITED_API
+static int __Pyx_MergeVtables(PyTypeObject *type);
+#endif
+
+/* SetupReduce.proto */
+#if !CYTHON_COMPILING_IN_LIMITED_API
+static int __Pyx_setup_reduce(PyObject* type_obj);
+#endif
+
+/* TypeImport.proto */
+#ifndef __PYX_HAVE_RT_ImportType_proto_3_0_12
+#define __PYX_HAVE_RT_ImportType_proto_3_0_12
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#include <stdalign.h>
+#endif
+#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || __cplusplus >= 201103L
+#define __PYX_GET_STRUCT_ALIGNMENT_3_0_12(s) alignof(s)
+#else
+#define __PYX_GET_STRUCT_ALIGNMENT_3_0_12(s) sizeof(void*)
+#endif
+enum __Pyx_ImportType_CheckSize_3_0_12 {
+   __Pyx_ImportType_CheckSize_Error_3_0_12 = 0,
+   __Pyx_ImportType_CheckSize_Warn_3_0_12 = 1,
+   __Pyx_ImportType_CheckSize_Ignore_3_0_12 = 2
+};
+static PyTypeObject *__Pyx_ImportType_3_0_12(PyObject* module, const char *module_name, const char *class_name, size_t size, size_t alignment, enum __Pyx_ImportType_CheckSize_3_0_12 check_size);
+#endif
+
+/* FetchSharedCythonModule.proto */
+static PyObject *__Pyx_FetchSharedCythonABIModule(void);
+
+/* FetchCommonType.proto */
+#if !CYTHON_USE_TYPE_SPECS
+static PyTypeObject* __Pyx_FetchCommonType(PyTypeObject* type);
+#else
+static PyTypeObject* __Pyx_FetchCommonTypeFromSpec(PyObject *module, PyType_Spec *spec, PyObject *bases);
+#endif
+
+/* PyMethodNew.proto */
+#if CYTHON_COMPILING_IN_LIMITED_API
+static PyObject *__Pyx_PyMethod_New(PyObject *func, PyObject *self, PyObject *typ) {
+    PyObject *typesModule=NULL, *methodType=NULL, *result=NULL;
+    CYTHON_UNUSED_VAR(typ);
+    if (!self)
+        return __Pyx_NewRef(func);
+    typesModule = PyImport_ImportModule("types");
+    if (!typesModule) return NULL;
+    methodType = PyObject_GetAttrString(typesModule, "MethodType");
+    Py_DECREF(typesModule);
+    if (!methodType) return NULL;
+    result = PyObject_CallFunctionObjArgs(methodType, func, self, NULL);
+    Py_DECREF(methodType);
+    return result;
+}
+#elif PY_MAJOR_VERSION >= 3
+static PyObject *__Pyx_PyMethod_New(PyObject *func, PyObject *self, PyObject *typ) {
+    CYTHON_UNUSED_VAR(typ);
+    if (!self)
+        return __Pyx_NewRef(func);
+    return PyMethod_New(func, self);
+}
+#else
+    #define __Pyx_PyMethod_New PyMethod_New
+#endif
+
+/* PyVectorcallFastCallDict.proto */
+#if CYTHON_METH_FASTCALL
+static CYTHON_INLINE PyObject *__Pyx_PyVectorcall_FastCallDict(PyObject *func, __pyx_vectorcallfunc vc, PyObject *const *args, size_t nargs, PyObject *kw);
+#endif
+
+/* CythonFunctionShared.proto */
+#define __Pyx_CyFunction_USED
+#define __Pyx_CYFUNCTION_STATICMETHOD  0x01
+#define __Pyx_CYFUNCTION_CLASSMETHOD   0x02
+#define __Pyx_CYFUNCTION_CCLASS        0x04
+#define __Pyx_CYFUNCTION_COROUTINE     0x08
+#define __Pyx_CyFunction_GetClosure(f)\
+    (((__pyx_CyFunctionObject *) (f))->func_closure)
+#if PY_VERSION_HEX < 0x030900B1 || CYTHON_COMPILING_IN_LIMITED_API
+  #define __Pyx_CyFunction_GetClassObj(f)\
+      (((__pyx_CyFunctionObject *) (f))->func_classobj)
+#else
+  #define __Pyx_CyFunction_GetClassObj(f)\
+      ((PyObject*) ((PyCMethodObject *) (f))->mm_class)
+#endif
+#define __Pyx_CyFunction_SetClassObj(f, classobj)\
+    __Pyx__CyFunction_SetClassObj((__pyx_CyFunctionObject *) (f), (classobj))
+#define __Pyx_CyFunction_Defaults(type, f)\
+    ((type *)(((__pyx_CyFunctionObject *) (f))->defaults))
+#define __Pyx_CyFunction_SetDefaultsGetter(f, g)\
+    ((__pyx_CyFunctionObject *) (f))->defaults_getter = (g)
+typedef struct {
+#if CYTHON_COMPILING_IN_LIMITED_API
+    PyObject_HEAD
+    PyObject *func;
+#elif PY_VERSION_HEX < 0x030900B1
+    PyCFunctionObject func;
+#else
+    PyCMethodObject func;
+#endif
+#if CYTHON_BACKPORT_VECTORCALL
+    __pyx_vectorcallfunc func_vectorcall;
+#endif
+#if PY_VERSION_HEX < 0x030500A0 || CYTHON_COMPILING_IN_LIMITED_API
+    PyObject *func_weakreflist;
+#endif
+    PyObject *func_dict;
+    PyObject *func_name;
+    PyObject *func_qualname;
+    PyObject *func_doc;
+    PyObject *func_globals;
+    PyObject *func_code;
+    PyObject *func_closure;
+#if PY_VERSION_HEX < 0x030900B1 || CYTHON_COMPILING_IN_LIMITED_API
+    PyObject *func_classobj;
+#endif
+    void *defaults;
+    int defaults_pyobjects;
+    size_t defaults_size;
+    int flags;
+    PyObject *defaults_tuple;
+    PyObject *defaults_kwdict;
+    PyObject *(*defaults_getter)(PyObject *);
+    PyObject *func_annotations;
+    PyObject *func_is_coroutine;
+} __pyx_CyFunctionObject;
+#undef __Pyx_CyOrPyCFunction_Check
+#define __Pyx_CyFunction_Check(obj)  __Pyx_TypeCheck(obj, __pyx_CyFunctionType)
+#define __Pyx_CyOrPyCFunction_Check(obj)  __Pyx_TypeCheck2(obj, __pyx_CyFunctionType, &PyCFunction_Type)
+#define __Pyx_CyFunction_CheckExact(obj)  __Pyx_IS_TYPE(obj, __pyx_CyFunctionType)
+static CYTHON_INLINE int __Pyx__IsSameCyOrCFunction(PyObject *func, void *cfunc);
+#undef __Pyx_IsSameCFunction
+#define __Pyx_IsSameCFunction(func, cfunc)   __Pyx__IsSameCyOrCFunction(func, cfunc)
+static PyObject *__Pyx_CyFunction_Init(__pyx_CyFunctionObject* op, PyMethodDef *ml,
+                                      int flags, PyObject* qualname,
+                                      PyObject *closure,
+                                      PyObject *module, PyObject *globals,
+                                      PyObject* code);
+static CYTHON_INLINE void __Pyx__CyFunction_SetClassObj(__pyx_CyFunctionObject* f, PyObject* classobj);
+static CYTHON_INLINE void *__Pyx_CyFunction_InitDefaults(PyObject *m,
+                                                         size_t size,
+                                                         int pyobjects);
+static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsTuple(PyObject *m,
+                                                            PyObject *tuple);
+static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsKwDict(PyObject *m,
+                                                             PyObject *dict);
+static CYTHON_INLINE void __Pyx_CyFunction_SetAnnotationsDict(PyObject *m,
+                                                              PyObject *dict);
+static int __pyx_CyFunction_init(PyObject *module);
+#if CYTHON_METH_FASTCALL
+static PyObject * __Pyx_CyFunction_Vectorcall_NOARGS(PyObject *func, PyObject *const *args, size_t nargsf, PyObject *kwnames);
+static PyObject * __Pyx_CyFunction_Vectorcall_O(PyObject *func, PyObject *const *args, size_t nargsf, PyObject *kwnames);
+static PyObject * __Pyx_CyFunction_Vectorcall_FASTCALL_KEYWORDS(PyObject *func, PyObject *const *args, size_t nargsf, PyObject *kwnames);
+static PyObject * __Pyx_CyFunction_Vectorcall_FASTCALL_KEYWORDS_METHOD(PyObject *func, PyObject *const *args, size_t nargsf, PyObject *kwnames);
+#if CYTHON_BACKPORT_VECTORCALL
+#define __Pyx_CyFunction_func_vectorcall(f) (((__pyx_CyFunctionObject*)f)->func_vectorcall)
+#else
+#define __Pyx_CyFunction_func_vectorcall(f) (((PyCFunctionObject*)f)->vectorcall)
+#endif
+#endif
+
+/* CythonFunction.proto */
+static PyObject *__Pyx_CyFunction_New(PyMethodDef *ml,
+                                      int flags, PyObject* qualname,
+                                      PyObject *closure,
+                                      PyObject *module, PyObject *globals,
+                                      PyObject* code);
+
+/* CLineInTraceback.proto */
+#ifdef CYTHON_CLINE_IN_TRACEBACK
+#define __Pyx_CLineForTraceback(tstate, c_line)  (((CYTHON_CLINE_IN_TRACEBACK)) ? c_line : 0)
+#else
+static int __Pyx_CLineForTraceback(PyThreadState *tstate, int c_line);
+#endif
+
+/* CodeObjectCache.proto */
+#if !CYTHON_COMPILING_IN_LIMITED_API
+typedef struct {
+    PyCodeObject* code_object;
+    int code_line;
+} __Pyx_CodeObjectCacheEntry;
+struct __Pyx_CodeObjectCache {
+    int count;
+    int max_count;
+    __Pyx_CodeObjectCacheEntry* entries;
+};
+static struct __Pyx_CodeObjectCache __pyx_code_cache = {0,0,NULL};
+static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line);
+static PyCodeObject *__pyx_find_code_object(int code_line);
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object);
+#endif
+
+/* AddTraceback.proto */
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename);
+
+#if PY_MAJOR_VERSION < 3
+    static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags);
+    static void __Pyx_ReleaseBuffer(Py_buffer *view);
+#else
+    #define __Pyx_GetBuffer PyObject_GetBuffer
+    #define __Pyx_ReleaseBuffer PyBuffer_Release
+#endif
+
+
+/* BufferStructDeclare.proto */
+typedef struct {
+  Py_ssize_t shape, strides, suboffsets;
+} __Pyx_Buf_DimInfo;
+typedef struct {
+  size_t refcount;
+  Py_buffer pybuffer;
+} __Pyx_Buffer;
+typedef struct {
+  __Pyx_Buffer *rcbuffer;
+  char *data;
+  __Pyx_Buf_DimInfo diminfo[8];
+} __Pyx_LocalBuf_ND;
+
+/* MemviewSliceIsContig.proto */
+static int __pyx_memviewslice_is_contig(const __Pyx_memviewslice mvs, char order, int ndim);
+
+/* OverlappingSlices.proto */
+static int __pyx_slices_overlap(__Pyx_memviewslice *slice1,
+                                __Pyx_memviewslice *slice2,
+                                int ndim, size_t itemsize);
+
+/* MemviewDtypeToObject.proto */
+static CYTHON_INLINE PyObject *__pyx_memview_get_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t(const char *itemp);
+static CYTHON_INLINE int __pyx_memview_set_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t(const char *itemp, PyObject *obj);
+
+/* TypeInfoCompare.proto */
+static int __pyx_typeinfo_cmp(__Pyx_TypeInfo *a, __Pyx_TypeInfo *b);
+
+/* MemviewSliceValidateAndInit.proto */
+static int __Pyx_ValidateAndInit_memviewslice(
+                int *axes_specs,
+                int c_or_f_flag,
+                int buf_flags,
+                int ndim,
+                __Pyx_TypeInfo *dtype,
+                __Pyx_BufFmt_StackElem stack[],
+                __Pyx_memviewslice *memviewslice,
+                PyObject *original_obj);
+
+/* ObjectToMemviewSlice.proto */
+static CYTHON_INLINE __Pyx_memviewslice __Pyx_PyObject_to_MemoryviewSlice_ds_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t(PyObject *, int writable_flag);
+
+/* ObjectToMemviewSlice.proto */
+static CYTHON_INLINE __Pyx_memviewslice __Pyx_PyObject_to_MemoryviewSlice_dsds_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t(PyObject *, int writable_flag);
+
+/* RealImag.proto */
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    #define __Pyx_CREAL(z) ((z).real())
+    #define __Pyx_CIMAG(z) ((z).imag())
+  #else
+    #define __Pyx_CREAL(z) (__real__(z))
+    #define __Pyx_CIMAG(z) (__imag__(z))
+  #endif
+#else
+    #define __Pyx_CREAL(z) ((z).real)
+    #define __Pyx_CIMAG(z) ((z).imag)
+#endif
+#if defined(__cplusplus) && CYTHON_CCOMPLEX\
+        && (defined(_WIN32) || defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4 )) || __cplusplus >= 201103)
+    #define __Pyx_SET_CREAL(z,x) ((z).real(x))
+    #define __Pyx_SET_CIMAG(z,y) ((z).imag(y))
+#else
+    #define __Pyx_SET_CREAL(z,x) __Pyx_CREAL(z) = (x)
+    #define __Pyx_SET_CIMAG(z,y) __Pyx_CIMAG(z) = (y)
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+    #define __Pyx_c_eq_float(a, b)   ((a)==(b))
+    #define __Pyx_c_sum_float(a, b)  ((a)+(b))
+    #define __Pyx_c_diff_float(a, b) ((a)-(b))
+    #define __Pyx_c_prod_float(a, b) ((a)*(b))
+    #define __Pyx_c_quot_float(a, b) ((a)/(b))
+    #define __Pyx_c_neg_float(a)     (-(a))
+  #ifdef __cplusplus
+    #define __Pyx_c_is_zero_float(z) ((z)==(float)0)
+    #define __Pyx_c_conj_float(z)    (::std::conj(z))
+    #if 1
+        #define __Pyx_c_abs_float(z)     (::std::abs(z))
+        #define __Pyx_c_pow_float(a, b)  (::std::pow(a, b))
+    #endif
+  #else
+    #define __Pyx_c_is_zero_float(z) ((z)==0)
+    #define __Pyx_c_conj_float(z)    (conjf(z))
+    #if 1
+        #define __Pyx_c_abs_float(z)     (cabsf(z))
+        #define __Pyx_c_pow_float(a, b)  (cpowf(a, b))
+    #endif
+ #endif
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sum_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_diff_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prod_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_neg_float(__pyx_t_float_complex);
+    static CYTHON_INLINE int __Pyx_c_is_zero_float(__pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conj_float(__pyx_t_float_complex);
+    #if 1
+        static CYTHON_INLINE float __Pyx_c_abs_float(__pyx_t_float_complex);
+        static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_pow_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    #endif
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+    #define __Pyx_c_eq_double(a, b)   ((a)==(b))
+    #define __Pyx_c_sum_double(a, b)  ((a)+(b))
+    #define __Pyx_c_diff_double(a, b) ((a)-(b))
+    #define __Pyx_c_prod_double(a, b) ((a)*(b))
+    #define __Pyx_c_quot_double(a, b) ((a)/(b))
+    #define __Pyx_c_neg_double(a)     (-(a))
+  #ifdef __cplusplus
+    #define __Pyx_c_is_zero_double(z) ((z)==(double)0)
+    #define __Pyx_c_conj_double(z)    (::std::conj(z))
+    #if 1
+        #define __Pyx_c_abs_double(z)     (::std::abs(z))
+        #define __Pyx_c_pow_double(a, b)  (::std::pow(a, b))
+    #endif
+  #else
+    #define __Pyx_c_is_zero_double(z) ((z)==0)
+    #define __Pyx_c_conj_double(z)    (conj(z))
+    #if 1
+        #define __Pyx_c_abs_double(z)     (cabs(z))
+        #define __Pyx_c_pow_double(a, b)  (cpow(a, b))
+    #endif
+ #endif
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg_double(__pyx_t_double_complex);
+    static CYTHON_INLINE int __Pyx_c_is_zero_double(__pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj_double(__pyx_t_double_complex);
+    #if 1
+        static CYTHON_INLINE double __Pyx_c_abs_double(__pyx_t_double_complex);
+        static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    #endif
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+    #define __Pyx_c_eq_long__double(a, b)   ((a)==(b))
+    #define __Pyx_c_sum_long__double(a, b)  ((a)+(b))
+    #define __Pyx_c_diff_long__double(a, b) ((a)-(b))
+    #define __Pyx_c_prod_long__double(a, b) ((a)*(b))
+    #define __Pyx_c_quot_long__double(a, b) ((a)/(b))
+    #define __Pyx_c_neg_long__double(a)     (-(a))
+  #ifdef __cplusplus
+    #define __Pyx_c_is_zero_long__double(z) ((z)==(long double)0)
+    #define __Pyx_c_conj_long__double(z)    (::std::conj(z))
+    #if 1
+        #define __Pyx_c_abs_long__double(z)     (::std::abs(z))
+        #define __Pyx_c_pow_long__double(a, b)  (::std::pow(a, b))
+    #endif
+  #else
+    #define __Pyx_c_is_zero_long__double(z) ((z)==0)
+    #define __Pyx_c_conj_long__double(z)    (conjl(z))
+    #if 1
+        #define __Pyx_c_abs_long__double(z)     (cabsl(z))
+        #define __Pyx_c_pow_long__double(a, b)  (cpowl(a, b))
+    #endif
+ #endif
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_long__double(__pyx_t_long_double_complex, __pyx_t_long_double_complex);
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_sum_long__double(__pyx_t_long_double_complex, __pyx_t_long_double_complex);
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_diff_long__double(__pyx_t_long_double_complex, __pyx_t_long_double_complex);
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_prod_long__double(__pyx_t_long_double_complex, __pyx_t_long_double_complex);
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_quot_long__double(__pyx_t_long_double_complex, __pyx_t_long_double_complex);
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_neg_long__double(__pyx_t_long_double_complex);
+    static CYTHON_INLINE int __Pyx_c_is_zero_long__double(__pyx_t_long_double_complex);
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_conj_long__double(__pyx_t_long_double_complex);
+    #if 1
+        static CYTHON_INLINE long double __Pyx_c_abs_long__double(__pyx_t_long_double_complex);
+        static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_pow_long__double(__pyx_t_long_double_complex, __pyx_t_long_double_complex);
+    #endif
+#endif
+
+/* MemviewSliceCopyTemplate.proto */
+static __Pyx_memviewslice
+__pyx_memoryview_copy_new_contig(const __Pyx_memviewslice *from_mvs,
+                                 const char *mode, int ndim,
+                                 size_t sizeof_dtype, int contig_flag,
+                                 int dtype_is_object);
+
+/* MemviewSliceInit.proto */
+#define __Pyx_BUF_MAX_NDIMS %(BUF_MAX_NDIMS)d
+#define __Pyx_MEMVIEW_DIRECT   1
+#define __Pyx_MEMVIEW_PTR      2
+#define __Pyx_MEMVIEW_FULL     4
+#define __Pyx_MEMVIEW_CONTIG   8
+#define __Pyx_MEMVIEW_STRIDED  16
+#define __Pyx_MEMVIEW_FOLLOW   32
+#define __Pyx_IS_C_CONTIG 1
+#define __Pyx_IS_F_CONTIG 2
+static int __Pyx_init_memviewslice(
+                struct __pyx_memoryview_obj *memview,
+                int ndim,
+                __Pyx_memviewslice *memviewslice,
+                int memview_is_new_reference);
+static CYTHON_INLINE int __pyx_add_acquisition_count_locked(
+    __pyx_atomic_int_type *acquisition_count, PyThread_type_lock lock);
+static CYTHON_INLINE int __pyx_sub_acquisition_count_locked(
+    __pyx_atomic_int_type *acquisition_count, PyThread_type_lock lock);
+#define __pyx_get_slice_count_pointer(memview) (&memview->acquisition_count)
+#define __PYX_INC_MEMVIEW(slice, have_gil) __Pyx_INC_MEMVIEW(slice, have_gil, __LINE__)
+#define __PYX_XCLEAR_MEMVIEW(slice, have_gil) __Pyx_XCLEAR_MEMVIEW(slice, have_gil, __LINE__)
+static CYTHON_INLINE void __Pyx_INC_MEMVIEW(__Pyx_memviewslice *, int, int);
+static CYTHON_INLINE void __Pyx_XCLEAR_MEMVIEW(__Pyx_memviewslice *, int, int);
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_npy_int64(npy_int64 value);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE npy_int64 __Pyx_PyInt_As_npy_int64(PyObject *);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *);
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value);
+
+/* None.proto */
+#include <new>
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE char __Pyx_PyInt_As_char(PyObject *);
+
+/* FormatTypeName.proto */
+#if CYTHON_COMPILING_IN_LIMITED_API
+typedef PyObject *__Pyx_TypeName;
+#define __Pyx_FMT_TYPENAME "%U"
+static __Pyx_TypeName __Pyx_PyType_GetName(PyTypeObject* tp);
+#define __Pyx_DECREF_TypeName(obj) Py_XDECREF(obj)
+#else
+typedef const char *__Pyx_TypeName;
+#define __Pyx_FMT_TYPENAME "%.200s"
+#define __Pyx_PyType_GetName(tp) ((tp)->tp_name)
+#define __Pyx_DECREF_TypeName(obj)
+#endif
+
+/* CheckBinaryVersion.proto */
+static unsigned long __Pyx_get_runtime_version(void);
+static int __Pyx_check_binary_version(unsigned long ct_version, unsigned long rt_version, int allow_newer);
+
+/* InitStrings.proto */
+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t);
+
+/* #### Code section: module_declarations ### */
+static PyObject *__pyx_array_get_memview(struct __pyx_array_obj *__pyx_v_self); /* proto*/
+static char *__pyx_memoryview_get_item_pointer(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index); /* proto*/
+static PyObject *__pyx_memoryview_is_slice(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_obj); /* proto*/
+static PyObject *__pyx_memoryview_setitem_slice_assignment(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_dst, PyObject *__pyx_v_src); /* proto*/
+static PyObject *__pyx_memoryview_setitem_slice_assign_scalar(struct __pyx_memoryview_obj *__pyx_v_self, struct __pyx_memoryview_obj *__pyx_v_dst, PyObject *__pyx_v_value); /* proto*/
+static PyObject *__pyx_memoryview_setitem_indexed(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index, PyObject *__pyx_v_value); /* proto*/
+static PyObject *__pyx_memoryview_convert_item_to_object(struct __pyx_memoryview_obj *__pyx_v_self, char *__pyx_v_itemp); /* proto*/
+static PyObject *__pyx_memoryview_assign_item_from_object(struct __pyx_memoryview_obj *__pyx_v_self, char *__pyx_v_itemp, PyObject *__pyx_v_value); /* proto*/
+static PyObject *__pyx_memoryview__get_base(struct __pyx_memoryview_obj *__pyx_v_self); /* proto*/
+static PyObject *__pyx_memoryviewslice_convert_item_to_object(struct __pyx_memoryviewslice_obj *__pyx_v_self, char *__pyx_v_itemp); /* proto*/
+static PyObject *__pyx_memoryviewslice_assign_item_from_object(struct __pyx_memoryviewslice_obj *__pyx_v_self, char *__pyx_v_itemp, PyObject *__pyx_v_value); /* proto*/
+static PyObject *__pyx_memoryviewslice__get_base(struct __pyx_memoryviewslice_obj *__pyx_v_self); /* proto*/
+static CYTHON_INLINE npy_intp __pyx_f_5numpy_5dtype_8itemsize_itemsize(PyArray_Descr *__pyx_v_self); /* proto*/
+static CYTHON_INLINE npy_intp __pyx_f_5numpy_5dtype_9alignment_alignment(PyArray_Descr *__pyx_v_self); /* proto*/
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_5dtype_6fields_fields(PyArray_Descr *__pyx_v_self); /* proto*/
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_5dtype_5names_names(PyArray_Descr *__pyx_v_self); /* proto*/
+static CYTHON_INLINE PyArray_ArrayDescr *__pyx_f_5numpy_5dtype_8subarray_subarray(PyArray_Descr *__pyx_v_self); /* proto*/
+static CYTHON_INLINE npy_uint64 __pyx_f_5numpy_5dtype_5flags_flags(PyArray_Descr *__pyx_v_self); /* proto*/
+static CYTHON_INLINE int __pyx_f_5numpy_9broadcast_7numiter_numiter(PyArrayMultiIterObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE npy_intp __pyx_f_5numpy_9broadcast_4size_size(PyArrayMultiIterObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE npy_intp __pyx_f_5numpy_9broadcast_5index_index(PyArrayMultiIterObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE int __pyx_f_5numpy_9broadcast_2nd_nd(PyArrayMultiIterObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE npy_intp *__pyx_f_5numpy_9broadcast_10dimensions_dimensions(PyArrayMultiIterObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE void **__pyx_f_5numpy_9broadcast_5iters_iters(PyArrayMultiIterObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_7ndarray_4base_base(PyArrayObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE PyArray_Descr *__pyx_f_5numpy_7ndarray_5descr_descr(PyArrayObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE int __pyx_f_5numpy_7ndarray_4ndim_ndim(PyArrayObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE npy_intp *__pyx_f_5numpy_7ndarray_5shape_shape(PyArrayObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE npy_intp *__pyx_f_5numpy_7ndarray_7strides_strides(PyArrayObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE npy_intp __pyx_f_5numpy_7ndarray_4size_size(PyArrayObject *__pyx_v_self); /* proto*/
+static CYTHON_INLINE char *__pyx_f_5numpy_7ndarray_4data_data(PyArrayObject *__pyx_v_self); /* proto*/
+static PyObject *__pyx_f_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_reset(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *__pyx_v_self); /* proto*/
+static int __pyx_f_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_step(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *__pyx_v_self, __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_i); /* proto*/
+static PyObject *__pyx_f_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_seek(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *__pyx_v_self, __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_i); /* proto*/
+
+/* Module declarations from "libc.math" */
+
+/* Module declarations from "cython.view" */
+
+/* Module declarations from "cython.dataclasses" */
+
+/* Module declarations from "cython" */
+
+/* Module declarations from "libc.string" */
+
+/* Module declarations from "libc.stdio" */
+
+/* Module declarations from "__builtin__" */
+
+/* Module declarations from "cpython.type" */
+
+/* Module declarations from "cpython" */
+
+/* Module declarations from "cpython.object" */
+
+/* Module declarations from "cpython.ref" */
+
+/* Module declarations from "numpy" */
+
+/* Module declarations from "numpy" */
+
+/* Module declarations from "fairseq.data.token_block_utils_fast" */
+static PyObject *__pyx_collections_abc_Sequence = 0;
+static PyObject *generic = 0;
+static PyObject *strided = 0;
+static PyObject *indirect = 0;
+static PyObject *contiguous = 0;
+static PyObject *indirect_contiguous = 0;
+static int __pyx_memoryview_thread_locks_used;
+static PyThread_type_lock __pyx_memoryview_thread_locks[8];
+static PyArrayObject *__pyx_f_7fairseq_4data_22token_block_utils_fast__get_slice_indices_none_mode(PyArrayObject *, int); /*proto*/
+static PyArrayObject *__pyx_f_7fairseq_4data_22token_block_utils_fast__fast_convert_to_np_array(PyObject *); /*proto*/
+static PyArrayObject *__pyx_f_7fairseq_4data_22token_block_utils_fast__get_slice_indices_fast(PyArrayObject *, PyObject *, int, int, int __pyx_skip_dispatch); /*proto*/
+static PyArrayObject *__pyx_f_7fairseq_4data_22token_block_utils_fast__get_block_to_dataset_index_fast(PyArrayObject *, PyArrayObject *, int __pyx_skip_dispatch); /*proto*/
+static PyObject *__pyx_f_7fairseq_4data_22token_block_utils_fast___pyx_unpickle_DatasetSearcher__set_state(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *, PyObject *); /*proto*/
+static int __pyx_array_allocate_buffer(struct __pyx_array_obj *); /*proto*/
+static struct __pyx_array_obj *__pyx_array_new(PyObject *, Py_ssize_t, char *, char *, char *); /*proto*/
+static PyObject *__pyx_memoryview_new(PyObject *, int, int, __Pyx_TypeInfo *); /*proto*/
+static CYTHON_INLINE int __pyx_memoryview_check(PyObject *); /*proto*/
+static PyObject *_unellipsify(PyObject *, int); /*proto*/
+static int assert_direct_dimensions(Py_ssize_t *, int); /*proto*/
+static struct __pyx_memoryview_obj *__pyx_memview_slice(struct __pyx_memoryview_obj *, PyObject *); /*proto*/
+static int __pyx_memoryview_slice_memviewslice(__Pyx_memviewslice *, Py_ssize_t, Py_ssize_t, Py_ssize_t, int, int, int *, Py_ssize_t, Py_ssize_t, Py_ssize_t, int, int, int, int); /*proto*/
+static char *__pyx_pybuffer_index(Py_buffer *, char *, Py_ssize_t, Py_ssize_t); /*proto*/
+static int __pyx_memslice_transpose(__Pyx_memviewslice *); /*proto*/
+static PyObject *__pyx_memoryview_fromslice(__Pyx_memviewslice, int, PyObject *(*)(char *), int (*)(char *, PyObject *), int); /*proto*/
+static __Pyx_memviewslice *__pyx_memoryview_get_slice_from_memoryview(struct __pyx_memoryview_obj *, __Pyx_memviewslice *); /*proto*/
+static void __pyx_memoryview_slice_copy(struct __pyx_memoryview_obj *, __Pyx_memviewslice *); /*proto*/
+static PyObject *__pyx_memoryview_copy_object(struct __pyx_memoryview_obj *); /*proto*/
+static PyObject *__pyx_memoryview_copy_object_from_slice(struct __pyx_memoryview_obj *, __Pyx_memviewslice *); /*proto*/
+static Py_ssize_t abs_py_ssize_t(Py_ssize_t); /*proto*/
+static char __pyx_get_best_slice_order(__Pyx_memviewslice *, int); /*proto*/
+static void _copy_strided_to_strided(char *, Py_ssize_t *, char *, Py_ssize_t *, Py_ssize_t *, Py_ssize_t *, int, size_t); /*proto*/
+static void copy_strided_to_strided(__Pyx_memviewslice *, __Pyx_memviewslice *, int, size_t); /*proto*/
+static Py_ssize_t __pyx_memoryview_slice_get_size(__Pyx_memviewslice *, int); /*proto*/
+static Py_ssize_t __pyx_fill_contig_strides_array(Py_ssize_t *, Py_ssize_t *, Py_ssize_t, int, char); /*proto*/
+static void *__pyx_memoryview_copy_data_to_temp(__Pyx_memviewslice *, __Pyx_memviewslice *, char, int); /*proto*/
+static int __pyx_memoryview_err_extents(int, Py_ssize_t, Py_ssize_t); /*proto*/
+static int __pyx_memoryview_err_dim(PyObject *, PyObject *, int); /*proto*/
+static int __pyx_memoryview_err(PyObject *, PyObject *); /*proto*/
+static int __pyx_memoryview_err_no_memory(void); /*proto*/
+static int __pyx_memoryview_copy_contents(__Pyx_memviewslice, __Pyx_memviewslice, int, int, int); /*proto*/
+static void __pyx_memoryview_broadcast_leading(__Pyx_memviewslice *, int, int); /*proto*/
+static void __pyx_memoryview_refcount_copying(__Pyx_memviewslice *, int, int, int); /*proto*/
+static void __pyx_memoryview_refcount_objects_in_slice_with_gil(char *, Py_ssize_t *, Py_ssize_t *, int, int); /*proto*/
+static void __pyx_memoryview_refcount_objects_in_slice(char *, Py_ssize_t *, Py_ssize_t *, int, int); /*proto*/
+static void __pyx_memoryview_slice_assign_scalar(__Pyx_memviewslice *, int, size_t, void *, int); /*proto*/
+static void __pyx_memoryview__slice_assign_scalar(char *, Py_ssize_t *, Py_ssize_t *, int, size_t, void *); /*proto*/
+static PyObject *__pyx_unpickle_Enum__set_state(struct __pyx_MemviewEnum_obj *, PyObject *); /*proto*/
+/* #### Code section: typeinfo ### */
+static __Pyx_TypeInfo __Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t = { "DTYPE_t", NULL, sizeof(__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t), { 0 }, 0, __PYX_IS_UNSIGNED(__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t) ? 'U' : 'I', __PYX_IS_UNSIGNED(__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t), 0 };
+/* #### Code section: before_global_var ### */
+#define __Pyx_MODULE_NAME "fairseq.data.token_block_utils_fast"
+extern int __pyx_module_is_main_fairseq__data__token_block_utils_fast;
+int __pyx_module_is_main_fairseq__data__token_block_utils_fast = 0;
+
+/* Implementation of "fairseq.data.token_block_utils_fast" */
+/* #### Code section: global_var ### */
+static PyObject *__pyx_builtin_range;
+static PyObject *__pyx_builtin_ValueError;
+static PyObject *__pyx_builtin_AssertionError;
+static PyObject *__pyx_builtin___import__;
+static PyObject *__pyx_builtin_MemoryError;
+static PyObject *__pyx_builtin_enumerate;
+static PyObject *__pyx_builtin_TypeError;
+static PyObject *__pyx_builtin_Ellipsis;
+static PyObject *__pyx_builtin_id;
+static PyObject *__pyx_builtin_IndexError;
+static PyObject *__pyx_builtin_ImportError;
+/* #### Code section: string_decls ### */
+static const char __pyx_k_[] = ": ";
+static const char __pyx_k_O[] = "O";
+static const char __pyx_k_c[] = "c";
+static const char __pyx_k__2[] = ".";
+static const char __pyx_k__3[] = "*";
+static const char __pyx_k__6[] = "'";
+static const char __pyx_k__7[] = ")";
+static const char __pyx_k_gc[] = "gc";
+static const char __pyx_k_id[] = "id";
+static const char __pyx_k_np[] = "np";
+static const char __pyx_k__35[] = "?";
+static const char __pyx_k_abc[] = "abc";
+static const char __pyx_k_and[] = " and ";
+static const char __pyx_k_eos[] = "eos";
+static const char __pyx_k_got[] = " (got ";
+static const char __pyx_k_new[] = "__new__";
+static const char __pyx_k_obj[] = "obj";
+static const char __pyx_k_sum[] = "sum";
+static const char __pyx_k_sys[] = "sys";
+static const char __pyx_k_axis[] = "axis";
+static const char __pyx_k_base[] = "base";
+static const char __pyx_k_dict[] = "__dict__";
+static const char __pyx_k_main[] = "__main__";
+static const char __pyx_k_mode[] = "mode";
+static const char __pyx_k_name[] = "name";
+static const char __pyx_k_ndim[] = "ndim";
+static const char __pyx_k_none[] = "none";
+static const char __pyx_k_pack[] = "pack";
+static const char __pyx_k_self[] = "self";
+static const char __pyx_k_size[] = "size";
+static const char __pyx_k_spec[] = "__spec__";
+static const char __pyx_k_step[] = "step";
+static const char __pyx_k_stop[] = "stop";
+static const char __pyx_k_test[] = "__test__";
+static const char __pyx_k_ASCII[] = "ASCII";
+static const char __pyx_k_DTYPE[] = "DTYPE";
+static const char __pyx_k_chain[] = "chain";
+static const char __pyx_k_class[] = "__class__";
+static const char __pyx_k_count[] = "count";
+static const char __pyx_k_dtype[] = "dtype";
+static const char __pyx_k_error[] = "error";
+static const char __pyx_k_flags[] = "flags";
+static const char __pyx_k_index[] = "index";
+static const char __pyx_k_int64[] = "int64";
+static const char __pyx_k_numpy[] = "numpy";
+static const char __pyx_k_range[] = "range";
+static const char __pyx_k_shape[] = "shape";
+static const char __pyx_k_sizes[] = "sizes";
+static const char __pyx_k_start[] = "start";
+static const char __pyx_k_state[] = "state";
+static const char __pyx_k_torch[] = "torch";
+static const char __pyx_k_zeros[] = "zeros";
+static const char __pyx_k_cumsum[] = "cumsum";
+static const char __pyx_k_dict_2[] = "_dict";
+static const char __pyx_k_enable[] = "enable";
+static const char __pyx_k_encode[] = "encode";
+static const char __pyx_k_format[] = "format";
+static const char __pyx_k_import[] = "__import__";
+static const char __pyx_k_name_2[] = "__name__";
+static const char __pyx_k_pickle[] = "pickle";
+static const char __pyx_k_reduce[] = "__reduce__";
+static const char __pyx_k_struct[] = "struct";
+static const char __pyx_k_unpack[] = "unpack";
+static const char __pyx_k_update[] = "update";
+static const char __pyx_k_disable[] = "disable";
+static const char __pyx_k_fortran[] = "fortran";
+static const char __pyx_k_memview[] = "memview";
+static const char __pyx_k_reshape[] = "reshape";
+static const char __pyx_k_Ellipsis[] = "Ellipsis";
+static const char __pyx_k_Sequence[] = "Sequence";
+static const char __pyx_k_complete[] = "complete";
+static const char __pyx_k_fromiter[] = "fromiter";
+static const char __pyx_k_getstate[] = "__getstate__";
+static const char __pyx_k_itemsize[] = "itemsize";
+static const char __pyx_k_pyx_type[] = "__pyx_type";
+static const char __pyx_k_register[] = "register";
+static const char __pyx_k_setstate[] = "__setstate__";
+static const char __pyx_k_TypeError[] = "TypeError";
+static const char __pyx_k_enumerate[] = "enumerate";
+static const char __pyx_k_isenabled[] = "isenabled";
+static const char __pyx_k_itertools[] = "itertools";
+static const char __pyx_k_pyx_state[] = "__pyx_state";
+static const char __pyx_k_reduce_ex[] = "__reduce_ex__";
+static const char __pyx_k_IndexError[] = "IndexError";
+static const char __pyx_k_ValueError[] = "ValueError";
+static const char __pyx_k_block_size[] = "block_size";
+static const char __pyx_k_break_mode[] = "break_mode";
+static const char __pyx_k_pyx_result[] = "__pyx_result";
+static const char __pyx_k_pyx_vtable[] = "__pyx_vtable__";
+static const char __pyx_k_ImportError[] = "ImportError";
+static const char __pyx_k_MemoryError[] = "MemoryError";
+static const char __pyx_k_PickleError[] = "PickleError";
+static const char __pyx_k_collections[] = "collections";
+static const char __pyx_k_complete_doc[] = "complete_doc";
+static const char __pyx_k_initializing[] = "_initializing";
+static const char __pyx_k_is_coroutine[] = "_is_coroutine";
+static const char __pyx_k_pyx_checksum[] = "__pyx_checksum";
+static const char __pyx_k_stringsource[] = "<stringsource>";
+static const char __pyx_k_use_setstate[] = "use_setstate";
+static const char __pyx_k_version_info[] = "version_info";
+static const char __pyx_k_class_getitem[] = "__class_getitem__";
+static const char __pyx_k_from_iterable[] = "from_iterable";
+static const char __pyx_k_reduce_cython[] = "__reduce_cython__";
+static const char __pyx_k_slice_indices[] = "slice_indices";
+static const char __pyx_k_AssertionError[] = "AssertionError";
+static const char __pyx_k_DatasetSearcher[] = "DatasetSearcher";
+static const char __pyx_k_View_MemoryView[] = "View.MemoryView";
+static const char __pyx_k_allocate_buffer[] = "allocate_buffer";
+static const char __pyx_k_collections_abc[] = "collections.abc";
+static const char __pyx_k_dtype_is_object[] = "dtype_is_object";
+static const char __pyx_k_pyx_PickleError[] = "__pyx_PickleError";
+static const char __pyx_k_setstate_cython[] = "__setstate_cython__";
+static const char __pyx_k_document_sep_len[] = "document_sep_len";
+static const char __pyx_k_pyx_unpickle_Enum[] = "__pyx_unpickle_Enum";
+static const char __pyx_k_Invalid_break_mode[] = "Invalid break_mode: ";
+static const char __pyx_k_asyncio_coroutines[] = "asyncio.coroutines";
+static const char __pyx_k_cline_in_traceback[] = "cline_in_traceback";
+static const char __pyx_k_strided_and_direct[] = "<strided and direct>";
+static const char __pyx_k_strided_and_indirect[] = "<strided and indirect>";
+static const char __pyx_k_Invalid_shape_in_axis[] = "Invalid shape in axis ";
+static const char __pyx_k_contiguous_and_direct[] = "<contiguous and direct>";
+static const char __pyx_k_Cannot_index_with_type[] = "Cannot index with type '";
+static const char __pyx_k_MemoryView_of_r_object[] = "<MemoryView of %r object>";
+static const char __pyx_k_get_slice_indices_fast[] = "_get_slice_indices_fast";
+static const char __pyx_k_MemoryView_of_r_at_0x_x[] = "<MemoryView of %r at 0x%x>";
+static const char __pyx_k_contiguous_and_indirect[] = "<contiguous and indirect>";
+static const char __pyx_k_Dimension_d_is_not_direct[] = "Dimension %d is not direct";
+static const char __pyx_k_Index_out_of_bounds_axis_d[] = "Index out of bounds (axis %d)";
+static const char __pyx_k_Step_may_not_be_zero_axis_d[] = "Step may not be zero (axis %d)";
+static const char __pyx_k_itemsize_0_for_cython_array[] = "itemsize <= 0 for cython.array";
+static const char __pyx_k_pyx_unpickle_DatasetSearcher[] = "__pyx_unpickle_DatasetSearcher";
+static const char __pyx_k_unable_to_allocate_array_data[] = "unable to allocate array data.";
+static const char __pyx_k_strided_and_direct_or_indirect[] = "<strided and direct or indirect>";
+static const char __pyx_k_DatasetSearcher___reduce_cython[] = "DatasetSearcher.__reduce_cython__";
+static const char __pyx_k_get_block_to_dataset_index_fast[] = "_get_block_to_dataset_index_fast";
+static const char __pyx_k_All_dimensions_preceding_dimensi[] = "All dimensions preceding dimension %d must be indexed and not sliced";
+static const char __pyx_k_Buffer_view_does_not_expose_stri[] = "Buffer view does not expose strides";
+static const char __pyx_k_Can_only_create_a_buffer_that_is[] = "Can only create a buffer that is contiguous in memory.";
+static const char __pyx_k_Cannot_assign_to_read_only_memor[] = "Cannot assign to read-only memoryview";
+static const char __pyx_k_Cannot_create_writable_memory_vi[] = "Cannot create writable memory view from read-only memoryview";
+static const char __pyx_k_Cannot_transpose_memoryview_with[] = "Cannot transpose memoryview with indirect dimensions";
+static const char __pyx_k_DatasetSearcher___setstate_cytho[] = "DatasetSearcher.__setstate_cython__";
+static const char __pyx_k_Empty_shape_tuple_for_cython_arr[] = "Empty shape tuple for cython.array";
+static const char __pyx_k_Incompatible_checksums_0x_x_vs_0[] = "Incompatible checksums (0x%x vs (0x82a3537, 0x6ae9995, 0xb068931) = (name))";
+static const char __pyx_k_Indirect_dimensions_not_supporte[] = "Indirect dimensions not supported";
+static const char __pyx_k_Invalid_mode_expected_c_or_fortr[] = "Invalid mode, expected 'c' or 'fortran', got ";
+static const char __pyx_k_Out_of_bounds_on_buffer_access_a[] = "Out of bounds on buffer access (axis ";
+static const char __pyx_k_Unable_to_convert_item_to_object[] = "Unable to convert item to object";
+static const char __pyx_k_fairseq_data_token_block_utils_f[] = "fairseq/data/token_block_utils_fast.pyx";
+static const char __pyx_k_got_differing_extents_in_dimensi[] = "got differing extents in dimension ";
+static const char __pyx_k_no_default___reduce___due_to_non[] = "no default __reduce__ due to non-trivial __cinit__";
+static const char __pyx_k_numpy__core_multiarray_failed_to[] = "numpy._core.multiarray failed to import";
+static const char __pyx_k_numpy__core_umath_failed_to_impo[] = "numpy._core.umath failed to import";
+static const char __pyx_k_unable_to_allocate_shape_and_str[] = "unable to allocate shape and strides.";
+static const char __pyx_k_Incompatible_checksums_0x_x_vs_0_2[] = "Incompatible checksums (0x%x vs (0x8c67b45, 0x2e2dd22, 0x6632805) = (current_i, current_index, current_offset, sizes))";
+static const char __pyx_k_fairseq_data_token_block_utils_f_2[] = "fairseq.data.token_block_utils_fast";
+/* #### Code section: decls ### */
+static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array___cinit__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_shape, Py_ssize_t __pyx_v_itemsize, PyObject *__pyx_v_format, PyObject *__pyx_v_mode, int __pyx_v_allocate_buffer); /* proto */
+static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array_2__getbuffer__(struct __pyx_array_obj *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /* proto */
+static void __pyx_array___pyx_pf_15View_dot_MemoryView_5array_4__dealloc__(struct __pyx_array_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_5array_7memview___get__(struct __pyx_array_obj *__pyx_v_self); /* proto */
+static Py_ssize_t __pyx_array___pyx_pf_15View_dot_MemoryView_5array_6__len__(struct __pyx_array_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_array___pyx_pf_15View_dot_MemoryView_5array_8__getattr__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_attr); /* proto */
+static PyObject *__pyx_array___pyx_pf_15View_dot_MemoryView_5array_10__getitem__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_item); /* proto */
+static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array_12__setitem__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_item, PyObject *__pyx_v_value); /* proto */
+static PyObject *__pyx_pf___pyx_array___reduce_cython__(CYTHON_UNUSED struct __pyx_array_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_array_2__setstate_cython__(CYTHON_UNUSED struct __pyx_array_obj *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state); /* proto */
+static int __pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum___init__(struct __pyx_MemviewEnum_obj *__pyx_v_self, PyObject *__pyx_v_name); /* proto */
+static PyObject *__pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum_2__repr__(struct __pyx_MemviewEnum_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_MemviewEnum___reduce_cython__(struct __pyx_MemviewEnum_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_MemviewEnum_2__setstate_cython__(struct __pyx_MemviewEnum_obj *__pyx_v_self, PyObject *__pyx_v___pyx_state); /* proto */
+static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview___cinit__(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_obj, int __pyx_v_flags, int __pyx_v_dtype_is_object); /* proto */
+static void __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_2__dealloc__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_4__getitem__(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index); /* proto */
+static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_6__setitem__(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index, PyObject *__pyx_v_value); /* proto */
+static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_8__getbuffer__(struct __pyx_memoryview_obj *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_1T___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4base___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_5shape___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_7strides___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_10suboffsets___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4ndim___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_8itemsize___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_6nbytes___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4size___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static Py_ssize_t __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_10__len__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_12__repr__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_14__str__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_16is_c_contig(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_18is_f_contig(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_20copy(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_22copy_fortran(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_memoryview___reduce_cython__(CYTHON_UNUSED struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_memoryview_2__setstate_cython__(CYTHON_UNUSED struct __pyx_memoryview_obj *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state); /* proto */
+static void __pyx_memoryviewslice___pyx_pf_15View_dot_MemoryView_16_memoryviewslice___dealloc__(struct __pyx_memoryviewslice_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_memoryviewslice___reduce_cython__(CYTHON_UNUSED struct __pyx_memoryviewslice_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_memoryviewslice_2__setstate_cython__(CYTHON_UNUSED struct __pyx_memoryviewslice_obj *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView___pyx_unpickle_Enum(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v___pyx_type, long __pyx_v___pyx_checksum, PyObject *__pyx_v___pyx_state); /* proto */
+static PyObject *__pyx_pf_7fairseq_4data_22token_block_utils_fast__get_slice_indices_fast(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_sizes, PyObject *__pyx_v_break_mode, int __pyx_v_block_size, int __pyx_v_document_sep_len); /* proto */
+static PyObject *__pyx_pf_7fairseq_4data_22token_block_utils_fast_2_get_block_to_dataset_index_fast(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_sizes, PyArrayObject *__pyx_v_slice_indices); /* proto */
+static int __pyx_pf_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher___init__(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *__pyx_v_self, __Pyx_memviewslice __pyx_v_sizes); /* proto */
+static PyObject *__pyx_pf_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_2__reduce_cython__(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_4__setstate_cython__(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *__pyx_v_self, PyObject *__pyx_v___pyx_state); /* proto */
+static PyObject *__pyx_pf_7fairseq_4data_22token_block_utils_fast_4__pyx_unpickle_DatasetSearcher(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v___pyx_type, long __pyx_v___pyx_checksum, PyObject *__pyx_v___pyx_state); /* proto */
+static PyObject *__pyx_tp_new_7fairseq_4data_22token_block_utils_fast_DatasetSearcher(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
+static PyObject *__pyx_tp_new_array(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
+static PyObject *__pyx_tp_new_Enum(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
+static PyObject *__pyx_tp_new_memoryview(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
+static PyObject *__pyx_tp_new__memoryviewslice(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
+/* #### Code section: late_includes ### */
+/* #### Code section: module_state ### */
+typedef struct {
+  PyObject *__pyx_d;
+  PyObject *__pyx_b;
+  PyObject *__pyx_cython_runtime;
+  PyObject *__pyx_empty_tuple;
+  PyObject *__pyx_empty_bytes;
+  PyObject *__pyx_empty_unicode;
+  #ifdef __Pyx_CyFunction_USED
+  PyTypeObject *__pyx_CyFunctionType;
+  #endif
+  #ifdef __Pyx_FusedFunction_USED
+  PyTypeObject *__pyx_FusedFunctionType;
+  #endif
+  #ifdef __Pyx_Generator_USED
+  PyTypeObject *__pyx_GeneratorType;
+  #endif
+  #ifdef __Pyx_IterableCoroutine_USED
+  PyTypeObject *__pyx_IterableCoroutineType;
+  #endif
+  #ifdef __Pyx_Coroutine_USED
+  PyTypeObject *__pyx_CoroutineAwaitType;
+  #endif
+  #ifdef __Pyx_Coroutine_USED
+  PyTypeObject *__pyx_CoroutineType;
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  PyTypeObject *__pyx_ptype_7cpython_4type_type;
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  #if CYTHON_USE_MODULE_STATE
+  #endif
+  PyTypeObject *__pyx_ptype_5numpy_dtype;
+  PyTypeObject *__pyx_ptype_5numpy_flatiter;
+  PyTypeObject *__pyx_ptype_5numpy_broadcast;
+  PyTypeObject *__pyx_ptype_5numpy_ndarray;
+  PyTypeObject *__pyx_ptype_5numpy_generic;
+  PyTypeObject *__pyx_ptype_5numpy_number;
+  PyTypeObject *__pyx_ptype_5numpy_integer;
+  PyTypeObject *__pyx_ptype_5numpy_signedinteger;
+  PyTypeObject *__pyx_ptype_5numpy_unsignedinteger;
+  PyTypeObject *__pyx_ptype_5numpy_inexact;
+  PyTypeObject *__pyx_ptype_5numpy_floating;
+  PyTypeObject *__pyx_ptype_5numpy_complexfloating;
+  PyTypeObject *__pyx_ptype_5numpy_flexible;
+  PyTypeObject *__pyx_ptype_5numpy_character;
+  PyTypeObject *__pyx_ptype_5numpy_ufunc;
+  #if CYTHON_USE_MODULE_STATE
+  PyObject *__pyx_type_7fairseq_4data_22token_block_utils_fast_DatasetSearcher;
+  PyObject *__pyx_type___pyx_array;
+  PyObject *__pyx_type___pyx_MemviewEnum;
+  PyObject *__pyx_type___pyx_memoryview;
+  PyObject *__pyx_type___pyx_memoryviewslice;
+  #endif
+  PyTypeObject *__pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher;
+  PyTypeObject *__pyx_array_type;
+  PyTypeObject *__pyx_MemviewEnum_type;
+  PyTypeObject *__pyx_memoryview_type;
+  PyTypeObject *__pyx_memoryviewslice_type;
+  PyObject *__pyx_kp_u_;
+  PyObject *__pyx_n_s_ASCII;
+  PyObject *__pyx_kp_s_All_dimensions_preceding_dimensi;
+  PyObject *__pyx_n_s_AssertionError;
+  PyObject *__pyx_kp_s_Buffer_view_does_not_expose_stri;
+  PyObject *__pyx_kp_s_Can_only_create_a_buffer_that_is;
+  PyObject *__pyx_kp_s_Cannot_assign_to_read_only_memor;
+  PyObject *__pyx_kp_s_Cannot_create_writable_memory_vi;
+  PyObject *__pyx_kp_u_Cannot_index_with_type;
+  PyObject *__pyx_kp_s_Cannot_transpose_memoryview_with;
+  PyObject *__pyx_n_s_DTYPE;
+  PyObject *__pyx_n_s_DatasetSearcher;
+  PyObject *__pyx_n_s_DatasetSearcher___reduce_cython;
+  PyObject *__pyx_n_s_DatasetSearcher___setstate_cytho;
+  PyObject *__pyx_kp_s_Dimension_d_is_not_direct;
+  PyObject *__pyx_n_s_Ellipsis;
+  PyObject *__pyx_kp_s_Empty_shape_tuple_for_cython_arr;
+  PyObject *__pyx_n_s_ImportError;
+  PyObject *__pyx_kp_s_Incompatible_checksums_0x_x_vs_0;
+  PyObject *__pyx_kp_s_Incompatible_checksums_0x_x_vs_0_2;
+  PyObject *__pyx_n_s_IndexError;
+  PyObject *__pyx_kp_s_Index_out_of_bounds_axis_d;
+  PyObject *__pyx_kp_s_Indirect_dimensions_not_supporte;
+  PyObject *__pyx_kp_u_Invalid_break_mode;
+  PyObject *__pyx_kp_u_Invalid_mode_expected_c_or_fortr;
+  PyObject *__pyx_kp_u_Invalid_shape_in_axis;
+  PyObject *__pyx_n_s_MemoryError;
+  PyObject *__pyx_kp_s_MemoryView_of_r_at_0x_x;
+  PyObject *__pyx_kp_s_MemoryView_of_r_object;
+  PyObject *__pyx_n_b_O;
+  PyObject *__pyx_kp_u_Out_of_bounds_on_buffer_access_a;
+  PyObject *__pyx_n_s_PickleError;
+  PyObject *__pyx_n_s_Sequence;
+  PyObject *__pyx_kp_s_Step_may_not_be_zero_axis_d;
+  PyObject *__pyx_n_s_TypeError;
+  PyObject *__pyx_kp_s_Unable_to_convert_item_to_object;
+  PyObject *__pyx_n_s_ValueError;
+  PyObject *__pyx_n_s_View_MemoryView;
+  PyObject *__pyx_kp_u__2;
+  PyObject *__pyx_n_s__3;
+  PyObject *__pyx_n_s__35;
+  PyObject *__pyx_kp_u__6;
+  PyObject *__pyx_kp_u__7;
+  PyObject *__pyx_n_s_abc;
+  PyObject *__pyx_n_s_allocate_buffer;
+  PyObject *__pyx_kp_u_and;
+  PyObject *__pyx_n_s_asyncio_coroutines;
+  PyObject *__pyx_n_s_axis;
+  PyObject *__pyx_n_s_base;
+  PyObject *__pyx_n_s_block_size;
+  PyObject *__pyx_n_s_break_mode;
+  PyObject *__pyx_n_s_c;
+  PyObject *__pyx_n_u_c;
+  PyObject *__pyx_n_s_chain;
+  PyObject *__pyx_n_s_class;
+  PyObject *__pyx_n_s_class_getitem;
+  PyObject *__pyx_n_s_cline_in_traceback;
+  PyObject *__pyx_n_s_collections;
+  PyObject *__pyx_kp_s_collections_abc;
+  PyObject *__pyx_n_u_complete;
+  PyObject *__pyx_n_u_complete_doc;
+  PyObject *__pyx_kp_s_contiguous_and_direct;
+  PyObject *__pyx_kp_s_contiguous_and_indirect;
+  PyObject *__pyx_n_s_count;
+  PyObject *__pyx_n_s_cumsum;
+  PyObject *__pyx_n_s_dict;
+  PyObject *__pyx_n_s_dict_2;
+  PyObject *__pyx_kp_u_disable;
+  PyObject *__pyx_n_s_document_sep_len;
+  PyObject *__pyx_n_s_dtype;
+  PyObject *__pyx_n_s_dtype_is_object;
+  PyObject *__pyx_kp_u_enable;
+  PyObject *__pyx_n_s_encode;
+  PyObject *__pyx_n_s_enumerate;
+  PyObject *__pyx_n_u_eos;
+  PyObject *__pyx_n_s_error;
+  PyObject *__pyx_kp_s_fairseq_data_token_block_utils_f;
+  PyObject *__pyx_n_s_fairseq_data_token_block_utils_f_2;
+  PyObject *__pyx_n_s_flags;
+  PyObject *__pyx_n_s_format;
+  PyObject *__pyx_n_s_fortran;
+  PyObject *__pyx_n_u_fortran;
+  PyObject *__pyx_n_s_from_iterable;
+  PyObject *__pyx_n_s_fromiter;
+  PyObject *__pyx_kp_u_gc;
+  PyObject *__pyx_n_s_get_block_to_dataset_index_fast;
+  PyObject *__pyx_n_s_get_slice_indices_fast;
+  PyObject *__pyx_n_s_getstate;
+  PyObject *__pyx_kp_u_got;
+  PyObject *__pyx_kp_u_got_differing_extents_in_dimensi;
+  PyObject *__pyx_n_s_id;
+  PyObject *__pyx_n_s_import;
+  PyObject *__pyx_n_s_index;
+  PyObject *__pyx_n_s_initializing;
+  PyObject *__pyx_n_s_int64;
+  PyObject *__pyx_n_s_is_coroutine;
+  PyObject *__pyx_kp_u_isenabled;
+  PyObject *__pyx_n_s_itemsize;
+  PyObject *__pyx_kp_s_itemsize_0_for_cython_array;
+  PyObject *__pyx_n_s_itertools;
+  PyObject *__pyx_n_s_main;
+  PyObject *__pyx_n_s_memview;
+  PyObject *__pyx_n_s_mode;
+  PyObject *__pyx_n_s_name;
+  PyObject *__pyx_n_s_name_2;
+  PyObject *__pyx_n_s_ndim;
+  PyObject *__pyx_n_s_new;
+  PyObject *__pyx_kp_s_no_default___reduce___due_to_non;
+  PyObject *__pyx_n_u_none;
+  PyObject *__pyx_n_s_np;
+  PyObject *__pyx_n_s_numpy;
+  PyObject *__pyx_kp_u_numpy__core_multiarray_failed_to;
+  PyObject *__pyx_kp_u_numpy__core_umath_failed_to_impo;
+  PyObject *__pyx_n_s_obj;
+  PyObject *__pyx_n_s_pack;
+  PyObject *__pyx_n_s_pickle;
+  PyObject *__pyx_n_s_pyx_PickleError;
+  PyObject *__pyx_n_s_pyx_checksum;
+  PyObject *__pyx_n_s_pyx_result;
+  PyObject *__pyx_n_s_pyx_state;
+  PyObject *__pyx_n_s_pyx_type;
+  PyObject *__pyx_n_s_pyx_unpickle_DatasetSearcher;
+  PyObject *__pyx_n_s_pyx_unpickle_Enum;
+  PyObject *__pyx_n_s_pyx_vtable;
+  PyObject *__pyx_n_s_range;
+  PyObject *__pyx_n_s_reduce;
+  PyObject *__pyx_n_s_reduce_cython;
+  PyObject *__pyx_n_s_reduce_ex;
+  PyObject *__pyx_n_s_register;
+  PyObject *__pyx_n_s_reshape;
+  PyObject *__pyx_n_s_self;
+  PyObject *__pyx_n_s_setstate;
+  PyObject *__pyx_n_s_setstate_cython;
+  PyObject *__pyx_n_s_shape;
+  PyObject *__pyx_n_s_size;
+  PyObject *__pyx_n_s_sizes;
+  PyObject *__pyx_n_s_slice_indices;
+  PyObject *__pyx_n_s_spec;
+  PyObject *__pyx_n_s_start;
+  PyObject *__pyx_n_s_state;
+  PyObject *__pyx_n_s_step;
+  PyObject *__pyx_n_s_stop;
+  PyObject *__pyx_kp_s_strided_and_direct;
+  PyObject *__pyx_kp_s_strided_and_direct_or_indirect;
+  PyObject *__pyx_kp_s_strided_and_indirect;
+  PyObject *__pyx_kp_s_stringsource;
+  PyObject *__pyx_n_s_struct;
+  PyObject *__pyx_n_s_sum;
+  PyObject *__pyx_n_s_sys;
+  PyObject *__pyx_n_s_test;
+  PyObject *__pyx_n_s_torch;
+  PyObject *__pyx_kp_s_unable_to_allocate_array_data;
+  PyObject *__pyx_kp_s_unable_to_allocate_shape_and_str;
+  PyObject *__pyx_n_s_unpack;
+  PyObject *__pyx_n_s_update;
+  PyObject *__pyx_n_s_use_setstate;
+  PyObject *__pyx_n_s_version_info;
+  PyObject *__pyx_n_s_zeros;
+  PyObject *__pyx_int_0;
+  PyObject *__pyx_int_1;
+  PyObject *__pyx_int_2;
+  PyObject *__pyx_int_3;
+  PyObject *__pyx_int_48422178;
+  PyObject *__pyx_int_107161605;
+  PyObject *__pyx_int_112105877;
+  PyObject *__pyx_int_136983863;
+  PyObject *__pyx_int_147225413;
+  PyObject *__pyx_int_184977713;
+  PyObject *__pyx_int_neg_1;
+  PyObject *__pyx_slice__5;
+  PyObject *__pyx_tuple__4;
+  PyObject *__pyx_tuple__8;
+  PyObject *__pyx_tuple__9;
+  PyObject *__pyx_slice__11;
+  PyObject *__pyx_tuple__10;
+  PyObject *__pyx_tuple__12;
+  PyObject *__pyx_tuple__13;
+  PyObject *__pyx_tuple__14;
+  PyObject *__pyx_tuple__15;
+  PyObject *__pyx_tuple__16;
+  PyObject *__pyx_tuple__17;
+  PyObject *__pyx_tuple__18;
+  PyObject *__pyx_tuple__19;
+  PyObject *__pyx_tuple__20;
+  PyObject *__pyx_tuple__21;
+  PyObject *__pyx_tuple__22;
+  PyObject *__pyx_tuple__23;
+  PyObject *__pyx_tuple__24;
+  PyObject *__pyx_tuple__26;
+  PyObject *__pyx_tuple__28;
+  PyObject *__pyx_tuple__30;
+  PyObject *__pyx_tuple__32;
+  PyObject *__pyx_codeobj__25;
+  PyObject *__pyx_codeobj__27;
+  PyObject *__pyx_codeobj__29;
+  PyObject *__pyx_codeobj__31;
+  PyObject *__pyx_codeobj__33;
+  PyObject *__pyx_codeobj__34;
+} __pyx_mstate;
+
+#if CYTHON_USE_MODULE_STATE
+#ifdef __cplusplus
+namespace {
+  extern struct PyModuleDef __pyx_moduledef;
+} /* anonymous namespace */
+#else
+static struct PyModuleDef __pyx_moduledef;
+#endif
+
+#define __pyx_mstate(o) ((__pyx_mstate *)__Pyx_PyModule_GetState(o))
+
+#define __pyx_mstate_global (__pyx_mstate(PyState_FindModule(&__pyx_moduledef)))
+
+#define __pyx_m (PyState_FindModule(&__pyx_moduledef))
+#else
+static __pyx_mstate __pyx_mstate_global_static =
+#ifdef __cplusplus
+    {};
+#else
+    {0};
+#endif
+static __pyx_mstate *__pyx_mstate_global = &__pyx_mstate_global_static;
+#endif
+/* #### Code section: module_state_clear ### */
+#if CYTHON_USE_MODULE_STATE
+static int __pyx_m_clear(PyObject *m) {
+  __pyx_mstate *clear_module_state = __pyx_mstate(m);
+  if (!clear_module_state) return 0;
+  Py_CLEAR(clear_module_state->__pyx_d);
+  Py_CLEAR(clear_module_state->__pyx_b);
+  Py_CLEAR(clear_module_state->__pyx_cython_runtime);
+  Py_CLEAR(clear_module_state->__pyx_empty_tuple);
+  Py_CLEAR(clear_module_state->__pyx_empty_bytes);
+  Py_CLEAR(clear_module_state->__pyx_empty_unicode);
+  #ifdef __Pyx_CyFunction_USED
+  Py_CLEAR(clear_module_state->__pyx_CyFunctionType);
+  #endif
+  #ifdef __Pyx_FusedFunction_USED
+  Py_CLEAR(clear_module_state->__pyx_FusedFunctionType);
+  #endif
+  Py_CLEAR(clear_module_state->__pyx_ptype_7cpython_4type_type);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_dtype);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_flatiter);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_broadcast);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_ndarray);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_generic);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_number);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_integer);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_signedinteger);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_unsignedinteger);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_inexact);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_floating);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_complexfloating);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_flexible);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_character);
+  Py_CLEAR(clear_module_state->__pyx_ptype_5numpy_ufunc);
+  Py_CLEAR(clear_module_state->__pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher);
+  Py_CLEAR(clear_module_state->__pyx_type_7fairseq_4data_22token_block_utils_fast_DatasetSearcher);
+  Py_CLEAR(clear_module_state->__pyx_array_type);
+  Py_CLEAR(clear_module_state->__pyx_type___pyx_array);
+  Py_CLEAR(clear_module_state->__pyx_MemviewEnum_type);
+  Py_CLEAR(clear_module_state->__pyx_type___pyx_MemviewEnum);
+  Py_CLEAR(clear_module_state->__pyx_memoryview_type);
+  Py_CLEAR(clear_module_state->__pyx_type___pyx_memoryview);
+  Py_CLEAR(clear_module_state->__pyx_memoryviewslice_type);
+  Py_CLEAR(clear_module_state->__pyx_type___pyx_memoryviewslice);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_);
+  Py_CLEAR(clear_module_state->__pyx_n_s_ASCII);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_All_dimensions_preceding_dimensi);
+  Py_CLEAR(clear_module_state->__pyx_n_s_AssertionError);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Buffer_view_does_not_expose_stri);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Can_only_create_a_buffer_that_is);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Cannot_assign_to_read_only_memor);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Cannot_create_writable_memory_vi);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_Cannot_index_with_type);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Cannot_transpose_memoryview_with);
+  Py_CLEAR(clear_module_state->__pyx_n_s_DTYPE);
+  Py_CLEAR(clear_module_state->__pyx_n_s_DatasetSearcher);
+  Py_CLEAR(clear_module_state->__pyx_n_s_DatasetSearcher___reduce_cython);
+  Py_CLEAR(clear_module_state->__pyx_n_s_DatasetSearcher___setstate_cytho);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Dimension_d_is_not_direct);
+  Py_CLEAR(clear_module_state->__pyx_n_s_Ellipsis);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Empty_shape_tuple_for_cython_arr);
+  Py_CLEAR(clear_module_state->__pyx_n_s_ImportError);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Incompatible_checksums_0x_x_vs_0);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Incompatible_checksums_0x_x_vs_0_2);
+  Py_CLEAR(clear_module_state->__pyx_n_s_IndexError);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Index_out_of_bounds_axis_d);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Indirect_dimensions_not_supporte);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_Invalid_break_mode);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_Invalid_mode_expected_c_or_fortr);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_Invalid_shape_in_axis);
+  Py_CLEAR(clear_module_state->__pyx_n_s_MemoryError);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_MemoryView_of_r_at_0x_x);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_MemoryView_of_r_object);
+  Py_CLEAR(clear_module_state->__pyx_n_b_O);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_Out_of_bounds_on_buffer_access_a);
+  Py_CLEAR(clear_module_state->__pyx_n_s_PickleError);
+  Py_CLEAR(clear_module_state->__pyx_n_s_Sequence);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Step_may_not_be_zero_axis_d);
+  Py_CLEAR(clear_module_state->__pyx_n_s_TypeError);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_Unable_to_convert_item_to_object);
+  Py_CLEAR(clear_module_state->__pyx_n_s_ValueError);
+  Py_CLEAR(clear_module_state->__pyx_n_s_View_MemoryView);
+  Py_CLEAR(clear_module_state->__pyx_kp_u__2);
+  Py_CLEAR(clear_module_state->__pyx_n_s__3);
+  Py_CLEAR(clear_module_state->__pyx_n_s__35);
+  Py_CLEAR(clear_module_state->__pyx_kp_u__6);
+  Py_CLEAR(clear_module_state->__pyx_kp_u__7);
+  Py_CLEAR(clear_module_state->__pyx_n_s_abc);
+  Py_CLEAR(clear_module_state->__pyx_n_s_allocate_buffer);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_and);
+  Py_CLEAR(clear_module_state->__pyx_n_s_asyncio_coroutines);
+  Py_CLEAR(clear_module_state->__pyx_n_s_axis);
+  Py_CLEAR(clear_module_state->__pyx_n_s_base);
+  Py_CLEAR(clear_module_state->__pyx_n_s_block_size);
+  Py_CLEAR(clear_module_state->__pyx_n_s_break_mode);
+  Py_CLEAR(clear_module_state->__pyx_n_s_c);
+  Py_CLEAR(clear_module_state->__pyx_n_u_c);
+  Py_CLEAR(clear_module_state->__pyx_n_s_chain);
+  Py_CLEAR(clear_module_state->__pyx_n_s_class);
+  Py_CLEAR(clear_module_state->__pyx_n_s_class_getitem);
+  Py_CLEAR(clear_module_state->__pyx_n_s_cline_in_traceback);
+  Py_CLEAR(clear_module_state->__pyx_n_s_collections);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_collections_abc);
+  Py_CLEAR(clear_module_state->__pyx_n_u_complete);
+  Py_CLEAR(clear_module_state->__pyx_n_u_complete_doc);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_contiguous_and_direct);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_contiguous_and_indirect);
+  Py_CLEAR(clear_module_state->__pyx_n_s_count);
+  Py_CLEAR(clear_module_state->__pyx_n_s_cumsum);
+  Py_CLEAR(clear_module_state->__pyx_n_s_dict);
+  Py_CLEAR(clear_module_state->__pyx_n_s_dict_2);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_disable);
+  Py_CLEAR(clear_module_state->__pyx_n_s_document_sep_len);
+  Py_CLEAR(clear_module_state->__pyx_n_s_dtype);
+  Py_CLEAR(clear_module_state->__pyx_n_s_dtype_is_object);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_enable);
+  Py_CLEAR(clear_module_state->__pyx_n_s_encode);
+  Py_CLEAR(clear_module_state->__pyx_n_s_enumerate);
+  Py_CLEAR(clear_module_state->__pyx_n_u_eos);
+  Py_CLEAR(clear_module_state->__pyx_n_s_error);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_fairseq_data_token_block_utils_f);
+  Py_CLEAR(clear_module_state->__pyx_n_s_fairseq_data_token_block_utils_f_2);
+  Py_CLEAR(clear_module_state->__pyx_n_s_flags);
+  Py_CLEAR(clear_module_state->__pyx_n_s_format);
+  Py_CLEAR(clear_module_state->__pyx_n_s_fortran);
+  Py_CLEAR(clear_module_state->__pyx_n_u_fortran);
+  Py_CLEAR(clear_module_state->__pyx_n_s_from_iterable);
+  Py_CLEAR(clear_module_state->__pyx_n_s_fromiter);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_gc);
+  Py_CLEAR(clear_module_state->__pyx_n_s_get_block_to_dataset_index_fast);
+  Py_CLEAR(clear_module_state->__pyx_n_s_get_slice_indices_fast);
+  Py_CLEAR(clear_module_state->__pyx_n_s_getstate);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_got);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_got_differing_extents_in_dimensi);
+  Py_CLEAR(clear_module_state->__pyx_n_s_id);
+  Py_CLEAR(clear_module_state->__pyx_n_s_import);
+  Py_CLEAR(clear_module_state->__pyx_n_s_index);
+  Py_CLEAR(clear_module_state->__pyx_n_s_initializing);
+  Py_CLEAR(clear_module_state->__pyx_n_s_int64);
+  Py_CLEAR(clear_module_state->__pyx_n_s_is_coroutine);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_isenabled);
+  Py_CLEAR(clear_module_state->__pyx_n_s_itemsize);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_itemsize_0_for_cython_array);
+  Py_CLEAR(clear_module_state->__pyx_n_s_itertools);
+  Py_CLEAR(clear_module_state->__pyx_n_s_main);
+  Py_CLEAR(clear_module_state->__pyx_n_s_memview);
+  Py_CLEAR(clear_module_state->__pyx_n_s_mode);
+  Py_CLEAR(clear_module_state->__pyx_n_s_name);
+  Py_CLEAR(clear_module_state->__pyx_n_s_name_2);
+  Py_CLEAR(clear_module_state->__pyx_n_s_ndim);
+  Py_CLEAR(clear_module_state->__pyx_n_s_new);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_no_default___reduce___due_to_non);
+  Py_CLEAR(clear_module_state->__pyx_n_u_none);
+  Py_CLEAR(clear_module_state->__pyx_n_s_np);
+  Py_CLEAR(clear_module_state->__pyx_n_s_numpy);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_numpy__core_multiarray_failed_to);
+  Py_CLEAR(clear_module_state->__pyx_kp_u_numpy__core_umath_failed_to_impo);
+  Py_CLEAR(clear_module_state->__pyx_n_s_obj);
+  Py_CLEAR(clear_module_state->__pyx_n_s_pack);
+  Py_CLEAR(clear_module_state->__pyx_n_s_pickle);
+  Py_CLEAR(clear_module_state->__pyx_n_s_pyx_PickleError);
+  Py_CLEAR(clear_module_state->__pyx_n_s_pyx_checksum);
+  Py_CLEAR(clear_module_state->__pyx_n_s_pyx_result);
+  Py_CLEAR(clear_module_state->__pyx_n_s_pyx_state);
+  Py_CLEAR(clear_module_state->__pyx_n_s_pyx_type);
+  Py_CLEAR(clear_module_state->__pyx_n_s_pyx_unpickle_DatasetSearcher);
+  Py_CLEAR(clear_module_state->__pyx_n_s_pyx_unpickle_Enum);
+  Py_CLEAR(clear_module_state->__pyx_n_s_pyx_vtable);
+  Py_CLEAR(clear_module_state->__pyx_n_s_range);
+  Py_CLEAR(clear_module_state->__pyx_n_s_reduce);
+  Py_CLEAR(clear_module_state->__pyx_n_s_reduce_cython);
+  Py_CLEAR(clear_module_state->__pyx_n_s_reduce_ex);
+  Py_CLEAR(clear_module_state->__pyx_n_s_register);
+  Py_CLEAR(clear_module_state->__pyx_n_s_reshape);
+  Py_CLEAR(clear_module_state->__pyx_n_s_self);
+  Py_CLEAR(clear_module_state->__pyx_n_s_setstate);
+  Py_CLEAR(clear_module_state->__pyx_n_s_setstate_cython);
+  Py_CLEAR(clear_module_state->__pyx_n_s_shape);
+  Py_CLEAR(clear_module_state->__pyx_n_s_size);
+  Py_CLEAR(clear_module_state->__pyx_n_s_sizes);
+  Py_CLEAR(clear_module_state->__pyx_n_s_slice_indices);
+  Py_CLEAR(clear_module_state->__pyx_n_s_spec);
+  Py_CLEAR(clear_module_state->__pyx_n_s_start);
+  Py_CLEAR(clear_module_state->__pyx_n_s_state);
+  Py_CLEAR(clear_module_state->__pyx_n_s_step);
+  Py_CLEAR(clear_module_state->__pyx_n_s_stop);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_strided_and_direct);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_strided_and_direct_or_indirect);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_strided_and_indirect);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_stringsource);
+  Py_CLEAR(clear_module_state->__pyx_n_s_struct);
+  Py_CLEAR(clear_module_state->__pyx_n_s_sum);
+  Py_CLEAR(clear_module_state->__pyx_n_s_sys);
+  Py_CLEAR(clear_module_state->__pyx_n_s_test);
+  Py_CLEAR(clear_module_state->__pyx_n_s_torch);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_unable_to_allocate_array_data);
+  Py_CLEAR(clear_module_state->__pyx_kp_s_unable_to_allocate_shape_and_str);
+  Py_CLEAR(clear_module_state->__pyx_n_s_unpack);
+  Py_CLEAR(clear_module_state->__pyx_n_s_update);
+  Py_CLEAR(clear_module_state->__pyx_n_s_use_setstate);
+  Py_CLEAR(clear_module_state->__pyx_n_s_version_info);
+  Py_CLEAR(clear_module_state->__pyx_n_s_zeros);
+  Py_CLEAR(clear_module_state->__pyx_int_0);
+  Py_CLEAR(clear_module_state->__pyx_int_1);
+  Py_CLEAR(clear_module_state->__pyx_int_2);
+  Py_CLEAR(clear_module_state->__pyx_int_3);
+  Py_CLEAR(clear_module_state->__pyx_int_48422178);
+  Py_CLEAR(clear_module_state->__pyx_int_107161605);
+  Py_CLEAR(clear_module_state->__pyx_int_112105877);
+  Py_CLEAR(clear_module_state->__pyx_int_136983863);
+  Py_CLEAR(clear_module_state->__pyx_int_147225413);
+  Py_CLEAR(clear_module_state->__pyx_int_184977713);
+  Py_CLEAR(clear_module_state->__pyx_int_neg_1);
+  Py_CLEAR(clear_module_state->__pyx_slice__5);
+  Py_CLEAR(clear_module_state->__pyx_tuple__4);
+  Py_CLEAR(clear_module_state->__pyx_tuple__8);
+  Py_CLEAR(clear_module_state->__pyx_tuple__9);
+  Py_CLEAR(clear_module_state->__pyx_slice__11);
+  Py_CLEAR(clear_module_state->__pyx_tuple__10);
+  Py_CLEAR(clear_module_state->__pyx_tuple__12);
+  Py_CLEAR(clear_module_state->__pyx_tuple__13);
+  Py_CLEAR(clear_module_state->__pyx_tuple__14);
+  Py_CLEAR(clear_module_state->__pyx_tuple__15);
+  Py_CLEAR(clear_module_state->__pyx_tuple__16);
+  Py_CLEAR(clear_module_state->__pyx_tuple__17);
+  Py_CLEAR(clear_module_state->__pyx_tuple__18);
+  Py_CLEAR(clear_module_state->__pyx_tuple__19);
+  Py_CLEAR(clear_module_state->__pyx_tuple__20);
+  Py_CLEAR(clear_module_state->__pyx_tuple__21);
+  Py_CLEAR(clear_module_state->__pyx_tuple__22);
+  Py_CLEAR(clear_module_state->__pyx_tuple__23);
+  Py_CLEAR(clear_module_state->__pyx_tuple__24);
+  Py_CLEAR(clear_module_state->__pyx_tuple__26);
+  Py_CLEAR(clear_module_state->__pyx_tuple__28);
+  Py_CLEAR(clear_module_state->__pyx_tuple__30);
+  Py_CLEAR(clear_module_state->__pyx_tuple__32);
+  Py_CLEAR(clear_module_state->__pyx_codeobj__25);
+  Py_CLEAR(clear_module_state->__pyx_codeobj__27);
+  Py_CLEAR(clear_module_state->__pyx_codeobj__29);
+  Py_CLEAR(clear_module_state->__pyx_codeobj__31);
+  Py_CLEAR(clear_module_state->__pyx_codeobj__33);
+  Py_CLEAR(clear_module_state->__pyx_codeobj__34);
+  return 0;
+}
+#endif
+/* #### Code section: module_state_traverse ### */
+#if CYTHON_USE_MODULE_STATE
+static int __pyx_m_traverse(PyObject *m, visitproc visit, void *arg) {
+  __pyx_mstate *traverse_module_state = __pyx_mstate(m);
+  if (!traverse_module_state) return 0;
+  Py_VISIT(traverse_module_state->__pyx_d);
+  Py_VISIT(traverse_module_state->__pyx_b);
+  Py_VISIT(traverse_module_state->__pyx_cython_runtime);
+  Py_VISIT(traverse_module_state->__pyx_empty_tuple);
+  Py_VISIT(traverse_module_state->__pyx_empty_bytes);
+  Py_VISIT(traverse_module_state->__pyx_empty_unicode);
+  #ifdef __Pyx_CyFunction_USED
+  Py_VISIT(traverse_module_state->__pyx_CyFunctionType);
+  #endif
+  #ifdef __Pyx_FusedFunction_USED
+  Py_VISIT(traverse_module_state->__pyx_FusedFunctionType);
+  #endif
+  Py_VISIT(traverse_module_state->__pyx_ptype_7cpython_4type_type);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_dtype);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_flatiter);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_broadcast);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_ndarray);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_generic);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_number);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_integer);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_signedinteger);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_unsignedinteger);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_inexact);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_floating);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_complexfloating);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_flexible);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_character);
+  Py_VISIT(traverse_module_state->__pyx_ptype_5numpy_ufunc);
+  Py_VISIT(traverse_module_state->__pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher);
+  Py_VISIT(traverse_module_state->__pyx_type_7fairseq_4data_22token_block_utils_fast_DatasetSearcher);
+  Py_VISIT(traverse_module_state->__pyx_array_type);
+  Py_VISIT(traverse_module_state->__pyx_type___pyx_array);
+  Py_VISIT(traverse_module_state->__pyx_MemviewEnum_type);
+  Py_VISIT(traverse_module_state->__pyx_type___pyx_MemviewEnum);
+  Py_VISIT(traverse_module_state->__pyx_memoryview_type);
+  Py_VISIT(traverse_module_state->__pyx_type___pyx_memoryview);
+  Py_VISIT(traverse_module_state->__pyx_memoryviewslice_type);
+  Py_VISIT(traverse_module_state->__pyx_type___pyx_memoryviewslice);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_);
+  Py_VISIT(traverse_module_state->__pyx_n_s_ASCII);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_All_dimensions_preceding_dimensi);
+  Py_VISIT(traverse_module_state->__pyx_n_s_AssertionError);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Buffer_view_does_not_expose_stri);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Can_only_create_a_buffer_that_is);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Cannot_assign_to_read_only_memor);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Cannot_create_writable_memory_vi);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_Cannot_index_with_type);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Cannot_transpose_memoryview_with);
+  Py_VISIT(traverse_module_state->__pyx_n_s_DTYPE);
+  Py_VISIT(traverse_module_state->__pyx_n_s_DatasetSearcher);
+  Py_VISIT(traverse_module_state->__pyx_n_s_DatasetSearcher___reduce_cython);
+  Py_VISIT(traverse_module_state->__pyx_n_s_DatasetSearcher___setstate_cytho);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Dimension_d_is_not_direct);
+  Py_VISIT(traverse_module_state->__pyx_n_s_Ellipsis);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Empty_shape_tuple_for_cython_arr);
+  Py_VISIT(traverse_module_state->__pyx_n_s_ImportError);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Incompatible_checksums_0x_x_vs_0);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Incompatible_checksums_0x_x_vs_0_2);
+  Py_VISIT(traverse_module_state->__pyx_n_s_IndexError);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Index_out_of_bounds_axis_d);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Indirect_dimensions_not_supporte);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_Invalid_break_mode);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_Invalid_mode_expected_c_or_fortr);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_Invalid_shape_in_axis);
+  Py_VISIT(traverse_module_state->__pyx_n_s_MemoryError);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_MemoryView_of_r_at_0x_x);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_MemoryView_of_r_object);
+  Py_VISIT(traverse_module_state->__pyx_n_b_O);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_Out_of_bounds_on_buffer_access_a);
+  Py_VISIT(traverse_module_state->__pyx_n_s_PickleError);
+  Py_VISIT(traverse_module_state->__pyx_n_s_Sequence);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Step_may_not_be_zero_axis_d);
+  Py_VISIT(traverse_module_state->__pyx_n_s_TypeError);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_Unable_to_convert_item_to_object);
+  Py_VISIT(traverse_module_state->__pyx_n_s_ValueError);
+  Py_VISIT(traverse_module_state->__pyx_n_s_View_MemoryView);
+  Py_VISIT(traverse_module_state->__pyx_kp_u__2);
+  Py_VISIT(traverse_module_state->__pyx_n_s__3);
+  Py_VISIT(traverse_module_state->__pyx_n_s__35);
+  Py_VISIT(traverse_module_state->__pyx_kp_u__6);
+  Py_VISIT(traverse_module_state->__pyx_kp_u__7);
+  Py_VISIT(traverse_module_state->__pyx_n_s_abc);
+  Py_VISIT(traverse_module_state->__pyx_n_s_allocate_buffer);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_and);
+  Py_VISIT(traverse_module_state->__pyx_n_s_asyncio_coroutines);
+  Py_VISIT(traverse_module_state->__pyx_n_s_axis);
+  Py_VISIT(traverse_module_state->__pyx_n_s_base);
+  Py_VISIT(traverse_module_state->__pyx_n_s_block_size);
+  Py_VISIT(traverse_module_state->__pyx_n_s_break_mode);
+  Py_VISIT(traverse_module_state->__pyx_n_s_c);
+  Py_VISIT(traverse_module_state->__pyx_n_u_c);
+  Py_VISIT(traverse_module_state->__pyx_n_s_chain);
+  Py_VISIT(traverse_module_state->__pyx_n_s_class);
+  Py_VISIT(traverse_module_state->__pyx_n_s_class_getitem);
+  Py_VISIT(traverse_module_state->__pyx_n_s_cline_in_traceback);
+  Py_VISIT(traverse_module_state->__pyx_n_s_collections);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_collections_abc);
+  Py_VISIT(traverse_module_state->__pyx_n_u_complete);
+  Py_VISIT(traverse_module_state->__pyx_n_u_complete_doc);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_contiguous_and_direct);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_contiguous_and_indirect);
+  Py_VISIT(traverse_module_state->__pyx_n_s_count);
+  Py_VISIT(traverse_module_state->__pyx_n_s_cumsum);
+  Py_VISIT(traverse_module_state->__pyx_n_s_dict);
+  Py_VISIT(traverse_module_state->__pyx_n_s_dict_2);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_disable);
+  Py_VISIT(traverse_module_state->__pyx_n_s_document_sep_len);
+  Py_VISIT(traverse_module_state->__pyx_n_s_dtype);
+  Py_VISIT(traverse_module_state->__pyx_n_s_dtype_is_object);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_enable);
+  Py_VISIT(traverse_module_state->__pyx_n_s_encode);
+  Py_VISIT(traverse_module_state->__pyx_n_s_enumerate);
+  Py_VISIT(traverse_module_state->__pyx_n_u_eos);
+  Py_VISIT(traverse_module_state->__pyx_n_s_error);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_fairseq_data_token_block_utils_f);
+  Py_VISIT(traverse_module_state->__pyx_n_s_fairseq_data_token_block_utils_f_2);
+  Py_VISIT(traverse_module_state->__pyx_n_s_flags);
+  Py_VISIT(traverse_module_state->__pyx_n_s_format);
+  Py_VISIT(traverse_module_state->__pyx_n_s_fortran);
+  Py_VISIT(traverse_module_state->__pyx_n_u_fortran);
+  Py_VISIT(traverse_module_state->__pyx_n_s_from_iterable);
+  Py_VISIT(traverse_module_state->__pyx_n_s_fromiter);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_gc);
+  Py_VISIT(traverse_module_state->__pyx_n_s_get_block_to_dataset_index_fast);
+  Py_VISIT(traverse_module_state->__pyx_n_s_get_slice_indices_fast);
+  Py_VISIT(traverse_module_state->__pyx_n_s_getstate);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_got);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_got_differing_extents_in_dimensi);
+  Py_VISIT(traverse_module_state->__pyx_n_s_id);
+  Py_VISIT(traverse_module_state->__pyx_n_s_import);
+  Py_VISIT(traverse_module_state->__pyx_n_s_index);
+  Py_VISIT(traverse_module_state->__pyx_n_s_initializing);
+  Py_VISIT(traverse_module_state->__pyx_n_s_int64);
+  Py_VISIT(traverse_module_state->__pyx_n_s_is_coroutine);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_isenabled);
+  Py_VISIT(traverse_module_state->__pyx_n_s_itemsize);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_itemsize_0_for_cython_array);
+  Py_VISIT(traverse_module_state->__pyx_n_s_itertools);
+  Py_VISIT(traverse_module_state->__pyx_n_s_main);
+  Py_VISIT(traverse_module_state->__pyx_n_s_memview);
+  Py_VISIT(traverse_module_state->__pyx_n_s_mode);
+  Py_VISIT(traverse_module_state->__pyx_n_s_name);
+  Py_VISIT(traverse_module_state->__pyx_n_s_name_2);
+  Py_VISIT(traverse_module_state->__pyx_n_s_ndim);
+  Py_VISIT(traverse_module_state->__pyx_n_s_new);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_no_default___reduce___due_to_non);
+  Py_VISIT(traverse_module_state->__pyx_n_u_none);
+  Py_VISIT(traverse_module_state->__pyx_n_s_np);
+  Py_VISIT(traverse_module_state->__pyx_n_s_numpy);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_numpy__core_multiarray_failed_to);
+  Py_VISIT(traverse_module_state->__pyx_kp_u_numpy__core_umath_failed_to_impo);
+  Py_VISIT(traverse_module_state->__pyx_n_s_obj);
+  Py_VISIT(traverse_module_state->__pyx_n_s_pack);
+  Py_VISIT(traverse_module_state->__pyx_n_s_pickle);
+  Py_VISIT(traverse_module_state->__pyx_n_s_pyx_PickleError);
+  Py_VISIT(traverse_module_state->__pyx_n_s_pyx_checksum);
+  Py_VISIT(traverse_module_state->__pyx_n_s_pyx_result);
+  Py_VISIT(traverse_module_state->__pyx_n_s_pyx_state);
+  Py_VISIT(traverse_module_state->__pyx_n_s_pyx_type);
+  Py_VISIT(traverse_module_state->__pyx_n_s_pyx_unpickle_DatasetSearcher);
+  Py_VISIT(traverse_module_state->__pyx_n_s_pyx_unpickle_Enum);
+  Py_VISIT(traverse_module_state->__pyx_n_s_pyx_vtable);
+  Py_VISIT(traverse_module_state->__pyx_n_s_range);
+  Py_VISIT(traverse_module_state->__pyx_n_s_reduce);
+  Py_VISIT(traverse_module_state->__pyx_n_s_reduce_cython);
+  Py_VISIT(traverse_module_state->__pyx_n_s_reduce_ex);
+  Py_VISIT(traverse_module_state->__pyx_n_s_register);
+  Py_VISIT(traverse_module_state->__pyx_n_s_reshape);
+  Py_VISIT(traverse_module_state->__pyx_n_s_self);
+  Py_VISIT(traverse_module_state->__pyx_n_s_setstate);
+  Py_VISIT(traverse_module_state->__pyx_n_s_setstate_cython);
+  Py_VISIT(traverse_module_state->__pyx_n_s_shape);
+  Py_VISIT(traverse_module_state->__pyx_n_s_size);
+  Py_VISIT(traverse_module_state->__pyx_n_s_sizes);
+  Py_VISIT(traverse_module_state->__pyx_n_s_slice_indices);
+  Py_VISIT(traverse_module_state->__pyx_n_s_spec);
+  Py_VISIT(traverse_module_state->__pyx_n_s_start);
+  Py_VISIT(traverse_module_state->__pyx_n_s_state);
+  Py_VISIT(traverse_module_state->__pyx_n_s_step);
+  Py_VISIT(traverse_module_state->__pyx_n_s_stop);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_strided_and_direct);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_strided_and_direct_or_indirect);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_strided_and_indirect);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_stringsource);
+  Py_VISIT(traverse_module_state->__pyx_n_s_struct);
+  Py_VISIT(traverse_module_state->__pyx_n_s_sum);
+  Py_VISIT(traverse_module_state->__pyx_n_s_sys);
+  Py_VISIT(traverse_module_state->__pyx_n_s_test);
+  Py_VISIT(traverse_module_state->__pyx_n_s_torch);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_unable_to_allocate_array_data);
+  Py_VISIT(traverse_module_state->__pyx_kp_s_unable_to_allocate_shape_and_str);
+  Py_VISIT(traverse_module_state->__pyx_n_s_unpack);
+  Py_VISIT(traverse_module_state->__pyx_n_s_update);
+  Py_VISIT(traverse_module_state->__pyx_n_s_use_setstate);
+  Py_VISIT(traverse_module_state->__pyx_n_s_version_info);
+  Py_VISIT(traverse_module_state->__pyx_n_s_zeros);
+  Py_VISIT(traverse_module_state->__pyx_int_0);
+  Py_VISIT(traverse_module_state->__pyx_int_1);
+  Py_VISIT(traverse_module_state->__pyx_int_2);
+  Py_VISIT(traverse_module_state->__pyx_int_3);
+  Py_VISIT(traverse_module_state->__pyx_int_48422178);
+  Py_VISIT(traverse_module_state->__pyx_int_107161605);
+  Py_VISIT(traverse_module_state->__pyx_int_112105877);
+  Py_VISIT(traverse_module_state->__pyx_int_136983863);
+  Py_VISIT(traverse_module_state->__pyx_int_147225413);
+  Py_VISIT(traverse_module_state->__pyx_int_184977713);
+  Py_VISIT(traverse_module_state->__pyx_int_neg_1);
+  Py_VISIT(traverse_module_state->__pyx_slice__5);
+  Py_VISIT(traverse_module_state->__pyx_tuple__4);
+  Py_VISIT(traverse_module_state->__pyx_tuple__8);
+  Py_VISIT(traverse_module_state->__pyx_tuple__9);
+  Py_VISIT(traverse_module_state->__pyx_slice__11);
+  Py_VISIT(traverse_module_state->__pyx_tuple__10);
+  Py_VISIT(traverse_module_state->__pyx_tuple__12);
+  Py_VISIT(traverse_module_state->__pyx_tuple__13);
+  Py_VISIT(traverse_module_state->__pyx_tuple__14);
+  Py_VISIT(traverse_module_state->__pyx_tuple__15);
+  Py_VISIT(traverse_module_state->__pyx_tuple__16);
+  Py_VISIT(traverse_module_state->__pyx_tuple__17);
+  Py_VISIT(traverse_module_state->__pyx_tuple__18);
+  Py_VISIT(traverse_module_state->__pyx_tuple__19);
+  Py_VISIT(traverse_module_state->__pyx_tuple__20);
+  Py_VISIT(traverse_module_state->__pyx_tuple__21);
+  Py_VISIT(traverse_module_state->__pyx_tuple__22);
+  Py_VISIT(traverse_module_state->__pyx_tuple__23);
+  Py_VISIT(traverse_module_state->__pyx_tuple__24);
+  Py_VISIT(traverse_module_state->__pyx_tuple__26);
+  Py_VISIT(traverse_module_state->__pyx_tuple__28);
+  Py_VISIT(traverse_module_state->__pyx_tuple__30);
+  Py_VISIT(traverse_module_state->__pyx_tuple__32);
+  Py_VISIT(traverse_module_state->__pyx_codeobj__25);
+  Py_VISIT(traverse_module_state->__pyx_codeobj__27);
+  Py_VISIT(traverse_module_state->__pyx_codeobj__29);
+  Py_VISIT(traverse_module_state->__pyx_codeobj__31);
+  Py_VISIT(traverse_module_state->__pyx_codeobj__33);
+  Py_VISIT(traverse_module_state->__pyx_codeobj__34);
+  return 0;
+}
+#endif
+/* #### Code section: module_state_defines ### */
+#define __pyx_d __pyx_mstate_global->__pyx_d
+#define __pyx_b __pyx_mstate_global->__pyx_b
+#define __pyx_cython_runtime __pyx_mstate_global->__pyx_cython_runtime
+#define __pyx_empty_tuple __pyx_mstate_global->__pyx_empty_tuple
+#define __pyx_empty_bytes __pyx_mstate_global->__pyx_empty_bytes
+#define __pyx_empty_unicode __pyx_mstate_global->__pyx_empty_unicode
+#ifdef __Pyx_CyFunction_USED
+#define __pyx_CyFunctionType __pyx_mstate_global->__pyx_CyFunctionType
+#endif
+#ifdef __Pyx_FusedFunction_USED
+#define __pyx_FusedFunctionType __pyx_mstate_global->__pyx_FusedFunctionType
+#endif
+#ifdef __Pyx_Generator_USED
+#define __pyx_GeneratorType __pyx_mstate_global->__pyx_GeneratorType
+#endif
+#ifdef __Pyx_IterableCoroutine_USED
+#define __pyx_IterableCoroutineType __pyx_mstate_global->__pyx_IterableCoroutineType
+#endif
+#ifdef __Pyx_Coroutine_USED
+#define __pyx_CoroutineAwaitType __pyx_mstate_global->__pyx_CoroutineAwaitType
+#endif
+#ifdef __Pyx_Coroutine_USED
+#define __pyx_CoroutineType __pyx_mstate_global->__pyx_CoroutineType
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#define __pyx_ptype_7cpython_4type_type __pyx_mstate_global->__pyx_ptype_7cpython_4type_type
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#if CYTHON_USE_MODULE_STATE
+#endif
+#define __pyx_ptype_5numpy_dtype __pyx_mstate_global->__pyx_ptype_5numpy_dtype
+#define __pyx_ptype_5numpy_flatiter __pyx_mstate_global->__pyx_ptype_5numpy_flatiter
+#define __pyx_ptype_5numpy_broadcast __pyx_mstate_global->__pyx_ptype_5numpy_broadcast
+#define __pyx_ptype_5numpy_ndarray __pyx_mstate_global->__pyx_ptype_5numpy_ndarray
+#define __pyx_ptype_5numpy_generic __pyx_mstate_global->__pyx_ptype_5numpy_generic
+#define __pyx_ptype_5numpy_number __pyx_mstate_global->__pyx_ptype_5numpy_number
+#define __pyx_ptype_5numpy_integer __pyx_mstate_global->__pyx_ptype_5numpy_integer
+#define __pyx_ptype_5numpy_signedinteger __pyx_mstate_global->__pyx_ptype_5numpy_signedinteger
+#define __pyx_ptype_5numpy_unsignedinteger __pyx_mstate_global->__pyx_ptype_5numpy_unsignedinteger
+#define __pyx_ptype_5numpy_inexact __pyx_mstate_global->__pyx_ptype_5numpy_inexact
+#define __pyx_ptype_5numpy_floating __pyx_mstate_global->__pyx_ptype_5numpy_floating
+#define __pyx_ptype_5numpy_complexfloating __pyx_mstate_global->__pyx_ptype_5numpy_complexfloating
+#define __pyx_ptype_5numpy_flexible __pyx_mstate_global->__pyx_ptype_5numpy_flexible
+#define __pyx_ptype_5numpy_character __pyx_mstate_global->__pyx_ptype_5numpy_character
+#define __pyx_ptype_5numpy_ufunc __pyx_mstate_global->__pyx_ptype_5numpy_ufunc
+#if CYTHON_USE_MODULE_STATE
+#define __pyx_type_7fairseq_4data_22token_block_utils_fast_DatasetSearcher __pyx_mstate_global->__pyx_type_7fairseq_4data_22token_block_utils_fast_DatasetSearcher
+#define __pyx_type___pyx_array __pyx_mstate_global->__pyx_type___pyx_array
+#define __pyx_type___pyx_MemviewEnum __pyx_mstate_global->__pyx_type___pyx_MemviewEnum
+#define __pyx_type___pyx_memoryview __pyx_mstate_global->__pyx_type___pyx_memoryview
+#define __pyx_type___pyx_memoryviewslice __pyx_mstate_global->__pyx_type___pyx_memoryviewslice
+#endif
+#define __pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher __pyx_mstate_global->__pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher
+#define __pyx_array_type __pyx_mstate_global->__pyx_array_type
+#define __pyx_MemviewEnum_type __pyx_mstate_global->__pyx_MemviewEnum_type
+#define __pyx_memoryview_type __pyx_mstate_global->__pyx_memoryview_type
+#define __pyx_memoryviewslice_type __pyx_mstate_global->__pyx_memoryviewslice_type
+#define __pyx_kp_u_ __pyx_mstate_global->__pyx_kp_u_
+#define __pyx_n_s_ASCII __pyx_mstate_global->__pyx_n_s_ASCII
+#define __pyx_kp_s_All_dimensions_preceding_dimensi __pyx_mstate_global->__pyx_kp_s_All_dimensions_preceding_dimensi
+#define __pyx_n_s_AssertionError __pyx_mstate_global->__pyx_n_s_AssertionError
+#define __pyx_kp_s_Buffer_view_does_not_expose_stri __pyx_mstate_global->__pyx_kp_s_Buffer_view_does_not_expose_stri
+#define __pyx_kp_s_Can_only_create_a_buffer_that_is __pyx_mstate_global->__pyx_kp_s_Can_only_create_a_buffer_that_is
+#define __pyx_kp_s_Cannot_assign_to_read_only_memor __pyx_mstate_global->__pyx_kp_s_Cannot_assign_to_read_only_memor
+#define __pyx_kp_s_Cannot_create_writable_memory_vi __pyx_mstate_global->__pyx_kp_s_Cannot_create_writable_memory_vi
+#define __pyx_kp_u_Cannot_index_with_type __pyx_mstate_global->__pyx_kp_u_Cannot_index_with_type
+#define __pyx_kp_s_Cannot_transpose_memoryview_with __pyx_mstate_global->__pyx_kp_s_Cannot_transpose_memoryview_with
+#define __pyx_n_s_DTYPE __pyx_mstate_global->__pyx_n_s_DTYPE
+#define __pyx_n_s_DatasetSearcher __pyx_mstate_global->__pyx_n_s_DatasetSearcher
+#define __pyx_n_s_DatasetSearcher___reduce_cython __pyx_mstate_global->__pyx_n_s_DatasetSearcher___reduce_cython
+#define __pyx_n_s_DatasetSearcher___setstate_cytho __pyx_mstate_global->__pyx_n_s_DatasetSearcher___setstate_cytho
+#define __pyx_kp_s_Dimension_d_is_not_direct __pyx_mstate_global->__pyx_kp_s_Dimension_d_is_not_direct
+#define __pyx_n_s_Ellipsis __pyx_mstate_global->__pyx_n_s_Ellipsis
+#define __pyx_kp_s_Empty_shape_tuple_for_cython_arr __pyx_mstate_global->__pyx_kp_s_Empty_shape_tuple_for_cython_arr
+#define __pyx_n_s_ImportError __pyx_mstate_global->__pyx_n_s_ImportError
+#define __pyx_kp_s_Incompatible_checksums_0x_x_vs_0 __pyx_mstate_global->__pyx_kp_s_Incompatible_checksums_0x_x_vs_0
+#define __pyx_kp_s_Incompatible_checksums_0x_x_vs_0_2 __pyx_mstate_global->__pyx_kp_s_Incompatible_checksums_0x_x_vs_0_2
+#define __pyx_n_s_IndexError __pyx_mstate_global->__pyx_n_s_IndexError
+#define __pyx_kp_s_Index_out_of_bounds_axis_d __pyx_mstate_global->__pyx_kp_s_Index_out_of_bounds_axis_d
+#define __pyx_kp_s_Indirect_dimensions_not_supporte __pyx_mstate_global->__pyx_kp_s_Indirect_dimensions_not_supporte
+#define __pyx_kp_u_Invalid_break_mode __pyx_mstate_global->__pyx_kp_u_Invalid_break_mode
+#define __pyx_kp_u_Invalid_mode_expected_c_or_fortr __pyx_mstate_global->__pyx_kp_u_Invalid_mode_expected_c_or_fortr
+#define __pyx_kp_u_Invalid_shape_in_axis __pyx_mstate_global->__pyx_kp_u_Invalid_shape_in_axis
+#define __pyx_n_s_MemoryError __pyx_mstate_global->__pyx_n_s_MemoryError
+#define __pyx_kp_s_MemoryView_of_r_at_0x_x __pyx_mstate_global->__pyx_kp_s_MemoryView_of_r_at_0x_x
+#define __pyx_kp_s_MemoryView_of_r_object __pyx_mstate_global->__pyx_kp_s_MemoryView_of_r_object
+#define __pyx_n_b_O __pyx_mstate_global->__pyx_n_b_O
+#define __pyx_kp_u_Out_of_bounds_on_buffer_access_a __pyx_mstate_global->__pyx_kp_u_Out_of_bounds_on_buffer_access_a
+#define __pyx_n_s_PickleError __pyx_mstate_global->__pyx_n_s_PickleError
+#define __pyx_n_s_Sequence __pyx_mstate_global->__pyx_n_s_Sequence
+#define __pyx_kp_s_Step_may_not_be_zero_axis_d __pyx_mstate_global->__pyx_kp_s_Step_may_not_be_zero_axis_d
+#define __pyx_n_s_TypeError __pyx_mstate_global->__pyx_n_s_TypeError
+#define __pyx_kp_s_Unable_to_convert_item_to_object __pyx_mstate_global->__pyx_kp_s_Unable_to_convert_item_to_object
+#define __pyx_n_s_ValueError __pyx_mstate_global->__pyx_n_s_ValueError
+#define __pyx_n_s_View_MemoryView __pyx_mstate_global->__pyx_n_s_View_MemoryView
+#define __pyx_kp_u__2 __pyx_mstate_global->__pyx_kp_u__2
+#define __pyx_n_s__3 __pyx_mstate_global->__pyx_n_s__3
+#define __pyx_n_s__35 __pyx_mstate_global->__pyx_n_s__35
+#define __pyx_kp_u__6 __pyx_mstate_global->__pyx_kp_u__6
+#define __pyx_kp_u__7 __pyx_mstate_global->__pyx_kp_u__7
+#define __pyx_n_s_abc __pyx_mstate_global->__pyx_n_s_abc
+#define __pyx_n_s_allocate_buffer __pyx_mstate_global->__pyx_n_s_allocate_buffer
+#define __pyx_kp_u_and __pyx_mstate_global->__pyx_kp_u_and
+#define __pyx_n_s_asyncio_coroutines __pyx_mstate_global->__pyx_n_s_asyncio_coroutines
+#define __pyx_n_s_axis __pyx_mstate_global->__pyx_n_s_axis
+#define __pyx_n_s_base __pyx_mstate_global->__pyx_n_s_base
+#define __pyx_n_s_block_size __pyx_mstate_global->__pyx_n_s_block_size
+#define __pyx_n_s_break_mode __pyx_mstate_global->__pyx_n_s_break_mode
+#define __pyx_n_s_c __pyx_mstate_global->__pyx_n_s_c
+#define __pyx_n_u_c __pyx_mstate_global->__pyx_n_u_c
+#define __pyx_n_s_chain __pyx_mstate_global->__pyx_n_s_chain
+#define __pyx_n_s_class __pyx_mstate_global->__pyx_n_s_class
+#define __pyx_n_s_class_getitem __pyx_mstate_global->__pyx_n_s_class_getitem
+#define __pyx_n_s_cline_in_traceback __pyx_mstate_global->__pyx_n_s_cline_in_traceback
+#define __pyx_n_s_collections __pyx_mstate_global->__pyx_n_s_collections
+#define __pyx_kp_s_collections_abc __pyx_mstate_global->__pyx_kp_s_collections_abc
+#define __pyx_n_u_complete __pyx_mstate_global->__pyx_n_u_complete
+#define __pyx_n_u_complete_doc __pyx_mstate_global->__pyx_n_u_complete_doc
+#define __pyx_kp_s_contiguous_and_direct __pyx_mstate_global->__pyx_kp_s_contiguous_and_direct
+#define __pyx_kp_s_contiguous_and_indirect __pyx_mstate_global->__pyx_kp_s_contiguous_and_indirect
+#define __pyx_n_s_count __pyx_mstate_global->__pyx_n_s_count
+#define __pyx_n_s_cumsum __pyx_mstate_global->__pyx_n_s_cumsum
+#define __pyx_n_s_dict __pyx_mstate_global->__pyx_n_s_dict
+#define __pyx_n_s_dict_2 __pyx_mstate_global->__pyx_n_s_dict_2
+#define __pyx_kp_u_disable __pyx_mstate_global->__pyx_kp_u_disable
+#define __pyx_n_s_document_sep_len __pyx_mstate_global->__pyx_n_s_document_sep_len
+#define __pyx_n_s_dtype __pyx_mstate_global->__pyx_n_s_dtype
+#define __pyx_n_s_dtype_is_object __pyx_mstate_global->__pyx_n_s_dtype_is_object
+#define __pyx_kp_u_enable __pyx_mstate_global->__pyx_kp_u_enable
+#define __pyx_n_s_encode __pyx_mstate_global->__pyx_n_s_encode
+#define __pyx_n_s_enumerate __pyx_mstate_global->__pyx_n_s_enumerate
+#define __pyx_n_u_eos __pyx_mstate_global->__pyx_n_u_eos
+#define __pyx_n_s_error __pyx_mstate_global->__pyx_n_s_error
+#define __pyx_kp_s_fairseq_data_token_block_utils_f __pyx_mstate_global->__pyx_kp_s_fairseq_data_token_block_utils_f
+#define __pyx_n_s_fairseq_data_token_block_utils_f_2 __pyx_mstate_global->__pyx_n_s_fairseq_data_token_block_utils_f_2
+#define __pyx_n_s_flags __pyx_mstate_global->__pyx_n_s_flags
+#define __pyx_n_s_format __pyx_mstate_global->__pyx_n_s_format
+#define __pyx_n_s_fortran __pyx_mstate_global->__pyx_n_s_fortran
+#define __pyx_n_u_fortran __pyx_mstate_global->__pyx_n_u_fortran
+#define __pyx_n_s_from_iterable __pyx_mstate_global->__pyx_n_s_from_iterable
+#define __pyx_n_s_fromiter __pyx_mstate_global->__pyx_n_s_fromiter
+#define __pyx_kp_u_gc __pyx_mstate_global->__pyx_kp_u_gc
+#define __pyx_n_s_get_block_to_dataset_index_fast __pyx_mstate_global->__pyx_n_s_get_block_to_dataset_index_fast
+#define __pyx_n_s_get_slice_indices_fast __pyx_mstate_global->__pyx_n_s_get_slice_indices_fast
+#define __pyx_n_s_getstate __pyx_mstate_global->__pyx_n_s_getstate
+#define __pyx_kp_u_got __pyx_mstate_global->__pyx_kp_u_got
+#define __pyx_kp_u_got_differing_extents_in_dimensi __pyx_mstate_global->__pyx_kp_u_got_differing_extents_in_dimensi
+#define __pyx_n_s_id __pyx_mstate_global->__pyx_n_s_id
+#define __pyx_n_s_import __pyx_mstate_global->__pyx_n_s_import
+#define __pyx_n_s_index __pyx_mstate_global->__pyx_n_s_index
+#define __pyx_n_s_initializing __pyx_mstate_global->__pyx_n_s_initializing
+#define __pyx_n_s_int64 __pyx_mstate_global->__pyx_n_s_int64
+#define __pyx_n_s_is_coroutine __pyx_mstate_global->__pyx_n_s_is_coroutine
+#define __pyx_kp_u_isenabled __pyx_mstate_global->__pyx_kp_u_isenabled
+#define __pyx_n_s_itemsize __pyx_mstate_global->__pyx_n_s_itemsize
+#define __pyx_kp_s_itemsize_0_for_cython_array __pyx_mstate_global->__pyx_kp_s_itemsize_0_for_cython_array
+#define __pyx_n_s_itertools __pyx_mstate_global->__pyx_n_s_itertools
+#define __pyx_n_s_main __pyx_mstate_global->__pyx_n_s_main
+#define __pyx_n_s_memview __pyx_mstate_global->__pyx_n_s_memview
+#define __pyx_n_s_mode __pyx_mstate_global->__pyx_n_s_mode
+#define __pyx_n_s_name __pyx_mstate_global->__pyx_n_s_name
+#define __pyx_n_s_name_2 __pyx_mstate_global->__pyx_n_s_name_2
+#define __pyx_n_s_ndim __pyx_mstate_global->__pyx_n_s_ndim
+#define __pyx_n_s_new __pyx_mstate_global->__pyx_n_s_new
+#define __pyx_kp_s_no_default___reduce___due_to_non __pyx_mstate_global->__pyx_kp_s_no_default___reduce___due_to_non
+#define __pyx_n_u_none __pyx_mstate_global->__pyx_n_u_none
+#define __pyx_n_s_np __pyx_mstate_global->__pyx_n_s_np
+#define __pyx_n_s_numpy __pyx_mstate_global->__pyx_n_s_numpy
+#define __pyx_kp_u_numpy__core_multiarray_failed_to __pyx_mstate_global->__pyx_kp_u_numpy__core_multiarray_failed_to
+#define __pyx_kp_u_numpy__core_umath_failed_to_impo __pyx_mstate_global->__pyx_kp_u_numpy__core_umath_failed_to_impo
+#define __pyx_n_s_obj __pyx_mstate_global->__pyx_n_s_obj
+#define __pyx_n_s_pack __pyx_mstate_global->__pyx_n_s_pack
+#define __pyx_n_s_pickle __pyx_mstate_global->__pyx_n_s_pickle
+#define __pyx_n_s_pyx_PickleError __pyx_mstate_global->__pyx_n_s_pyx_PickleError
+#define __pyx_n_s_pyx_checksum __pyx_mstate_global->__pyx_n_s_pyx_checksum
+#define __pyx_n_s_pyx_result __pyx_mstate_global->__pyx_n_s_pyx_result
+#define __pyx_n_s_pyx_state __pyx_mstate_global->__pyx_n_s_pyx_state
+#define __pyx_n_s_pyx_type __pyx_mstate_global->__pyx_n_s_pyx_type
+#define __pyx_n_s_pyx_unpickle_DatasetSearcher __pyx_mstate_global->__pyx_n_s_pyx_unpickle_DatasetSearcher
+#define __pyx_n_s_pyx_unpickle_Enum __pyx_mstate_global->__pyx_n_s_pyx_unpickle_Enum
+#define __pyx_n_s_pyx_vtable __pyx_mstate_global->__pyx_n_s_pyx_vtable
+#define __pyx_n_s_range __pyx_mstate_global->__pyx_n_s_range
+#define __pyx_n_s_reduce __pyx_mstate_global->__pyx_n_s_reduce
+#define __pyx_n_s_reduce_cython __pyx_mstate_global->__pyx_n_s_reduce_cython
+#define __pyx_n_s_reduce_ex __pyx_mstate_global->__pyx_n_s_reduce_ex
+#define __pyx_n_s_register __pyx_mstate_global->__pyx_n_s_register
+#define __pyx_n_s_reshape __pyx_mstate_global->__pyx_n_s_reshape
+#define __pyx_n_s_self __pyx_mstate_global->__pyx_n_s_self
+#define __pyx_n_s_setstate __pyx_mstate_global->__pyx_n_s_setstate
+#define __pyx_n_s_setstate_cython __pyx_mstate_global->__pyx_n_s_setstate_cython
+#define __pyx_n_s_shape __pyx_mstate_global->__pyx_n_s_shape
+#define __pyx_n_s_size __pyx_mstate_global->__pyx_n_s_size
+#define __pyx_n_s_sizes __pyx_mstate_global->__pyx_n_s_sizes
+#define __pyx_n_s_slice_indices __pyx_mstate_global->__pyx_n_s_slice_indices
+#define __pyx_n_s_spec __pyx_mstate_global->__pyx_n_s_spec
+#define __pyx_n_s_start __pyx_mstate_global->__pyx_n_s_start
+#define __pyx_n_s_state __pyx_mstate_global->__pyx_n_s_state
+#define __pyx_n_s_step __pyx_mstate_global->__pyx_n_s_step
+#define __pyx_n_s_stop __pyx_mstate_global->__pyx_n_s_stop
+#define __pyx_kp_s_strided_and_direct __pyx_mstate_global->__pyx_kp_s_strided_and_direct
+#define __pyx_kp_s_strided_and_direct_or_indirect __pyx_mstate_global->__pyx_kp_s_strided_and_direct_or_indirect
+#define __pyx_kp_s_strided_and_indirect __pyx_mstate_global->__pyx_kp_s_strided_and_indirect
+#define __pyx_kp_s_stringsource __pyx_mstate_global->__pyx_kp_s_stringsource
+#define __pyx_n_s_struct __pyx_mstate_global->__pyx_n_s_struct
+#define __pyx_n_s_sum __pyx_mstate_global->__pyx_n_s_sum
+#define __pyx_n_s_sys __pyx_mstate_global->__pyx_n_s_sys
+#define __pyx_n_s_test __pyx_mstate_global->__pyx_n_s_test
+#define __pyx_n_s_torch __pyx_mstate_global->__pyx_n_s_torch
+#define __pyx_kp_s_unable_to_allocate_array_data __pyx_mstate_global->__pyx_kp_s_unable_to_allocate_array_data
+#define __pyx_kp_s_unable_to_allocate_shape_and_str __pyx_mstate_global->__pyx_kp_s_unable_to_allocate_shape_and_str
+#define __pyx_n_s_unpack __pyx_mstate_global->__pyx_n_s_unpack
+#define __pyx_n_s_update __pyx_mstate_global->__pyx_n_s_update
+#define __pyx_n_s_use_setstate __pyx_mstate_global->__pyx_n_s_use_setstate
+#define __pyx_n_s_version_info __pyx_mstate_global->__pyx_n_s_version_info
+#define __pyx_n_s_zeros __pyx_mstate_global->__pyx_n_s_zeros
+#define __pyx_int_0 __pyx_mstate_global->__pyx_int_0
+#define __pyx_int_1 __pyx_mstate_global->__pyx_int_1
+#define __pyx_int_2 __pyx_mstate_global->__pyx_int_2
+#define __pyx_int_3 __pyx_mstate_global->__pyx_int_3
+#define __pyx_int_48422178 __pyx_mstate_global->__pyx_int_48422178
+#define __pyx_int_107161605 __pyx_mstate_global->__pyx_int_107161605
+#define __pyx_int_112105877 __pyx_mstate_global->__pyx_int_112105877
+#define __pyx_int_136983863 __pyx_mstate_global->__pyx_int_136983863
+#define __pyx_int_147225413 __pyx_mstate_global->__pyx_int_147225413
+#define __pyx_int_184977713 __pyx_mstate_global->__pyx_int_184977713
+#define __pyx_int_neg_1 __pyx_mstate_global->__pyx_int_neg_1
+#define __pyx_slice__5 __pyx_mstate_global->__pyx_slice__5
+#define __pyx_tuple__4 __pyx_mstate_global->__pyx_tuple__4
+#define __pyx_tuple__8 __pyx_mstate_global->__pyx_tuple__8
+#define __pyx_tuple__9 __pyx_mstate_global->__pyx_tuple__9
+#define __pyx_slice__11 __pyx_mstate_global->__pyx_slice__11
+#define __pyx_tuple__10 __pyx_mstate_global->__pyx_tuple__10
+#define __pyx_tuple__12 __pyx_mstate_global->__pyx_tuple__12
+#define __pyx_tuple__13 __pyx_mstate_global->__pyx_tuple__13
+#define __pyx_tuple__14 __pyx_mstate_global->__pyx_tuple__14
+#define __pyx_tuple__15 __pyx_mstate_global->__pyx_tuple__15
+#define __pyx_tuple__16 __pyx_mstate_global->__pyx_tuple__16
+#define __pyx_tuple__17 __pyx_mstate_global->__pyx_tuple__17
+#define __pyx_tuple__18 __pyx_mstate_global->__pyx_tuple__18
+#define __pyx_tuple__19 __pyx_mstate_global->__pyx_tuple__19
+#define __pyx_tuple__20 __pyx_mstate_global->__pyx_tuple__20
+#define __pyx_tuple__21 __pyx_mstate_global->__pyx_tuple__21
+#define __pyx_tuple__22 __pyx_mstate_global->__pyx_tuple__22
+#define __pyx_tuple__23 __pyx_mstate_global->__pyx_tuple__23
+#define __pyx_tuple__24 __pyx_mstate_global->__pyx_tuple__24
+#define __pyx_tuple__26 __pyx_mstate_global->__pyx_tuple__26
+#define __pyx_tuple__28 __pyx_mstate_global->__pyx_tuple__28
+#define __pyx_tuple__30 __pyx_mstate_global->__pyx_tuple__30
+#define __pyx_tuple__32 __pyx_mstate_global->__pyx_tuple__32
+#define __pyx_codeobj__25 __pyx_mstate_global->__pyx_codeobj__25
+#define __pyx_codeobj__27 __pyx_mstate_global->__pyx_codeobj__27
+#define __pyx_codeobj__29 __pyx_mstate_global->__pyx_codeobj__29
+#define __pyx_codeobj__31 __pyx_mstate_global->__pyx_codeobj__31
+#define __pyx_codeobj__33 __pyx_mstate_global->__pyx_codeobj__33
+#define __pyx_codeobj__34 __pyx_mstate_global->__pyx_codeobj__34
+/* #### Code section: module_code ### */
+
+/* "View.MemoryView":131
+ *         cdef bint dtype_is_object
+ * 
+ *     def __cinit__(array self, tuple shape, Py_ssize_t itemsize, format not None,             # <<<<<<<<<<<<<<
+ *                   mode="c", bint allocate_buffer=True):
+ * 
+ */
+
+/* Python wrapper */
+static int __pyx_array___cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static int __pyx_array___cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_shape = 0;
+  Py_ssize_t __pyx_v_itemsize;
+  PyObject *__pyx_v_format = 0;
+  PyObject *__pyx_v_mode = 0;
+  int __pyx_v_allocate_buffer;
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[5] = {0,0,0,0,0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__cinit__ (wrapper)", 0);
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return -1;
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_shape,&__pyx_n_s_itemsize,&__pyx_n_s_format,&__pyx_n_s_mode,&__pyx_n_s_allocate_buffer,0};
+    values[3] = __Pyx_Arg_NewRef_VARARGS(((PyObject *)__pyx_n_s_c));
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  5: values[4] = __Pyx_Arg_VARARGS(__pyx_args, 4);
+        CYTHON_FALLTHROUGH;
+        case  4: values[3] = __Pyx_Arg_VARARGS(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = __Pyx_Arg_VARARGS(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = __Pyx_Arg_VARARGS(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = __Pyx_Arg_VARARGS(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_VARARGS(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_VARARGS(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_shape)) != 0)) {
+          (void)__Pyx_Arg_NewRef_VARARGS(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 131, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_GetKwValue_VARARGS(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_itemsize)) != 0)) {
+          (void)__Pyx_Arg_NewRef_VARARGS(values[1]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 131, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 3, 5, 1); __PYX_ERR(1, 131, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_GetKwValue_VARARGS(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_format)) != 0)) {
+          (void)__Pyx_Arg_NewRef_VARARGS(values[2]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 131, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 3, 5, 2); __PYX_ERR(1, 131, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  3:
+        if (kw_args > 0) {
+          PyObject* value = __Pyx_GetKwValue_VARARGS(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_mode);
+          if (value) { values[3] = __Pyx_Arg_NewRef_VARARGS(value); kw_args--; }
+          else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 131, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  4:
+        if (kw_args > 0) {
+          PyObject* value = __Pyx_GetKwValue_VARARGS(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_allocate_buffer);
+          if (value) { values[4] = __Pyx_Arg_NewRef_VARARGS(value); kw_args--; }
+          else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 131, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "__cinit__") < 0)) __PYX_ERR(1, 131, __pyx_L3_error)
+      }
+    } else {
+      switch (__pyx_nargs) {
+        case  5: values[4] = __Pyx_Arg_VARARGS(__pyx_args, 4);
+        CYTHON_FALLTHROUGH;
+        case  4: values[3] = __Pyx_Arg_VARARGS(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = __Pyx_Arg_VARARGS(__pyx_args, 2);
+        values[1] = __Pyx_Arg_VARARGS(__pyx_args, 1);
+        values[0] = __Pyx_Arg_VARARGS(__pyx_args, 0);
+        break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+    }
+    __pyx_v_shape = ((PyObject*)values[0]);
+    __pyx_v_itemsize = __Pyx_PyIndex_AsSsize_t(values[1]); if (unlikely((__pyx_v_itemsize == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(1, 131, __pyx_L3_error)
+    __pyx_v_format = values[2];
+    __pyx_v_mode = values[3];
+    if (values[4]) {
+      __pyx_v_allocate_buffer = __Pyx_PyObject_IsTrue(values[4]); if (unlikely((__pyx_v_allocate_buffer == (int)-1) && PyErr_Occurred())) __PYX_ERR(1, 132, __pyx_L3_error)
+    } else {
+
+      /* "View.MemoryView":132
+ * 
+ *     def __cinit__(array self, tuple shape, Py_ssize_t itemsize, format not None,
+ *                   mode="c", bint allocate_buffer=True):             # <<<<<<<<<<<<<<
+ * 
+ *         cdef int idx
+ */
+      __pyx_v_allocate_buffer = ((int)1);
+    }
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 3, 5, __pyx_nargs); __PYX_ERR(1, 131, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_VARARGS(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("View.MemoryView.array.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return -1;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_shape), (&PyTuple_Type), 1, "shape", 1))) __PYX_ERR(1, 131, __pyx_L1_error)
+  if (unlikely(((PyObject *)__pyx_v_format) == Py_None)) {
+    PyErr_Format(PyExc_TypeError, "Argument '%.200s' must not be None", "format"); __PYX_ERR(1, 131, __pyx_L1_error)
+  }
+  __pyx_r = __pyx_array___pyx_pf_15View_dot_MemoryView_5array___cinit__(((struct __pyx_array_obj *)__pyx_v_self), __pyx_v_shape, __pyx_v_itemsize, __pyx_v_format, __pyx_v_mode, __pyx_v_allocate_buffer);
+
+  /* "View.MemoryView":131
+ *         cdef bint dtype_is_object
+ * 
+ *     def __cinit__(array self, tuple shape, Py_ssize_t itemsize, format not None,             # <<<<<<<<<<<<<<
+ *                   mode="c", bint allocate_buffer=True):
+ * 
+ */
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = -1;
+  __pyx_L0:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_VARARGS(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array___cinit__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_shape, Py_ssize_t __pyx_v_itemsize, PyObject *__pyx_v_format, PyObject *__pyx_v_mode, int __pyx_v_allocate_buffer) {
+  int __pyx_v_idx;
+  Py_ssize_t __pyx_v_dim;
+  char __pyx_v_order;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  Py_ssize_t __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  unsigned int __pyx_t_7;
+  char *__pyx_t_8;
+  int __pyx_t_9;
+  Py_ssize_t __pyx_t_10;
+  Py_UCS4 __pyx_t_11;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__cinit__", 0);
+  __Pyx_INCREF(__pyx_v_format);
+
+  /* "View.MemoryView":137
+ *         cdef Py_ssize_t dim
+ * 
+ *         self.ndim = <int> len(shape)             # <<<<<<<<<<<<<<
+ *         self.itemsize = itemsize
+ * 
+ */
+  if (unlikely(__pyx_v_shape == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()");
+    __PYX_ERR(1, 137, __pyx_L1_error)
+  }
+  __pyx_t_1 = __Pyx_PyTuple_GET_SIZE(__pyx_v_shape); if (unlikely(__pyx_t_1 == ((Py_ssize_t)-1))) __PYX_ERR(1, 137, __pyx_L1_error)
+  __pyx_v_self->ndim = ((int)__pyx_t_1);
+
+  /* "View.MemoryView":138
+ * 
+ *         self.ndim = <int> len(shape)
+ *         self.itemsize = itemsize             # <<<<<<<<<<<<<<
+ * 
+ *         if not self.ndim:
+ */
+  __pyx_v_self->itemsize = __pyx_v_itemsize;
+
+  /* "View.MemoryView":140
+ *         self.itemsize = itemsize
+ * 
+ *         if not self.ndim:             # <<<<<<<<<<<<<<
+ *             raise ValueError, "Empty shape tuple for cython.array"
+ * 
+ */
+  __pyx_t_2 = (!(__pyx_v_self->ndim != 0));
+  if (unlikely(__pyx_t_2)) {
+
+    /* "View.MemoryView":141
+ * 
+ *         if not self.ndim:
+ *             raise ValueError, "Empty shape tuple for cython.array"             # <<<<<<<<<<<<<<
+ * 
+ *         if itemsize <= 0:
+ */
+    __Pyx_Raise(__pyx_builtin_ValueError, __pyx_kp_s_Empty_shape_tuple_for_cython_arr, 0, 0);
+    __PYX_ERR(1, 141, __pyx_L1_error)
+
+    /* "View.MemoryView":140
+ *         self.itemsize = itemsize
+ * 
+ *         if not self.ndim:             # <<<<<<<<<<<<<<
+ *             raise ValueError, "Empty shape tuple for cython.array"
+ * 
+ */
+  }
+
+  /* "View.MemoryView":143
+ *             raise ValueError, "Empty shape tuple for cython.array"
+ * 
+ *         if itemsize <= 0:             # <<<<<<<<<<<<<<
+ *             raise ValueError, "itemsize <= 0 for cython.array"
+ * 
+ */
+  __pyx_t_2 = (__pyx_v_itemsize <= 0);
+  if (unlikely(__pyx_t_2)) {
+
+    /* "View.MemoryView":144
+ * 
+ *         if itemsize <= 0:
+ *             raise ValueError, "itemsize <= 0 for cython.array"             # <<<<<<<<<<<<<<
+ * 
+ *         if not isinstance(format, bytes):
+ */
+    __Pyx_Raise(__pyx_builtin_ValueError, __pyx_kp_s_itemsize_0_for_cython_array, 0, 0);
+    __PYX_ERR(1, 144, __pyx_L1_error)
+
+    /* "View.MemoryView":143
+ *             raise ValueError, "Empty shape tuple for cython.array"
+ * 
+ *         if itemsize <= 0:             # <<<<<<<<<<<<<<
+ *             raise ValueError, "itemsize <= 0 for cython.array"
+ * 
+ */
+  }
+
+  /* "View.MemoryView":146
+ *             raise ValueError, "itemsize <= 0 for cython.array"
+ * 
+ *         if not isinstance(format, bytes):             # <<<<<<<<<<<<<<
+ *             format = format.encode('ASCII')
+ *         self._format = format  # keep a reference to the byte string
+ */
+  __pyx_t_2 = PyBytes_Check(__pyx_v_format); 
+  __pyx_t_3 = (!__pyx_t_2);
+  if (__pyx_t_3) {
+
+    /* "View.MemoryView":147
+ * 
+ *         if not isinstance(format, bytes):
+ *             format = format.encode('ASCII')             # <<<<<<<<<<<<<<
+ *         self._format = format  # keep a reference to the byte string
+ *         self.format = self._format
+ */
+    __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_v_format, __pyx_n_s_encode); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 147, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_6 = NULL;
+    __pyx_t_7 = 0;
+    #if CYTHON_UNPACK_METHODS
+    if (likely(PyMethod_Check(__pyx_t_5))) {
+      __pyx_t_6 = PyMethod_GET_SELF(__pyx_t_5);
+      if (likely(__pyx_t_6)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_5);
+        __Pyx_INCREF(__pyx_t_6);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_5, function);
+        __pyx_t_7 = 1;
+      }
+    }
+    #endif
+    {
+      PyObject *__pyx_callargs[2] = {__pyx_t_6, __pyx_n_s_ASCII};
+      __pyx_t_4 = __Pyx_PyObject_FastCall(__pyx_t_5, __pyx_callargs+1-__pyx_t_7, 1+__pyx_t_7);
+      __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+      if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 147, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    }
+    __Pyx_DECREF_SET(__pyx_v_format, __pyx_t_4);
+    __pyx_t_4 = 0;
+
+    /* "View.MemoryView":146
+ *             raise ValueError, "itemsize <= 0 for cython.array"
+ * 
+ *         if not isinstance(format, bytes):             # <<<<<<<<<<<<<<
+ *             format = format.encode('ASCII')
+ *         self._format = format  # keep a reference to the byte string
+ */
+  }
+
+  /* "View.MemoryView":148
+ *         if not isinstance(format, bytes):
+ *             format = format.encode('ASCII')
+ *         self._format = format  # keep a reference to the byte string             # <<<<<<<<<<<<<<
+ *         self.format = self._format
+ * 
+ */
+  if (!(likely(PyBytes_CheckExact(__pyx_v_format))||((__pyx_v_format) == Py_None) || __Pyx_RaiseUnexpectedTypeError("bytes", __pyx_v_format))) __PYX_ERR(1, 148, __pyx_L1_error)
+  __pyx_t_4 = __pyx_v_format;
+  __Pyx_INCREF(__pyx_t_4);
+  __Pyx_GIVEREF(__pyx_t_4);
+  __Pyx_GOTREF(__pyx_v_self->_format);
+  __Pyx_DECREF(__pyx_v_self->_format);
+  __pyx_v_self->_format = ((PyObject*)__pyx_t_4);
+  __pyx_t_4 = 0;
+
+  /* "View.MemoryView":149
+ *             format = format.encode('ASCII')
+ *         self._format = format  # keep a reference to the byte string
+ *         self.format = self._format             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  if (unlikely(__pyx_v_self->_format == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "expected bytes, NoneType found");
+    __PYX_ERR(1, 149, __pyx_L1_error)
+  }
+  __pyx_t_8 = __Pyx_PyBytes_AsWritableString(__pyx_v_self->_format); if (unlikely((!__pyx_t_8) && PyErr_Occurred())) __PYX_ERR(1, 149, __pyx_L1_error)
+  __pyx_v_self->format = __pyx_t_8;
+
+  /* "View.MemoryView":152
+ * 
+ * 
+ *         self._shape = <Py_ssize_t *> PyObject_Malloc(sizeof(Py_ssize_t)*self.ndim*2)             # <<<<<<<<<<<<<<
+ *         self._strides = self._shape + self.ndim
+ * 
+ */
+  __pyx_v_self->_shape = ((Py_ssize_t *)PyObject_Malloc((((sizeof(Py_ssize_t)) * __pyx_v_self->ndim) * 2)));
+
+  /* "View.MemoryView":153
+ * 
+ *         self._shape = <Py_ssize_t *> PyObject_Malloc(sizeof(Py_ssize_t)*self.ndim*2)
+ *         self._strides = self._shape + self.ndim             # <<<<<<<<<<<<<<
+ * 
+ *         if not self._shape:
+ */
+  __pyx_v_self->_strides = (__pyx_v_self->_shape + __pyx_v_self->ndim);
+
+  /* "View.MemoryView":155
+ *         self._strides = self._shape + self.ndim
+ * 
+ *         if not self._shape:             # <<<<<<<<<<<<<<
+ *             raise MemoryError, "unable to allocate shape and strides."
+ * 
+ */
+  __pyx_t_3 = (!(__pyx_v_self->_shape != 0));
+  if (unlikely(__pyx_t_3)) {
+
+    /* "View.MemoryView":156
+ * 
+ *         if not self._shape:
+ *             raise MemoryError, "unable to allocate shape and strides."             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+    __Pyx_Raise(__pyx_builtin_MemoryError, __pyx_kp_s_unable_to_allocate_shape_and_str, 0, 0);
+    __PYX_ERR(1, 156, __pyx_L1_error)
+
+    /* "View.MemoryView":155
+ *         self._strides = self._shape + self.ndim
+ * 
+ *         if not self._shape:             # <<<<<<<<<<<<<<
+ *             raise MemoryError, "unable to allocate shape and strides."
+ * 
+ */
+  }
+
+  /* "View.MemoryView":159
+ * 
+ * 
+ *         for idx, dim in enumerate(shape):             # <<<<<<<<<<<<<<
+ *             if dim <= 0:
+ *                 raise ValueError, f"Invalid shape in axis {idx}: {dim}."
+ */
+  __pyx_t_9 = 0;
+  __pyx_t_4 = __pyx_v_shape; __Pyx_INCREF(__pyx_t_4);
+  __pyx_t_1 = 0;
+  for (;;) {
+    {
+      Py_ssize_t __pyx_temp = __Pyx_PyTuple_GET_SIZE(__pyx_t_4);
+      #if !CYTHON_ASSUME_SAFE_MACROS
+      if (unlikely((__pyx_temp < 0))) __PYX_ERR(1, 159, __pyx_L1_error)
+      #endif
+      if (__pyx_t_1 >= __pyx_temp) break;
+    }
+    #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    __pyx_t_5 = PyTuple_GET_ITEM(__pyx_t_4, __pyx_t_1); __Pyx_INCREF(__pyx_t_5); __pyx_t_1++; if (unlikely((0 < 0))) __PYX_ERR(1, 159, __pyx_L1_error)
+    #else
+    __pyx_t_5 = __Pyx_PySequence_ITEM(__pyx_t_4, __pyx_t_1); __pyx_t_1++; if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 159, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    #endif
+    __pyx_t_10 = __Pyx_PyIndex_AsSsize_t(__pyx_t_5); if (unlikely((__pyx_t_10 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(1, 159, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __pyx_v_dim = __pyx_t_10;
+    __pyx_v_idx = __pyx_t_9;
+    __pyx_t_9 = (__pyx_t_9 + 1);
+
+    /* "View.MemoryView":160
+ * 
+ *         for idx, dim in enumerate(shape):
+ *             if dim <= 0:             # <<<<<<<<<<<<<<
+ *                 raise ValueError, f"Invalid shape in axis {idx}: {dim}."
+ *             self._shape[idx] = dim
+ */
+    __pyx_t_3 = (__pyx_v_dim <= 0);
+    if (unlikely(__pyx_t_3)) {
+
+      /* "View.MemoryView":161
+ *         for idx, dim in enumerate(shape):
+ *             if dim <= 0:
+ *                 raise ValueError, f"Invalid shape in axis {idx}: {dim}."             # <<<<<<<<<<<<<<
+ *             self._shape[idx] = dim
+ * 
+ */
+      __pyx_t_5 = PyTuple_New(5); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 161, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __pyx_t_10 = 0;
+      __pyx_t_11 = 127;
+      __Pyx_INCREF(__pyx_kp_u_Invalid_shape_in_axis);
+      __pyx_t_10 += 22;
+      __Pyx_GIVEREF(__pyx_kp_u_Invalid_shape_in_axis);
+      PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_kp_u_Invalid_shape_in_axis);
+      __pyx_t_6 = __Pyx_PyUnicode_From_int(__pyx_v_idx, 0, ' ', 'd'); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 161, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __pyx_t_10 += __Pyx_PyUnicode_GET_LENGTH(__pyx_t_6);
+      __Pyx_GIVEREF(__pyx_t_6);
+      PyTuple_SET_ITEM(__pyx_t_5, 1, __pyx_t_6);
+      __pyx_t_6 = 0;
+      __Pyx_INCREF(__pyx_kp_u_);
+      __pyx_t_10 += 2;
+      __Pyx_GIVEREF(__pyx_kp_u_);
+      PyTuple_SET_ITEM(__pyx_t_5, 2, __pyx_kp_u_);
+      __pyx_t_6 = __Pyx_PyUnicode_From_Py_ssize_t(__pyx_v_dim, 0, ' ', 'd'); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 161, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __pyx_t_10 += __Pyx_PyUnicode_GET_LENGTH(__pyx_t_6);
+      __Pyx_GIVEREF(__pyx_t_6);
+      PyTuple_SET_ITEM(__pyx_t_5, 3, __pyx_t_6);
+      __pyx_t_6 = 0;
+      __Pyx_INCREF(__pyx_kp_u__2);
+      __pyx_t_10 += 1;
+      __Pyx_GIVEREF(__pyx_kp_u__2);
+      PyTuple_SET_ITEM(__pyx_t_5, 4, __pyx_kp_u__2);
+      __pyx_t_6 = __Pyx_PyUnicode_Join(__pyx_t_5, 5, __pyx_t_10, __pyx_t_11); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 161, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __Pyx_Raise(__pyx_builtin_ValueError, __pyx_t_6, 0, 0);
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __PYX_ERR(1, 161, __pyx_L1_error)
+
+      /* "View.MemoryView":160
+ * 
+ *         for idx, dim in enumerate(shape):
+ *             if dim <= 0:             # <<<<<<<<<<<<<<
+ *                 raise ValueError, f"Invalid shape in axis {idx}: {dim}."
+ *             self._shape[idx] = dim
+ */
+    }
+
+    /* "View.MemoryView":162
+ *             if dim <= 0:
+ *                 raise ValueError, f"Invalid shape in axis {idx}: {dim}."
+ *             self._shape[idx] = dim             # <<<<<<<<<<<<<<
+ * 
+ *         cdef char order
+ */
+    (__pyx_v_self->_shape[__pyx_v_idx]) = __pyx_v_dim;
+
+    /* "View.MemoryView":159
+ * 
+ * 
+ *         for idx, dim in enumerate(shape):             # <<<<<<<<<<<<<<
+ *             if dim <= 0:
+ *                 raise ValueError, f"Invalid shape in axis {idx}: {dim}."
+ */
+  }
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+
+  /* "View.MemoryView":165
+ * 
+ *         cdef char order
+ *         if mode == 'c':             # <<<<<<<<<<<<<<
+ *             order = b'C'
+ *             self.mode = u'c'
+ */
+  __pyx_t_3 = (__Pyx_PyString_Equals(__pyx_v_mode, __pyx_n_s_c, Py_EQ)); if (unlikely((__pyx_t_3 < 0))) __PYX_ERR(1, 165, __pyx_L1_error)
+  if (__pyx_t_3) {
+
+    /* "View.MemoryView":166
+ *         cdef char order
+ *         if mode == 'c':
+ *             order = b'C'             # <<<<<<<<<<<<<<
+ *             self.mode = u'c'
+ *         elif mode == 'fortran':
+ */
+    __pyx_v_order = 'C';
+
+    /* "View.MemoryView":167
+ *         if mode == 'c':
+ *             order = b'C'
+ *             self.mode = u'c'             # <<<<<<<<<<<<<<
+ *         elif mode == 'fortran':
+ *             order = b'F'
+ */
+    __Pyx_INCREF(__pyx_n_u_c);
+    __Pyx_GIVEREF(__pyx_n_u_c);
+    __Pyx_GOTREF(__pyx_v_self->mode);
+    __Pyx_DECREF(__pyx_v_self->mode);
+    __pyx_v_self->mode = __pyx_n_u_c;
+
+    /* "View.MemoryView":165
+ * 
+ *         cdef char order
+ *         if mode == 'c':             # <<<<<<<<<<<<<<
+ *             order = b'C'
+ *             self.mode = u'c'
+ */
+    goto __pyx_L11;
+  }
+
+  /* "View.MemoryView":168
+ *             order = b'C'
+ *             self.mode = u'c'
+ *         elif mode == 'fortran':             # <<<<<<<<<<<<<<
+ *             order = b'F'
+ *             self.mode = u'fortran'
+ */
+  __pyx_t_3 = (__Pyx_PyString_Equals(__pyx_v_mode, __pyx_n_s_fortran, Py_EQ)); if (unlikely((__pyx_t_3 < 0))) __PYX_ERR(1, 168, __pyx_L1_error)
+  if (likely(__pyx_t_3)) {
+
+    /* "View.MemoryView":169
+ *             self.mode = u'c'
+ *         elif mode == 'fortran':
+ *             order = b'F'             # <<<<<<<<<<<<<<
+ *             self.mode = u'fortran'
+ *         else:
+ */
+    __pyx_v_order = 'F';
+
+    /* "View.MemoryView":170
+ *         elif mode == 'fortran':
+ *             order = b'F'
+ *             self.mode = u'fortran'             # <<<<<<<<<<<<<<
+ *         else:
+ *             raise ValueError, f"Invalid mode, expected 'c' or 'fortran', got {mode}"
+ */
+    __Pyx_INCREF(__pyx_n_u_fortran);
+    __Pyx_GIVEREF(__pyx_n_u_fortran);
+    __Pyx_GOTREF(__pyx_v_self->mode);
+    __Pyx_DECREF(__pyx_v_self->mode);
+    __pyx_v_self->mode = __pyx_n_u_fortran;
+
+    /* "View.MemoryView":168
+ *             order = b'C'
+ *             self.mode = u'c'
+ *         elif mode == 'fortran':             # <<<<<<<<<<<<<<
+ *             order = b'F'
+ *             self.mode = u'fortran'
+ */
+    goto __pyx_L11;
+  }
+
+  /* "View.MemoryView":172
+ *             self.mode = u'fortran'
+ *         else:
+ *             raise ValueError, f"Invalid mode, expected 'c' or 'fortran', got {mode}"             # <<<<<<<<<<<<<<
+ * 
+ *         self.len = fill_contig_strides_array(self._shape, self._strides, itemsize, self.ndim, order)
+ */
+  /*else*/ {
+    __pyx_t_4 = __Pyx_PyObject_FormatSimple(__pyx_v_mode, __pyx_empty_unicode); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 172, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_6 = __Pyx_PyUnicode_Concat(__pyx_kp_u_Invalid_mode_expected_c_or_fortr, __pyx_t_4); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 172, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __Pyx_Raise(__pyx_builtin_ValueError, __pyx_t_6, 0, 0);
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    __PYX_ERR(1, 172, __pyx_L1_error)
+  }
+  __pyx_L11:;
+
+  /* "View.MemoryView":174
+ *             raise ValueError, f"Invalid mode, expected 'c' or 'fortran', got {mode}"
+ * 
+ *         self.len = fill_contig_strides_array(self._shape, self._strides, itemsize, self.ndim, order)             # <<<<<<<<<<<<<<
+ * 
+ *         self.free_data = allocate_buffer
+ */
+  __pyx_v_self->len = __pyx_fill_contig_strides_array(__pyx_v_self->_shape, __pyx_v_self->_strides, __pyx_v_itemsize, __pyx_v_self->ndim, __pyx_v_order);
+
+  /* "View.MemoryView":176
+ *         self.len = fill_contig_strides_array(self._shape, self._strides, itemsize, self.ndim, order)
+ * 
+ *         self.free_data = allocate_buffer             # <<<<<<<<<<<<<<
+ *         self.dtype_is_object = format == b'O'
+ * 
+ */
+  __pyx_v_self->free_data = __pyx_v_allocate_buffer;
+
+  /* "View.MemoryView":177
+ * 
+ *         self.free_data = allocate_buffer
+ *         self.dtype_is_object = format == b'O'             # <<<<<<<<<<<<<<
+ * 
+ *         if allocate_buffer:
+ */
+  __pyx_t_6 = PyObject_RichCompare(__pyx_v_format, __pyx_n_b_O, Py_EQ); __Pyx_XGOTREF(__pyx_t_6); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 177, __pyx_L1_error)
+  __pyx_t_3 = __Pyx_PyObject_IsTrue(__pyx_t_6); if (unlikely((__pyx_t_3 == (int)-1) && PyErr_Occurred())) __PYX_ERR(1, 177, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+  __pyx_v_self->dtype_is_object = __pyx_t_3;
+
+  /* "View.MemoryView":179
+ *         self.dtype_is_object = format == b'O'
+ * 
+ *         if allocate_buffer:             # <<<<<<<<<<<<<<
+ *             _allocate_buffer(self)
+ * 
+ */
+  if (__pyx_v_allocate_buffer) {
+
+    /* "View.MemoryView":180
+ * 
+ *         if allocate_buffer:
+ *             _allocate_buffer(self)             # <<<<<<<<<<<<<<
+ * 
+ *     @cname('getbuffer')
+ */
+    __pyx_t_9 = __pyx_array_allocate_buffer(__pyx_v_self); if (unlikely(__pyx_t_9 == ((int)-1))) __PYX_ERR(1, 180, __pyx_L1_error)
+
+    /* "View.MemoryView":179
+ *         self.dtype_is_object = format == b'O'
+ * 
+ *         if allocate_buffer:             # <<<<<<<<<<<<<<
+ *             _allocate_buffer(self)
+ * 
+ */
+  }
+
+  /* "View.MemoryView":131
+ *         cdef bint dtype_is_object
+ * 
+ *     def __cinit__(array self, tuple shape, Py_ssize_t itemsize, format not None,             # <<<<<<<<<<<<<<
+ *                   mode="c", bint allocate_buffer=True):
+ * 
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_AddTraceback("View.MemoryView.array.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_format);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":182
+ *             _allocate_buffer(self)
+ * 
+ *     @cname('getbuffer')             # <<<<<<<<<<<<<<
+ *     def __getbuffer__(self, Py_buffer *info, int flags):
+ *         cdef int bufmode = -1
+ */
+
+/* Python wrapper */
+CYTHON_UNUSED static int __pyx_array_getbuffer(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /*proto*/
+CYTHON_UNUSED static int __pyx_array_getbuffer(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__getbuffer__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_array___pyx_pf_15View_dot_MemoryView_5array_2__getbuffer__(((struct __pyx_array_obj *)__pyx_v_self), ((Py_buffer *)__pyx_v_info), ((int)__pyx_v_flags));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array_2__getbuffer__(struct __pyx_array_obj *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+  int __pyx_v_bufmode;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  char *__pyx_t_2;
+  Py_ssize_t __pyx_t_3;
+  int __pyx_t_4;
+  Py_ssize_t *__pyx_t_5;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  if (unlikely(__pyx_v_info == NULL)) {
+    PyErr_SetString(PyExc_BufferError, "PyObject_GetBuffer: view==NULL argument is obsolete");
+    return -1;
+  }
+  __Pyx_RefNannySetupContext("__getbuffer__", 0);
+  __pyx_v_info->obj = Py_None; __Pyx_INCREF(Py_None);
+  __Pyx_GIVEREF(__pyx_v_info->obj);
+
+  /* "View.MemoryView":184
+ *     @cname('getbuffer')
+ *     def __getbuffer__(self, Py_buffer *info, int flags):
+ *         cdef int bufmode = -1             # <<<<<<<<<<<<<<
+ *         if flags & (PyBUF_C_CONTIGUOUS | PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS):
+ *             if self.mode == u"c":
+ */
+  __pyx_v_bufmode = -1;
+
+  /* "View.MemoryView":185
+ *     def __getbuffer__(self, Py_buffer *info, int flags):
+ *         cdef int bufmode = -1
+ *         if flags & (PyBUF_C_CONTIGUOUS | PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS):             # <<<<<<<<<<<<<<
+ *             if self.mode == u"c":
+ *                 bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ */
+  __pyx_t_1 = ((__pyx_v_flags & ((PyBUF_C_CONTIGUOUS | PyBUF_F_CONTIGUOUS) | PyBUF_ANY_CONTIGUOUS)) != 0);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":186
+ *         cdef int bufmode = -1
+ *         if flags & (PyBUF_C_CONTIGUOUS | PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS):
+ *             if self.mode == u"c":             # <<<<<<<<<<<<<<
+ *                 bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ *             elif self.mode == u"fortran":
+ */
+    __pyx_t_1 = (__Pyx_PyUnicode_Equals(__pyx_v_self->mode, __pyx_n_u_c, Py_EQ)); if (unlikely((__pyx_t_1 < 0))) __PYX_ERR(1, 186, __pyx_L1_error)
+    if (__pyx_t_1) {
+
+      /* "View.MemoryView":187
+ *         if flags & (PyBUF_C_CONTIGUOUS | PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS):
+ *             if self.mode == u"c":
+ *                 bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS             # <<<<<<<<<<<<<<
+ *             elif self.mode == u"fortran":
+ *                 bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ */
+      __pyx_v_bufmode = (PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS);
+
+      /* "View.MemoryView":186
+ *         cdef int bufmode = -1
+ *         if flags & (PyBUF_C_CONTIGUOUS | PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS):
+ *             if self.mode == u"c":             # <<<<<<<<<<<<<<
+ *                 bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ *             elif self.mode == u"fortran":
+ */
+      goto __pyx_L4;
+    }
+
+    /* "View.MemoryView":188
+ *             if self.mode == u"c":
+ *                 bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ *             elif self.mode == u"fortran":             # <<<<<<<<<<<<<<
+ *                 bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ *             if not (flags & bufmode):
+ */
+    __pyx_t_1 = (__Pyx_PyUnicode_Equals(__pyx_v_self->mode, __pyx_n_u_fortran, Py_EQ)); if (unlikely((__pyx_t_1 < 0))) __PYX_ERR(1, 188, __pyx_L1_error)
+    if (__pyx_t_1) {
+
+      /* "View.MemoryView":189
+ *                 bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ *             elif self.mode == u"fortran":
+ *                 bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS             # <<<<<<<<<<<<<<
+ *             if not (flags & bufmode):
+ *                 raise ValueError, "Can only create a buffer that is contiguous in memory."
+ */
+      __pyx_v_bufmode = (PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS);
+
+      /* "View.MemoryView":188
+ *             if self.mode == u"c":
+ *                 bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ *             elif self.mode == u"fortran":             # <<<<<<<<<<<<<<
+ *                 bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ *             if not (flags & bufmode):
+ */
+    }
+    __pyx_L4:;
+
+    /* "View.MemoryView":190
+ *             elif self.mode == u"fortran":
+ *                 bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ *             if not (flags & bufmode):             # <<<<<<<<<<<<<<
+ *                 raise ValueError, "Can only create a buffer that is contiguous in memory."
+ *         info.buf = self.data
+ */
+    __pyx_t_1 = (!((__pyx_v_flags & __pyx_v_bufmode) != 0));
+    if (unlikely(__pyx_t_1)) {
+
+      /* "View.MemoryView":191
+ *                 bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ *             if not (flags & bufmode):
+ *                 raise ValueError, "Can only create a buffer that is contiguous in memory."             # <<<<<<<<<<<<<<
+ *         info.buf = self.data
+ *         info.len = self.len
+ */
+      __Pyx_Raise(__pyx_builtin_ValueError, __pyx_kp_s_Can_only_create_a_buffer_that_is, 0, 0);
+      __PYX_ERR(1, 191, __pyx_L1_error)
+
+      /* "View.MemoryView":190
+ *             elif self.mode == u"fortran":
+ *                 bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ *             if not (flags & bufmode):             # <<<<<<<<<<<<<<
+ *                 raise ValueError, "Can only create a buffer that is contiguous in memory."
+ *         info.buf = self.data
+ */
+    }
+
+    /* "View.MemoryView":185
+ *     def __getbuffer__(self, Py_buffer *info, int flags):
+ *         cdef int bufmode = -1
+ *         if flags & (PyBUF_C_CONTIGUOUS | PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS):             # <<<<<<<<<<<<<<
+ *             if self.mode == u"c":
+ *                 bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ */
+  }
+
+  /* "View.MemoryView":192
+ *             if not (flags & bufmode):
+ *                 raise ValueError, "Can only create a buffer that is contiguous in memory."
+ *         info.buf = self.data             # <<<<<<<<<<<<<<
+ *         info.len = self.len
+ * 
+ */
+  __pyx_t_2 = __pyx_v_self->data;
+  __pyx_v_info->buf = __pyx_t_2;
+
+  /* "View.MemoryView":193
+ *                 raise ValueError, "Can only create a buffer that is contiguous in memory."
+ *         info.buf = self.data
+ *         info.len = self.len             # <<<<<<<<<<<<<<
+ * 
+ *         if flags & PyBUF_STRIDES:
+ */
+  __pyx_t_3 = __pyx_v_self->len;
+  __pyx_v_info->len = __pyx_t_3;
+
+  /* "View.MemoryView":195
+ *         info.len = self.len
+ * 
+ *         if flags & PyBUF_STRIDES:             # <<<<<<<<<<<<<<
+ *             info.ndim = self.ndim
+ *             info.shape = self._shape
+ */
+  __pyx_t_1 = ((__pyx_v_flags & PyBUF_STRIDES) != 0);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":196
+ * 
+ *         if flags & PyBUF_STRIDES:
+ *             info.ndim = self.ndim             # <<<<<<<<<<<<<<
+ *             info.shape = self._shape
+ *             info.strides = self._strides
+ */
+    __pyx_t_4 = __pyx_v_self->ndim;
+    __pyx_v_info->ndim = __pyx_t_4;
+
+    /* "View.MemoryView":197
+ *         if flags & PyBUF_STRIDES:
+ *             info.ndim = self.ndim
+ *             info.shape = self._shape             # <<<<<<<<<<<<<<
+ *             info.strides = self._strides
+ *         else:
+ */
+    __pyx_t_5 = __pyx_v_self->_shape;
+    __pyx_v_info->shape = __pyx_t_5;
+
+    /* "View.MemoryView":198
+ *             info.ndim = self.ndim
+ *             info.shape = self._shape
+ *             info.strides = self._strides             # <<<<<<<<<<<<<<
+ *         else:
+ *             info.ndim = 1
+ */
+    __pyx_t_5 = __pyx_v_self->_strides;
+    __pyx_v_info->strides = __pyx_t_5;
+
+    /* "View.MemoryView":195
+ *         info.len = self.len
+ * 
+ *         if flags & PyBUF_STRIDES:             # <<<<<<<<<<<<<<
+ *             info.ndim = self.ndim
+ *             info.shape = self._shape
+ */
+    goto __pyx_L6;
+  }
+
+  /* "View.MemoryView":200
+ *             info.strides = self._strides
+ *         else:
+ *             info.ndim = 1             # <<<<<<<<<<<<<<
+ *             info.shape = &self.len if flags & PyBUF_ND else NULL
+ *             info.strides = NULL
+ */
+  /*else*/ {
+    __pyx_v_info->ndim = 1;
+
+    /* "View.MemoryView":201
+ *         else:
+ *             info.ndim = 1
+ *             info.shape = &self.len if flags & PyBUF_ND else NULL             # <<<<<<<<<<<<<<
+ *             info.strides = NULL
+ * 
+ */
+    __pyx_t_1 = ((__pyx_v_flags & PyBUF_ND) != 0);
+    if (__pyx_t_1) {
+      __pyx_t_5 = (&__pyx_v_self->len);
+    } else {
+      __pyx_t_5 = NULL;
+    }
+    __pyx_v_info->shape = __pyx_t_5;
+
+    /* "View.MemoryView":202
+ *             info.ndim = 1
+ *             info.shape = &self.len if flags & PyBUF_ND else NULL
+ *             info.strides = NULL             # <<<<<<<<<<<<<<
+ * 
+ *         info.suboffsets = NULL
+ */
+    __pyx_v_info->strides = NULL;
+  }
+  __pyx_L6:;
+
+  /* "View.MemoryView":204
+ *             info.strides = NULL
+ * 
+ *         info.suboffsets = NULL             # <<<<<<<<<<<<<<
+ *         info.itemsize = self.itemsize
+ *         info.readonly = 0
+ */
+  __pyx_v_info->suboffsets = NULL;
+
+  /* "View.MemoryView":205
+ * 
+ *         info.suboffsets = NULL
+ *         info.itemsize = self.itemsize             # <<<<<<<<<<<<<<
+ *         info.readonly = 0
+ *         info.format = self.format if flags & PyBUF_FORMAT else NULL
+ */
+  __pyx_t_3 = __pyx_v_self->itemsize;
+  __pyx_v_info->itemsize = __pyx_t_3;
+
+  /* "View.MemoryView":206
+ *         info.suboffsets = NULL
+ *         info.itemsize = self.itemsize
+ *         info.readonly = 0             # <<<<<<<<<<<<<<
+ *         info.format = self.format if flags & PyBUF_FORMAT else NULL
+ *         info.obj = self
+ */
+  __pyx_v_info->readonly = 0;
+
+  /* "View.MemoryView":207
+ *         info.itemsize = self.itemsize
+ *         info.readonly = 0
+ *         info.format = self.format if flags & PyBUF_FORMAT else NULL             # <<<<<<<<<<<<<<
+ *         info.obj = self
+ * 
+ */
+  __pyx_t_1 = ((__pyx_v_flags & PyBUF_FORMAT) != 0);
+  if (__pyx_t_1) {
+    __pyx_t_2 = __pyx_v_self->format;
+  } else {
+    __pyx_t_2 = NULL;
+  }
+  __pyx_v_info->format = __pyx_t_2;
+
+  /* "View.MemoryView":208
+ *         info.readonly = 0
+ *         info.format = self.format if flags & PyBUF_FORMAT else NULL
+ *         info.obj = self             # <<<<<<<<<<<<<<
+ * 
+ *     def __dealloc__(array self):
+ */
+  __Pyx_INCREF((PyObject *)__pyx_v_self);
+  __Pyx_GIVEREF((PyObject *)__pyx_v_self);
+  __Pyx_GOTREF(__pyx_v_info->obj);
+  __Pyx_DECREF(__pyx_v_info->obj);
+  __pyx_v_info->obj = ((PyObject *)__pyx_v_self);
+
+  /* "View.MemoryView":182
+ *             _allocate_buffer(self)
+ * 
+ *     @cname('getbuffer')             # <<<<<<<<<<<<<<
+ *     def __getbuffer__(self, Py_buffer *info, int flags):
+ *         cdef int bufmode = -1
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView.array.__getbuffer__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  if (__pyx_v_info->obj != NULL) {
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj); __pyx_v_info->obj = 0;
+  }
+  goto __pyx_L2;
+  __pyx_L0:;
+  if (__pyx_v_info->obj == Py_None) {
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj); __pyx_v_info->obj = 0;
+  }
+  __pyx_L2:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":210
+ *         info.obj = self
+ * 
+ *     def __dealloc__(array self):             # <<<<<<<<<<<<<<
+ *         if self.callback_free_data != NULL:
+ *             self.callback_free_data(self.data)
+ */
+
+/* Python wrapper */
+static void __pyx_array___dealloc__(PyObject *__pyx_v_self); /*proto*/
+static void __pyx_array___dealloc__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_array___pyx_pf_15View_dot_MemoryView_5array_4__dealloc__(((struct __pyx_array_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+static void __pyx_array___pyx_pf_15View_dot_MemoryView_5array_4__dealloc__(struct __pyx_array_obj *__pyx_v_self) {
+  int __pyx_t_1;
+  int __pyx_t_2;
+
+  /* "View.MemoryView":211
+ * 
+ *     def __dealloc__(array self):
+ *         if self.callback_free_data != NULL:             # <<<<<<<<<<<<<<
+ *             self.callback_free_data(self.data)
+ *         elif self.free_data and self.data is not NULL:
+ */
+  __pyx_t_1 = (__pyx_v_self->callback_free_data != NULL);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":212
+ *     def __dealloc__(array self):
+ *         if self.callback_free_data != NULL:
+ *             self.callback_free_data(self.data)             # <<<<<<<<<<<<<<
+ *         elif self.free_data and self.data is not NULL:
+ *             if self.dtype_is_object:
+ */
+    __pyx_v_self->callback_free_data(__pyx_v_self->data);
+
+    /* "View.MemoryView":211
+ * 
+ *     def __dealloc__(array self):
+ *         if self.callback_free_data != NULL:             # <<<<<<<<<<<<<<
+ *             self.callback_free_data(self.data)
+ *         elif self.free_data and self.data is not NULL:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":213
+ *         if self.callback_free_data != NULL:
+ *             self.callback_free_data(self.data)
+ *         elif self.free_data and self.data is not NULL:             # <<<<<<<<<<<<<<
+ *             if self.dtype_is_object:
+ *                 refcount_objects_in_slice(self.data, self._shape, self._strides, self.ndim, inc=False)
+ */
+  if (__pyx_v_self->free_data) {
+  } else {
+    __pyx_t_1 = __pyx_v_self->free_data;
+    goto __pyx_L4_bool_binop_done;
+  }
+  __pyx_t_2 = (__pyx_v_self->data != NULL);
+  __pyx_t_1 = __pyx_t_2;
+  __pyx_L4_bool_binop_done:;
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":214
+ *             self.callback_free_data(self.data)
+ *         elif self.free_data and self.data is not NULL:
+ *             if self.dtype_is_object:             # <<<<<<<<<<<<<<
+ *                 refcount_objects_in_slice(self.data, self._shape, self._strides, self.ndim, inc=False)
+ *             free(self.data)
+ */
+    if (__pyx_v_self->dtype_is_object) {
+
+      /* "View.MemoryView":215
+ *         elif self.free_data and self.data is not NULL:
+ *             if self.dtype_is_object:
+ *                 refcount_objects_in_slice(self.data, self._shape, self._strides, self.ndim, inc=False)             # <<<<<<<<<<<<<<
+ *             free(self.data)
+ *         PyObject_Free(self._shape)
+ */
+      __pyx_memoryview_refcount_objects_in_slice(__pyx_v_self->data, __pyx_v_self->_shape, __pyx_v_self->_strides, __pyx_v_self->ndim, 0);
+
+      /* "View.MemoryView":214
+ *             self.callback_free_data(self.data)
+ *         elif self.free_data and self.data is not NULL:
+ *             if self.dtype_is_object:             # <<<<<<<<<<<<<<
+ *                 refcount_objects_in_slice(self.data, self._shape, self._strides, self.ndim, inc=False)
+ *             free(self.data)
+ */
+    }
+
+    /* "View.MemoryView":216
+ *             if self.dtype_is_object:
+ *                 refcount_objects_in_slice(self.data, self._shape, self._strides, self.ndim, inc=False)
+ *             free(self.data)             # <<<<<<<<<<<<<<
+ *         PyObject_Free(self._shape)
+ * 
+ */
+    free(__pyx_v_self->data);
+
+    /* "View.MemoryView":213
+ *         if self.callback_free_data != NULL:
+ *             self.callback_free_data(self.data)
+ *         elif self.free_data and self.data is not NULL:             # <<<<<<<<<<<<<<
+ *             if self.dtype_is_object:
+ *                 refcount_objects_in_slice(self.data, self._shape, self._strides, self.ndim, inc=False)
+ */
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":217
+ *                 refcount_objects_in_slice(self.data, self._shape, self._strides, self.ndim, inc=False)
+ *             free(self.data)
+ *         PyObject_Free(self._shape)             # <<<<<<<<<<<<<<
+ * 
+ *     @property
+ */
+  PyObject_Free(__pyx_v_self->_shape);
+
+  /* "View.MemoryView":210
+ *         info.obj = self
+ * 
+ *     def __dealloc__(array self):             # <<<<<<<<<<<<<<
+ *         if self.callback_free_data != NULL:
+ *             self.callback_free_data(self.data)
+ */
+
+  /* function exit code */
+}
+
+/* "View.MemoryView":219
+ *         PyObject_Free(self._shape)
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def memview(self):
+ *         return self.get_memview()
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_5array_7memview_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_5array_7memview_1__get__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_pf_15View_dot_MemoryView_5array_7memview___get__(((struct __pyx_array_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_5array_7memview___get__(struct __pyx_array_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__get__", 1);
+
+  /* "View.MemoryView":221
+ *     @property
+ *     def memview(self):
+ *         return self.get_memview()             # <<<<<<<<<<<<<<
+ * 
+ *     @cname('get_memview')
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = ((struct __pyx_vtabstruct_array *)__pyx_v_self->__pyx_vtab)->get_memview(__pyx_v_self); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 221, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":219
+ *         PyObject_Free(self._shape)
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def memview(self):
+ *         return self.get_memview()
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("View.MemoryView.array.memview.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":224
+ * 
+ *     @cname('get_memview')
+ *     cdef get_memview(self):             # <<<<<<<<<<<<<<
+ *         flags =  PyBUF_ANY_CONTIGUOUS|PyBUF_FORMAT|PyBUF_WRITABLE
+ *         return  memoryview(self, flags, self.dtype_is_object)
+ */
+
+static PyObject *__pyx_array_get_memview(struct __pyx_array_obj *__pyx_v_self) {
+  int __pyx_v_flags;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("get_memview", 1);
+
+  /* "View.MemoryView":225
+ *     @cname('get_memview')
+ *     cdef get_memview(self):
+ *         flags =  PyBUF_ANY_CONTIGUOUS|PyBUF_FORMAT|PyBUF_WRITABLE             # <<<<<<<<<<<<<<
+ *         return  memoryview(self, flags, self.dtype_is_object)
+ * 
+ */
+  __pyx_v_flags = ((PyBUF_ANY_CONTIGUOUS | PyBUF_FORMAT) | PyBUF_WRITABLE);
+
+  /* "View.MemoryView":226
+ *     cdef get_memview(self):
+ *         flags =  PyBUF_ANY_CONTIGUOUS|PyBUF_FORMAT|PyBUF_WRITABLE
+ *         return  memoryview(self, flags, self.dtype_is_object)             # <<<<<<<<<<<<<<
+ * 
+ *     def __len__(self):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_flags); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 226, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyBool_FromLong(__pyx_v_self->dtype_is_object); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 226, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = PyTuple_New(3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 226, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_INCREF((PyObject *)__pyx_v_self);
+  __Pyx_GIVEREF((PyObject *)__pyx_v_self);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 0, ((PyObject *)__pyx_v_self))) __PYX_ERR(1, 226, __pyx_L1_error);
+  __Pyx_GIVEREF(__pyx_t_1);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_1)) __PYX_ERR(1, 226, __pyx_L1_error);
+  __Pyx_GIVEREF(__pyx_t_2);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 2, __pyx_t_2)) __PYX_ERR(1, 226, __pyx_L1_error);
+  __pyx_t_1 = 0;
+  __pyx_t_2 = 0;
+  __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)__pyx_memoryview_type), __pyx_t_3, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 226, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_r = __pyx_t_2;
+  __pyx_t_2 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":224
+ * 
+ *     @cname('get_memview')
+ *     cdef get_memview(self):             # <<<<<<<<<<<<<<
+ *         flags =  PyBUF_ANY_CONTIGUOUS|PyBUF_FORMAT|PyBUF_WRITABLE
+ *         return  memoryview(self, flags, self.dtype_is_object)
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_AddTraceback("View.MemoryView.array.get_memview", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":228
+ *         return  memoryview(self, flags, self.dtype_is_object)
+ * 
+ *     def __len__(self):             # <<<<<<<<<<<<<<
+ *         return self._shape[0]
+ * 
+ */
+
+/* Python wrapper */
+static Py_ssize_t __pyx_array___len__(PyObject *__pyx_v_self); /*proto*/
+static Py_ssize_t __pyx_array___len__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  Py_ssize_t __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__len__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_array___pyx_pf_15View_dot_MemoryView_5array_6__len__(((struct __pyx_array_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static Py_ssize_t __pyx_array___pyx_pf_15View_dot_MemoryView_5array_6__len__(struct __pyx_array_obj *__pyx_v_self) {
+  Py_ssize_t __pyx_r;
+
+  /* "View.MemoryView":229
+ * 
+ *     def __len__(self):
+ *         return self._shape[0]             # <<<<<<<<<<<<<<
+ * 
+ *     def __getattr__(self, attr):
+ */
+  __pyx_r = (__pyx_v_self->_shape[0]);
+  goto __pyx_L0;
+
+  /* "View.MemoryView":228
+ *         return  memoryview(self, flags, self.dtype_is_object)
+ * 
+ *     def __len__(self):             # <<<<<<<<<<<<<<
+ *         return self._shape[0]
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":231
+ *         return self._shape[0]
+ * 
+ *     def __getattr__(self, attr):             # <<<<<<<<<<<<<<
+ *         return getattr(self.memview, attr)
+ * 
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_array___getattr__(PyObject *__pyx_v_self, PyObject *__pyx_v_attr); /*proto*/
+static PyObject *__pyx_array___getattr__(PyObject *__pyx_v_self, PyObject *__pyx_v_attr) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__getattr__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_array___pyx_pf_15View_dot_MemoryView_5array_8__getattr__(((struct __pyx_array_obj *)__pyx_v_self), ((PyObject *)__pyx_v_attr));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_array___pyx_pf_15View_dot_MemoryView_5array_8__getattr__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_attr) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__getattr__", 1);
+
+  /* "View.MemoryView":232
+ * 
+ *     def __getattr__(self, attr):
+ *         return getattr(self.memview, attr)             # <<<<<<<<<<<<<<
+ * 
+ *     def __getitem__(self, item):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_memview); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 232, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_GetAttr(__pyx_t_1, __pyx_v_attr); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 232, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_r = __pyx_t_2;
+  __pyx_t_2 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":231
+ *         return self._shape[0]
+ * 
+ *     def __getattr__(self, attr):             # <<<<<<<<<<<<<<
+ *         return getattr(self.memview, attr)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView.array.__getattr__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":234
+ *         return getattr(self.memview, attr)
+ * 
+ *     def __getitem__(self, item):             # <<<<<<<<<<<<<<
+ *         return self.memview[item]
+ * 
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_array___getitem__(PyObject *__pyx_v_self, PyObject *__pyx_v_item); /*proto*/
+static PyObject *__pyx_array___getitem__(PyObject *__pyx_v_self, PyObject *__pyx_v_item) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__getitem__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_array___pyx_pf_15View_dot_MemoryView_5array_10__getitem__(((struct __pyx_array_obj *)__pyx_v_self), ((PyObject *)__pyx_v_item));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_array___pyx_pf_15View_dot_MemoryView_5array_10__getitem__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_item) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__getitem__", 1);
+
+  /* "View.MemoryView":235
+ * 
+ *     def __getitem__(self, item):
+ *         return self.memview[item]             # <<<<<<<<<<<<<<
+ * 
+ *     def __setitem__(self, item, value):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_memview); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 235, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetItem(__pyx_t_1, __pyx_v_item); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 235, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_r = __pyx_t_2;
+  __pyx_t_2 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":234
+ *         return getattr(self.memview, attr)
+ * 
+ *     def __getitem__(self, item):             # <<<<<<<<<<<<<<
+ *         return self.memview[item]
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView.array.__getitem__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":237
+ *         return self.memview[item]
+ * 
+ *     def __setitem__(self, item, value):             # <<<<<<<<<<<<<<
+ *         self.memview[item] = value
+ * 
+ */
+
+/* Python wrapper */
+static int __pyx_array___setitem__(PyObject *__pyx_v_self, PyObject *__pyx_v_item, PyObject *__pyx_v_value); /*proto*/
+static int __pyx_array___setitem__(PyObject *__pyx_v_self, PyObject *__pyx_v_item, PyObject *__pyx_v_value) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__setitem__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_array___pyx_pf_15View_dot_MemoryView_5array_12__setitem__(((struct __pyx_array_obj *)__pyx_v_self), ((PyObject *)__pyx_v_item), ((PyObject *)__pyx_v_value));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array_12__setitem__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_item, PyObject *__pyx_v_value) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__setitem__", 1);
+
+  /* "View.MemoryView":238
+ * 
+ *     def __setitem__(self, item, value):
+ *         self.memview[item] = value             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_memview); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 238, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (unlikely((PyObject_SetItem(__pyx_t_1, __pyx_v_item, __pyx_v_value) < 0))) __PYX_ERR(1, 238, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "View.MemoryView":237
+ *         return self.memview[item]
+ * 
+ *     def __setitem__(self, item, value):             # <<<<<<<<<<<<<<
+ *         self.memview[item] = value
+ * 
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("View.MemoryView.array.__setitem__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw___pyx_array_1__reduce_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_pw___pyx_array_1__reduce_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__reduce_cython__ (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  if (unlikely(__pyx_nargs > 0)) {
+    __Pyx_RaiseArgtupleInvalid("__reduce_cython__", 1, 0, 0, __pyx_nargs); return NULL;}
+  if (unlikely(__pyx_kwds) && __Pyx_NumKwargs_FASTCALL(__pyx_kwds) && unlikely(!__Pyx_CheckKeywordStrings(__pyx_kwds, "__reduce_cython__", 0))) return NULL;
+  __pyx_r = __pyx_pf___pyx_array___reduce_cython__(((struct __pyx_array_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf___pyx_array___reduce_cython__(CYTHON_UNUSED struct __pyx_array_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__reduce_cython__", 1);
+
+  /* "(tree fragment)":2
+ * def __reduce_cython__(self):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"             # <<<<<<<<<<<<<<
+ * def __setstate_cython__(self, __pyx_state):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ */
+  __Pyx_Raise(__pyx_builtin_TypeError, __pyx_kp_s_no_default___reduce___due_to_non, 0, 0);
+  __PYX_ERR(1, 2, __pyx_L1_error)
+
+  /* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView.array.__reduce_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":3
+ * def __reduce_cython__(self):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw___pyx_array_3__setstate_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_pw___pyx_array_3__setstate_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  CYTHON_UNUSED PyObject *__pyx_v___pyx_state = 0;
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[1] = {0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__setstate_cython__ (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_pyx_state,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  1: values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_FASTCALL(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_pyx_state)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 3, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "__setstate_cython__") < 0)) __PYX_ERR(1, 3, __pyx_L3_error)
+      }
+    } else if (unlikely(__pyx_nargs != 1)) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+    }
+    __pyx_v___pyx_state = values[0];
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__setstate_cython__", 1, 1, 1, __pyx_nargs); __PYX_ERR(1, 3, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("View.MemoryView.array.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf___pyx_array_2__setstate_cython__(((struct __pyx_array_obj *)__pyx_v_self), __pyx_v___pyx_state);
+
+  /* function exit code */
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf___pyx_array_2__setstate_cython__(CYTHON_UNUSED struct __pyx_array_obj *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__setstate_cython__", 1);
+
+  /* "(tree fragment)":4
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"             # <<<<<<<<<<<<<<
+ */
+  __Pyx_Raise(__pyx_builtin_TypeError, __pyx_kp_s_no_default___reduce___due_to_non, 0, 0);
+  __PYX_ERR(1, 4, __pyx_L1_error)
+
+  /* "(tree fragment)":3
+ * def __reduce_cython__(self):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView.array.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":248
+ * 
+ * @cname("__pyx_array_allocate_buffer")
+ * cdef int _allocate_buffer(array self) except -1:             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+
+static int __pyx_array_allocate_buffer(struct __pyx_array_obj *__pyx_v_self) {
+  Py_ssize_t __pyx_v_i;
+  PyObject **__pyx_v_p;
+  int __pyx_r;
+  int __pyx_t_1;
+  Py_ssize_t __pyx_t_2;
+  Py_ssize_t __pyx_t_3;
+  Py_ssize_t __pyx_t_4;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+
+  /* "View.MemoryView":254
+ *     cdef PyObject **p
+ * 
+ *     self.free_data = True             # <<<<<<<<<<<<<<
+ *     self.data = <char *>malloc(self.len)
+ *     if not self.data:
+ */
+  __pyx_v_self->free_data = 1;
+
+  /* "View.MemoryView":255
+ * 
+ *     self.free_data = True
+ *     self.data = <char *>malloc(self.len)             # <<<<<<<<<<<<<<
+ *     if not self.data:
+ *         raise MemoryError, "unable to allocate array data."
+ */
+  __pyx_v_self->data = ((char *)malloc(__pyx_v_self->len));
+
+  /* "View.MemoryView":256
+ *     self.free_data = True
+ *     self.data = <char *>malloc(self.len)
+ *     if not self.data:             # <<<<<<<<<<<<<<
+ *         raise MemoryError, "unable to allocate array data."
+ * 
+ */
+  __pyx_t_1 = (!(__pyx_v_self->data != 0));
+  if (unlikely(__pyx_t_1)) {
+
+    /* "View.MemoryView":257
+ *     self.data = <char *>malloc(self.len)
+ *     if not self.data:
+ *         raise MemoryError, "unable to allocate array data."             # <<<<<<<<<<<<<<
+ * 
+ *     if self.dtype_is_object:
+ */
+    __Pyx_Raise(__pyx_builtin_MemoryError, __pyx_kp_s_unable_to_allocate_array_data, 0, 0);
+    __PYX_ERR(1, 257, __pyx_L1_error)
+
+    /* "View.MemoryView":256
+ *     self.free_data = True
+ *     self.data = <char *>malloc(self.len)
+ *     if not self.data:             # <<<<<<<<<<<<<<
+ *         raise MemoryError, "unable to allocate array data."
+ * 
+ */
+  }
+
+  /* "View.MemoryView":259
+ *         raise MemoryError, "unable to allocate array data."
+ * 
+ *     if self.dtype_is_object:             # <<<<<<<<<<<<<<
+ *         p = <PyObject **> self.data
+ *         for i in range(self.len // self.itemsize):
+ */
+  if (__pyx_v_self->dtype_is_object) {
+
+    /* "View.MemoryView":260
+ * 
+ *     if self.dtype_is_object:
+ *         p = <PyObject **> self.data             # <<<<<<<<<<<<<<
+ *         for i in range(self.len // self.itemsize):
+ *             p[i] = Py_None
+ */
+    __pyx_v_p = ((PyObject **)__pyx_v_self->data);
+
+    /* "View.MemoryView":261
+ *     if self.dtype_is_object:
+ *         p = <PyObject **> self.data
+ *         for i in range(self.len // self.itemsize):             # <<<<<<<<<<<<<<
+ *             p[i] = Py_None
+ *             Py_INCREF(Py_None)
+ */
+    if (unlikely(__pyx_v_self->itemsize == 0)) {
+      PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
+      __PYX_ERR(1, 261, __pyx_L1_error)
+    }
+    else if (sizeof(Py_ssize_t) == sizeof(long) && (!(((Py_ssize_t)-1) > 0)) && unlikely(__pyx_v_self->itemsize == (Py_ssize_t)-1)  && unlikely(__Pyx_UNARY_NEG_WOULD_OVERFLOW(__pyx_v_self->len))) {
+      PyErr_SetString(PyExc_OverflowError, "value too large to perform division");
+      __PYX_ERR(1, 261, __pyx_L1_error)
+    }
+    __pyx_t_2 = __Pyx_div_Py_ssize_t(__pyx_v_self->len, __pyx_v_self->itemsize);
+    __pyx_t_3 = __pyx_t_2;
+    for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
+      __pyx_v_i = __pyx_t_4;
+
+      /* "View.MemoryView":262
+ *         p = <PyObject **> self.data
+ *         for i in range(self.len // self.itemsize):
+ *             p[i] = Py_None             # <<<<<<<<<<<<<<
+ *             Py_INCREF(Py_None)
+ *     return 0
+ */
+      (__pyx_v_p[__pyx_v_i]) = Py_None;
+
+      /* "View.MemoryView":263
+ *         for i in range(self.len // self.itemsize):
+ *             p[i] = Py_None
+ *             Py_INCREF(Py_None)             # <<<<<<<<<<<<<<
+ *     return 0
+ * 
+ */
+      Py_INCREF(Py_None);
+    }
+
+    /* "View.MemoryView":259
+ *         raise MemoryError, "unable to allocate array data."
+ * 
+ *     if self.dtype_is_object:             # <<<<<<<<<<<<<<
+ *         p = <PyObject **> self.data
+ *         for i in range(self.len // self.itemsize):
+ */
+  }
+
+  /* "View.MemoryView":264
+ *             p[i] = Py_None
+ *             Py_INCREF(Py_None)
+ *     return 0             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":248
+ * 
+ * @cname("__pyx_array_allocate_buffer")
+ * cdef int _allocate_buffer(array self) except -1:             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView._allocate_buffer", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":268
+ * 
+ * @cname("__pyx_array_new")
+ * cdef array array_cwrapper(tuple shape, Py_ssize_t itemsize, char *format, char *c_mode, char *buf):             # <<<<<<<<<<<<<<
+ *     cdef array result
+ *     cdef str mode = "fortran" if c_mode[0] == b'f' else "c"  # this often comes from a constant C string.
+ */
+
+static struct __pyx_array_obj *__pyx_array_new(PyObject *__pyx_v_shape, Py_ssize_t __pyx_v_itemsize, char *__pyx_v_format, char *__pyx_v_c_mode, char *__pyx_v_buf) {
+  struct __pyx_array_obj *__pyx_v_result = 0;
+  PyObject *__pyx_v_mode = 0;
+  struct __pyx_array_obj *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("array_cwrapper", 1);
+
+  /* "View.MemoryView":270
+ * cdef array array_cwrapper(tuple shape, Py_ssize_t itemsize, char *format, char *c_mode, char *buf):
+ *     cdef array result
+ *     cdef str mode = "fortran" if c_mode[0] == b'f' else "c"  # this often comes from a constant C string.             # <<<<<<<<<<<<<<
+ * 
+ *     if buf is NULL:
+ */
+  __pyx_t_2 = ((__pyx_v_c_mode[0]) == 'f');
+  if (__pyx_t_2) {
+    __Pyx_INCREF(__pyx_n_s_fortran);
+    __pyx_t_1 = __pyx_n_s_fortran;
+  } else {
+    __Pyx_INCREF(__pyx_n_s_c);
+    __pyx_t_1 = __pyx_n_s_c;
+  }
+  __pyx_v_mode = ((PyObject*)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "View.MemoryView":272
+ *     cdef str mode = "fortran" if c_mode[0] == b'f' else "c"  # this often comes from a constant C string.
+ * 
+ *     if buf is NULL:             # <<<<<<<<<<<<<<
+ *         result = array.__new__(array, shape, itemsize, format, mode)
+ *     else:
+ */
+  __pyx_t_2 = (__pyx_v_buf == NULL);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":273
+ * 
+ *     if buf is NULL:
+ *         result = array.__new__(array, shape, itemsize, format, mode)             # <<<<<<<<<<<<<<
+ *     else:
+ *         result = array.__new__(array, shape, itemsize, format, mode, allocate_buffer=False)
+ */
+    __pyx_t_1 = PyInt_FromSsize_t(__pyx_v_itemsize); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 273, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_3 = __Pyx_PyBytes_FromString(__pyx_v_format); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 273, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_4 = PyTuple_New(4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 273, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_INCREF(__pyx_v_shape);
+    __Pyx_GIVEREF(__pyx_v_shape);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_v_shape)) __PYX_ERR(1, 273, __pyx_L1_error);
+    __Pyx_GIVEREF(__pyx_t_1);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_4, 1, __pyx_t_1)) __PYX_ERR(1, 273, __pyx_L1_error);
+    __Pyx_GIVEREF(__pyx_t_3);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_4, 2, __pyx_t_3)) __PYX_ERR(1, 273, __pyx_L1_error);
+    __Pyx_INCREF(__pyx_v_mode);
+    __Pyx_GIVEREF(__pyx_v_mode);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_4, 3, __pyx_v_mode)) __PYX_ERR(1, 273, __pyx_L1_error);
+    __pyx_t_1 = 0;
+    __pyx_t_3 = 0;
+    __pyx_t_3 = ((PyObject *)__pyx_tp_new_array(((PyTypeObject *)__pyx_array_type), __pyx_t_4, NULL)); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 273, __pyx_L1_error)
+    __Pyx_GOTREF((PyObject *)__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __pyx_v_result = ((struct __pyx_array_obj *)__pyx_t_3);
+    __pyx_t_3 = 0;
+
+    /* "View.MemoryView":272
+ *     cdef str mode = "fortran" if c_mode[0] == b'f' else "c"  # this often comes from a constant C string.
+ * 
+ *     if buf is NULL:             # <<<<<<<<<<<<<<
+ *         result = array.__new__(array, shape, itemsize, format, mode)
+ *     else:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":275
+ *         result = array.__new__(array, shape, itemsize, format, mode)
+ *     else:
+ *         result = array.__new__(array, shape, itemsize, format, mode, allocate_buffer=False)             # <<<<<<<<<<<<<<
+ *         result.data = buf
+ * 
+ */
+  /*else*/ {
+    __pyx_t_3 = PyInt_FromSsize_t(__pyx_v_itemsize); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 275, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_4 = __Pyx_PyBytes_FromString(__pyx_v_format); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 275, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_1 = PyTuple_New(4); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 275, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_INCREF(__pyx_v_shape);
+    __Pyx_GIVEREF(__pyx_v_shape);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v_shape)) __PYX_ERR(1, 275, __pyx_L1_error);
+    __Pyx_GIVEREF(__pyx_t_3);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_t_3)) __PYX_ERR(1, 275, __pyx_L1_error);
+    __Pyx_GIVEREF(__pyx_t_4);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 2, __pyx_t_4)) __PYX_ERR(1, 275, __pyx_L1_error);
+    __Pyx_INCREF(__pyx_v_mode);
+    __Pyx_GIVEREF(__pyx_v_mode);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 3, __pyx_v_mode)) __PYX_ERR(1, 275, __pyx_L1_error);
+    __pyx_t_3 = 0;
+    __pyx_t_4 = 0;
+    __pyx_t_4 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 275, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    if (PyDict_SetItem(__pyx_t_4, __pyx_n_s_allocate_buffer, Py_False) < 0) __PYX_ERR(1, 275, __pyx_L1_error)
+    __pyx_t_3 = ((PyObject *)__pyx_tp_new_array(((PyTypeObject *)__pyx_array_type), __pyx_t_1, __pyx_t_4)); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 275, __pyx_L1_error)
+    __Pyx_GOTREF((PyObject *)__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __pyx_v_result = ((struct __pyx_array_obj *)__pyx_t_3);
+    __pyx_t_3 = 0;
+
+    /* "View.MemoryView":276
+ *     else:
+ *         result = array.__new__(array, shape, itemsize, format, mode, allocate_buffer=False)
+ *         result.data = buf             # <<<<<<<<<<<<<<
+ * 
+ *     return result
+ */
+    __pyx_v_result->data = __pyx_v_buf;
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":278
+ *         result.data = buf
+ * 
+ *     return result             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __Pyx_XDECREF((PyObject *)__pyx_r);
+  __Pyx_INCREF((PyObject *)__pyx_v_result);
+  __pyx_r = __pyx_v_result;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":268
+ * 
+ * @cname("__pyx_array_new")
+ * cdef array array_cwrapper(tuple shape, Py_ssize_t itemsize, char *format, char *c_mode, char *buf):             # <<<<<<<<<<<<<<
+ *     cdef array result
+ *     cdef str mode = "fortran" if c_mode[0] == b'f' else "c"  # this often comes from a constant C string.
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("View.MemoryView.array_cwrapper", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_result);
+  __Pyx_XDECREF(__pyx_v_mode);
+  __Pyx_XGIVEREF((PyObject *)__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":304
+ * cdef class Enum(object):
+ *     cdef object name
+ *     def __init__(self, name):             # <<<<<<<<<<<<<<
+ *         self.name = name
+ *     def __repr__(self):
+ */
+
+/* Python wrapper */
+static int __pyx_MemviewEnum___init__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static int __pyx_MemviewEnum___init__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_name = 0;
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[1] = {0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__init__ (wrapper)", 0);
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return -1;
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_name,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  1: values[0] = __Pyx_Arg_VARARGS(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_VARARGS(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_VARARGS(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_name)) != 0)) {
+          (void)__Pyx_Arg_NewRef_VARARGS(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 304, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "__init__") < 0)) __PYX_ERR(1, 304, __pyx_L3_error)
+      }
+    } else if (unlikely(__pyx_nargs != 1)) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = __Pyx_Arg_VARARGS(__pyx_args, 0);
+    }
+    __pyx_v_name = values[0];
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__init__", 1, 1, 1, __pyx_nargs); __PYX_ERR(1, 304, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_VARARGS(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("View.MemoryView.Enum.__init__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return -1;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum___init__(((struct __pyx_MemviewEnum_obj *)__pyx_v_self), __pyx_v_name);
+
+  /* function exit code */
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_VARARGS(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum___init__(struct __pyx_MemviewEnum_obj *__pyx_v_self, PyObject *__pyx_v_name) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__init__", 1);
+
+  /* "View.MemoryView":305
+ *     cdef object name
+ *     def __init__(self, name):
+ *         self.name = name             # <<<<<<<<<<<<<<
+ *     def __repr__(self):
+ *         return self.name
+ */
+  __Pyx_INCREF(__pyx_v_name);
+  __Pyx_GIVEREF(__pyx_v_name);
+  __Pyx_GOTREF(__pyx_v_self->name);
+  __Pyx_DECREF(__pyx_v_self->name);
+  __pyx_v_self->name = __pyx_v_name;
+
+  /* "View.MemoryView":304
+ * cdef class Enum(object):
+ *     cdef object name
+ *     def __init__(self, name):             # <<<<<<<<<<<<<<
+ *         self.name = name
+ *     def __repr__(self):
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":306
+ *     def __init__(self, name):
+ *         self.name = name
+ *     def __repr__(self):             # <<<<<<<<<<<<<<
+ *         return self.name
+ * 
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_MemviewEnum___repr__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_MemviewEnum___repr__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__repr__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum_2__repr__(((struct __pyx_MemviewEnum_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum_2__repr__(struct __pyx_MemviewEnum_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__repr__", 1);
+
+  /* "View.MemoryView":307
+ *         self.name = name
+ *     def __repr__(self):
+ *         return self.name             # <<<<<<<<<<<<<<
+ * 
+ * cdef generic = Enum("<strided and direct or indirect>")
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_self->name);
+  __pyx_r = __pyx_v_self->name;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":306
+ *     def __init__(self, name):
+ *         self.name = name
+ *     def __repr__(self):             # <<<<<<<<<<<<<<
+ *         return self.name
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     cdef tuple state
+ *     cdef object _dict
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw___pyx_MemviewEnum_1__reduce_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_pw___pyx_MemviewEnum_1__reduce_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__reduce_cython__ (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  if (unlikely(__pyx_nargs > 0)) {
+    __Pyx_RaiseArgtupleInvalid("__reduce_cython__", 1, 0, 0, __pyx_nargs); return NULL;}
+  if (unlikely(__pyx_kwds) && __Pyx_NumKwargs_FASTCALL(__pyx_kwds) && unlikely(!__Pyx_CheckKeywordStrings(__pyx_kwds, "__reduce_cython__", 0))) return NULL;
+  __pyx_r = __pyx_pf___pyx_MemviewEnum___reduce_cython__(((struct __pyx_MemviewEnum_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf___pyx_MemviewEnum___reduce_cython__(struct __pyx_MemviewEnum_obj *__pyx_v_self) {
+  PyObject *__pyx_v_state = 0;
+  PyObject *__pyx_v__dict = 0;
+  int __pyx_v_use_setstate;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__reduce_cython__", 1);
+
+  /* "(tree fragment)":5
+ *     cdef object _dict
+ *     cdef bint use_setstate
+ *     state = (self.name,)             # <<<<<<<<<<<<<<
+ *     _dict = getattr(self, '__dict__', None)
+ *     if _dict is not None:
+ */
+  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 5, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_INCREF(__pyx_v_self->name);
+  __Pyx_GIVEREF(__pyx_v_self->name);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v_self->name)) __PYX_ERR(1, 5, __pyx_L1_error);
+  __pyx_v_state = ((PyObject*)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "(tree fragment)":6
+ *     cdef bint use_setstate
+ *     state = (self.name,)
+ *     _dict = getattr(self, '__dict__', None)             # <<<<<<<<<<<<<<
+ *     if _dict is not None:
+ *         state += (_dict,)
+ */
+  __pyx_t_1 = __Pyx_GetAttr3(((PyObject *)__pyx_v_self), __pyx_n_s_dict, Py_None); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 6, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v__dict = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "(tree fragment)":7
+ *     state = (self.name,)
+ *     _dict = getattr(self, '__dict__', None)
+ *     if _dict is not None:             # <<<<<<<<<<<<<<
+ *         state += (_dict,)
+ *         use_setstate = True
+ */
+  __pyx_t_2 = (__pyx_v__dict != Py_None);
+  if (__pyx_t_2) {
+
+    /* "(tree fragment)":8
+ *     _dict = getattr(self, '__dict__', None)
+ *     if _dict is not None:
+ *         state += (_dict,)             # <<<<<<<<<<<<<<
+ *         use_setstate = True
+ *     else:
+ */
+    __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 8, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_INCREF(__pyx_v__dict);
+    __Pyx_GIVEREF(__pyx_v__dict);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v__dict)) __PYX_ERR(1, 8, __pyx_L1_error);
+    __pyx_t_3 = PyNumber_InPlaceAdd(__pyx_v_state, __pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 8, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_DECREF_SET(__pyx_v_state, ((PyObject*)__pyx_t_3));
+    __pyx_t_3 = 0;
+
+    /* "(tree fragment)":9
+ *     if _dict is not None:
+ *         state += (_dict,)
+ *         use_setstate = True             # <<<<<<<<<<<<<<
+ *     else:
+ *         use_setstate = self.name is not None
+ */
+    __pyx_v_use_setstate = 1;
+
+    /* "(tree fragment)":7
+ *     state = (self.name,)
+ *     _dict = getattr(self, '__dict__', None)
+ *     if _dict is not None:             # <<<<<<<<<<<<<<
+ *         state += (_dict,)
+ *         use_setstate = True
+ */
+    goto __pyx_L3;
+  }
+
+  /* "(tree fragment)":11
+ *         use_setstate = True
+ *     else:
+ *         use_setstate = self.name is not None             # <<<<<<<<<<<<<<
+ *     if use_setstate:
+ *         return __pyx_unpickle_Enum, (type(self), 0x82a3537, None), state
+ */
+  /*else*/ {
+    __pyx_t_2 = (__pyx_v_self->name != Py_None);
+    __pyx_v_use_setstate = __pyx_t_2;
+  }
+  __pyx_L3:;
+
+  /* "(tree fragment)":12
+ *     else:
+ *         use_setstate = self.name is not None
+ *     if use_setstate:             # <<<<<<<<<<<<<<
+ *         return __pyx_unpickle_Enum, (type(self), 0x82a3537, None), state
+ *     else:
+ */
+  if (__pyx_v_use_setstate) {
+
+    /* "(tree fragment)":13
+ *         use_setstate = self.name is not None
+ *     if use_setstate:
+ *         return __pyx_unpickle_Enum, (type(self), 0x82a3537, None), state             # <<<<<<<<<<<<<<
+ *     else:
+ *         return __pyx_unpickle_Enum, (type(self), 0x82a3537, state)
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_GetModuleGlobalName(__pyx_t_3, __pyx_n_s_pyx_unpickle_Enum); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 13, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_1 = PyTuple_New(3); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 13, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_INCREF(((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))));
+    __Pyx_GIVEREF(((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))));
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 0, ((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))))) __PYX_ERR(1, 13, __pyx_L1_error);
+    __Pyx_INCREF(__pyx_int_136983863);
+    __Pyx_GIVEREF(__pyx_int_136983863);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_int_136983863)) __PYX_ERR(1, 13, __pyx_L1_error);
+    __Pyx_INCREF(Py_None);
+    __Pyx_GIVEREF(Py_None);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 2, Py_None)) __PYX_ERR(1, 13, __pyx_L1_error);
+    __pyx_t_4 = PyTuple_New(3); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 13, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_GIVEREF(__pyx_t_3);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3)) __PYX_ERR(1, 13, __pyx_L1_error);
+    __Pyx_GIVEREF(__pyx_t_1);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_4, 1, __pyx_t_1)) __PYX_ERR(1, 13, __pyx_L1_error);
+    __Pyx_INCREF(__pyx_v_state);
+    __Pyx_GIVEREF(__pyx_v_state);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_4, 2, __pyx_v_state)) __PYX_ERR(1, 13, __pyx_L1_error);
+    __pyx_t_3 = 0;
+    __pyx_t_1 = 0;
+    __pyx_r = __pyx_t_4;
+    __pyx_t_4 = 0;
+    goto __pyx_L0;
+
+    /* "(tree fragment)":12
+ *     else:
+ *         use_setstate = self.name is not None
+ *     if use_setstate:             # <<<<<<<<<<<<<<
+ *         return __pyx_unpickle_Enum, (type(self), 0x82a3537, None), state
+ *     else:
+ */
+  }
+
+  /* "(tree fragment)":15
+ *         return __pyx_unpickle_Enum, (type(self), 0x82a3537, None), state
+ *     else:
+ *         return __pyx_unpickle_Enum, (type(self), 0x82a3537, state)             # <<<<<<<<<<<<<<
+ * def __setstate_cython__(self, __pyx_state):
+ *     __pyx_unpickle_Enum__set_state(self, __pyx_state)
+ */
+  /*else*/ {
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_GetModuleGlobalName(__pyx_t_4, __pyx_n_s_pyx_unpickle_Enum); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 15, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_1 = PyTuple_New(3); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 15, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_INCREF(((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))));
+    __Pyx_GIVEREF(((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))));
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 0, ((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))))) __PYX_ERR(1, 15, __pyx_L1_error);
+    __Pyx_INCREF(__pyx_int_136983863);
+    __Pyx_GIVEREF(__pyx_int_136983863);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_int_136983863)) __PYX_ERR(1, 15, __pyx_L1_error);
+    __Pyx_INCREF(__pyx_v_state);
+    __Pyx_GIVEREF(__pyx_v_state);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 2, __pyx_v_state)) __PYX_ERR(1, 15, __pyx_L1_error);
+    __pyx_t_3 = PyTuple_New(2); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 15, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_GIVEREF(__pyx_t_4);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_4)) __PYX_ERR(1, 15, __pyx_L1_error);
+    __Pyx_GIVEREF(__pyx_t_1);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_1)) __PYX_ERR(1, 15, __pyx_L1_error);
+    __pyx_t_4 = 0;
+    __pyx_t_1 = 0;
+    __pyx_r = __pyx_t_3;
+    __pyx_t_3 = 0;
+    goto __pyx_L0;
+  }
+
+  /* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     cdef tuple state
+ *     cdef object _dict
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("View.MemoryView.Enum.__reduce_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_state);
+  __Pyx_XDECREF(__pyx_v__dict);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":16
+ *     else:
+ *         return __pyx_unpickle_Enum, (type(self), 0x82a3537, state)
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     __pyx_unpickle_Enum__set_state(self, __pyx_state)
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw___pyx_MemviewEnum_3__setstate_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_pw___pyx_MemviewEnum_3__setstate_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  PyObject *__pyx_v___pyx_state = 0;
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[1] = {0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__setstate_cython__ (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_pyx_state,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  1: values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_FASTCALL(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_pyx_state)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 16, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "__setstate_cython__") < 0)) __PYX_ERR(1, 16, __pyx_L3_error)
+      }
+    } else if (unlikely(__pyx_nargs != 1)) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+    }
+    __pyx_v___pyx_state = values[0];
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__setstate_cython__", 1, 1, 1, __pyx_nargs); __PYX_ERR(1, 16, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("View.MemoryView.Enum.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf___pyx_MemviewEnum_2__setstate_cython__(((struct __pyx_MemviewEnum_obj *)__pyx_v_self), __pyx_v___pyx_state);
+
+  /* function exit code */
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf___pyx_MemviewEnum_2__setstate_cython__(struct __pyx_MemviewEnum_obj *__pyx_v_self, PyObject *__pyx_v___pyx_state) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__setstate_cython__", 1);
+
+  /* "(tree fragment)":17
+ *         return __pyx_unpickle_Enum, (type(self), 0x82a3537, state)
+ * def __setstate_cython__(self, __pyx_state):
+ *     __pyx_unpickle_Enum__set_state(self, __pyx_state)             # <<<<<<<<<<<<<<
+ */
+  if (!(likely(PyTuple_CheckExact(__pyx_v___pyx_state))||((__pyx_v___pyx_state) == Py_None) || __Pyx_RaiseUnexpectedTypeError("tuple", __pyx_v___pyx_state))) __PYX_ERR(1, 17, __pyx_L1_error)
+  __pyx_t_1 = __pyx_unpickle_Enum__set_state(__pyx_v_self, ((PyObject*)__pyx_v___pyx_state)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 17, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "(tree fragment)":16
+ *     else:
+ *         return __pyx_unpickle_Enum, (type(self), 0x82a3537, state)
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     __pyx_unpickle_Enum__set_state(self, __pyx_state)
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("View.MemoryView.Enum.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":349
+ *     cdef __Pyx_TypeInfo *typeinfo
+ * 
+ *     def __cinit__(memoryview self, object obj, int flags, bint dtype_is_object=False):             # <<<<<<<<<<<<<<
+ *         self.obj = obj
+ *         self.flags = flags
+ */
+
+/* Python wrapper */
+static int __pyx_memoryview___cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static int __pyx_memoryview___cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_obj = 0;
+  int __pyx_v_flags;
+  int __pyx_v_dtype_is_object;
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[3] = {0,0,0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__cinit__ (wrapper)", 0);
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return -1;
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_obj,&__pyx_n_s_flags,&__pyx_n_s_dtype_is_object,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  3: values[2] = __Pyx_Arg_VARARGS(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = __Pyx_Arg_VARARGS(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = __Pyx_Arg_VARARGS(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_VARARGS(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_VARARGS(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_obj)) != 0)) {
+          (void)__Pyx_Arg_NewRef_VARARGS(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 349, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_GetKwValue_VARARGS(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_flags)) != 0)) {
+          (void)__Pyx_Arg_NewRef_VARARGS(values[1]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 349, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, 1); __PYX_ERR(1, 349, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (kw_args > 0) {
+          PyObject* value = __Pyx_GetKwValue_VARARGS(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_dtype_is_object);
+          if (value) { values[2] = __Pyx_Arg_NewRef_VARARGS(value); kw_args--; }
+          else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 349, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "__cinit__") < 0)) __PYX_ERR(1, 349, __pyx_L3_error)
+      }
+    } else {
+      switch (__pyx_nargs) {
+        case  3: values[2] = __Pyx_Arg_VARARGS(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = __Pyx_Arg_VARARGS(__pyx_args, 1);
+        values[0] = __Pyx_Arg_VARARGS(__pyx_args, 0);
+        break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+    }
+    __pyx_v_obj = values[0];
+    __pyx_v_flags = __Pyx_PyInt_As_int(values[1]); if (unlikely((__pyx_v_flags == (int)-1) && PyErr_Occurred())) __PYX_ERR(1, 349, __pyx_L3_error)
+    if (values[2]) {
+      __pyx_v_dtype_is_object = __Pyx_PyObject_IsTrue(values[2]); if (unlikely((__pyx_v_dtype_is_object == (int)-1) && PyErr_Occurred())) __PYX_ERR(1, 349, __pyx_L3_error)
+    } else {
+      __pyx_v_dtype_is_object = ((int)0);
+    }
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, __pyx_nargs); __PYX_ERR(1, 349, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_VARARGS(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("View.MemoryView.memoryview.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return -1;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview___cinit__(((struct __pyx_memoryview_obj *)__pyx_v_self), __pyx_v_obj, __pyx_v_flags, __pyx_v_dtype_is_object);
+
+  /* function exit code */
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_VARARGS(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview___cinit__(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_obj, int __pyx_v_flags, int __pyx_v_dtype_is_object) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+  Py_intptr_t __pyx_t_4;
+  size_t __pyx_t_5;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__cinit__", 1);
+
+  /* "View.MemoryView":350
+ * 
+ *     def __cinit__(memoryview self, object obj, int flags, bint dtype_is_object=False):
+ *         self.obj = obj             # <<<<<<<<<<<<<<
+ *         self.flags = flags
+ *         if type(self) is memoryview or obj is not None:
+ */
+  __Pyx_INCREF(__pyx_v_obj);
+  __Pyx_GIVEREF(__pyx_v_obj);
+  __Pyx_GOTREF(__pyx_v_self->obj);
+  __Pyx_DECREF(__pyx_v_self->obj);
+  __pyx_v_self->obj = __pyx_v_obj;
+
+  /* "View.MemoryView":351
+ *     def __cinit__(memoryview self, object obj, int flags, bint dtype_is_object=False):
+ *         self.obj = obj
+ *         self.flags = flags             # <<<<<<<<<<<<<<
+ *         if type(self) is memoryview or obj is not None:
+ *             __Pyx_GetBuffer(obj, &self.view, flags)
+ */
+  __pyx_v_self->flags = __pyx_v_flags;
+
+  /* "View.MemoryView":352
+ *         self.obj = obj
+ *         self.flags = flags
+ *         if type(self) is memoryview or obj is not None:             # <<<<<<<<<<<<<<
+ *             __Pyx_GetBuffer(obj, &self.view, flags)
+ *             if <PyObject *> self.view.obj == NULL:
+ */
+  __pyx_t_2 = (((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))) == ((PyObject *)__pyx_memoryview_type));
+  if (!__pyx_t_2) {
+  } else {
+    __pyx_t_1 = __pyx_t_2;
+    goto __pyx_L4_bool_binop_done;
+  }
+  __pyx_t_2 = (__pyx_v_obj != Py_None);
+  __pyx_t_1 = __pyx_t_2;
+  __pyx_L4_bool_binop_done:;
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":353
+ *         self.flags = flags
+ *         if type(self) is memoryview or obj is not None:
+ *             __Pyx_GetBuffer(obj, &self.view, flags)             # <<<<<<<<<<<<<<
+ *             if <PyObject *> self.view.obj == NULL:
+ *                 (<__pyx_buffer *> &self.view).obj = Py_None
+ */
+    __pyx_t_3 = __Pyx_GetBuffer(__pyx_v_obj, (&__pyx_v_self->view), __pyx_v_flags); if (unlikely(__pyx_t_3 == ((int)-1))) __PYX_ERR(1, 353, __pyx_L1_error)
+
+    /* "View.MemoryView":354
+ *         if type(self) is memoryview or obj is not None:
+ *             __Pyx_GetBuffer(obj, &self.view, flags)
+ *             if <PyObject *> self.view.obj == NULL:             # <<<<<<<<<<<<<<
+ *                 (<__pyx_buffer *> &self.view).obj = Py_None
+ *                 Py_INCREF(Py_None)
+ */
+    __pyx_t_1 = (((PyObject *)__pyx_v_self->view.obj) == NULL);
+    if (__pyx_t_1) {
+
+      /* "View.MemoryView":355
+ *             __Pyx_GetBuffer(obj, &self.view, flags)
+ *             if <PyObject *> self.view.obj == NULL:
+ *                 (<__pyx_buffer *> &self.view).obj = Py_None             # <<<<<<<<<<<<<<
+ *                 Py_INCREF(Py_None)
+ * 
+ */
+      ((Py_buffer *)(&__pyx_v_self->view))->obj = Py_None;
+
+      /* "View.MemoryView":356
+ *             if <PyObject *> self.view.obj == NULL:
+ *                 (<__pyx_buffer *> &self.view).obj = Py_None
+ *                 Py_INCREF(Py_None)             # <<<<<<<<<<<<<<
+ * 
+ *         if not __PYX_CYTHON_ATOMICS_ENABLED():
+ */
+      Py_INCREF(Py_None);
+
+      /* "View.MemoryView":354
+ *         if type(self) is memoryview or obj is not None:
+ *             __Pyx_GetBuffer(obj, &self.view, flags)
+ *             if <PyObject *> self.view.obj == NULL:             # <<<<<<<<<<<<<<
+ *                 (<__pyx_buffer *> &self.view).obj = Py_None
+ *                 Py_INCREF(Py_None)
+ */
+    }
+
+    /* "View.MemoryView":352
+ *         self.obj = obj
+ *         self.flags = flags
+ *         if type(self) is memoryview or obj is not None:             # <<<<<<<<<<<<<<
+ *             __Pyx_GetBuffer(obj, &self.view, flags)
+ *             if <PyObject *> self.view.obj == NULL:
+ */
+  }
+
+  /* "View.MemoryView":358
+ *                 Py_INCREF(Py_None)
+ * 
+ *         if not __PYX_CYTHON_ATOMICS_ENABLED():             # <<<<<<<<<<<<<<
+ *             global __pyx_memoryview_thread_locks_used
+ *             if __pyx_memoryview_thread_locks_used < 8:
+ */
+  __pyx_t_1 = (!__PYX_CYTHON_ATOMICS_ENABLED());
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":360
+ *         if not __PYX_CYTHON_ATOMICS_ENABLED():
+ *             global __pyx_memoryview_thread_locks_used
+ *             if __pyx_memoryview_thread_locks_used < 8:             # <<<<<<<<<<<<<<
+ *                 self.lock = __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]
+ *                 __pyx_memoryview_thread_locks_used += 1
+ */
+    __pyx_t_1 = (__pyx_memoryview_thread_locks_used < 8);
+    if (__pyx_t_1) {
+
+      /* "View.MemoryView":361
+ *             global __pyx_memoryview_thread_locks_used
+ *             if __pyx_memoryview_thread_locks_used < 8:
+ *                 self.lock = __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]             # <<<<<<<<<<<<<<
+ *                 __pyx_memoryview_thread_locks_used += 1
+ *             if self.lock is NULL:
+ */
+      __pyx_v_self->lock = (__pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]);
+
+      /* "View.MemoryView":362
+ *             if __pyx_memoryview_thread_locks_used < 8:
+ *                 self.lock = __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]
+ *                 __pyx_memoryview_thread_locks_used += 1             # <<<<<<<<<<<<<<
+ *             if self.lock is NULL:
+ *                 self.lock = PyThread_allocate_lock()
+ */
+      __pyx_memoryview_thread_locks_used = (__pyx_memoryview_thread_locks_used + 1);
+
+      /* "View.MemoryView":360
+ *         if not __PYX_CYTHON_ATOMICS_ENABLED():
+ *             global __pyx_memoryview_thread_locks_used
+ *             if __pyx_memoryview_thread_locks_used < 8:             # <<<<<<<<<<<<<<
+ *                 self.lock = __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]
+ *                 __pyx_memoryview_thread_locks_used += 1
+ */
+    }
+
+    /* "View.MemoryView":363
+ *                 self.lock = __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]
+ *                 __pyx_memoryview_thread_locks_used += 1
+ *             if self.lock is NULL:             # <<<<<<<<<<<<<<
+ *                 self.lock = PyThread_allocate_lock()
+ *                 if self.lock is NULL:
+ */
+    __pyx_t_1 = (__pyx_v_self->lock == NULL);
+    if (__pyx_t_1) {
+
+      /* "View.MemoryView":364
+ *                 __pyx_memoryview_thread_locks_used += 1
+ *             if self.lock is NULL:
+ *                 self.lock = PyThread_allocate_lock()             # <<<<<<<<<<<<<<
+ *                 if self.lock is NULL:
+ *                     raise MemoryError
+ */
+      __pyx_v_self->lock = PyThread_allocate_lock();
+
+      /* "View.MemoryView":365
+ *             if self.lock is NULL:
+ *                 self.lock = PyThread_allocate_lock()
+ *                 if self.lock is NULL:             # <<<<<<<<<<<<<<
+ *                     raise MemoryError
+ * 
+ */
+      __pyx_t_1 = (__pyx_v_self->lock == NULL);
+      if (unlikely(__pyx_t_1)) {
+
+        /* "View.MemoryView":366
+ *                 self.lock = PyThread_allocate_lock()
+ *                 if self.lock is NULL:
+ *                     raise MemoryError             # <<<<<<<<<<<<<<
+ * 
+ *         if flags & PyBUF_FORMAT:
+ */
+        PyErr_NoMemory(); __PYX_ERR(1, 366, __pyx_L1_error)
+
+        /* "View.MemoryView":365
+ *             if self.lock is NULL:
+ *                 self.lock = PyThread_allocate_lock()
+ *                 if self.lock is NULL:             # <<<<<<<<<<<<<<
+ *                     raise MemoryError
+ * 
+ */
+      }
+
+      /* "View.MemoryView":363
+ *                 self.lock = __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]
+ *                 __pyx_memoryview_thread_locks_used += 1
+ *             if self.lock is NULL:             # <<<<<<<<<<<<<<
+ *                 self.lock = PyThread_allocate_lock()
+ *                 if self.lock is NULL:
+ */
+    }
+
+    /* "View.MemoryView":358
+ *                 Py_INCREF(Py_None)
+ * 
+ *         if not __PYX_CYTHON_ATOMICS_ENABLED():             # <<<<<<<<<<<<<<
+ *             global __pyx_memoryview_thread_locks_used
+ *             if __pyx_memoryview_thread_locks_used < 8:
+ */
+  }
+
+  /* "View.MemoryView":368
+ *                     raise MemoryError
+ * 
+ *         if flags & PyBUF_FORMAT:             # <<<<<<<<<<<<<<
+ *             self.dtype_is_object = (self.view.format[0] == b'O' and self.view.format[1] == b'\0')
+ *         else:
+ */
+  __pyx_t_1 = ((__pyx_v_flags & PyBUF_FORMAT) != 0);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":369
+ * 
+ *         if flags & PyBUF_FORMAT:
+ *             self.dtype_is_object = (self.view.format[0] == b'O' and self.view.format[1] == b'\0')             # <<<<<<<<<<<<<<
+ *         else:
+ *             self.dtype_is_object = dtype_is_object
+ */
+    __pyx_t_2 = ((__pyx_v_self->view.format[0]) == 'O');
+    if (__pyx_t_2) {
+    } else {
+      __pyx_t_1 = __pyx_t_2;
+      goto __pyx_L12_bool_binop_done;
+    }
+    __pyx_t_2 = ((__pyx_v_self->view.format[1]) == '\x00');
+    __pyx_t_1 = __pyx_t_2;
+    __pyx_L12_bool_binop_done:;
+    __pyx_v_self->dtype_is_object = __pyx_t_1;
+
+    /* "View.MemoryView":368
+ *                     raise MemoryError
+ * 
+ *         if flags & PyBUF_FORMAT:             # <<<<<<<<<<<<<<
+ *             self.dtype_is_object = (self.view.format[0] == b'O' and self.view.format[1] == b'\0')
+ *         else:
+ */
+    goto __pyx_L11;
+  }
+
+  /* "View.MemoryView":371
+ *             self.dtype_is_object = (self.view.format[0] == b'O' and self.view.format[1] == b'\0')
+ *         else:
+ *             self.dtype_is_object = dtype_is_object             # <<<<<<<<<<<<<<
+ * 
+ *         assert <Py_intptr_t><void*>(&self.acquisition_count) % sizeof(__pyx_atomic_int_type) == 0
+ */
+  /*else*/ {
+    __pyx_v_self->dtype_is_object = __pyx_v_dtype_is_object;
+  }
+  __pyx_L11:;
+
+  /* "View.MemoryView":373
+ *             self.dtype_is_object = dtype_is_object
+ * 
+ *         assert <Py_intptr_t><void*>(&self.acquisition_count) % sizeof(__pyx_atomic_int_type) == 0             # <<<<<<<<<<<<<<
+ *         self.typeinfo = NULL
+ * 
+ */
+  #ifndef CYTHON_WITHOUT_ASSERTIONS
+  if (unlikely(__pyx_assertions_enabled())) {
+    __pyx_t_4 = ((Py_intptr_t)((void *)(&__pyx_v_self->acquisition_count)));
+    __pyx_t_5 = (sizeof(__pyx_atomic_int_type));
+    if (unlikely(__pyx_t_5 == 0)) {
+      PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
+      __PYX_ERR(1, 373, __pyx_L1_error)
+    }
+    __pyx_t_1 = ((__pyx_t_4 % __pyx_t_5) == 0);
+    if (unlikely(!__pyx_t_1)) {
+      __Pyx_Raise(__pyx_builtin_AssertionError, 0, 0, 0);
+      __PYX_ERR(1, 373, __pyx_L1_error)
+    }
+  }
+  #else
+  if ((1)); else __PYX_ERR(1, 373, __pyx_L1_error)
+  #endif
+
+  /* "View.MemoryView":374
+ * 
+ *         assert <Py_intptr_t><void*>(&self.acquisition_count) % sizeof(__pyx_atomic_int_type) == 0
+ *         self.typeinfo = NULL             # <<<<<<<<<<<<<<
+ * 
+ *     def __dealloc__(memoryview self):
+ */
+  __pyx_v_self->typeinfo = NULL;
+
+  /* "View.MemoryView":349
+ *     cdef __Pyx_TypeInfo *typeinfo
+ * 
+ *     def __cinit__(memoryview self, object obj, int flags, bint dtype_is_object=False):             # <<<<<<<<<<<<<<
+ *         self.obj = obj
+ *         self.flags = flags
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView.memoryview.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":376
+ *         self.typeinfo = NULL
+ * 
+ *     def __dealloc__(memoryview self):             # <<<<<<<<<<<<<<
+ *         if self.obj is not None:
+ *             __Pyx_ReleaseBuffer(&self.view)
+ */
+
+/* Python wrapper */
+static void __pyx_memoryview___dealloc__(PyObject *__pyx_v_self); /*proto*/
+static void __pyx_memoryview___dealloc__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_2__dealloc__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+static void __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_2__dealloc__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  int __pyx_v_i;
+  int __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+  int __pyx_t_4;
+  PyThread_type_lock __pyx_t_5;
+  PyThread_type_lock __pyx_t_6;
+
+  /* "View.MemoryView":377
+ * 
+ *     def __dealloc__(memoryview self):
+ *         if self.obj is not None:             # <<<<<<<<<<<<<<
+ *             __Pyx_ReleaseBuffer(&self.view)
+ *         elif (<__pyx_buffer *> &self.view).obj == Py_None:
+ */
+  __pyx_t_1 = (__pyx_v_self->obj != Py_None);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":378
+ *     def __dealloc__(memoryview self):
+ *         if self.obj is not None:
+ *             __Pyx_ReleaseBuffer(&self.view)             # <<<<<<<<<<<<<<
+ *         elif (<__pyx_buffer *> &self.view).obj == Py_None:
+ * 
+ */
+    __Pyx_ReleaseBuffer((&__pyx_v_self->view));
+
+    /* "View.MemoryView":377
+ * 
+ *     def __dealloc__(memoryview self):
+ *         if self.obj is not None:             # <<<<<<<<<<<<<<
+ *             __Pyx_ReleaseBuffer(&self.view)
+ *         elif (<__pyx_buffer *> &self.view).obj == Py_None:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":379
+ *         if self.obj is not None:
+ *             __Pyx_ReleaseBuffer(&self.view)
+ *         elif (<__pyx_buffer *> &self.view).obj == Py_None:             # <<<<<<<<<<<<<<
+ * 
+ *             (<__pyx_buffer *> &self.view).obj = NULL
+ */
+  __pyx_t_1 = (((Py_buffer *)(&__pyx_v_self->view))->obj == Py_None);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":381
+ *         elif (<__pyx_buffer *> &self.view).obj == Py_None:
+ * 
+ *             (<__pyx_buffer *> &self.view).obj = NULL             # <<<<<<<<<<<<<<
+ *             Py_DECREF(Py_None)
+ * 
+ */
+    ((Py_buffer *)(&__pyx_v_self->view))->obj = NULL;
+
+    /* "View.MemoryView":382
+ * 
+ *             (<__pyx_buffer *> &self.view).obj = NULL
+ *             Py_DECREF(Py_None)             # <<<<<<<<<<<<<<
+ * 
+ *         cdef int i
+ */
+    Py_DECREF(Py_None);
+
+    /* "View.MemoryView":379
+ *         if self.obj is not None:
+ *             __Pyx_ReleaseBuffer(&self.view)
+ *         elif (<__pyx_buffer *> &self.view).obj == Py_None:             # <<<<<<<<<<<<<<
+ * 
+ *             (<__pyx_buffer *> &self.view).obj = NULL
+ */
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":386
+ *         cdef int i
+ *         global __pyx_memoryview_thread_locks_used
+ *         if self.lock != NULL:             # <<<<<<<<<<<<<<
+ *             for i in range(__pyx_memoryview_thread_locks_used):
+ *                 if __pyx_memoryview_thread_locks[i] is self.lock:
+ */
+  __pyx_t_1 = (__pyx_v_self->lock != NULL);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":387
+ *         global __pyx_memoryview_thread_locks_used
+ *         if self.lock != NULL:
+ *             for i in range(__pyx_memoryview_thread_locks_used):             # <<<<<<<<<<<<<<
+ *                 if __pyx_memoryview_thread_locks[i] is self.lock:
+ *                     __pyx_memoryview_thread_locks_used -= 1
+ */
+    __pyx_t_2 = __pyx_memoryview_thread_locks_used;
+    __pyx_t_3 = __pyx_t_2;
+    for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
+      __pyx_v_i = __pyx_t_4;
+
+      /* "View.MemoryView":388
+ *         if self.lock != NULL:
+ *             for i in range(__pyx_memoryview_thread_locks_used):
+ *                 if __pyx_memoryview_thread_locks[i] is self.lock:             # <<<<<<<<<<<<<<
+ *                     __pyx_memoryview_thread_locks_used -= 1
+ *                     if i != __pyx_memoryview_thread_locks_used:
+ */
+      __pyx_t_1 = ((__pyx_memoryview_thread_locks[__pyx_v_i]) == __pyx_v_self->lock);
+      if (__pyx_t_1) {
+
+        /* "View.MemoryView":389
+ *             for i in range(__pyx_memoryview_thread_locks_used):
+ *                 if __pyx_memoryview_thread_locks[i] is self.lock:
+ *                     __pyx_memoryview_thread_locks_used -= 1             # <<<<<<<<<<<<<<
+ *                     if i != __pyx_memoryview_thread_locks_used:
+ *                         __pyx_memoryview_thread_locks[i], __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used] = (
+ */
+        __pyx_memoryview_thread_locks_used = (__pyx_memoryview_thread_locks_used - 1);
+
+        /* "View.MemoryView":390
+ *                 if __pyx_memoryview_thread_locks[i] is self.lock:
+ *                     __pyx_memoryview_thread_locks_used -= 1
+ *                     if i != __pyx_memoryview_thread_locks_used:             # <<<<<<<<<<<<<<
+ *                         __pyx_memoryview_thread_locks[i], __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used] = (
+ *                             __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used], __pyx_memoryview_thread_locks[i])
+ */
+        __pyx_t_1 = (__pyx_v_i != __pyx_memoryview_thread_locks_used);
+        if (__pyx_t_1) {
+
+          /* "View.MemoryView":392
+ *                     if i != __pyx_memoryview_thread_locks_used:
+ *                         __pyx_memoryview_thread_locks[i], __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used] = (
+ *                             __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used], __pyx_memoryview_thread_locks[i])             # <<<<<<<<<<<<<<
+ *                     break
+ *             else:
+ */
+          __pyx_t_5 = (__pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]);
+          __pyx_t_6 = (__pyx_memoryview_thread_locks[__pyx_v_i]);
+
+          /* "View.MemoryView":391
+ *                     __pyx_memoryview_thread_locks_used -= 1
+ *                     if i != __pyx_memoryview_thread_locks_used:
+ *                         __pyx_memoryview_thread_locks[i], __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used] = (             # <<<<<<<<<<<<<<
+ *                             __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used], __pyx_memoryview_thread_locks[i])
+ *                     break
+ */
+          (__pyx_memoryview_thread_locks[__pyx_v_i]) = __pyx_t_5;
+          (__pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]) = __pyx_t_6;
+
+          /* "View.MemoryView":390
+ *                 if __pyx_memoryview_thread_locks[i] is self.lock:
+ *                     __pyx_memoryview_thread_locks_used -= 1
+ *                     if i != __pyx_memoryview_thread_locks_used:             # <<<<<<<<<<<<<<
+ *                         __pyx_memoryview_thread_locks[i], __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used] = (
+ *                             __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used], __pyx_memoryview_thread_locks[i])
+ */
+        }
+
+        /* "View.MemoryView":393
+ *                         __pyx_memoryview_thread_locks[i], __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used] = (
+ *                             __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used], __pyx_memoryview_thread_locks[i])
+ *                     break             # <<<<<<<<<<<<<<
+ *             else:
+ *                 PyThread_free_lock(self.lock)
+ */
+        goto __pyx_L6_break;
+
+        /* "View.MemoryView":388
+ *         if self.lock != NULL:
+ *             for i in range(__pyx_memoryview_thread_locks_used):
+ *                 if __pyx_memoryview_thread_locks[i] is self.lock:             # <<<<<<<<<<<<<<
+ *                     __pyx_memoryview_thread_locks_used -= 1
+ *                     if i != __pyx_memoryview_thread_locks_used:
+ */
+      }
+    }
+    /*else*/ {
+
+      /* "View.MemoryView":395
+ *                     break
+ *             else:
+ *                 PyThread_free_lock(self.lock)             # <<<<<<<<<<<<<<
+ * 
+ *     cdef char *get_item_pointer(memoryview self, object index) except NULL:
+ */
+      PyThread_free_lock(__pyx_v_self->lock);
+    }
+    __pyx_L6_break:;
+
+    /* "View.MemoryView":386
+ *         cdef int i
+ *         global __pyx_memoryview_thread_locks_used
+ *         if self.lock != NULL:             # <<<<<<<<<<<<<<
+ *             for i in range(__pyx_memoryview_thread_locks_used):
+ *                 if __pyx_memoryview_thread_locks[i] is self.lock:
+ */
+  }
+
+  /* "View.MemoryView":376
+ *         self.typeinfo = NULL
+ * 
+ *     def __dealloc__(memoryview self):             # <<<<<<<<<<<<<<
+ *         if self.obj is not None:
+ *             __Pyx_ReleaseBuffer(&self.view)
+ */
+
+  /* function exit code */
+}
+
+/* "View.MemoryView":397
+ *                 PyThread_free_lock(self.lock)
+ * 
+ *     cdef char *get_item_pointer(memoryview self, object index) except NULL:             # <<<<<<<<<<<<<<
+ *         cdef Py_ssize_t dim
+ *         cdef char *itemp = <char *> self.view.buf
+ */
+
+static char *__pyx_memoryview_get_item_pointer(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index) {
+  Py_ssize_t __pyx_v_dim;
+  char *__pyx_v_itemp;
+  PyObject *__pyx_v_idx = NULL;
+  char *__pyx_r;
+  __Pyx_RefNannyDeclarations
+  Py_ssize_t __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  Py_ssize_t __pyx_t_3;
+  PyObject *(*__pyx_t_4)(PyObject *);
+  PyObject *__pyx_t_5 = NULL;
+  Py_ssize_t __pyx_t_6;
+  char *__pyx_t_7;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("get_item_pointer", 1);
+
+  /* "View.MemoryView":399
+ *     cdef char *get_item_pointer(memoryview self, object index) except NULL:
+ *         cdef Py_ssize_t dim
+ *         cdef char *itemp = <char *> self.view.buf             # <<<<<<<<<<<<<<
+ * 
+ *         for dim, idx in enumerate(index):
+ */
+  __pyx_v_itemp = ((char *)__pyx_v_self->view.buf);
+
+  /* "View.MemoryView":401
+ *         cdef char *itemp = <char *> self.view.buf
+ * 
+ *         for dim, idx in enumerate(index):             # <<<<<<<<<<<<<<
+ *             itemp = pybuffer_index(&self.view, itemp, idx, dim)
+ * 
+ */
+  __pyx_t_1 = 0;
+  if (likely(PyList_CheckExact(__pyx_v_index)) || PyTuple_CheckExact(__pyx_v_index)) {
+    __pyx_t_2 = __pyx_v_index; __Pyx_INCREF(__pyx_t_2);
+    __pyx_t_3 = 0;
+    __pyx_t_4 = NULL;
+  } else {
+    __pyx_t_3 = -1; __pyx_t_2 = PyObject_GetIter(__pyx_v_index); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 401, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_4 = __Pyx_PyObject_GetIterNextFunc(__pyx_t_2); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 401, __pyx_L1_error)
+  }
+  for (;;) {
+    if (likely(!__pyx_t_4)) {
+      if (likely(PyList_CheckExact(__pyx_t_2))) {
+        {
+          Py_ssize_t __pyx_temp = __Pyx_PyList_GET_SIZE(__pyx_t_2);
+          #if !CYTHON_ASSUME_SAFE_MACROS
+          if (unlikely((__pyx_temp < 0))) __PYX_ERR(1, 401, __pyx_L1_error)
+          #endif
+          if (__pyx_t_3 >= __pyx_temp) break;
+        }
+        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+        __pyx_t_5 = PyList_GET_ITEM(__pyx_t_2, __pyx_t_3); __Pyx_INCREF(__pyx_t_5); __pyx_t_3++; if (unlikely((0 < 0))) __PYX_ERR(1, 401, __pyx_L1_error)
+        #else
+        __pyx_t_5 = __Pyx_PySequence_ITEM(__pyx_t_2, __pyx_t_3); __pyx_t_3++; if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 401, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        #endif
+      } else {
+        {
+          Py_ssize_t __pyx_temp = __Pyx_PyTuple_GET_SIZE(__pyx_t_2);
+          #if !CYTHON_ASSUME_SAFE_MACROS
+          if (unlikely((__pyx_temp < 0))) __PYX_ERR(1, 401, __pyx_L1_error)
+          #endif
+          if (__pyx_t_3 >= __pyx_temp) break;
+        }
+        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+        __pyx_t_5 = PyTuple_GET_ITEM(__pyx_t_2, __pyx_t_3); __Pyx_INCREF(__pyx_t_5); __pyx_t_3++; if (unlikely((0 < 0))) __PYX_ERR(1, 401, __pyx_L1_error)
+        #else
+        __pyx_t_5 = __Pyx_PySequence_ITEM(__pyx_t_2, __pyx_t_3); __pyx_t_3++; if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 401, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        #endif
+      }
+    } else {
+      __pyx_t_5 = __pyx_t_4(__pyx_t_2);
+      if (unlikely(!__pyx_t_5)) {
+        PyObject* exc_type = PyErr_Occurred();
+        if (exc_type) {
+          if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
+          else __PYX_ERR(1, 401, __pyx_L1_error)
+        }
+        break;
+      }
+      __Pyx_GOTREF(__pyx_t_5);
+    }
+    __Pyx_XDECREF_SET(__pyx_v_idx, __pyx_t_5);
+    __pyx_t_5 = 0;
+    __pyx_v_dim = __pyx_t_1;
+    __pyx_t_1 = (__pyx_t_1 + 1);
+
+    /* "View.MemoryView":402
+ * 
+ *         for dim, idx in enumerate(index):
+ *             itemp = pybuffer_index(&self.view, itemp, idx, dim)             # <<<<<<<<<<<<<<
+ * 
+ *         return itemp
+ */
+    __pyx_t_6 = __Pyx_PyIndex_AsSsize_t(__pyx_v_idx); if (unlikely((__pyx_t_6 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(1, 402, __pyx_L1_error)
+    __pyx_t_7 = __pyx_pybuffer_index((&__pyx_v_self->view), __pyx_v_itemp, __pyx_t_6, __pyx_v_dim); if (unlikely(__pyx_t_7 == ((char *)NULL))) __PYX_ERR(1, 402, __pyx_L1_error)
+    __pyx_v_itemp = __pyx_t_7;
+
+    /* "View.MemoryView":401
+ *         cdef char *itemp = <char *> self.view.buf
+ * 
+ *         for dim, idx in enumerate(index):             # <<<<<<<<<<<<<<
+ *             itemp = pybuffer_index(&self.view, itemp, idx, dim)
+ * 
+ */
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "View.MemoryView":404
+ *             itemp = pybuffer_index(&self.view, itemp, idx, dim)
+ * 
+ *         return itemp             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = __pyx_v_itemp;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":397
+ *                 PyThread_free_lock(self.lock)
+ * 
+ *     cdef char *get_item_pointer(memoryview self, object index) except NULL:             # <<<<<<<<<<<<<<
+ *         cdef Py_ssize_t dim
+ *         cdef char *itemp = <char *> self.view.buf
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.get_item_pointer", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_idx);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":407
+ * 
+ * 
+ *     def __getitem__(memoryview self, object index):             # <<<<<<<<<<<<<<
+ *         if index is Ellipsis:
+ *             return self
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_memoryview___getitem__(PyObject *__pyx_v_self, PyObject *__pyx_v_index); /*proto*/
+static PyObject *__pyx_memoryview___getitem__(PyObject *__pyx_v_self, PyObject *__pyx_v_index) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__getitem__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_4__getitem__(((struct __pyx_memoryview_obj *)__pyx_v_self), ((PyObject *)__pyx_v_index));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_4__getitem__(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index) {
+  PyObject *__pyx_v_have_slices = NULL;
+  PyObject *__pyx_v_indices = NULL;
+  char *__pyx_v_itemp;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  char *__pyx_t_5;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__getitem__", 1);
+
+  /* "View.MemoryView":408
+ * 
+ *     def __getitem__(memoryview self, object index):
+ *         if index is Ellipsis:             # <<<<<<<<<<<<<<
+ *             return self
+ * 
+ */
+  __pyx_t_1 = (__pyx_v_index == __pyx_builtin_Ellipsis);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":409
+ *     def __getitem__(memoryview self, object index):
+ *         if index is Ellipsis:
+ *             return self             # <<<<<<<<<<<<<<
+ * 
+ *         have_slices, indices = _unellipsify(index, self.view.ndim)
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF((PyObject *)__pyx_v_self);
+    __pyx_r = ((PyObject *)__pyx_v_self);
+    goto __pyx_L0;
+
+    /* "View.MemoryView":408
+ * 
+ *     def __getitem__(memoryview self, object index):
+ *         if index is Ellipsis:             # <<<<<<<<<<<<<<
+ *             return self
+ * 
+ */
+  }
+
+  /* "View.MemoryView":411
+ *             return self
+ * 
+ *         have_slices, indices = _unellipsify(index, self.view.ndim)             # <<<<<<<<<<<<<<
+ * 
+ *         cdef char *itemp
+ */
+  __pyx_t_2 = _unellipsify(__pyx_v_index, __pyx_v_self->view.ndim); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 411, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  if (likely(__pyx_t_2 != Py_None)) {
+    PyObject* sequence = __pyx_t_2;
+    Py_ssize_t size = __Pyx_PySequence_SIZE(sequence);
+    if (unlikely(size != 2)) {
+      if (size > 2) __Pyx_RaiseTooManyValuesError(2);
+      else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size);
+      __PYX_ERR(1, 411, __pyx_L1_error)
+    }
+    #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    __pyx_t_3 = PyTuple_GET_ITEM(sequence, 0); 
+    __pyx_t_4 = PyTuple_GET_ITEM(sequence, 1); 
+    __Pyx_INCREF(__pyx_t_3);
+    __Pyx_INCREF(__pyx_t_4);
+    #else
+    __pyx_t_3 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 411, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_4 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 411, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    #endif
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  } else {
+    __Pyx_RaiseNoneNotIterableError(); __PYX_ERR(1, 411, __pyx_L1_error)
+  }
+  __pyx_v_have_slices = __pyx_t_3;
+  __pyx_t_3 = 0;
+  __pyx_v_indices = __pyx_t_4;
+  __pyx_t_4 = 0;
+
+  /* "View.MemoryView":414
+ * 
+ *         cdef char *itemp
+ *         if have_slices:             # <<<<<<<<<<<<<<
+ *             return memview_slice(self, indices)
+ *         else:
+ */
+  __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_v_have_slices); if (unlikely((__pyx_t_1 < 0))) __PYX_ERR(1, 414, __pyx_L1_error)
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":415
+ *         cdef char *itemp
+ *         if have_slices:
+ *             return memview_slice(self, indices)             # <<<<<<<<<<<<<<
+ *         else:
+ *             itemp = self.get_item_pointer(indices)
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_t_2 = ((PyObject *)__pyx_memview_slice(__pyx_v_self, __pyx_v_indices)); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 415, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_r = __pyx_t_2;
+    __pyx_t_2 = 0;
+    goto __pyx_L0;
+
+    /* "View.MemoryView":414
+ * 
+ *         cdef char *itemp
+ *         if have_slices:             # <<<<<<<<<<<<<<
+ *             return memview_slice(self, indices)
+ *         else:
+ */
+  }
+
+  /* "View.MemoryView":417
+ *             return memview_slice(self, indices)
+ *         else:
+ *             itemp = self.get_item_pointer(indices)             # <<<<<<<<<<<<<<
+ *             return self.convert_item_to_object(itemp)
+ * 
+ */
+  /*else*/ {
+    __pyx_t_5 = ((struct __pyx_vtabstruct_memoryview *)__pyx_v_self->__pyx_vtab)->get_item_pointer(__pyx_v_self, __pyx_v_indices); if (unlikely(__pyx_t_5 == ((char *)NULL))) __PYX_ERR(1, 417, __pyx_L1_error)
+    __pyx_v_itemp = __pyx_t_5;
+
+    /* "View.MemoryView":418
+ *         else:
+ *             itemp = self.get_item_pointer(indices)
+ *             return self.convert_item_to_object(itemp)             # <<<<<<<<<<<<<<
+ * 
+ *     def __setitem__(memoryview self, object index, object value):
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_t_2 = ((struct __pyx_vtabstruct_memoryview *)__pyx_v_self->__pyx_vtab)->convert_item_to_object(__pyx_v_self, __pyx_v_itemp); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 418, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_r = __pyx_t_2;
+    __pyx_t_2 = 0;
+    goto __pyx_L0;
+  }
+
+  /* "View.MemoryView":407
+ * 
+ * 
+ *     def __getitem__(memoryview self, object index):             # <<<<<<<<<<<<<<
+ *         if index is Ellipsis:
+ *             return self
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.__getitem__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_have_slices);
+  __Pyx_XDECREF(__pyx_v_indices);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":420
+ *             return self.convert_item_to_object(itemp)
+ * 
+ *     def __setitem__(memoryview self, object index, object value):             # <<<<<<<<<<<<<<
+ *         if self.view.readonly:
+ *             raise TypeError, "Cannot assign to read-only memoryview"
+ */
+
+/* Python wrapper */
+static int __pyx_memoryview___setitem__(PyObject *__pyx_v_self, PyObject *__pyx_v_index, PyObject *__pyx_v_value); /*proto*/
+static int __pyx_memoryview___setitem__(PyObject *__pyx_v_self, PyObject *__pyx_v_index, PyObject *__pyx_v_value) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__setitem__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_6__setitem__(((struct __pyx_memoryview_obj *)__pyx_v_self), ((PyObject *)__pyx_v_index), ((PyObject *)__pyx_v_value));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_6__setitem__(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index, PyObject *__pyx_v_value) {
+  PyObject *__pyx_v_have_slices = NULL;
+  PyObject *__pyx_v_obj = NULL;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__setitem__", 0);
+  __Pyx_INCREF(__pyx_v_index);
+
+  /* "View.MemoryView":421
+ * 
+ *     def __setitem__(memoryview self, object index, object value):
+ *         if self.view.readonly:             # <<<<<<<<<<<<<<
+ *             raise TypeError, "Cannot assign to read-only memoryview"
+ * 
+ */
+  if (unlikely(__pyx_v_self->view.readonly)) {
+
+    /* "View.MemoryView":422
+ *     def __setitem__(memoryview self, object index, object value):
+ *         if self.view.readonly:
+ *             raise TypeError, "Cannot assign to read-only memoryview"             # <<<<<<<<<<<<<<
+ * 
+ *         have_slices, index = _unellipsify(index, self.view.ndim)
+ */
+    __Pyx_Raise(__pyx_builtin_TypeError, __pyx_kp_s_Cannot_assign_to_read_only_memor, 0, 0);
+    __PYX_ERR(1, 422, __pyx_L1_error)
+
+    /* "View.MemoryView":421
+ * 
+ *     def __setitem__(memoryview self, object index, object value):
+ *         if self.view.readonly:             # <<<<<<<<<<<<<<
+ *             raise TypeError, "Cannot assign to read-only memoryview"
+ * 
+ */
+  }
+
+  /* "View.MemoryView":424
+ *             raise TypeError, "Cannot assign to read-only memoryview"
+ * 
+ *         have_slices, index = _unellipsify(index, self.view.ndim)             # <<<<<<<<<<<<<<
+ * 
+ *         if have_slices:
+ */
+  __pyx_t_1 = _unellipsify(__pyx_v_index, __pyx_v_self->view.ndim); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 424, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (likely(__pyx_t_1 != Py_None)) {
+    PyObject* sequence = __pyx_t_1;
+    Py_ssize_t size = __Pyx_PySequence_SIZE(sequence);
+    if (unlikely(size != 2)) {
+      if (size > 2) __Pyx_RaiseTooManyValuesError(2);
+      else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size);
+      __PYX_ERR(1, 424, __pyx_L1_error)
+    }
+    #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    __pyx_t_2 = PyTuple_GET_ITEM(sequence, 0); 
+    __pyx_t_3 = PyTuple_GET_ITEM(sequence, 1); 
+    __Pyx_INCREF(__pyx_t_2);
+    __Pyx_INCREF(__pyx_t_3);
+    #else
+    __pyx_t_2 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 424, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_3 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 424, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    #endif
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  } else {
+    __Pyx_RaiseNoneNotIterableError(); __PYX_ERR(1, 424, __pyx_L1_error)
+  }
+  __pyx_v_have_slices = __pyx_t_2;
+  __pyx_t_2 = 0;
+  __Pyx_DECREF_SET(__pyx_v_index, __pyx_t_3);
+  __pyx_t_3 = 0;
+
+  /* "View.MemoryView":426
+ *         have_slices, index = _unellipsify(index, self.view.ndim)
+ * 
+ *         if have_slices:             # <<<<<<<<<<<<<<
+ *             obj = self.is_slice(value)
+ *             if obj is not None:
+ */
+  __pyx_t_4 = __Pyx_PyObject_IsTrue(__pyx_v_have_slices); if (unlikely((__pyx_t_4 < 0))) __PYX_ERR(1, 426, __pyx_L1_error)
+  if (__pyx_t_4) {
+
+    /* "View.MemoryView":427
+ * 
+ *         if have_slices:
+ *             obj = self.is_slice(value)             # <<<<<<<<<<<<<<
+ *             if obj is not None:
+ *                 self.setitem_slice_assignment(self[index], obj)
+ */
+    __pyx_t_1 = ((struct __pyx_vtabstruct_memoryview *)__pyx_v_self->__pyx_vtab)->is_slice(__pyx_v_self, __pyx_v_value); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 427, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_v_obj = __pyx_t_1;
+    __pyx_t_1 = 0;
+
+    /* "View.MemoryView":428
+ *         if have_slices:
+ *             obj = self.is_slice(value)
+ *             if obj is not None:             # <<<<<<<<<<<<<<
+ *                 self.setitem_slice_assignment(self[index], obj)
+ *             else:
+ */
+    __pyx_t_4 = (__pyx_v_obj != Py_None);
+    if (__pyx_t_4) {
+
+      /* "View.MemoryView":429
+ *             obj = self.is_slice(value)
+ *             if obj is not None:
+ *                 self.setitem_slice_assignment(self[index], obj)             # <<<<<<<<<<<<<<
+ *             else:
+ *                 self.setitem_slice_assign_scalar(self[index], value)
+ */
+      __pyx_t_1 = __Pyx_PyObject_GetItem(((PyObject *)__pyx_v_self), __pyx_v_index); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 429, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __pyx_t_3 = ((struct __pyx_vtabstruct_memoryview *)__pyx_v_self->__pyx_vtab)->setitem_slice_assignment(__pyx_v_self, __pyx_t_1, __pyx_v_obj); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 429, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+
+      /* "View.MemoryView":428
+ *         if have_slices:
+ *             obj = self.is_slice(value)
+ *             if obj is not None:             # <<<<<<<<<<<<<<
+ *                 self.setitem_slice_assignment(self[index], obj)
+ *             else:
+ */
+      goto __pyx_L5;
+    }
+
+    /* "View.MemoryView":431
+ *                 self.setitem_slice_assignment(self[index], obj)
+ *             else:
+ *                 self.setitem_slice_assign_scalar(self[index], value)             # <<<<<<<<<<<<<<
+ *         else:
+ *             self.setitem_indexed(index, value)
+ */
+    /*else*/ {
+      __pyx_t_3 = __Pyx_PyObject_GetItem(((PyObject *)__pyx_v_self), __pyx_v_index); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 431, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      if (!(likely(((__pyx_t_3) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_3, __pyx_memoryview_type))))) __PYX_ERR(1, 431, __pyx_L1_error)
+      __pyx_t_1 = ((struct __pyx_vtabstruct_memoryview *)__pyx_v_self->__pyx_vtab)->setitem_slice_assign_scalar(__pyx_v_self, ((struct __pyx_memoryview_obj *)__pyx_t_3), __pyx_v_value); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 431, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    }
+    __pyx_L5:;
+
+    /* "View.MemoryView":426
+ *         have_slices, index = _unellipsify(index, self.view.ndim)
+ * 
+ *         if have_slices:             # <<<<<<<<<<<<<<
+ *             obj = self.is_slice(value)
+ *             if obj is not None:
+ */
+    goto __pyx_L4;
+  }
+
+  /* "View.MemoryView":433
+ *                 self.setitem_slice_assign_scalar(self[index], value)
+ *         else:
+ *             self.setitem_indexed(index, value)             # <<<<<<<<<<<<<<
+ * 
+ *     cdef is_slice(self, obj):
+ */
+  /*else*/ {
+    __pyx_t_1 = ((struct __pyx_vtabstruct_memoryview *)__pyx_v_self->__pyx_vtab)->setitem_indexed(__pyx_v_self, __pyx_v_index, __pyx_v_value); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 433, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  }
+  __pyx_L4:;
+
+  /* "View.MemoryView":420
+ *             return self.convert_item_to_object(itemp)
+ * 
+ *     def __setitem__(memoryview self, object index, object value):             # <<<<<<<<<<<<<<
+ *         if self.view.readonly:
+ *             raise TypeError, "Cannot assign to read-only memoryview"
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.__setitem__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_have_slices);
+  __Pyx_XDECREF(__pyx_v_obj);
+  __Pyx_XDECREF(__pyx_v_index);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":435
+ *             self.setitem_indexed(index, value)
+ * 
+ *     cdef is_slice(self, obj):             # <<<<<<<<<<<<<<
+ *         if not isinstance(obj, memoryview):
+ *             try:
+ */
+
+static PyObject *__pyx_memoryview_is_slice(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_obj) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  int __pyx_t_9;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("is_slice", 0);
+  __Pyx_INCREF(__pyx_v_obj);
+
+  /* "View.MemoryView":436
+ * 
+ *     cdef is_slice(self, obj):
+ *         if not isinstance(obj, memoryview):             # <<<<<<<<<<<<<<
+ *             try:
+ *                 obj = memoryview(obj, self.flags & ~PyBUF_WRITABLE | PyBUF_ANY_CONTIGUOUS,
+ */
+  __pyx_t_1 = __Pyx_TypeCheck(__pyx_v_obj, __pyx_memoryview_type); 
+  __pyx_t_2 = (!__pyx_t_1);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":437
+ *     cdef is_slice(self, obj):
+ *         if not isinstance(obj, memoryview):
+ *             try:             # <<<<<<<<<<<<<<
+ *                 obj = memoryview(obj, self.flags & ~PyBUF_WRITABLE | PyBUF_ANY_CONTIGUOUS,
+ *                                  self.dtype_is_object)
+ */
+    {
+      __Pyx_PyThreadState_declare
+      __Pyx_PyThreadState_assign
+      __Pyx_ExceptionSave(&__pyx_t_3, &__pyx_t_4, &__pyx_t_5);
+      __Pyx_XGOTREF(__pyx_t_3);
+      __Pyx_XGOTREF(__pyx_t_4);
+      __Pyx_XGOTREF(__pyx_t_5);
+      /*try:*/ {
+
+        /* "View.MemoryView":438
+ *         if not isinstance(obj, memoryview):
+ *             try:
+ *                 obj = memoryview(obj, self.flags & ~PyBUF_WRITABLE | PyBUF_ANY_CONTIGUOUS,             # <<<<<<<<<<<<<<
+ *                                  self.dtype_is_object)
+ *             except TypeError:
+ */
+        __pyx_t_6 = __Pyx_PyInt_From_int(((__pyx_v_self->flags & (~PyBUF_WRITABLE)) | PyBUF_ANY_CONTIGUOUS)); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 438, __pyx_L4_error)
+        __Pyx_GOTREF(__pyx_t_6);
+
+        /* "View.MemoryView":439
+ *             try:
+ *                 obj = memoryview(obj, self.flags & ~PyBUF_WRITABLE | PyBUF_ANY_CONTIGUOUS,
+ *                                  self.dtype_is_object)             # <<<<<<<<<<<<<<
+ *             except TypeError:
+ *                 return None
+ */
+        __pyx_t_7 = __Pyx_PyBool_FromLong(__pyx_v_self->dtype_is_object); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 439, __pyx_L4_error)
+        __Pyx_GOTREF(__pyx_t_7);
+
+        /* "View.MemoryView":438
+ *         if not isinstance(obj, memoryview):
+ *             try:
+ *                 obj = memoryview(obj, self.flags & ~PyBUF_WRITABLE | PyBUF_ANY_CONTIGUOUS,             # <<<<<<<<<<<<<<
+ *                                  self.dtype_is_object)
+ *             except TypeError:
+ */
+        __pyx_t_8 = PyTuple_New(3); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 438, __pyx_L4_error)
+        __Pyx_GOTREF(__pyx_t_8);
+        __Pyx_INCREF(__pyx_v_obj);
+        __Pyx_GIVEREF(__pyx_v_obj);
+        if (__Pyx_PyTuple_SET_ITEM(__pyx_t_8, 0, __pyx_v_obj)) __PYX_ERR(1, 438, __pyx_L4_error);
+        __Pyx_GIVEREF(__pyx_t_6);
+        if (__Pyx_PyTuple_SET_ITEM(__pyx_t_8, 1, __pyx_t_6)) __PYX_ERR(1, 438, __pyx_L4_error);
+        __Pyx_GIVEREF(__pyx_t_7);
+        if (__Pyx_PyTuple_SET_ITEM(__pyx_t_8, 2, __pyx_t_7)) __PYX_ERR(1, 438, __pyx_L4_error);
+        __pyx_t_6 = 0;
+        __pyx_t_7 = 0;
+        __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)__pyx_memoryview_type), __pyx_t_8, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 438, __pyx_L4_error)
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+        __Pyx_DECREF_SET(__pyx_v_obj, __pyx_t_7);
+        __pyx_t_7 = 0;
+
+        /* "View.MemoryView":437
+ *     cdef is_slice(self, obj):
+ *         if not isinstance(obj, memoryview):
+ *             try:             # <<<<<<<<<<<<<<
+ *                 obj = memoryview(obj, self.flags & ~PyBUF_WRITABLE | PyBUF_ANY_CONTIGUOUS,
+ *                                  self.dtype_is_object)
+ */
+      }
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+      goto __pyx_L9_try_end;
+      __pyx_L4_error:;
+      __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+      __Pyx_XDECREF(__pyx_t_8); __pyx_t_8 = 0;
+
+      /* "View.MemoryView":440
+ *                 obj = memoryview(obj, self.flags & ~PyBUF_WRITABLE | PyBUF_ANY_CONTIGUOUS,
+ *                                  self.dtype_is_object)
+ *             except TypeError:             # <<<<<<<<<<<<<<
+ *                 return None
+ * 
+ */
+      __pyx_t_9 = __Pyx_PyErr_ExceptionMatches(__pyx_builtin_TypeError);
+      if (__pyx_t_9) {
+        __Pyx_AddTraceback("View.MemoryView.memoryview.is_slice", __pyx_clineno, __pyx_lineno, __pyx_filename);
+        if (__Pyx_GetException(&__pyx_t_7, &__pyx_t_8, &__pyx_t_6) < 0) __PYX_ERR(1, 440, __pyx_L6_except_error)
+        __Pyx_XGOTREF(__pyx_t_7);
+        __Pyx_XGOTREF(__pyx_t_8);
+        __Pyx_XGOTREF(__pyx_t_6);
+
+        /* "View.MemoryView":441
+ *                                  self.dtype_is_object)
+ *             except TypeError:
+ *                 return None             # <<<<<<<<<<<<<<
+ * 
+ *         return obj
+ */
+        __Pyx_XDECREF(__pyx_r);
+        __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+        __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+        goto __pyx_L7_except_return;
+      }
+      goto __pyx_L6_except_error;
+
+      /* "View.MemoryView":437
+ *     cdef is_slice(self, obj):
+ *         if not isinstance(obj, memoryview):
+ *             try:             # <<<<<<<<<<<<<<
+ *                 obj = memoryview(obj, self.flags & ~PyBUF_WRITABLE | PyBUF_ANY_CONTIGUOUS,
+ *                                  self.dtype_is_object)
+ */
+      __pyx_L6_except_error:;
+      __Pyx_XGIVEREF(__pyx_t_3);
+      __Pyx_XGIVEREF(__pyx_t_4);
+      __Pyx_XGIVEREF(__pyx_t_5);
+      __Pyx_ExceptionReset(__pyx_t_3, __pyx_t_4, __pyx_t_5);
+      goto __pyx_L1_error;
+      __pyx_L7_except_return:;
+      __Pyx_XGIVEREF(__pyx_t_3);
+      __Pyx_XGIVEREF(__pyx_t_4);
+      __Pyx_XGIVEREF(__pyx_t_5);
+      __Pyx_ExceptionReset(__pyx_t_3, __pyx_t_4, __pyx_t_5);
+      goto __pyx_L0;
+      __pyx_L9_try_end:;
+    }
+
+    /* "View.MemoryView":436
+ * 
+ *     cdef is_slice(self, obj):
+ *         if not isinstance(obj, memoryview):             # <<<<<<<<<<<<<<
+ *             try:
+ *                 obj = memoryview(obj, self.flags & ~PyBUF_WRITABLE | PyBUF_ANY_CONTIGUOUS,
+ */
+  }
+
+  /* "View.MemoryView":443
+ *                 return None
+ * 
+ *         return obj             # <<<<<<<<<<<<<<
+ * 
+ *     cdef setitem_slice_assignment(self, dst, src):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_obj);
+  __pyx_r = __pyx_v_obj;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":435
+ *             self.setitem_indexed(index, value)
+ * 
+ *     cdef is_slice(self, obj):             # <<<<<<<<<<<<<<
+ *         if not isinstance(obj, memoryview):
+ *             try:
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.is_slice", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_obj);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":445
+ *         return obj
+ * 
+ *     cdef setitem_slice_assignment(self, dst, src):             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice dst_slice
+ *         cdef __Pyx_memviewslice src_slice
+ */
+
+static PyObject *__pyx_memoryview_setitem_slice_assignment(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_dst, PyObject *__pyx_v_src) {
+  __Pyx_memviewslice __pyx_v_dst_slice;
+  __Pyx_memviewslice __pyx_v_src_slice;
+  __Pyx_memviewslice __pyx_v_msrc;
+  __Pyx_memviewslice __pyx_v_mdst;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_memviewslice *__pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_t_3;
+  int __pyx_t_4;
+  int __pyx_t_5;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("setitem_slice_assignment", 1);
+
+  /* "View.MemoryView":448
+ *         cdef __Pyx_memviewslice dst_slice
+ *         cdef __Pyx_memviewslice src_slice
+ *         cdef __Pyx_memviewslice msrc = get_slice_from_memview(src, &src_slice)[0]             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice mdst = get_slice_from_memview(dst, &dst_slice)[0]
+ * 
+ */
+  if (!(likely(((__pyx_v_src) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_src, __pyx_memoryview_type))))) __PYX_ERR(1, 448, __pyx_L1_error)
+  __pyx_t_1 = __pyx_memoryview_get_slice_from_memoryview(((struct __pyx_memoryview_obj *)__pyx_v_src), (&__pyx_v_src_slice)); if (unlikely(__pyx_t_1 == ((__Pyx_memviewslice *)NULL))) __PYX_ERR(1, 448, __pyx_L1_error)
+  __pyx_v_msrc = (__pyx_t_1[0]);
+
+  /* "View.MemoryView":449
+ *         cdef __Pyx_memviewslice src_slice
+ *         cdef __Pyx_memviewslice msrc = get_slice_from_memview(src, &src_slice)[0]
+ *         cdef __Pyx_memviewslice mdst = get_slice_from_memview(dst, &dst_slice)[0]             # <<<<<<<<<<<<<<
+ * 
+ *         memoryview_copy_contents(msrc, mdst, src.ndim, dst.ndim, self.dtype_is_object)
+ */
+  if (!(likely(((__pyx_v_dst) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_dst, __pyx_memoryview_type))))) __PYX_ERR(1, 449, __pyx_L1_error)
+  __pyx_t_1 = __pyx_memoryview_get_slice_from_memoryview(((struct __pyx_memoryview_obj *)__pyx_v_dst), (&__pyx_v_dst_slice)); if (unlikely(__pyx_t_1 == ((__Pyx_memviewslice *)NULL))) __PYX_ERR(1, 449, __pyx_L1_error)
+  __pyx_v_mdst = (__pyx_t_1[0]);
+
+  /* "View.MemoryView":451
+ *         cdef __Pyx_memviewslice mdst = get_slice_from_memview(dst, &dst_slice)[0]
+ * 
+ *         memoryview_copy_contents(msrc, mdst, src.ndim, dst.ndim, self.dtype_is_object)             # <<<<<<<<<<<<<<
+ * 
+ *     cdef setitem_slice_assign_scalar(self, memoryview dst, value):
+ */
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_src, __pyx_n_s_ndim); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 451, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = __Pyx_PyInt_As_int(__pyx_t_2); if (unlikely((__pyx_t_3 == (int)-1) && PyErr_Occurred())) __PYX_ERR(1, 451, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_dst, __pyx_n_s_ndim); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 451, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_4 = __Pyx_PyInt_As_int(__pyx_t_2); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) __PYX_ERR(1, 451, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_t_5 = __pyx_memoryview_copy_contents(__pyx_v_msrc, __pyx_v_mdst, __pyx_t_3, __pyx_t_4, __pyx_v_self->dtype_is_object); if (unlikely(__pyx_t_5 == ((int)-1))) __PYX_ERR(1, 451, __pyx_L1_error)
+
+  /* "View.MemoryView":445
+ *         return obj
+ * 
+ *     cdef setitem_slice_assignment(self, dst, src):             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice dst_slice
+ *         cdef __Pyx_memviewslice src_slice
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.setitem_slice_assignment", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":453
+ *         memoryview_copy_contents(msrc, mdst, src.ndim, dst.ndim, self.dtype_is_object)
+ * 
+ *     cdef setitem_slice_assign_scalar(self, memoryview dst, value):             # <<<<<<<<<<<<<<
+ *         cdef int array[128]
+ *         cdef void *tmp = NULL
+ */
+
+static PyObject *__pyx_memoryview_setitem_slice_assign_scalar(struct __pyx_memoryview_obj *__pyx_v_self, struct __pyx_memoryview_obj *__pyx_v_dst, PyObject *__pyx_v_value) {
+  int __pyx_v_array[0x80];
+  void *__pyx_v_tmp;
+  void *__pyx_v_item;
+  __Pyx_memviewslice *__pyx_v_dst_slice;
+  __Pyx_memviewslice __pyx_v_tmp_slice;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_memviewslice *__pyx_t_1;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  int __pyx_t_5;
+  char const *__pyx_t_6;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  PyObject *__pyx_t_9 = NULL;
+  PyObject *__pyx_t_10 = NULL;
+  PyObject *__pyx_t_11 = NULL;
+  PyObject *__pyx_t_12 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("setitem_slice_assign_scalar", 1);
+
+  /* "View.MemoryView":455
+ *     cdef setitem_slice_assign_scalar(self, memoryview dst, value):
+ *         cdef int array[128]
+ *         cdef void *tmp = NULL             # <<<<<<<<<<<<<<
+ *         cdef void *item
+ * 
+ */
+  __pyx_v_tmp = NULL;
+
+  /* "View.MemoryView":460
+ *         cdef __Pyx_memviewslice *dst_slice
+ *         cdef __Pyx_memviewslice tmp_slice
+ *         dst_slice = get_slice_from_memview(dst, &tmp_slice)             # <<<<<<<<<<<<<<
+ * 
+ *         if <size_t>self.view.itemsize > sizeof(array):
+ */
+  __pyx_t_1 = __pyx_memoryview_get_slice_from_memoryview(__pyx_v_dst, (&__pyx_v_tmp_slice)); if (unlikely(__pyx_t_1 == ((__Pyx_memviewslice *)NULL))) __PYX_ERR(1, 460, __pyx_L1_error)
+  __pyx_v_dst_slice = __pyx_t_1;
+
+  /* "View.MemoryView":462
+ *         dst_slice = get_slice_from_memview(dst, &tmp_slice)
+ * 
+ *         if <size_t>self.view.itemsize > sizeof(array):             # <<<<<<<<<<<<<<
+ *             tmp = PyMem_Malloc(self.view.itemsize)
+ *             if tmp == NULL:
+ */
+  __pyx_t_2 = (((size_t)__pyx_v_self->view.itemsize) > (sizeof(__pyx_v_array)));
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":463
+ * 
+ *         if <size_t>self.view.itemsize > sizeof(array):
+ *             tmp = PyMem_Malloc(self.view.itemsize)             # <<<<<<<<<<<<<<
+ *             if tmp == NULL:
+ *                 raise MemoryError
+ */
+    __pyx_v_tmp = PyMem_Malloc(__pyx_v_self->view.itemsize);
+
+    /* "View.MemoryView":464
+ *         if <size_t>self.view.itemsize > sizeof(array):
+ *             tmp = PyMem_Malloc(self.view.itemsize)
+ *             if tmp == NULL:             # <<<<<<<<<<<<<<
+ *                 raise MemoryError
+ *             item = tmp
+ */
+    __pyx_t_2 = (__pyx_v_tmp == NULL);
+    if (unlikely(__pyx_t_2)) {
+
+      /* "View.MemoryView":465
+ *             tmp = PyMem_Malloc(self.view.itemsize)
+ *             if tmp == NULL:
+ *                 raise MemoryError             # <<<<<<<<<<<<<<
+ *             item = tmp
+ *         else:
+ */
+      PyErr_NoMemory(); __PYX_ERR(1, 465, __pyx_L1_error)
+
+      /* "View.MemoryView":464
+ *         if <size_t>self.view.itemsize > sizeof(array):
+ *             tmp = PyMem_Malloc(self.view.itemsize)
+ *             if tmp == NULL:             # <<<<<<<<<<<<<<
+ *                 raise MemoryError
+ *             item = tmp
+ */
+    }
+
+    /* "View.MemoryView":466
+ *             if tmp == NULL:
+ *                 raise MemoryError
+ *             item = tmp             # <<<<<<<<<<<<<<
+ *         else:
+ *             item = <void *> array
+ */
+    __pyx_v_item = __pyx_v_tmp;
+
+    /* "View.MemoryView":462
+ *         dst_slice = get_slice_from_memview(dst, &tmp_slice)
+ * 
+ *         if <size_t>self.view.itemsize > sizeof(array):             # <<<<<<<<<<<<<<
+ *             tmp = PyMem_Malloc(self.view.itemsize)
+ *             if tmp == NULL:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":468
+ *             item = tmp
+ *         else:
+ *             item = <void *> array             # <<<<<<<<<<<<<<
+ * 
+ *         try:
+ */
+  /*else*/ {
+    __pyx_v_item = ((void *)__pyx_v_array);
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":470
+ *             item = <void *> array
+ * 
+ *         try:             # <<<<<<<<<<<<<<
+ *             if self.dtype_is_object:
+ *                 (<PyObject **> item)[0] = <PyObject *> value
+ */
+  /*try:*/ {
+
+    /* "View.MemoryView":471
+ * 
+ *         try:
+ *             if self.dtype_is_object:             # <<<<<<<<<<<<<<
+ *                 (<PyObject **> item)[0] = <PyObject *> value
+ *             else:
+ */
+    if (__pyx_v_self->dtype_is_object) {
+
+      /* "View.MemoryView":472
+ *         try:
+ *             if self.dtype_is_object:
+ *                 (<PyObject **> item)[0] = <PyObject *> value             # <<<<<<<<<<<<<<
+ *             else:
+ *                 self.assign_item_from_object(<char *> item, value)
+ */
+      (((PyObject **)__pyx_v_item)[0]) = ((PyObject *)__pyx_v_value);
+
+      /* "View.MemoryView":471
+ * 
+ *         try:
+ *             if self.dtype_is_object:             # <<<<<<<<<<<<<<
+ *                 (<PyObject **> item)[0] = <PyObject *> value
+ *             else:
+ */
+      goto __pyx_L8;
+    }
+
+    /* "View.MemoryView":474
+ *                 (<PyObject **> item)[0] = <PyObject *> value
+ *             else:
+ *                 self.assign_item_from_object(<char *> item, value)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+    /*else*/ {
+      __pyx_t_3 = ((struct __pyx_vtabstruct_memoryview *)__pyx_v_self->__pyx_vtab)->assign_item_from_object(__pyx_v_self, ((char *)__pyx_v_item), __pyx_v_value); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 474, __pyx_L6_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    }
+    __pyx_L8:;
+
+    /* "View.MemoryView":478
+ * 
+ * 
+ *             if self.view.suboffsets != NULL:             # <<<<<<<<<<<<<<
+ *                 assert_direct_dimensions(self.view.suboffsets, self.view.ndim)
+ *             slice_assign_scalar(dst_slice, dst.view.ndim, self.view.itemsize,
+ */
+    __pyx_t_2 = (__pyx_v_self->view.suboffsets != NULL);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":479
+ * 
+ *             if self.view.suboffsets != NULL:
+ *                 assert_direct_dimensions(self.view.suboffsets, self.view.ndim)             # <<<<<<<<<<<<<<
+ *             slice_assign_scalar(dst_slice, dst.view.ndim, self.view.itemsize,
+ *                                 item, self.dtype_is_object)
+ */
+      __pyx_t_4 = assert_direct_dimensions(__pyx_v_self->view.suboffsets, __pyx_v_self->view.ndim); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(1, 479, __pyx_L6_error)
+
+      /* "View.MemoryView":478
+ * 
+ * 
+ *             if self.view.suboffsets != NULL:             # <<<<<<<<<<<<<<
+ *                 assert_direct_dimensions(self.view.suboffsets, self.view.ndim)
+ *             slice_assign_scalar(dst_slice, dst.view.ndim, self.view.itemsize,
+ */
+    }
+
+    /* "View.MemoryView":480
+ *             if self.view.suboffsets != NULL:
+ *                 assert_direct_dimensions(self.view.suboffsets, self.view.ndim)
+ *             slice_assign_scalar(dst_slice, dst.view.ndim, self.view.itemsize,             # <<<<<<<<<<<<<<
+ *                                 item, self.dtype_is_object)
+ *         finally:
+ */
+    __pyx_memoryview_slice_assign_scalar(__pyx_v_dst_slice, __pyx_v_dst->view.ndim, __pyx_v_self->view.itemsize, __pyx_v_item, __pyx_v_self->dtype_is_object);
+  }
+
+  /* "View.MemoryView":483
+ *                                 item, self.dtype_is_object)
+ *         finally:
+ *             PyMem_Free(tmp)             # <<<<<<<<<<<<<<
+ * 
+ *     cdef setitem_indexed(self, index, value):
+ */
+  /*finally:*/ {
+    /*normal exit:*/{
+      PyMem_Free(__pyx_v_tmp);
+      goto __pyx_L7;
+    }
+    __pyx_L6_error:;
+    /*exception exit:*/{
+      __Pyx_PyThreadState_declare
+      __Pyx_PyThreadState_assign
+      __pyx_t_7 = 0; __pyx_t_8 = 0; __pyx_t_9 = 0; __pyx_t_10 = 0; __pyx_t_11 = 0; __pyx_t_12 = 0;
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (PY_MAJOR_VERSION >= 3) __Pyx_ExceptionSwap(&__pyx_t_10, &__pyx_t_11, &__pyx_t_12);
+      if ((PY_MAJOR_VERSION < 3) || unlikely(__Pyx_GetException(&__pyx_t_7, &__pyx_t_8, &__pyx_t_9) < 0)) __Pyx_ErrFetch(&__pyx_t_7, &__pyx_t_8, &__pyx_t_9);
+      __Pyx_XGOTREF(__pyx_t_7);
+      __Pyx_XGOTREF(__pyx_t_8);
+      __Pyx_XGOTREF(__pyx_t_9);
+      __Pyx_XGOTREF(__pyx_t_10);
+      __Pyx_XGOTREF(__pyx_t_11);
+      __Pyx_XGOTREF(__pyx_t_12);
+      __pyx_t_4 = __pyx_lineno; __pyx_t_5 = __pyx_clineno; __pyx_t_6 = __pyx_filename;
+      {
+        PyMem_Free(__pyx_v_tmp);
+      }
+      if (PY_MAJOR_VERSION >= 3) {
+        __Pyx_XGIVEREF(__pyx_t_10);
+        __Pyx_XGIVEREF(__pyx_t_11);
+        __Pyx_XGIVEREF(__pyx_t_12);
+        __Pyx_ExceptionReset(__pyx_t_10, __pyx_t_11, __pyx_t_12);
+      }
+      __Pyx_XGIVEREF(__pyx_t_7);
+      __Pyx_XGIVEREF(__pyx_t_8);
+      __Pyx_XGIVEREF(__pyx_t_9);
+      __Pyx_ErrRestore(__pyx_t_7, __pyx_t_8, __pyx_t_9);
+      __pyx_t_7 = 0; __pyx_t_8 = 0; __pyx_t_9 = 0; __pyx_t_10 = 0; __pyx_t_11 = 0; __pyx_t_12 = 0;
+      __pyx_lineno = __pyx_t_4; __pyx_clineno = __pyx_t_5; __pyx_filename = __pyx_t_6;
+      goto __pyx_L1_error;
+    }
+    __pyx_L7:;
+  }
+
+  /* "View.MemoryView":453
+ *         memoryview_copy_contents(msrc, mdst, src.ndim, dst.ndim, self.dtype_is_object)
+ * 
+ *     cdef setitem_slice_assign_scalar(self, memoryview dst, value):             # <<<<<<<<<<<<<<
+ *         cdef int array[128]
+ *         cdef void *tmp = NULL
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.setitem_slice_assign_scalar", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":485
+ *             PyMem_Free(tmp)
+ * 
+ *     cdef setitem_indexed(self, index, value):             # <<<<<<<<<<<<<<
+ *         cdef char *itemp = self.get_item_pointer(index)
+ *         self.assign_item_from_object(itemp, value)
+ */
+
+static PyObject *__pyx_memoryview_setitem_indexed(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index, PyObject *__pyx_v_value) {
+  char *__pyx_v_itemp;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  char *__pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("setitem_indexed", 1);
+
+  /* "View.MemoryView":486
+ * 
+ *     cdef setitem_indexed(self, index, value):
+ *         cdef char *itemp = self.get_item_pointer(index)             # <<<<<<<<<<<<<<
+ *         self.assign_item_from_object(itemp, value)
+ * 
+ */
+  __pyx_t_1 = ((struct __pyx_vtabstruct_memoryview *)__pyx_v_self->__pyx_vtab)->get_item_pointer(__pyx_v_self, __pyx_v_index); if (unlikely(__pyx_t_1 == ((char *)NULL))) __PYX_ERR(1, 486, __pyx_L1_error)
+  __pyx_v_itemp = __pyx_t_1;
+
+  /* "View.MemoryView":487
+ *     cdef setitem_indexed(self, index, value):
+ *         cdef char *itemp = self.get_item_pointer(index)
+ *         self.assign_item_from_object(itemp, value)             # <<<<<<<<<<<<<<
+ * 
+ *     cdef convert_item_to_object(self, char *itemp):
+ */
+  __pyx_t_2 = ((struct __pyx_vtabstruct_memoryview *)__pyx_v_self->__pyx_vtab)->assign_item_from_object(__pyx_v_self, __pyx_v_itemp, __pyx_v_value); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 487, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "View.MemoryView":485
+ *             PyMem_Free(tmp)
+ * 
+ *     cdef setitem_indexed(self, index, value):             # <<<<<<<<<<<<<<
+ *         cdef char *itemp = self.get_item_pointer(index)
+ *         self.assign_item_from_object(itemp, value)
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.setitem_indexed", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":489
+ *         self.assign_item_from_object(itemp, value)
+ * 
+ *     cdef convert_item_to_object(self, char *itemp):             # <<<<<<<<<<<<<<
+ *         """Only used if instantiated manually by the user, or if Cython doesn't
+ *         know how to convert the type"""
+ */
+
+static PyObject *__pyx_memoryview_convert_item_to_object(struct __pyx_memoryview_obj *__pyx_v_self, char *__pyx_v_itemp) {
+  PyObject *__pyx_v_struct = NULL;
+  PyObject *__pyx_v_bytesitem = 0;
+  PyObject *__pyx_v_result = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  unsigned int __pyx_t_8;
+  Py_ssize_t __pyx_t_9;
+  int __pyx_t_10;
+  int __pyx_t_11;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("convert_item_to_object", 1);
+
+  /* "View.MemoryView":492
+ *         """Only used if instantiated manually by the user, or if Cython doesn't
+ *         know how to convert the type"""
+ *         import struct             # <<<<<<<<<<<<<<
+ *         cdef bytes bytesitem
+ * 
+ */
+  __pyx_t_1 = __Pyx_ImportDottedModule(__pyx_n_s_struct, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 492, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v_struct = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "View.MemoryView":495
+ *         cdef bytes bytesitem
+ * 
+ *         bytesitem = itemp[:self.view.itemsize]             # <<<<<<<<<<<<<<
+ *         try:
+ *             result = struct.unpack(self.view.format, bytesitem)
+ */
+  __pyx_t_1 = __Pyx_PyBytes_FromStringAndSize(__pyx_v_itemp + 0, __pyx_v_self->view.itemsize - 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 495, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v_bytesitem = ((PyObject*)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "View.MemoryView":496
+ * 
+ *         bytesitem = itemp[:self.view.itemsize]
+ *         try:             # <<<<<<<<<<<<<<
+ *             result = struct.unpack(self.view.format, bytesitem)
+ *         except struct.error:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_2, &__pyx_t_3, &__pyx_t_4);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_4);
+    /*try:*/ {
+
+      /* "View.MemoryView":497
+ *         bytesitem = itemp[:self.view.itemsize]
+ *         try:
+ *             result = struct.unpack(self.view.format, bytesitem)             # <<<<<<<<<<<<<<
+ *         except struct.error:
+ *             raise ValueError, "Unable to convert item to object"
+ */
+      __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_v_struct, __pyx_n_s_unpack); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 497, __pyx_L3_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __pyx_t_6 = __Pyx_PyBytes_FromString(__pyx_v_self->view.format); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 497, __pyx_L3_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __pyx_t_7 = NULL;
+      __pyx_t_8 = 0;
+      #if CYTHON_UNPACK_METHODS
+      if (likely(PyMethod_Check(__pyx_t_5))) {
+        __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_5);
+        if (likely(__pyx_t_7)) {
+          PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_5);
+          __Pyx_INCREF(__pyx_t_7);
+          __Pyx_INCREF(function);
+          __Pyx_DECREF_SET(__pyx_t_5, function);
+          __pyx_t_8 = 1;
+        }
+      }
+      #endif
+      {
+        PyObject *__pyx_callargs[3] = {__pyx_t_7, __pyx_t_6, __pyx_v_bytesitem};
+        __pyx_t_1 = __Pyx_PyObject_FastCall(__pyx_t_5, __pyx_callargs+1-__pyx_t_8, 2+__pyx_t_8);
+        __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+        if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 497, __pyx_L3_error)
+        __Pyx_GOTREF(__pyx_t_1);
+        __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      }
+      __pyx_v_result = __pyx_t_1;
+      __pyx_t_1 = 0;
+
+      /* "View.MemoryView":496
+ * 
+ *         bytesitem = itemp[:self.view.itemsize]
+ *         try:             # <<<<<<<<<<<<<<
+ *             result = struct.unpack(self.view.format, bytesitem)
+ *         except struct.error:
+ */
+    }
+
+    /* "View.MemoryView":501
+ *             raise ValueError, "Unable to convert item to object"
+ *         else:
+ *             if len(self.view.format) == 1:             # <<<<<<<<<<<<<<
+ *                 return result[0]
+ *             return result
+ */
+    /*else:*/ {
+      __pyx_t_9 = __Pyx_ssize_strlen(__pyx_v_self->view.format); if (unlikely(__pyx_t_9 == ((Py_ssize_t)-1))) __PYX_ERR(1, 501, __pyx_L5_except_error)
+      __pyx_t_10 = (__pyx_t_9 == 1);
+      if (__pyx_t_10) {
+
+        /* "View.MemoryView":502
+ *         else:
+ *             if len(self.view.format) == 1:
+ *                 return result[0]             # <<<<<<<<<<<<<<
+ *             return result
+ * 
+ */
+        __Pyx_XDECREF(__pyx_r);
+        __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_result, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 502, __pyx_L5_except_error)
+        __Pyx_GOTREF(__pyx_t_1);
+        __pyx_r = __pyx_t_1;
+        __pyx_t_1 = 0;
+        goto __pyx_L6_except_return;
+
+        /* "View.MemoryView":501
+ *             raise ValueError, "Unable to convert item to object"
+ *         else:
+ *             if len(self.view.format) == 1:             # <<<<<<<<<<<<<<
+ *                 return result[0]
+ *             return result
+ */
+      }
+
+      /* "View.MemoryView":503
+ *             if len(self.view.format) == 1:
+ *                 return result[0]
+ *             return result             # <<<<<<<<<<<<<<
+ * 
+ *     cdef assign_item_from_object(self, char *itemp, object value):
+ */
+      __Pyx_XDECREF(__pyx_r);
+      __Pyx_INCREF(__pyx_v_result);
+      __pyx_r = __pyx_v_result;
+      goto __pyx_L6_except_return;
+    }
+    __pyx_L3_error:;
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+    __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+    /* "View.MemoryView":498
+ *         try:
+ *             result = struct.unpack(self.view.format, bytesitem)
+ *         except struct.error:             # <<<<<<<<<<<<<<
+ *             raise ValueError, "Unable to convert item to object"
+ *         else:
+ */
+    __Pyx_ErrFetch(&__pyx_t_1, &__pyx_t_5, &__pyx_t_6);
+    __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_v_struct, __pyx_n_s_error); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 498, __pyx_L5_except_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __pyx_t_11 = __Pyx_PyErr_GivenExceptionMatches(__pyx_t_1, __pyx_t_7);
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __Pyx_ErrRestore(__pyx_t_1, __pyx_t_5, __pyx_t_6);
+    __pyx_t_1 = 0; __pyx_t_5 = 0; __pyx_t_6 = 0;
+    if (__pyx_t_11) {
+      __Pyx_AddTraceback("View.MemoryView.memoryview.convert_item_to_object", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_6, &__pyx_t_5, &__pyx_t_1) < 0) __PYX_ERR(1, 498, __pyx_L5_except_error)
+      __Pyx_XGOTREF(__pyx_t_6);
+      __Pyx_XGOTREF(__pyx_t_5);
+      __Pyx_XGOTREF(__pyx_t_1);
+
+      /* "View.MemoryView":499
+ *             result = struct.unpack(self.view.format, bytesitem)
+ *         except struct.error:
+ *             raise ValueError, "Unable to convert item to object"             # <<<<<<<<<<<<<<
+ *         else:
+ *             if len(self.view.format) == 1:
+ */
+      __Pyx_Raise(__pyx_builtin_ValueError, __pyx_kp_s_Unable_to_convert_item_to_object, 0, 0);
+      __PYX_ERR(1, 499, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+
+    /* "View.MemoryView":496
+ * 
+ *         bytesitem = itemp[:self.view.itemsize]
+ *         try:             # <<<<<<<<<<<<<<
+ *             result = struct.unpack(self.view.format, bytesitem)
+ *         except struct.error:
+ */
+    __pyx_L5_except_error:;
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_XGIVEREF(__pyx_t_4);
+    __Pyx_ExceptionReset(__pyx_t_2, __pyx_t_3, __pyx_t_4);
+    goto __pyx_L1_error;
+    __pyx_L6_except_return:;
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_XGIVEREF(__pyx_t_4);
+    __Pyx_ExceptionReset(__pyx_t_2, __pyx_t_3, __pyx_t_4);
+    goto __pyx_L0;
+  }
+
+  /* "View.MemoryView":489
+ *         self.assign_item_from_object(itemp, value)
+ * 
+ *     cdef convert_item_to_object(self, char *itemp):             # <<<<<<<<<<<<<<
+ *         """Only used if instantiated manually by the user, or if Cython doesn't
+ *         know how to convert the type"""
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.convert_item_to_object", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_struct);
+  __Pyx_XDECREF(__pyx_v_bytesitem);
+  __Pyx_XDECREF(__pyx_v_result);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":505
+ *             return result
+ * 
+ *     cdef assign_item_from_object(self, char *itemp, object value):             # <<<<<<<<<<<<<<
+ *         """Only used if instantiated manually by the user, or if Cython doesn't
+ *         know how to convert the type"""
+ */
+
+static PyObject *__pyx_memoryview_assign_item_from_object(struct __pyx_memoryview_obj *__pyx_v_self, char *__pyx_v_itemp, PyObject *__pyx_v_value) {
+  PyObject *__pyx_v_struct = NULL;
+  char __pyx_v_c;
+  PyObject *__pyx_v_bytesvalue = 0;
+  Py_ssize_t __pyx_v_i;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  unsigned int __pyx_t_6;
+  Py_ssize_t __pyx_t_7;
+  PyObject *__pyx_t_8 = NULL;
+  char *__pyx_t_9;
+  char *__pyx_t_10;
+  char *__pyx_t_11;
+  char *__pyx_t_12;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("assign_item_from_object", 1);
+
+  /* "View.MemoryView":508
+ *         """Only used if instantiated manually by the user, or if Cython doesn't
+ *         know how to convert the type"""
+ *         import struct             # <<<<<<<<<<<<<<
+ *         cdef char c
+ *         cdef bytes bytesvalue
+ */
+  __pyx_t_1 = __Pyx_ImportDottedModule(__pyx_n_s_struct, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 508, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v_struct = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "View.MemoryView":513
+ *         cdef Py_ssize_t i
+ * 
+ *         if isinstance(value, tuple):             # <<<<<<<<<<<<<<
+ *             bytesvalue = struct.pack(self.view.format, *value)
+ *         else:
+ */
+  __pyx_t_2 = PyTuple_Check(__pyx_v_value); 
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":514
+ * 
+ *         if isinstance(value, tuple):
+ *             bytesvalue = struct.pack(self.view.format, *value)             # <<<<<<<<<<<<<<
+ *         else:
+ *             bytesvalue = struct.pack(self.view.format, value)
+ */
+    __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_struct, __pyx_n_s_pack); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 514, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_3 = __Pyx_PyBytes_FromString(__pyx_v_self->view.format); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 514, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_4 = PyTuple_New(1); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 514, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_GIVEREF(__pyx_t_3);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3)) __PYX_ERR(1, 514, __pyx_L1_error);
+    __pyx_t_3 = 0;
+    __pyx_t_3 = __Pyx_PySequence_Tuple(__pyx_v_value); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 514, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_5 = PyNumber_Add(__pyx_t_4, __pyx_t_3); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 514, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_5, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 514, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    if (!(likely(PyBytes_CheckExact(__pyx_t_3))||((__pyx_t_3) == Py_None) || __Pyx_RaiseUnexpectedTypeError("bytes", __pyx_t_3))) __PYX_ERR(1, 514, __pyx_L1_error)
+    __pyx_v_bytesvalue = ((PyObject*)__pyx_t_3);
+    __pyx_t_3 = 0;
+
+    /* "View.MemoryView":513
+ *         cdef Py_ssize_t i
+ * 
+ *         if isinstance(value, tuple):             # <<<<<<<<<<<<<<
+ *             bytesvalue = struct.pack(self.view.format, *value)
+ *         else:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":516
+ *             bytesvalue = struct.pack(self.view.format, *value)
+ *         else:
+ *             bytesvalue = struct.pack(self.view.format, value)             # <<<<<<<<<<<<<<
+ * 
+ *         for i, c in enumerate(bytesvalue):
+ */
+  /*else*/ {
+    __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_v_struct, __pyx_n_s_pack); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 516, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_1 = __Pyx_PyBytes_FromString(__pyx_v_self->view.format); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 516, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_4 = NULL;
+    __pyx_t_6 = 0;
+    #if CYTHON_UNPACK_METHODS
+    if (likely(PyMethod_Check(__pyx_t_5))) {
+      __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_5);
+      if (likely(__pyx_t_4)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_5);
+        __Pyx_INCREF(__pyx_t_4);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_5, function);
+        __pyx_t_6 = 1;
+      }
+    }
+    #endif
+    {
+      PyObject *__pyx_callargs[3] = {__pyx_t_4, __pyx_t_1, __pyx_v_value};
+      __pyx_t_3 = __Pyx_PyObject_FastCall(__pyx_t_5, __pyx_callargs+1-__pyx_t_6, 2+__pyx_t_6);
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+      if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 516, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    }
+    if (!(likely(PyBytes_CheckExact(__pyx_t_3))||((__pyx_t_3) == Py_None) || __Pyx_RaiseUnexpectedTypeError("bytes", __pyx_t_3))) __PYX_ERR(1, 516, __pyx_L1_error)
+    __pyx_v_bytesvalue = ((PyObject*)__pyx_t_3);
+    __pyx_t_3 = 0;
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":518
+ *             bytesvalue = struct.pack(self.view.format, value)
+ * 
+ *         for i, c in enumerate(bytesvalue):             # <<<<<<<<<<<<<<
+ *             itemp[i] = c
+ * 
+ */
+  __pyx_t_7 = 0;
+  if (unlikely(__pyx_v_bytesvalue == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' is not iterable");
+    __PYX_ERR(1, 518, __pyx_L1_error)
+  }
+  __Pyx_INCREF(__pyx_v_bytesvalue);
+  __pyx_t_8 = __pyx_v_bytesvalue;
+  __pyx_t_10 = PyBytes_AS_STRING(__pyx_t_8);
+  __pyx_t_11 = (__pyx_t_10 + PyBytes_GET_SIZE(__pyx_t_8));
+  for (__pyx_t_12 = __pyx_t_10; __pyx_t_12 < __pyx_t_11; __pyx_t_12++) {
+    __pyx_t_9 = __pyx_t_12;
+    __pyx_v_c = (__pyx_t_9[0]);
+
+    /* "View.MemoryView":519
+ * 
+ *         for i, c in enumerate(bytesvalue):
+ *             itemp[i] = c             # <<<<<<<<<<<<<<
+ * 
+ *     @cname('getbuffer')
+ */
+    __pyx_v_i = __pyx_t_7;
+
+    /* "View.MemoryView":518
+ *             bytesvalue = struct.pack(self.view.format, value)
+ * 
+ *         for i, c in enumerate(bytesvalue):             # <<<<<<<<<<<<<<
+ *             itemp[i] = c
+ * 
+ */
+    __pyx_t_7 = (__pyx_t_7 + 1);
+
+    /* "View.MemoryView":519
+ * 
+ *         for i, c in enumerate(bytesvalue):
+ *             itemp[i] = c             # <<<<<<<<<<<<<<
+ * 
+ *     @cname('getbuffer')
+ */
+    (__pyx_v_itemp[__pyx_v_i]) = __pyx_v_c;
+  }
+  __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+
+  /* "View.MemoryView":505
+ *             return result
+ * 
+ *     cdef assign_item_from_object(self, char *itemp, object value):             # <<<<<<<<<<<<<<
+ *         """Only used if instantiated manually by the user, or if Cython doesn't
+ *         know how to convert the type"""
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.assign_item_from_object", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_struct);
+  __Pyx_XDECREF(__pyx_v_bytesvalue);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":521
+ *             itemp[i] = c
+ * 
+ *     @cname('getbuffer')             # <<<<<<<<<<<<<<
+ *     def __getbuffer__(self, Py_buffer *info, int flags):
+ *         if flags & PyBUF_WRITABLE and self.view.readonly:
+ */
+
+/* Python wrapper */
+CYTHON_UNUSED static int __pyx_memoryview_getbuffer(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /*proto*/
+CYTHON_UNUSED static int __pyx_memoryview_getbuffer(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__getbuffer__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_8__getbuffer__(((struct __pyx_memoryview_obj *)__pyx_v_self), ((Py_buffer *)__pyx_v_info), ((int)__pyx_v_flags));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_8__getbuffer__(struct __pyx_memoryview_obj *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  int __pyx_t_2;
+  Py_ssize_t *__pyx_t_3;
+  char *__pyx_t_4;
+  void *__pyx_t_5;
+  int __pyx_t_6;
+  Py_ssize_t __pyx_t_7;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  if (unlikely(__pyx_v_info == NULL)) {
+    PyErr_SetString(PyExc_BufferError, "PyObject_GetBuffer: view==NULL argument is obsolete");
+    return -1;
+  }
+  __Pyx_RefNannySetupContext("__getbuffer__", 0);
+  __pyx_v_info->obj = Py_None; __Pyx_INCREF(Py_None);
+  __Pyx_GIVEREF(__pyx_v_info->obj);
+
+  /* "View.MemoryView":523
+ *     @cname('getbuffer')
+ *     def __getbuffer__(self, Py_buffer *info, int flags):
+ *         if flags & PyBUF_WRITABLE and self.view.readonly:             # <<<<<<<<<<<<<<
+ *             raise ValueError, "Cannot create writable memory view from read-only memoryview"
+ * 
+ */
+  __pyx_t_2 = ((__pyx_v_flags & PyBUF_WRITABLE) != 0);
+  if (__pyx_t_2) {
+  } else {
+    __pyx_t_1 = __pyx_t_2;
+    goto __pyx_L4_bool_binop_done;
+  }
+  __pyx_t_1 = __pyx_v_self->view.readonly;
+  __pyx_L4_bool_binop_done:;
+  if (unlikely(__pyx_t_1)) {
+
+    /* "View.MemoryView":524
+ *     def __getbuffer__(self, Py_buffer *info, int flags):
+ *         if flags & PyBUF_WRITABLE and self.view.readonly:
+ *             raise ValueError, "Cannot create writable memory view from read-only memoryview"             # <<<<<<<<<<<<<<
+ * 
+ *         if flags & PyBUF_ND:
+ */
+    __Pyx_Raise(__pyx_builtin_ValueError, __pyx_kp_s_Cannot_create_writable_memory_vi, 0, 0);
+    __PYX_ERR(1, 524, __pyx_L1_error)
+
+    /* "View.MemoryView":523
+ *     @cname('getbuffer')
+ *     def __getbuffer__(self, Py_buffer *info, int flags):
+ *         if flags & PyBUF_WRITABLE and self.view.readonly:             # <<<<<<<<<<<<<<
+ *             raise ValueError, "Cannot create writable memory view from read-only memoryview"
+ * 
+ */
+  }
+
+  /* "View.MemoryView":526
+ *             raise ValueError, "Cannot create writable memory view from read-only memoryview"
+ * 
+ *         if flags & PyBUF_ND:             # <<<<<<<<<<<<<<
+ *             info.shape = self.view.shape
+ *         else:
+ */
+  __pyx_t_1 = ((__pyx_v_flags & PyBUF_ND) != 0);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":527
+ * 
+ *         if flags & PyBUF_ND:
+ *             info.shape = self.view.shape             # <<<<<<<<<<<<<<
+ *         else:
+ *             info.shape = NULL
+ */
+    __pyx_t_3 = __pyx_v_self->view.shape;
+    __pyx_v_info->shape = __pyx_t_3;
+
+    /* "View.MemoryView":526
+ *             raise ValueError, "Cannot create writable memory view from read-only memoryview"
+ * 
+ *         if flags & PyBUF_ND:             # <<<<<<<<<<<<<<
+ *             info.shape = self.view.shape
+ *         else:
+ */
+    goto __pyx_L6;
+  }
+
+  /* "View.MemoryView":529
+ *             info.shape = self.view.shape
+ *         else:
+ *             info.shape = NULL             # <<<<<<<<<<<<<<
+ * 
+ *         if flags & PyBUF_STRIDES:
+ */
+  /*else*/ {
+    __pyx_v_info->shape = NULL;
+  }
+  __pyx_L6:;
+
+  /* "View.MemoryView":531
+ *             info.shape = NULL
+ * 
+ *         if flags & PyBUF_STRIDES:             # <<<<<<<<<<<<<<
+ *             info.strides = self.view.strides
+ *         else:
+ */
+  __pyx_t_1 = ((__pyx_v_flags & PyBUF_STRIDES) != 0);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":532
+ * 
+ *         if flags & PyBUF_STRIDES:
+ *             info.strides = self.view.strides             # <<<<<<<<<<<<<<
+ *         else:
+ *             info.strides = NULL
+ */
+    __pyx_t_3 = __pyx_v_self->view.strides;
+    __pyx_v_info->strides = __pyx_t_3;
+
+    /* "View.MemoryView":531
+ *             info.shape = NULL
+ * 
+ *         if flags & PyBUF_STRIDES:             # <<<<<<<<<<<<<<
+ *             info.strides = self.view.strides
+ *         else:
+ */
+    goto __pyx_L7;
+  }
+
+  /* "View.MemoryView":534
+ *             info.strides = self.view.strides
+ *         else:
+ *             info.strides = NULL             # <<<<<<<<<<<<<<
+ * 
+ *         if flags & PyBUF_INDIRECT:
+ */
+  /*else*/ {
+    __pyx_v_info->strides = NULL;
+  }
+  __pyx_L7:;
+
+  /* "View.MemoryView":536
+ *             info.strides = NULL
+ * 
+ *         if flags & PyBUF_INDIRECT:             # <<<<<<<<<<<<<<
+ *             info.suboffsets = self.view.suboffsets
+ *         else:
+ */
+  __pyx_t_1 = ((__pyx_v_flags & PyBUF_INDIRECT) != 0);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":537
+ * 
+ *         if flags & PyBUF_INDIRECT:
+ *             info.suboffsets = self.view.suboffsets             # <<<<<<<<<<<<<<
+ *         else:
+ *             info.suboffsets = NULL
+ */
+    __pyx_t_3 = __pyx_v_self->view.suboffsets;
+    __pyx_v_info->suboffsets = __pyx_t_3;
+
+    /* "View.MemoryView":536
+ *             info.strides = NULL
+ * 
+ *         if flags & PyBUF_INDIRECT:             # <<<<<<<<<<<<<<
+ *             info.suboffsets = self.view.suboffsets
+ *         else:
+ */
+    goto __pyx_L8;
+  }
+
+  /* "View.MemoryView":539
+ *             info.suboffsets = self.view.suboffsets
+ *         else:
+ *             info.suboffsets = NULL             # <<<<<<<<<<<<<<
+ * 
+ *         if flags & PyBUF_FORMAT:
+ */
+  /*else*/ {
+    __pyx_v_info->suboffsets = NULL;
+  }
+  __pyx_L8:;
+
+  /* "View.MemoryView":541
+ *             info.suboffsets = NULL
+ * 
+ *         if flags & PyBUF_FORMAT:             # <<<<<<<<<<<<<<
+ *             info.format = self.view.format
+ *         else:
+ */
+  __pyx_t_1 = ((__pyx_v_flags & PyBUF_FORMAT) != 0);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":542
+ * 
+ *         if flags & PyBUF_FORMAT:
+ *             info.format = self.view.format             # <<<<<<<<<<<<<<
+ *         else:
+ *             info.format = NULL
+ */
+    __pyx_t_4 = __pyx_v_self->view.format;
+    __pyx_v_info->format = __pyx_t_4;
+
+    /* "View.MemoryView":541
+ *             info.suboffsets = NULL
+ * 
+ *         if flags & PyBUF_FORMAT:             # <<<<<<<<<<<<<<
+ *             info.format = self.view.format
+ *         else:
+ */
+    goto __pyx_L9;
+  }
+
+  /* "View.MemoryView":544
+ *             info.format = self.view.format
+ *         else:
+ *             info.format = NULL             # <<<<<<<<<<<<<<
+ * 
+ *         info.buf = self.view.buf
+ */
+  /*else*/ {
+    __pyx_v_info->format = NULL;
+  }
+  __pyx_L9:;
+
+  /* "View.MemoryView":546
+ *             info.format = NULL
+ * 
+ *         info.buf = self.view.buf             # <<<<<<<<<<<<<<
+ *         info.ndim = self.view.ndim
+ *         info.itemsize = self.view.itemsize
+ */
+  __pyx_t_5 = __pyx_v_self->view.buf;
+  __pyx_v_info->buf = __pyx_t_5;
+
+  /* "View.MemoryView":547
+ * 
+ *         info.buf = self.view.buf
+ *         info.ndim = self.view.ndim             # <<<<<<<<<<<<<<
+ *         info.itemsize = self.view.itemsize
+ *         info.len = self.view.len
+ */
+  __pyx_t_6 = __pyx_v_self->view.ndim;
+  __pyx_v_info->ndim = __pyx_t_6;
+
+  /* "View.MemoryView":548
+ *         info.buf = self.view.buf
+ *         info.ndim = self.view.ndim
+ *         info.itemsize = self.view.itemsize             # <<<<<<<<<<<<<<
+ *         info.len = self.view.len
+ *         info.readonly = self.view.readonly
+ */
+  __pyx_t_7 = __pyx_v_self->view.itemsize;
+  __pyx_v_info->itemsize = __pyx_t_7;
+
+  /* "View.MemoryView":549
+ *         info.ndim = self.view.ndim
+ *         info.itemsize = self.view.itemsize
+ *         info.len = self.view.len             # <<<<<<<<<<<<<<
+ *         info.readonly = self.view.readonly
+ *         info.obj = self
+ */
+  __pyx_t_7 = __pyx_v_self->view.len;
+  __pyx_v_info->len = __pyx_t_7;
+
+  /* "View.MemoryView":550
+ *         info.itemsize = self.view.itemsize
+ *         info.len = self.view.len
+ *         info.readonly = self.view.readonly             # <<<<<<<<<<<<<<
+ *         info.obj = self
+ * 
+ */
+  __pyx_t_1 = __pyx_v_self->view.readonly;
+  __pyx_v_info->readonly = __pyx_t_1;
+
+  /* "View.MemoryView":551
+ *         info.len = self.view.len
+ *         info.readonly = self.view.readonly
+ *         info.obj = self             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __Pyx_INCREF((PyObject *)__pyx_v_self);
+  __Pyx_GIVEREF((PyObject *)__pyx_v_self);
+  __Pyx_GOTREF(__pyx_v_info->obj);
+  __Pyx_DECREF(__pyx_v_info->obj);
+  __pyx_v_info->obj = ((PyObject *)__pyx_v_self);
+
+  /* "View.MemoryView":521
+ *             itemp[i] = c
+ * 
+ *     @cname('getbuffer')             # <<<<<<<<<<<<<<
+ *     def __getbuffer__(self, Py_buffer *info, int flags):
+ *         if flags & PyBUF_WRITABLE and self.view.readonly:
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView.memoryview.__getbuffer__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  if (__pyx_v_info->obj != NULL) {
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj); __pyx_v_info->obj = 0;
+  }
+  goto __pyx_L2;
+  __pyx_L0:;
+  if (__pyx_v_info->obj == Py_None) {
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj); __pyx_v_info->obj = 0;
+  }
+  __pyx_L2:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":554
+ * 
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def T(self):
+ *         cdef _memoryviewslice result = memoryview_copy(self)
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_1T_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_1T_1__get__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_pf_15View_dot_MemoryView_10memoryview_1T___get__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_1T___get__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  struct __pyx_memoryviewslice_obj *__pyx_v_result = 0;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__get__", 1);
+
+  /* "View.MemoryView":556
+ *     @property
+ *     def T(self):
+ *         cdef _memoryviewslice result = memoryview_copy(self)             # <<<<<<<<<<<<<<
+ *         transpose_memslice(&result.from_slice)
+ *         return result
+ */
+  __pyx_t_1 = __pyx_memoryview_copy_object(__pyx_v_self); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 556, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_memoryviewslice_type))))) __PYX_ERR(1, 556, __pyx_L1_error)
+  __pyx_v_result = ((struct __pyx_memoryviewslice_obj *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "View.MemoryView":557
+ *     def T(self):
+ *         cdef _memoryviewslice result = memoryview_copy(self)
+ *         transpose_memslice(&result.from_slice)             # <<<<<<<<<<<<<<
+ *         return result
+ * 
+ */
+  __pyx_t_2 = __pyx_memslice_transpose((&__pyx_v_result->from_slice)); if (unlikely(__pyx_t_2 == ((int)-1))) __PYX_ERR(1, 557, __pyx_L1_error)
+
+  /* "View.MemoryView":558
+ *         cdef _memoryviewslice result = memoryview_copy(self)
+ *         transpose_memslice(&result.from_slice)
+ *         return result             # <<<<<<<<<<<<<<
+ * 
+ *     @property
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF((PyObject *)__pyx_v_result);
+  __pyx_r = ((PyObject *)__pyx_v_result);
+  goto __pyx_L0;
+
+  /* "View.MemoryView":554
+ * 
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def T(self):
+ *         cdef _memoryviewslice result = memoryview_copy(self)
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.T.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_result);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":560
+ *         return result
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def base(self):
+ *         return self._get_base()
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_4base_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_4base_1__get__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_pf_15View_dot_MemoryView_10memoryview_4base___get__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4base___get__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__get__", 1);
+
+  /* "View.MemoryView":562
+ *     @property
+ *     def base(self):
+ *         return self._get_base()             # <<<<<<<<<<<<<<
+ * 
+ *     cdef _get_base(self):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = ((struct __pyx_vtabstruct_memoryview *)__pyx_v_self->__pyx_vtab)->_get_base(__pyx_v_self); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 562, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":560
+ *         return result
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def base(self):
+ *         return self._get_base()
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.base.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":564
+ *         return self._get_base()
+ * 
+ *     cdef _get_base(self):             # <<<<<<<<<<<<<<
+ *         return self.obj
+ * 
+ */
+
+static PyObject *__pyx_memoryview__get_base(struct __pyx_memoryview_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_get_base", 1);
+
+  /* "View.MemoryView":565
+ * 
+ *     cdef _get_base(self):
+ *         return self.obj             # <<<<<<<<<<<<<<
+ * 
+ *     @property
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_self->obj);
+  __pyx_r = __pyx_v_self->obj;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":564
+ *         return self._get_base()
+ * 
+ *     cdef _get_base(self):             # <<<<<<<<<<<<<<
+ *         return self.obj
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":567
+ *         return self.obj
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def shape(self):
+ *         return tuple([length for length in self.view.shape[:self.view.ndim]])
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_5shape_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_5shape_1__get__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_pf_15View_dot_MemoryView_10memoryview_5shape___get__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_5shape___get__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  Py_ssize_t __pyx_7genexpr__pyx_v_length;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  Py_ssize_t *__pyx_t_2;
+  Py_ssize_t *__pyx_t_3;
+  Py_ssize_t *__pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__get__", 1);
+
+  /* "View.MemoryView":569
+ *     @property
+ *     def shape(self):
+ *         return tuple([length for length in self.view.shape[:self.view.ndim]])             # <<<<<<<<<<<<<<
+ * 
+ *     @property
+ */
+  __Pyx_XDECREF(__pyx_r);
+  { /* enter inner scope */
+    __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 569, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_3 = (__pyx_v_self->view.shape + __pyx_v_self->view.ndim);
+    for (__pyx_t_4 = __pyx_v_self->view.shape; __pyx_t_4 < __pyx_t_3; __pyx_t_4++) {
+      __pyx_t_2 = __pyx_t_4;
+      __pyx_7genexpr__pyx_v_length = (__pyx_t_2[0]);
+      __pyx_t_5 = PyInt_FromSsize_t(__pyx_7genexpr__pyx_v_length); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 569, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      if (unlikely(__Pyx_ListComp_Append(__pyx_t_1, (PyObject*)__pyx_t_5))) __PYX_ERR(1, 569, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    }
+  } /* exit inner scope */
+  __pyx_t_5 = PyList_AsTuple(((PyObject*)__pyx_t_1)); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 569, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_r = __pyx_t_5;
+  __pyx_t_5 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":567
+ *         return self.obj
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def shape(self):
+ *         return tuple([length for length in self.view.shape[:self.view.ndim]])
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.shape.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":571
+ *         return tuple([length for length in self.view.shape[:self.view.ndim]])
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def strides(self):
+ *         if self.view.strides == NULL:
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_7strides_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_7strides_1__get__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_pf_15View_dot_MemoryView_10memoryview_7strides___get__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_7strides___get__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  Py_ssize_t __pyx_8genexpr1__pyx_v_stride;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  Py_ssize_t *__pyx_t_3;
+  Py_ssize_t *__pyx_t_4;
+  Py_ssize_t *__pyx_t_5;
+  PyObject *__pyx_t_6 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__get__", 1);
+
+  /* "View.MemoryView":573
+ *     @property
+ *     def strides(self):
+ *         if self.view.strides == NULL:             # <<<<<<<<<<<<<<
+ * 
+ *             raise ValueError, "Buffer view does not expose strides"
+ */
+  __pyx_t_1 = (__pyx_v_self->view.strides == NULL);
+  if (unlikely(__pyx_t_1)) {
+
+    /* "View.MemoryView":575
+ *         if self.view.strides == NULL:
+ * 
+ *             raise ValueError, "Buffer view does not expose strides"             # <<<<<<<<<<<<<<
+ * 
+ *         return tuple([stride for stride in self.view.strides[:self.view.ndim]])
+ */
+    __Pyx_Raise(__pyx_builtin_ValueError, __pyx_kp_s_Buffer_view_does_not_expose_stri, 0, 0);
+    __PYX_ERR(1, 575, __pyx_L1_error)
+
+    /* "View.MemoryView":573
+ *     @property
+ *     def strides(self):
+ *         if self.view.strides == NULL:             # <<<<<<<<<<<<<<
+ * 
+ *             raise ValueError, "Buffer view does not expose strides"
+ */
+  }
+
+  /* "View.MemoryView":577
+ *             raise ValueError, "Buffer view does not expose strides"
+ * 
+ *         return tuple([stride for stride in self.view.strides[:self.view.ndim]])             # <<<<<<<<<<<<<<
+ * 
+ *     @property
+ */
+  __Pyx_XDECREF(__pyx_r);
+  { /* enter inner scope */
+    __pyx_t_2 = PyList_New(0); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 577, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_4 = (__pyx_v_self->view.strides + __pyx_v_self->view.ndim);
+    for (__pyx_t_5 = __pyx_v_self->view.strides; __pyx_t_5 < __pyx_t_4; __pyx_t_5++) {
+      __pyx_t_3 = __pyx_t_5;
+      __pyx_8genexpr1__pyx_v_stride = (__pyx_t_3[0]);
+      __pyx_t_6 = PyInt_FromSsize_t(__pyx_8genexpr1__pyx_v_stride); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 577, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      if (unlikely(__Pyx_ListComp_Append(__pyx_t_2, (PyObject*)__pyx_t_6))) __PYX_ERR(1, 577, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    }
+  } /* exit inner scope */
+  __pyx_t_6 = PyList_AsTuple(((PyObject*)__pyx_t_2)); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 577, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_6);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_r = __pyx_t_6;
+  __pyx_t_6 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":571
+ *         return tuple([length for length in self.view.shape[:self.view.ndim]])
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def strides(self):
+ *         if self.view.strides == NULL:
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.strides.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":579
+ *         return tuple([stride for stride in self.view.strides[:self.view.ndim]])
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def suboffsets(self):
+ *         if self.view.suboffsets == NULL:
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_10suboffsets_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_10suboffsets_1__get__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_pf_15View_dot_MemoryView_10memoryview_10suboffsets___get__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_10suboffsets___get__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  Py_ssize_t __pyx_8genexpr2__pyx_v_suboffset;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  Py_ssize_t *__pyx_t_3;
+  Py_ssize_t *__pyx_t_4;
+  Py_ssize_t *__pyx_t_5;
+  PyObject *__pyx_t_6 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__get__", 1);
+
+  /* "View.MemoryView":581
+ *     @property
+ *     def suboffsets(self):
+ *         if self.view.suboffsets == NULL:             # <<<<<<<<<<<<<<
+ *             return (-1,) * self.view.ndim
+ * 
+ */
+  __pyx_t_1 = (__pyx_v_self->view.suboffsets == NULL);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":582
+ *     def suboffsets(self):
+ *         if self.view.suboffsets == NULL:
+ *             return (-1,) * self.view.ndim             # <<<<<<<<<<<<<<
+ * 
+ *         return tuple([suboffset for suboffset in self.view.suboffsets[:self.view.ndim]])
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_t_2 = __Pyx_PySequence_Multiply(__pyx_tuple__4, __pyx_v_self->view.ndim); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 582, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_r = __pyx_t_2;
+    __pyx_t_2 = 0;
+    goto __pyx_L0;
+
+    /* "View.MemoryView":581
+ *     @property
+ *     def suboffsets(self):
+ *         if self.view.suboffsets == NULL:             # <<<<<<<<<<<<<<
+ *             return (-1,) * self.view.ndim
+ * 
+ */
+  }
+
+  /* "View.MemoryView":584
+ *             return (-1,) * self.view.ndim
+ * 
+ *         return tuple([suboffset for suboffset in self.view.suboffsets[:self.view.ndim]])             # <<<<<<<<<<<<<<
+ * 
+ *     @property
+ */
+  __Pyx_XDECREF(__pyx_r);
+  { /* enter inner scope */
+    __pyx_t_2 = PyList_New(0); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 584, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_4 = (__pyx_v_self->view.suboffsets + __pyx_v_self->view.ndim);
+    for (__pyx_t_5 = __pyx_v_self->view.suboffsets; __pyx_t_5 < __pyx_t_4; __pyx_t_5++) {
+      __pyx_t_3 = __pyx_t_5;
+      __pyx_8genexpr2__pyx_v_suboffset = (__pyx_t_3[0]);
+      __pyx_t_6 = PyInt_FromSsize_t(__pyx_8genexpr2__pyx_v_suboffset); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 584, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      if (unlikely(__Pyx_ListComp_Append(__pyx_t_2, (PyObject*)__pyx_t_6))) __PYX_ERR(1, 584, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    }
+  } /* exit inner scope */
+  __pyx_t_6 = PyList_AsTuple(((PyObject*)__pyx_t_2)); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 584, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_6);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_r = __pyx_t_6;
+  __pyx_t_6 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":579
+ *         return tuple([stride for stride in self.view.strides[:self.view.ndim]])
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def suboffsets(self):
+ *         if self.view.suboffsets == NULL:
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.suboffsets.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":586
+ *         return tuple([suboffset for suboffset in self.view.suboffsets[:self.view.ndim]])
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def ndim(self):
+ *         return self.view.ndim
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_4ndim_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_4ndim_1__get__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_pf_15View_dot_MemoryView_10memoryview_4ndim___get__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4ndim___get__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__get__", 1);
+
+  /* "View.MemoryView":588
+ *     @property
+ *     def ndim(self):
+ *         return self.view.ndim             # <<<<<<<<<<<<<<
+ * 
+ *     @property
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->view.ndim); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 588, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":586
+ *         return tuple([suboffset for suboffset in self.view.suboffsets[:self.view.ndim]])
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def ndim(self):
+ *         return self.view.ndim
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.ndim.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":590
+ *         return self.view.ndim
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def itemsize(self):
+ *         return self.view.itemsize
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_8itemsize_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_8itemsize_1__get__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_pf_15View_dot_MemoryView_10memoryview_8itemsize___get__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_8itemsize___get__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__get__", 1);
+
+  /* "View.MemoryView":592
+ *     @property
+ *     def itemsize(self):
+ *         return self.view.itemsize             # <<<<<<<<<<<<<<
+ * 
+ *     @property
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyInt_FromSsize_t(__pyx_v_self->view.itemsize); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 592, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":590
+ *         return self.view.ndim
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def itemsize(self):
+ *         return self.view.itemsize
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.itemsize.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":594
+ *         return self.view.itemsize
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def nbytes(self):
+ *         return self.size * self.view.itemsize
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_6nbytes_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_6nbytes_1__get__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_pf_15View_dot_MemoryView_10memoryview_6nbytes___get__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_6nbytes___get__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__get__", 1);
+
+  /* "View.MemoryView":596
+ *     @property
+ *     def nbytes(self):
+ *         return self.size * self.view.itemsize             # <<<<<<<<<<<<<<
+ * 
+ *     @property
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_size); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 596, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = PyInt_FromSsize_t(__pyx_v_self->view.itemsize); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 596, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = PyNumber_Multiply(__pyx_t_1, __pyx_t_2); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 596, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_r = __pyx_t_3;
+  __pyx_t_3 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":594
+ *         return self.view.itemsize
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def nbytes(self):
+ *         return self.size * self.view.itemsize
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.nbytes.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":598
+ *         return self.size * self.view.itemsize
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def size(self):
+ *         if self._size is None:
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_4size_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_4size_1__get__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_pf_15View_dot_MemoryView_10memoryview_4size___get__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4size___get__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  PyObject *__pyx_v_result = NULL;
+  PyObject *__pyx_v_length = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  Py_ssize_t *__pyx_t_2;
+  Py_ssize_t *__pyx_t_3;
+  Py_ssize_t *__pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__get__", 1);
+
+  /* "View.MemoryView":600
+ *     @property
+ *     def size(self):
+ *         if self._size is None:             # <<<<<<<<<<<<<<
+ *             result = 1
+ * 
+ */
+  __pyx_t_1 = (__pyx_v_self->_size == Py_None);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":601
+ *     def size(self):
+ *         if self._size is None:
+ *             result = 1             # <<<<<<<<<<<<<<
+ * 
+ *             for length in self.view.shape[:self.view.ndim]:
+ */
+    __Pyx_INCREF(__pyx_int_1);
+    __pyx_v_result = __pyx_int_1;
+
+    /* "View.MemoryView":603
+ *             result = 1
+ * 
+ *             for length in self.view.shape[:self.view.ndim]:             # <<<<<<<<<<<<<<
+ *                 result *= length
+ * 
+ */
+    __pyx_t_3 = (__pyx_v_self->view.shape + __pyx_v_self->view.ndim);
+    for (__pyx_t_4 = __pyx_v_self->view.shape; __pyx_t_4 < __pyx_t_3; __pyx_t_4++) {
+      __pyx_t_2 = __pyx_t_4;
+      __pyx_t_5 = PyInt_FromSsize_t((__pyx_t_2[0])); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 603, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_XDECREF_SET(__pyx_v_length, __pyx_t_5);
+      __pyx_t_5 = 0;
+
+      /* "View.MemoryView":604
+ * 
+ *             for length in self.view.shape[:self.view.ndim]:
+ *                 result *= length             # <<<<<<<<<<<<<<
+ * 
+ *             self._size = result
+ */
+      __pyx_t_5 = PyNumber_InPlaceMultiply(__pyx_v_result, __pyx_v_length); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 604, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_DECREF_SET(__pyx_v_result, __pyx_t_5);
+      __pyx_t_5 = 0;
+    }
+
+    /* "View.MemoryView":606
+ *                 result *= length
+ * 
+ *             self._size = result             # <<<<<<<<<<<<<<
+ * 
+ *         return self._size
+ */
+    __Pyx_INCREF(__pyx_v_result);
+    __Pyx_GIVEREF(__pyx_v_result);
+    __Pyx_GOTREF(__pyx_v_self->_size);
+    __Pyx_DECREF(__pyx_v_self->_size);
+    __pyx_v_self->_size = __pyx_v_result;
+
+    /* "View.MemoryView":600
+ *     @property
+ *     def size(self):
+ *         if self._size is None:             # <<<<<<<<<<<<<<
+ *             result = 1
+ * 
+ */
+  }
+
+  /* "View.MemoryView":608
+ *             self._size = result
+ * 
+ *         return self._size             # <<<<<<<<<<<<<<
+ * 
+ *     def __len__(self):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_self->_size);
+  __pyx_r = __pyx_v_self->_size;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":598
+ *         return self.size * self.view.itemsize
+ * 
+ *     @property             # <<<<<<<<<<<<<<
+ *     def size(self):
+ *         if self._size is None:
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.size.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_result);
+  __Pyx_XDECREF(__pyx_v_length);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":610
+ *         return self._size
+ * 
+ *     def __len__(self):             # <<<<<<<<<<<<<<
+ *         if self.view.ndim >= 1:
+ *             return self.view.shape[0]
+ */
+
+/* Python wrapper */
+static Py_ssize_t __pyx_memoryview___len__(PyObject *__pyx_v_self); /*proto*/
+static Py_ssize_t __pyx_memoryview___len__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  Py_ssize_t __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__len__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_10__len__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static Py_ssize_t __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_10__len__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  Py_ssize_t __pyx_r;
+  int __pyx_t_1;
+
+  /* "View.MemoryView":611
+ * 
+ *     def __len__(self):
+ *         if self.view.ndim >= 1:             # <<<<<<<<<<<<<<
+ *             return self.view.shape[0]
+ * 
+ */
+  __pyx_t_1 = (__pyx_v_self->view.ndim >= 1);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":612
+ *     def __len__(self):
+ *         if self.view.ndim >= 1:
+ *             return self.view.shape[0]             # <<<<<<<<<<<<<<
+ * 
+ *         return 0
+ */
+    __pyx_r = (__pyx_v_self->view.shape[0]);
+    goto __pyx_L0;
+
+    /* "View.MemoryView":611
+ * 
+ *     def __len__(self):
+ *         if self.view.ndim >= 1:             # <<<<<<<<<<<<<<
+ *             return self.view.shape[0]
+ * 
+ */
+  }
+
+  /* "View.MemoryView":614
+ *             return self.view.shape[0]
+ * 
+ *         return 0             # <<<<<<<<<<<<<<
+ * 
+ *     def __repr__(self):
+ */
+  __pyx_r = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":610
+ *         return self._size
+ * 
+ *     def __len__(self):             # <<<<<<<<<<<<<<
+ *         if self.view.ndim >= 1:
+ *             return self.view.shape[0]
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":616
+ *         return 0
+ * 
+ *     def __repr__(self):             # <<<<<<<<<<<<<<
+ *         return "<MemoryView of %r at 0x%x>" % (self.base.__class__.__name__,
+ *                                                id(self))
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_memoryview___repr__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_memoryview___repr__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__repr__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_12__repr__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_12__repr__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__repr__", 1);
+
+  /* "View.MemoryView":617
+ * 
+ *     def __repr__(self):
+ *         return "<MemoryView of %r at 0x%x>" % (self.base.__class__.__name__,             # <<<<<<<<<<<<<<
+ *                                                id(self))
+ * 
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_base); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 617, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_class); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 617, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_name_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 617, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "View.MemoryView":618
+ *     def __repr__(self):
+ *         return "<MemoryView of %r at 0x%x>" % (self.base.__class__.__name__,
+ *                                                id(self))             # <<<<<<<<<<<<<<
+ * 
+ *     def __str__(self):
+ */
+  __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_builtin_id, ((PyObject *)__pyx_v_self)); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 618, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+
+  /* "View.MemoryView":617
+ * 
+ *     def __repr__(self):
+ *         return "<MemoryView of %r at 0x%x>" % (self.base.__class__.__name__,             # <<<<<<<<<<<<<<
+ *                                                id(self))
+ * 
+ */
+  __pyx_t_3 = PyTuple_New(2); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 617, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_GIVEREF(__pyx_t_1);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_1)) __PYX_ERR(1, 617, __pyx_L1_error);
+  __Pyx_GIVEREF(__pyx_t_2);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_2)) __PYX_ERR(1, 617, __pyx_L1_error);
+  __pyx_t_1 = 0;
+  __pyx_t_2 = 0;
+  __pyx_t_2 = __Pyx_PyString_Format(__pyx_kp_s_MemoryView_of_r_at_0x_x, __pyx_t_3); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 617, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_r = __pyx_t_2;
+  __pyx_t_2 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":616
+ *         return 0
+ * 
+ *     def __repr__(self):             # <<<<<<<<<<<<<<
+ *         return "<MemoryView of %r at 0x%x>" % (self.base.__class__.__name__,
+ *                                                id(self))
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.__repr__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":620
+ *                                                id(self))
+ * 
+ *     def __str__(self):             # <<<<<<<<<<<<<<
+ *         return "<MemoryView of %r object>" % (self.base.__class__.__name__,)
+ * 
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_memoryview___str__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_memoryview___str__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__str__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_14__str__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_14__str__(struct __pyx_memoryview_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__str__", 1);
+
+  /* "View.MemoryView":621
+ * 
+ *     def __str__(self):
+ *         return "<MemoryView of %r object>" % (self.base.__class__.__name__,)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_base); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 621, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_class); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 621, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_name_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 621, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 621, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_GIVEREF(__pyx_t_1);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_1)) __PYX_ERR(1, 621, __pyx_L1_error);
+  __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyString_Format(__pyx_kp_s_MemoryView_of_r_object, __pyx_t_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 621, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":620
+ *                                                id(self))
+ * 
+ *     def __str__(self):             # <<<<<<<<<<<<<<
+ *         return "<MemoryView of %r object>" % (self.base.__class__.__name__,)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.__str__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":624
+ * 
+ * 
+ *     def is_c_contig(self):             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice *mslice
+ *         cdef __Pyx_memviewslice tmp
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_memoryview_is_c_contig(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_memoryview_is_c_contig(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("is_c_contig (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  if (unlikely(__pyx_nargs > 0)) {
+    __Pyx_RaiseArgtupleInvalid("is_c_contig", 1, 0, 0, __pyx_nargs); return NULL;}
+  if (unlikely(__pyx_kwds) && __Pyx_NumKwargs_FASTCALL(__pyx_kwds) && unlikely(!__Pyx_CheckKeywordStrings(__pyx_kwds, "is_c_contig", 0))) return NULL;
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_16is_c_contig(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_16is_c_contig(struct __pyx_memoryview_obj *__pyx_v_self) {
+  __Pyx_memviewslice *__pyx_v_mslice;
+  __Pyx_memviewslice __pyx_v_tmp;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_memviewslice *__pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("is_c_contig", 1);
+
+  /* "View.MemoryView":627
+ *         cdef __Pyx_memviewslice *mslice
+ *         cdef __Pyx_memviewslice tmp
+ *         mslice = get_slice_from_memview(self, &tmp)             # <<<<<<<<<<<<<<
+ *         return slice_is_contig(mslice[0], 'C', self.view.ndim)
+ * 
+ */
+  __pyx_t_1 = __pyx_memoryview_get_slice_from_memoryview(__pyx_v_self, (&__pyx_v_tmp)); if (unlikely(__pyx_t_1 == ((__Pyx_memviewslice *)NULL))) __PYX_ERR(1, 627, __pyx_L1_error)
+  __pyx_v_mslice = __pyx_t_1;
+
+  /* "View.MemoryView":628
+ *         cdef __Pyx_memviewslice tmp
+ *         mslice = get_slice_from_memview(self, &tmp)
+ *         return slice_is_contig(mslice[0], 'C', self.view.ndim)             # <<<<<<<<<<<<<<
+ * 
+ *     def is_f_contig(self):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_2 = __Pyx_PyBool_FromLong(__pyx_memviewslice_is_contig((__pyx_v_mslice[0]), 'C', __pyx_v_self->view.ndim)); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 628, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_r = __pyx_t_2;
+  __pyx_t_2 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":624
+ * 
+ * 
+ *     def is_c_contig(self):             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice *mslice
+ *         cdef __Pyx_memviewslice tmp
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.is_c_contig", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":630
+ *         return slice_is_contig(mslice[0], 'C', self.view.ndim)
+ * 
+ *     def is_f_contig(self):             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice *mslice
+ *         cdef __Pyx_memviewslice tmp
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_memoryview_is_f_contig(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_memoryview_is_f_contig(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("is_f_contig (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  if (unlikely(__pyx_nargs > 0)) {
+    __Pyx_RaiseArgtupleInvalid("is_f_contig", 1, 0, 0, __pyx_nargs); return NULL;}
+  if (unlikely(__pyx_kwds) && __Pyx_NumKwargs_FASTCALL(__pyx_kwds) && unlikely(!__Pyx_CheckKeywordStrings(__pyx_kwds, "is_f_contig", 0))) return NULL;
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_18is_f_contig(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_18is_f_contig(struct __pyx_memoryview_obj *__pyx_v_self) {
+  __Pyx_memviewslice *__pyx_v_mslice;
+  __Pyx_memviewslice __pyx_v_tmp;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_memviewslice *__pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("is_f_contig", 1);
+
+  /* "View.MemoryView":633
+ *         cdef __Pyx_memviewslice *mslice
+ *         cdef __Pyx_memviewslice tmp
+ *         mslice = get_slice_from_memview(self, &tmp)             # <<<<<<<<<<<<<<
+ *         return slice_is_contig(mslice[0], 'F', self.view.ndim)
+ * 
+ */
+  __pyx_t_1 = __pyx_memoryview_get_slice_from_memoryview(__pyx_v_self, (&__pyx_v_tmp)); if (unlikely(__pyx_t_1 == ((__Pyx_memviewslice *)NULL))) __PYX_ERR(1, 633, __pyx_L1_error)
+  __pyx_v_mslice = __pyx_t_1;
+
+  /* "View.MemoryView":634
+ *         cdef __Pyx_memviewslice tmp
+ *         mslice = get_slice_from_memview(self, &tmp)
+ *         return slice_is_contig(mslice[0], 'F', self.view.ndim)             # <<<<<<<<<<<<<<
+ * 
+ *     def copy(self):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_2 = __Pyx_PyBool_FromLong(__pyx_memviewslice_is_contig((__pyx_v_mslice[0]), 'F', __pyx_v_self->view.ndim)); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 634, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_r = __pyx_t_2;
+  __pyx_t_2 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":630
+ *         return slice_is_contig(mslice[0], 'C', self.view.ndim)
+ * 
+ *     def is_f_contig(self):             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice *mslice
+ *         cdef __Pyx_memviewslice tmp
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.is_f_contig", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":636
+ *         return slice_is_contig(mslice[0], 'F', self.view.ndim)
+ * 
+ *     def copy(self):             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice mslice
+ *         cdef int flags = self.flags & ~PyBUF_F_CONTIGUOUS
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_memoryview_copy(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_memoryview_copy(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("copy (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  if (unlikely(__pyx_nargs > 0)) {
+    __Pyx_RaiseArgtupleInvalid("copy", 1, 0, 0, __pyx_nargs); return NULL;}
+  if (unlikely(__pyx_kwds) && __Pyx_NumKwargs_FASTCALL(__pyx_kwds) && unlikely(!__Pyx_CheckKeywordStrings(__pyx_kwds, "copy", 0))) return NULL;
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_20copy(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_20copy(struct __pyx_memoryview_obj *__pyx_v_self) {
+  __Pyx_memviewslice __pyx_v_mslice;
+  int __pyx_v_flags;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_memviewslice __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("copy", 1);
+
+  /* "View.MemoryView":638
+ *     def copy(self):
+ *         cdef __Pyx_memviewslice mslice
+ *         cdef int flags = self.flags & ~PyBUF_F_CONTIGUOUS             # <<<<<<<<<<<<<<
+ * 
+ *         slice_copy(self, &mslice)
+ */
+  __pyx_v_flags = (__pyx_v_self->flags & (~PyBUF_F_CONTIGUOUS));
+
+  /* "View.MemoryView":640
+ *         cdef int flags = self.flags & ~PyBUF_F_CONTIGUOUS
+ * 
+ *         slice_copy(self, &mslice)             # <<<<<<<<<<<<<<
+ *         mslice = slice_copy_contig(&mslice, "c", self.view.ndim,
+ *                                    self.view.itemsize,
+ */
+  __pyx_memoryview_slice_copy(__pyx_v_self, (&__pyx_v_mslice));
+
+  /* "View.MemoryView":641
+ * 
+ *         slice_copy(self, &mslice)
+ *         mslice = slice_copy_contig(&mslice, "c", self.view.ndim,             # <<<<<<<<<<<<<<
+ *                                    self.view.itemsize,
+ *                                    flags|PyBUF_C_CONTIGUOUS,
+ */
+  __pyx_t_1 = __pyx_memoryview_copy_new_contig((&__pyx_v_mslice), ((char *)"c"), __pyx_v_self->view.ndim, __pyx_v_self->view.itemsize, (__pyx_v_flags | PyBUF_C_CONTIGUOUS), __pyx_v_self->dtype_is_object); if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 641, __pyx_L1_error)
+  __pyx_v_mslice = __pyx_t_1;
+
+  /* "View.MemoryView":646
+ *                                    self.dtype_is_object)
+ * 
+ *         return memoryview_copy_from_slice(self, &mslice)             # <<<<<<<<<<<<<<
+ * 
+ *     def copy_fortran(self):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_2 = __pyx_memoryview_copy_object_from_slice(__pyx_v_self, (&__pyx_v_mslice)); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 646, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_r = __pyx_t_2;
+  __pyx_t_2 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":636
+ *         return slice_is_contig(mslice[0], 'F', self.view.ndim)
+ * 
+ *     def copy(self):             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice mslice
+ *         cdef int flags = self.flags & ~PyBUF_F_CONTIGUOUS
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.copy", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":648
+ *         return memoryview_copy_from_slice(self, &mslice)
+ * 
+ *     def copy_fortran(self):             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice src, dst
+ *         cdef int flags = self.flags & ~PyBUF_C_CONTIGUOUS
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_memoryview_copy_fortran(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_memoryview_copy_fortran(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("copy_fortran (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  if (unlikely(__pyx_nargs > 0)) {
+    __Pyx_RaiseArgtupleInvalid("copy_fortran", 1, 0, 0, __pyx_nargs); return NULL;}
+  if (unlikely(__pyx_kwds) && __Pyx_NumKwargs_FASTCALL(__pyx_kwds) && unlikely(!__Pyx_CheckKeywordStrings(__pyx_kwds, "copy_fortran", 0))) return NULL;
+  __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_22copy_fortran(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_22copy_fortran(struct __pyx_memoryview_obj *__pyx_v_self) {
+  __Pyx_memviewslice __pyx_v_src;
+  __Pyx_memviewslice __pyx_v_dst;
+  int __pyx_v_flags;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_memviewslice __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("copy_fortran", 1);
+
+  /* "View.MemoryView":650
+ *     def copy_fortran(self):
+ *         cdef __Pyx_memviewslice src, dst
+ *         cdef int flags = self.flags & ~PyBUF_C_CONTIGUOUS             # <<<<<<<<<<<<<<
+ * 
+ *         slice_copy(self, &src)
+ */
+  __pyx_v_flags = (__pyx_v_self->flags & (~PyBUF_C_CONTIGUOUS));
+
+  /* "View.MemoryView":652
+ *         cdef int flags = self.flags & ~PyBUF_C_CONTIGUOUS
+ * 
+ *         slice_copy(self, &src)             # <<<<<<<<<<<<<<
+ *         dst = slice_copy_contig(&src, "fortran", self.view.ndim,
+ *                                 self.view.itemsize,
+ */
+  __pyx_memoryview_slice_copy(__pyx_v_self, (&__pyx_v_src));
+
+  /* "View.MemoryView":653
+ * 
+ *         slice_copy(self, &src)
+ *         dst = slice_copy_contig(&src, "fortran", self.view.ndim,             # <<<<<<<<<<<<<<
+ *                                 self.view.itemsize,
+ *                                 flags|PyBUF_F_CONTIGUOUS,
+ */
+  __pyx_t_1 = __pyx_memoryview_copy_new_contig((&__pyx_v_src), ((char *)"fortran"), __pyx_v_self->view.ndim, __pyx_v_self->view.itemsize, (__pyx_v_flags | PyBUF_F_CONTIGUOUS), __pyx_v_self->dtype_is_object); if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 653, __pyx_L1_error)
+  __pyx_v_dst = __pyx_t_1;
+
+  /* "View.MemoryView":658
+ *                                 self.dtype_is_object)
+ * 
+ *         return memoryview_copy_from_slice(self, &dst)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_2 = __pyx_memoryview_copy_object_from_slice(__pyx_v_self, (&__pyx_v_dst)); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 658, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_r = __pyx_t_2;
+  __pyx_t_2 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":648
+ *         return memoryview_copy_from_slice(self, &mslice)
+ * 
+ *     def copy_fortran(self):             # <<<<<<<<<<<<<<
+ *         cdef __Pyx_memviewslice src, dst
+ *         cdef int flags = self.flags & ~PyBUF_C_CONTIGUOUS
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView.memoryview.copy_fortran", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw___pyx_memoryview_1__reduce_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_pw___pyx_memoryview_1__reduce_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__reduce_cython__ (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  if (unlikely(__pyx_nargs > 0)) {
+    __Pyx_RaiseArgtupleInvalid("__reduce_cython__", 1, 0, 0, __pyx_nargs); return NULL;}
+  if (unlikely(__pyx_kwds) && __Pyx_NumKwargs_FASTCALL(__pyx_kwds) && unlikely(!__Pyx_CheckKeywordStrings(__pyx_kwds, "__reduce_cython__", 0))) return NULL;
+  __pyx_r = __pyx_pf___pyx_memoryview___reduce_cython__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf___pyx_memoryview___reduce_cython__(CYTHON_UNUSED struct __pyx_memoryview_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__reduce_cython__", 1);
+
+  /* "(tree fragment)":2
+ * def __reduce_cython__(self):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"             # <<<<<<<<<<<<<<
+ * def __setstate_cython__(self, __pyx_state):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ */
+  __Pyx_Raise(__pyx_builtin_TypeError, __pyx_kp_s_no_default___reduce___due_to_non, 0, 0);
+  __PYX_ERR(1, 2, __pyx_L1_error)
+
+  /* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView.memoryview.__reduce_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":3
+ * def __reduce_cython__(self):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw___pyx_memoryview_3__setstate_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_pw___pyx_memoryview_3__setstate_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  CYTHON_UNUSED PyObject *__pyx_v___pyx_state = 0;
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[1] = {0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__setstate_cython__ (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_pyx_state,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  1: values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_FASTCALL(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_pyx_state)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 3, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "__setstate_cython__") < 0)) __PYX_ERR(1, 3, __pyx_L3_error)
+      }
+    } else if (unlikely(__pyx_nargs != 1)) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+    }
+    __pyx_v___pyx_state = values[0];
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__setstate_cython__", 1, 1, 1, __pyx_nargs); __PYX_ERR(1, 3, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("View.MemoryView.memoryview.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf___pyx_memoryview_2__setstate_cython__(((struct __pyx_memoryview_obj *)__pyx_v_self), __pyx_v___pyx_state);
+
+  /* function exit code */
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf___pyx_memoryview_2__setstate_cython__(CYTHON_UNUSED struct __pyx_memoryview_obj *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__setstate_cython__", 1);
+
+  /* "(tree fragment)":4
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"             # <<<<<<<<<<<<<<
+ */
+  __Pyx_Raise(__pyx_builtin_TypeError, __pyx_kp_s_no_default___reduce___due_to_non, 0, 0);
+  __PYX_ERR(1, 4, __pyx_L1_error)
+
+  /* "(tree fragment)":3
+ * def __reduce_cython__(self):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView.memoryview.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":662
+ * 
+ * @cname('__pyx_memoryview_new')
+ * cdef memoryview_cwrapper(object o, int flags, bint dtype_is_object, __Pyx_TypeInfo *typeinfo):             # <<<<<<<<<<<<<<
+ *     cdef memoryview result = memoryview(o, flags, dtype_is_object)
+ *     result.typeinfo = typeinfo
+ */
+
+static PyObject *__pyx_memoryview_new(PyObject *__pyx_v_o, int __pyx_v_flags, int __pyx_v_dtype_is_object, __Pyx_TypeInfo *__pyx_v_typeinfo) {
+  struct __pyx_memoryview_obj *__pyx_v_result = 0;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("memoryview_cwrapper", 1);
+
+  /* "View.MemoryView":663
+ * @cname('__pyx_memoryview_new')
+ * cdef memoryview_cwrapper(object o, int flags, bint dtype_is_object, __Pyx_TypeInfo *typeinfo):
+ *     cdef memoryview result = memoryview(o, flags, dtype_is_object)             # <<<<<<<<<<<<<<
+ *     result.typeinfo = typeinfo
+ *     return result
+ */
+  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_flags); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 663, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyBool_FromLong(__pyx_v_dtype_is_object); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 663, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = PyTuple_New(3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 663, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_INCREF(__pyx_v_o);
+  __Pyx_GIVEREF(__pyx_v_o);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_v_o)) __PYX_ERR(1, 663, __pyx_L1_error);
+  __Pyx_GIVEREF(__pyx_t_1);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_1)) __PYX_ERR(1, 663, __pyx_L1_error);
+  __Pyx_GIVEREF(__pyx_t_2);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 2, __pyx_t_2)) __PYX_ERR(1, 663, __pyx_L1_error);
+  __pyx_t_1 = 0;
+  __pyx_t_2 = 0;
+  __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)__pyx_memoryview_type), __pyx_t_3, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 663, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_v_result = ((struct __pyx_memoryview_obj *)__pyx_t_2);
+  __pyx_t_2 = 0;
+
+  /* "View.MemoryView":664
+ * cdef memoryview_cwrapper(object o, int flags, bint dtype_is_object, __Pyx_TypeInfo *typeinfo):
+ *     cdef memoryview result = memoryview(o, flags, dtype_is_object)
+ *     result.typeinfo = typeinfo             # <<<<<<<<<<<<<<
+ *     return result
+ * 
+ */
+  __pyx_v_result->typeinfo = __pyx_v_typeinfo;
+
+  /* "View.MemoryView":665
+ *     cdef memoryview result = memoryview(o, flags, dtype_is_object)
+ *     result.typeinfo = typeinfo
+ *     return result             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_check')
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF((PyObject *)__pyx_v_result);
+  __pyx_r = ((PyObject *)__pyx_v_result);
+  goto __pyx_L0;
+
+  /* "View.MemoryView":662
+ * 
+ * @cname('__pyx_memoryview_new')
+ * cdef memoryview_cwrapper(object o, int flags, bint dtype_is_object, __Pyx_TypeInfo *typeinfo):             # <<<<<<<<<<<<<<
+ *     cdef memoryview result = memoryview(o, flags, dtype_is_object)
+ *     result.typeinfo = typeinfo
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_AddTraceback("View.MemoryView.memoryview_cwrapper", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_result);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":668
+ * 
+ * @cname('__pyx_memoryview_check')
+ * cdef inline bint memoryview_check(object o) noexcept:             # <<<<<<<<<<<<<<
+ *     return isinstance(o, memoryview)
+ * 
+ */
+
+static CYTHON_INLINE int __pyx_memoryview_check(PyObject *__pyx_v_o) {
+  int __pyx_r;
+  int __pyx_t_1;
+
+  /* "View.MemoryView":669
+ * @cname('__pyx_memoryview_check')
+ * cdef inline bint memoryview_check(object o) noexcept:
+ *     return isinstance(o, memoryview)             # <<<<<<<<<<<<<<
+ * 
+ * cdef tuple _unellipsify(object index, int ndim):
+ */
+  __pyx_t_1 = __Pyx_TypeCheck(__pyx_v_o, __pyx_memoryview_type); 
+  __pyx_r = __pyx_t_1;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":668
+ * 
+ * @cname('__pyx_memoryview_check')
+ * cdef inline bint memoryview_check(object o) noexcept:             # <<<<<<<<<<<<<<
+ *     return isinstance(o, memoryview)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":671
+ *     return isinstance(o, memoryview)
+ * 
+ * cdef tuple _unellipsify(object index, int ndim):             # <<<<<<<<<<<<<<
+ *     """
+ *     Replace all ellipses with full slices and fill incomplete indices with
+ */
+
+static PyObject *_unellipsify(PyObject *__pyx_v_index, int __pyx_v_ndim) {
+  Py_ssize_t __pyx_v_idx;
+  PyObject *__pyx_v_tup = NULL;
+  PyObject *__pyx_v_result = NULL;
+  int __pyx_v_have_slices;
+  int __pyx_v_seen_ellipsis;
+  PyObject *__pyx_v_item = NULL;
+  Py_ssize_t __pyx_v_nslices;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  Py_ssize_t __pyx_t_4;
+  Py_ssize_t __pyx_t_5;
+  Py_UCS4 __pyx_t_6;
+  PyObject *__pyx_t_7 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("_unellipsify", 1);
+
+  /* "View.MemoryView":677
+ *     """
+ *     cdef Py_ssize_t idx
+ *     tup = <tuple>index if isinstance(index, tuple) else (index,)             # <<<<<<<<<<<<<<
+ * 
+ *     result = [slice(None)] * ndim
+ */
+  __pyx_t_2 = PyTuple_Check(__pyx_v_index); 
+  if (__pyx_t_2) {
+    __Pyx_INCREF(((PyObject*)__pyx_v_index));
+    __pyx_t_1 = __pyx_v_index;
+  } else {
+    __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 677, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_INCREF(__pyx_v_index);
+    __Pyx_GIVEREF(__pyx_v_index);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_v_index)) __PYX_ERR(1, 677, __pyx_L1_error);
+    __pyx_t_1 = __pyx_t_3;
+    __pyx_t_3 = 0;
+  }
+  __pyx_v_tup = ((PyObject*)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "View.MemoryView":679
+ *     tup = <tuple>index if isinstance(index, tuple) else (index,)
+ * 
+ *     result = [slice(None)] * ndim             # <<<<<<<<<<<<<<
+ *     have_slices = False
+ *     seen_ellipsis = False
+ */
+  __pyx_t_1 = PyList_New(1 * ((__pyx_v_ndim<0) ? 0:__pyx_v_ndim)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 679, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  { Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < __pyx_v_ndim; __pyx_temp++) {
+      __Pyx_INCREF(__pyx_slice__5);
+      __Pyx_GIVEREF(__pyx_slice__5);
+      if (__Pyx_PyList_SET_ITEM(__pyx_t_1, __pyx_temp, __pyx_slice__5)) __PYX_ERR(1, 679, __pyx_L1_error);
+    }
+  }
+  __pyx_v_result = ((PyObject*)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "View.MemoryView":680
+ * 
+ *     result = [slice(None)] * ndim
+ *     have_slices = False             # <<<<<<<<<<<<<<
+ *     seen_ellipsis = False
+ *     idx = 0
+ */
+  __pyx_v_have_slices = 0;
+
+  /* "View.MemoryView":681
+ *     result = [slice(None)] * ndim
+ *     have_slices = False
+ *     seen_ellipsis = False             # <<<<<<<<<<<<<<
+ *     idx = 0
+ *     for item in tup:
+ */
+  __pyx_v_seen_ellipsis = 0;
+
+  /* "View.MemoryView":682
+ *     have_slices = False
+ *     seen_ellipsis = False
+ *     idx = 0             # <<<<<<<<<<<<<<
+ *     for item in tup:
+ *         if item is Ellipsis:
+ */
+  __pyx_v_idx = 0;
+
+  /* "View.MemoryView":683
+ *     seen_ellipsis = False
+ *     idx = 0
+ *     for item in tup:             # <<<<<<<<<<<<<<
+ *         if item is Ellipsis:
+ *             if not seen_ellipsis:
+ */
+  if (unlikely(__pyx_v_tup == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
+    __PYX_ERR(1, 683, __pyx_L1_error)
+  }
+  __pyx_t_1 = __pyx_v_tup; __Pyx_INCREF(__pyx_t_1);
+  __pyx_t_4 = 0;
+  for (;;) {
+    {
+      Py_ssize_t __pyx_temp = __Pyx_PyTuple_GET_SIZE(__pyx_t_1);
+      #if !CYTHON_ASSUME_SAFE_MACROS
+      if (unlikely((__pyx_temp < 0))) __PYX_ERR(1, 683, __pyx_L1_error)
+      #endif
+      if (__pyx_t_4 >= __pyx_temp) break;
+    }
+    #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    __pyx_t_3 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_4); __Pyx_INCREF(__pyx_t_3); __pyx_t_4++; if (unlikely((0 < 0))) __PYX_ERR(1, 683, __pyx_L1_error)
+    #else
+    __pyx_t_3 = __Pyx_PySequence_ITEM(__pyx_t_1, __pyx_t_4); __pyx_t_4++; if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 683, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    #endif
+    __Pyx_XDECREF_SET(__pyx_v_item, __pyx_t_3);
+    __pyx_t_3 = 0;
+
+    /* "View.MemoryView":684
+ *     idx = 0
+ *     for item in tup:
+ *         if item is Ellipsis:             # <<<<<<<<<<<<<<
+ *             if not seen_ellipsis:
+ *                 idx += ndim - len(tup)
+ */
+    __pyx_t_2 = (__pyx_v_item == __pyx_builtin_Ellipsis);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":685
+ *     for item in tup:
+ *         if item is Ellipsis:
+ *             if not seen_ellipsis:             # <<<<<<<<<<<<<<
+ *                 idx += ndim - len(tup)
+ *                 seen_ellipsis = True
+ */
+      __pyx_t_2 = (!__pyx_v_seen_ellipsis);
+      if (__pyx_t_2) {
+
+        /* "View.MemoryView":686
+ *         if item is Ellipsis:
+ *             if not seen_ellipsis:
+ *                 idx += ndim - len(tup)             # <<<<<<<<<<<<<<
+ *                 seen_ellipsis = True
+ *             have_slices = True
+ */
+        if (unlikely(__pyx_v_tup == Py_None)) {
+          PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()");
+          __PYX_ERR(1, 686, __pyx_L1_error)
+        }
+        __pyx_t_5 = __Pyx_PyTuple_GET_SIZE(__pyx_v_tup); if (unlikely(__pyx_t_5 == ((Py_ssize_t)-1))) __PYX_ERR(1, 686, __pyx_L1_error)
+        __pyx_v_idx = (__pyx_v_idx + (__pyx_v_ndim - __pyx_t_5));
+
+        /* "View.MemoryView":687
+ *             if not seen_ellipsis:
+ *                 idx += ndim - len(tup)
+ *                 seen_ellipsis = True             # <<<<<<<<<<<<<<
+ *             have_slices = True
+ *         else:
+ */
+        __pyx_v_seen_ellipsis = 1;
+
+        /* "View.MemoryView":685
+ *     for item in tup:
+ *         if item is Ellipsis:
+ *             if not seen_ellipsis:             # <<<<<<<<<<<<<<
+ *                 idx += ndim - len(tup)
+ *                 seen_ellipsis = True
+ */
+      }
+
+      /* "View.MemoryView":688
+ *                 idx += ndim - len(tup)
+ *                 seen_ellipsis = True
+ *             have_slices = True             # <<<<<<<<<<<<<<
+ *         else:
+ *             if isinstance(item, slice):
+ */
+      __pyx_v_have_slices = 1;
+
+      /* "View.MemoryView":684
+ *     idx = 0
+ *     for item in tup:
+ *         if item is Ellipsis:             # <<<<<<<<<<<<<<
+ *             if not seen_ellipsis:
+ *                 idx += ndim - len(tup)
+ */
+      goto __pyx_L5;
+    }
+
+    /* "View.MemoryView":690
+ *             have_slices = True
+ *         else:
+ *             if isinstance(item, slice):             # <<<<<<<<<<<<<<
+ *                 have_slices = True
+ *             elif not PyIndex_Check(item):
+ */
+    /*else*/ {
+      __pyx_t_2 = PySlice_Check(__pyx_v_item); 
+      if (__pyx_t_2) {
+
+        /* "View.MemoryView":691
+ *         else:
+ *             if isinstance(item, slice):
+ *                 have_slices = True             # <<<<<<<<<<<<<<
+ *             elif not PyIndex_Check(item):
+ *                 raise TypeError, f"Cannot index with type '{type(item)}'"
+ */
+        __pyx_v_have_slices = 1;
+
+        /* "View.MemoryView":690
+ *             have_slices = True
+ *         else:
+ *             if isinstance(item, slice):             # <<<<<<<<<<<<<<
+ *                 have_slices = True
+ *             elif not PyIndex_Check(item):
+ */
+        goto __pyx_L7;
+      }
+
+      /* "View.MemoryView":692
+ *             if isinstance(item, slice):
+ *                 have_slices = True
+ *             elif not PyIndex_Check(item):             # <<<<<<<<<<<<<<
+ *                 raise TypeError, f"Cannot index with type '{type(item)}'"
+ *             result[idx] = item
+ */
+      __pyx_t_2 = (!(PyIndex_Check(__pyx_v_item) != 0));
+      if (unlikely(__pyx_t_2)) {
+
+        /* "View.MemoryView":693
+ *                 have_slices = True
+ *             elif not PyIndex_Check(item):
+ *                 raise TypeError, f"Cannot index with type '{type(item)}'"             # <<<<<<<<<<<<<<
+ *             result[idx] = item
+ *         idx += 1
+ */
+        __pyx_t_3 = PyTuple_New(3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 693, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_3);
+        __pyx_t_5 = 0;
+        __pyx_t_6 = 127;
+        __Pyx_INCREF(__pyx_kp_u_Cannot_index_with_type);
+        __pyx_t_5 += 24;
+        __Pyx_GIVEREF(__pyx_kp_u_Cannot_index_with_type);
+        PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_kp_u_Cannot_index_with_type);
+        __pyx_t_7 = __Pyx_PyObject_FormatSimple(((PyObject *)Py_TYPE(__pyx_v_item)), __pyx_empty_unicode); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 693, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_6 = (__Pyx_PyUnicode_MAX_CHAR_VALUE(__pyx_t_7) > __pyx_t_6) ? __Pyx_PyUnicode_MAX_CHAR_VALUE(__pyx_t_7) : __pyx_t_6;
+        __pyx_t_5 += __Pyx_PyUnicode_GET_LENGTH(__pyx_t_7);
+        __Pyx_GIVEREF(__pyx_t_7);
+        PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_7);
+        __pyx_t_7 = 0;
+        __Pyx_INCREF(__pyx_kp_u__6);
+        __pyx_t_5 += 1;
+        __Pyx_GIVEREF(__pyx_kp_u__6);
+        PyTuple_SET_ITEM(__pyx_t_3, 2, __pyx_kp_u__6);
+        __pyx_t_7 = __Pyx_PyUnicode_Join(__pyx_t_3, 3, __pyx_t_5, __pyx_t_6); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 693, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+        __Pyx_Raise(__pyx_builtin_TypeError, __pyx_t_7, 0, 0);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __PYX_ERR(1, 693, __pyx_L1_error)
+
+        /* "View.MemoryView":692
+ *             if isinstance(item, slice):
+ *                 have_slices = True
+ *             elif not PyIndex_Check(item):             # <<<<<<<<<<<<<<
+ *                 raise TypeError, f"Cannot index with type '{type(item)}'"
+ *             result[idx] = item
+ */
+      }
+      __pyx_L7:;
+
+      /* "View.MemoryView":694
+ *             elif not PyIndex_Check(item):
+ *                 raise TypeError, f"Cannot index with type '{type(item)}'"
+ *             result[idx] = item             # <<<<<<<<<<<<<<
+ *         idx += 1
+ * 
+ */
+      if (unlikely((__Pyx_SetItemInt(__pyx_v_result, __pyx_v_idx, __pyx_v_item, Py_ssize_t, 1, PyInt_FromSsize_t, 1, 1, 1) < 0))) __PYX_ERR(1, 694, __pyx_L1_error)
+    }
+    __pyx_L5:;
+
+    /* "View.MemoryView":695
+ *                 raise TypeError, f"Cannot index with type '{type(item)}'"
+ *             result[idx] = item
+ *         idx += 1             # <<<<<<<<<<<<<<
+ * 
+ *     nslices = ndim - idx
+ */
+    __pyx_v_idx = (__pyx_v_idx + 1);
+
+    /* "View.MemoryView":683
+ *     seen_ellipsis = False
+ *     idx = 0
+ *     for item in tup:             # <<<<<<<<<<<<<<
+ *         if item is Ellipsis:
+ *             if not seen_ellipsis:
+ */
+  }
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "View.MemoryView":697
+ *         idx += 1
+ * 
+ *     nslices = ndim - idx             # <<<<<<<<<<<<<<
+ *     return have_slices or nslices, tuple(result)
+ * 
+ */
+  __pyx_v_nslices = (__pyx_v_ndim - __pyx_v_idx);
+
+  /* "View.MemoryView":698
+ * 
+ *     nslices = ndim - idx
+ *     return have_slices or nslices, tuple(result)             # <<<<<<<<<<<<<<
+ * 
+ * cdef int assert_direct_dimensions(Py_ssize_t *suboffsets, int ndim) except -1:
+ */
+  __Pyx_XDECREF(__pyx_r);
+  if (!__pyx_v_have_slices) {
+  } else {
+    __pyx_t_7 = __Pyx_PyBool_FromLong(__pyx_v_have_slices); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 698, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __pyx_t_1 = __pyx_t_7;
+    __pyx_t_7 = 0;
+    goto __pyx_L9_bool_binop_done;
+  }
+  __pyx_t_7 = PyInt_FromSsize_t(__pyx_v_nslices); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 698, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  __pyx_t_1 = __pyx_t_7;
+  __pyx_t_7 = 0;
+  __pyx_L9_bool_binop_done:;
+  __pyx_t_7 = PyList_AsTuple(__pyx_v_result); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 698, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  __pyx_t_3 = PyTuple_New(2); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 698, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_GIVEREF(__pyx_t_1);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_1)) __PYX_ERR(1, 698, __pyx_L1_error);
+  __Pyx_GIVEREF(__pyx_t_7);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_7)) __PYX_ERR(1, 698, __pyx_L1_error);
+  __pyx_t_1 = 0;
+  __pyx_t_7 = 0;
+  __pyx_r = ((PyObject*)__pyx_t_3);
+  __pyx_t_3 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":671
+ *     return isinstance(o, memoryview)
+ * 
+ * cdef tuple _unellipsify(object index, int ndim):             # <<<<<<<<<<<<<<
+ *     """
+ *     Replace all ellipses with full slices and fill incomplete indices with
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_AddTraceback("View.MemoryView._unellipsify", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_tup);
+  __Pyx_XDECREF(__pyx_v_result);
+  __Pyx_XDECREF(__pyx_v_item);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":700
+ *     return have_slices or nslices, tuple(result)
+ * 
+ * cdef int assert_direct_dimensions(Py_ssize_t *suboffsets, int ndim) except -1:             # <<<<<<<<<<<<<<
+ *     for suboffset in suboffsets[:ndim]:
+ *         if suboffset >= 0:
+ */
+
+static int assert_direct_dimensions(Py_ssize_t *__pyx_v_suboffsets, int __pyx_v_ndim) {
+  Py_ssize_t __pyx_v_suboffset;
+  int __pyx_r;
+  Py_ssize_t *__pyx_t_1;
+  Py_ssize_t *__pyx_t_2;
+  Py_ssize_t *__pyx_t_3;
+  int __pyx_t_4;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+
+  /* "View.MemoryView":701
+ * 
+ * cdef int assert_direct_dimensions(Py_ssize_t *suboffsets, int ndim) except -1:
+ *     for suboffset in suboffsets[:ndim]:             # <<<<<<<<<<<<<<
+ *         if suboffset >= 0:
+ *             raise ValueError, "Indirect dimensions not supported"
+ */
+  __pyx_t_2 = (__pyx_v_suboffsets + __pyx_v_ndim);
+  for (__pyx_t_3 = __pyx_v_suboffsets; __pyx_t_3 < __pyx_t_2; __pyx_t_3++) {
+    __pyx_t_1 = __pyx_t_3;
+    __pyx_v_suboffset = (__pyx_t_1[0]);
+
+    /* "View.MemoryView":702
+ * cdef int assert_direct_dimensions(Py_ssize_t *suboffsets, int ndim) except -1:
+ *     for suboffset in suboffsets[:ndim]:
+ *         if suboffset >= 0:             # <<<<<<<<<<<<<<
+ *             raise ValueError, "Indirect dimensions not supported"
+ *     return 0  # return type just used as an error flag
+ */
+    __pyx_t_4 = (__pyx_v_suboffset >= 0);
+    if (unlikely(__pyx_t_4)) {
+
+      /* "View.MemoryView":703
+ *     for suboffset in suboffsets[:ndim]:
+ *         if suboffset >= 0:
+ *             raise ValueError, "Indirect dimensions not supported"             # <<<<<<<<<<<<<<
+ *     return 0  # return type just used as an error flag
+ * 
+ */
+      __Pyx_Raise(__pyx_builtin_ValueError, __pyx_kp_s_Indirect_dimensions_not_supporte, 0, 0);
+      __PYX_ERR(1, 703, __pyx_L1_error)
+
+      /* "View.MemoryView":702
+ * cdef int assert_direct_dimensions(Py_ssize_t *suboffsets, int ndim) except -1:
+ *     for suboffset in suboffsets[:ndim]:
+ *         if suboffset >= 0:             # <<<<<<<<<<<<<<
+ *             raise ValueError, "Indirect dimensions not supported"
+ *     return 0  # return type just used as an error flag
+ */
+    }
+  }
+
+  /* "View.MemoryView":704
+ *         if suboffset >= 0:
+ *             raise ValueError, "Indirect dimensions not supported"
+ *     return 0  # return type just used as an error flag             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":700
+ *     return have_slices or nslices, tuple(result)
+ * 
+ * cdef int assert_direct_dimensions(Py_ssize_t *suboffsets, int ndim) except -1:             # <<<<<<<<<<<<<<
+ *     for suboffset in suboffsets[:ndim]:
+ *         if suboffset >= 0:
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView.assert_direct_dimensions", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":711
+ * 
+ * @cname('__pyx_memview_slice')
+ * cdef memoryview memview_slice(memoryview memview, object indices):             # <<<<<<<<<<<<<<
+ *     cdef int new_ndim = 0, suboffset_dim = -1, dim
+ *     cdef bint negative_step
+ */
+
+static struct __pyx_memoryview_obj *__pyx_memview_slice(struct __pyx_memoryview_obj *__pyx_v_memview, PyObject *__pyx_v_indices) {
+  int __pyx_v_new_ndim;
+  int __pyx_v_suboffset_dim;
+  int __pyx_v_dim;
+  __Pyx_memviewslice __pyx_v_src;
+  __Pyx_memviewslice __pyx_v_dst;
+  __Pyx_memviewslice *__pyx_v_p_src;
+  struct __pyx_memoryviewslice_obj *__pyx_v_memviewsliceobj = 0;
+  __Pyx_memviewslice *__pyx_v_p_dst;
+  int *__pyx_v_p_suboffset_dim;
+  Py_ssize_t __pyx_v_start;
+  Py_ssize_t __pyx_v_stop;
+  Py_ssize_t __pyx_v_step;
+  Py_ssize_t __pyx_v_cindex;
+  int __pyx_v_have_start;
+  int __pyx_v_have_stop;
+  int __pyx_v_have_step;
+  PyObject *__pyx_v_index = NULL;
+  struct __pyx_memoryview_obj *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  struct __pyx_memoryview_obj *__pyx_t_3;
+  char *__pyx_t_4;
+  int __pyx_t_5;
+  Py_ssize_t __pyx_t_6;
+  PyObject *(*__pyx_t_7)(PyObject *);
+  PyObject *__pyx_t_8 = NULL;
+  Py_ssize_t __pyx_t_9;
+  int __pyx_t_10;
+  Py_ssize_t __pyx_t_11;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("memview_slice", 1);
+
+  /* "View.MemoryView":712
+ * @cname('__pyx_memview_slice')
+ * cdef memoryview memview_slice(memoryview memview, object indices):
+ *     cdef int new_ndim = 0, suboffset_dim = -1, dim             # <<<<<<<<<<<<<<
+ *     cdef bint negative_step
+ *     cdef __Pyx_memviewslice src, dst
+ */
+  __pyx_v_new_ndim = 0;
+  __pyx_v_suboffset_dim = -1;
+
+  /* "View.MemoryView":719
+ * 
+ * 
+ *     memset(&dst, 0, sizeof(dst))             # <<<<<<<<<<<<<<
+ * 
+ *     cdef _memoryviewslice memviewsliceobj
+ */
+  (void)(memset((&__pyx_v_dst), 0, (sizeof(__pyx_v_dst))));
+
+  /* "View.MemoryView":723
+ *     cdef _memoryviewslice memviewsliceobj
+ * 
+ *     assert memview.view.ndim > 0             # <<<<<<<<<<<<<<
+ * 
+ *     if isinstance(memview, _memoryviewslice):
+ */
+  #ifndef CYTHON_WITHOUT_ASSERTIONS
+  if (unlikely(__pyx_assertions_enabled())) {
+    __pyx_t_1 = (__pyx_v_memview->view.ndim > 0);
+    if (unlikely(!__pyx_t_1)) {
+      __Pyx_Raise(__pyx_builtin_AssertionError, 0, 0, 0);
+      __PYX_ERR(1, 723, __pyx_L1_error)
+    }
+  }
+  #else
+  if ((1)); else __PYX_ERR(1, 723, __pyx_L1_error)
+  #endif
+
+  /* "View.MemoryView":725
+ *     assert memview.view.ndim > 0
+ * 
+ *     if isinstance(memview, _memoryviewslice):             # <<<<<<<<<<<<<<
+ *         memviewsliceobj = memview
+ *         p_src = &memviewsliceobj.from_slice
+ */
+  __pyx_t_1 = __Pyx_TypeCheck(((PyObject *)__pyx_v_memview), __pyx_memoryviewslice_type); 
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":726
+ * 
+ *     if isinstance(memview, _memoryviewslice):
+ *         memviewsliceobj = memview             # <<<<<<<<<<<<<<
+ *         p_src = &memviewsliceobj.from_slice
+ *     else:
+ */
+    if (!(likely(((((PyObject *)__pyx_v_memview)) == Py_None) || likely(__Pyx_TypeTest(((PyObject *)__pyx_v_memview), __pyx_memoryviewslice_type))))) __PYX_ERR(1, 726, __pyx_L1_error)
+    __pyx_t_2 = ((PyObject *)__pyx_v_memview);
+    __Pyx_INCREF(__pyx_t_2);
+    __pyx_v_memviewsliceobj = ((struct __pyx_memoryviewslice_obj *)__pyx_t_2);
+    __pyx_t_2 = 0;
+
+    /* "View.MemoryView":727
+ *     if isinstance(memview, _memoryviewslice):
+ *         memviewsliceobj = memview
+ *         p_src = &memviewsliceobj.from_slice             # <<<<<<<<<<<<<<
+ *     else:
+ *         slice_copy(memview, &src)
+ */
+    __pyx_v_p_src = (&__pyx_v_memviewsliceobj->from_slice);
+
+    /* "View.MemoryView":725
+ *     assert memview.view.ndim > 0
+ * 
+ *     if isinstance(memview, _memoryviewslice):             # <<<<<<<<<<<<<<
+ *         memviewsliceobj = memview
+ *         p_src = &memviewsliceobj.from_slice
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":729
+ *         p_src = &memviewsliceobj.from_slice
+ *     else:
+ *         slice_copy(memview, &src)             # <<<<<<<<<<<<<<
+ *         p_src = &src
+ * 
+ */
+  /*else*/ {
+    __pyx_memoryview_slice_copy(__pyx_v_memview, (&__pyx_v_src));
+
+    /* "View.MemoryView":730
+ *     else:
+ *         slice_copy(memview, &src)
+ *         p_src = &src             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+    __pyx_v_p_src = (&__pyx_v_src);
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":736
+ * 
+ * 
+ *     dst.memview = p_src.memview             # <<<<<<<<<<<<<<
+ *     dst.data = p_src.data
+ * 
+ */
+  __pyx_t_3 = __pyx_v_p_src->memview;
+  __pyx_v_dst.memview = __pyx_t_3;
+
+  /* "View.MemoryView":737
+ * 
+ *     dst.memview = p_src.memview
+ *     dst.data = p_src.data             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_t_4 = __pyx_v_p_src->data;
+  __pyx_v_dst.data = __pyx_t_4;
+
+  /* "View.MemoryView":742
+ * 
+ * 
+ *     cdef __Pyx_memviewslice *p_dst = &dst             # <<<<<<<<<<<<<<
+ *     cdef int *p_suboffset_dim = &suboffset_dim
+ *     cdef Py_ssize_t start, stop, step, cindex
+ */
+  __pyx_v_p_dst = (&__pyx_v_dst);
+
+  /* "View.MemoryView":743
+ * 
+ *     cdef __Pyx_memviewslice *p_dst = &dst
+ *     cdef int *p_suboffset_dim = &suboffset_dim             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t start, stop, step, cindex
+ *     cdef bint have_start, have_stop, have_step
+ */
+  __pyx_v_p_suboffset_dim = (&__pyx_v_suboffset_dim);
+
+  /* "View.MemoryView":747
+ *     cdef bint have_start, have_stop, have_step
+ * 
+ *     for dim, index in enumerate(indices):             # <<<<<<<<<<<<<<
+ *         if PyIndex_Check(index):
+ *             cindex = index
+ */
+  __pyx_t_5 = 0;
+  if (likely(PyList_CheckExact(__pyx_v_indices)) || PyTuple_CheckExact(__pyx_v_indices)) {
+    __pyx_t_2 = __pyx_v_indices; __Pyx_INCREF(__pyx_t_2);
+    __pyx_t_6 = 0;
+    __pyx_t_7 = NULL;
+  } else {
+    __pyx_t_6 = -1; __pyx_t_2 = PyObject_GetIter(__pyx_v_indices); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 747, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_7 = __Pyx_PyObject_GetIterNextFunc(__pyx_t_2); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 747, __pyx_L1_error)
+  }
+  for (;;) {
+    if (likely(!__pyx_t_7)) {
+      if (likely(PyList_CheckExact(__pyx_t_2))) {
+        {
+          Py_ssize_t __pyx_temp = __Pyx_PyList_GET_SIZE(__pyx_t_2);
+          #if !CYTHON_ASSUME_SAFE_MACROS
+          if (unlikely((__pyx_temp < 0))) __PYX_ERR(1, 747, __pyx_L1_error)
+          #endif
+          if (__pyx_t_6 >= __pyx_temp) break;
+        }
+        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+        __pyx_t_8 = PyList_GET_ITEM(__pyx_t_2, __pyx_t_6); __Pyx_INCREF(__pyx_t_8); __pyx_t_6++; if (unlikely((0 < 0))) __PYX_ERR(1, 747, __pyx_L1_error)
+        #else
+        __pyx_t_8 = __Pyx_PySequence_ITEM(__pyx_t_2, __pyx_t_6); __pyx_t_6++; if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 747, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_8);
+        #endif
+      } else {
+        {
+          Py_ssize_t __pyx_temp = __Pyx_PyTuple_GET_SIZE(__pyx_t_2);
+          #if !CYTHON_ASSUME_SAFE_MACROS
+          if (unlikely((__pyx_temp < 0))) __PYX_ERR(1, 747, __pyx_L1_error)
+          #endif
+          if (__pyx_t_6 >= __pyx_temp) break;
+        }
+        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+        __pyx_t_8 = PyTuple_GET_ITEM(__pyx_t_2, __pyx_t_6); __Pyx_INCREF(__pyx_t_8); __pyx_t_6++; if (unlikely((0 < 0))) __PYX_ERR(1, 747, __pyx_L1_error)
+        #else
+        __pyx_t_8 = __Pyx_PySequence_ITEM(__pyx_t_2, __pyx_t_6); __pyx_t_6++; if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 747, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_8);
+        #endif
+      }
+    } else {
+      __pyx_t_8 = __pyx_t_7(__pyx_t_2);
+      if (unlikely(!__pyx_t_8)) {
+        PyObject* exc_type = PyErr_Occurred();
+        if (exc_type) {
+          if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
+          else __PYX_ERR(1, 747, __pyx_L1_error)
+        }
+        break;
+      }
+      __Pyx_GOTREF(__pyx_t_8);
+    }
+    __Pyx_XDECREF_SET(__pyx_v_index, __pyx_t_8);
+    __pyx_t_8 = 0;
+    __pyx_v_dim = __pyx_t_5;
+    __pyx_t_5 = (__pyx_t_5 + 1);
+
+    /* "View.MemoryView":748
+ * 
+ *     for dim, index in enumerate(indices):
+ *         if PyIndex_Check(index):             # <<<<<<<<<<<<<<
+ *             cindex = index
+ *             slice_memviewslice(
+ */
+    __pyx_t_1 = (PyIndex_Check(__pyx_v_index) != 0);
+    if (__pyx_t_1) {
+
+      /* "View.MemoryView":749
+ *     for dim, index in enumerate(indices):
+ *         if PyIndex_Check(index):
+ *             cindex = index             # <<<<<<<<<<<<<<
+ *             slice_memviewslice(
+ *                 p_dst, p_src.shape[dim], p_src.strides[dim], p_src.suboffsets[dim],
+ */
+      __pyx_t_9 = __Pyx_PyIndex_AsSsize_t(__pyx_v_index); if (unlikely((__pyx_t_9 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(1, 749, __pyx_L1_error)
+      __pyx_v_cindex = __pyx_t_9;
+
+      /* "View.MemoryView":750
+ *         if PyIndex_Check(index):
+ *             cindex = index
+ *             slice_memviewslice(             # <<<<<<<<<<<<<<
+ *                 p_dst, p_src.shape[dim], p_src.strides[dim], p_src.suboffsets[dim],
+ *                 dim, new_ndim, p_suboffset_dim,
+ */
+      __pyx_t_10 = __pyx_memoryview_slice_memviewslice(__pyx_v_p_dst, (__pyx_v_p_src->shape[__pyx_v_dim]), (__pyx_v_p_src->strides[__pyx_v_dim]), (__pyx_v_p_src->suboffsets[__pyx_v_dim]), __pyx_v_dim, __pyx_v_new_ndim, __pyx_v_p_suboffset_dim, __pyx_v_cindex, 0, 0, 0, 0, 0, 0); if (unlikely(__pyx_t_10 == ((int)-1))) __PYX_ERR(1, 750, __pyx_L1_error)
+
+      /* "View.MemoryView":748
+ * 
+ *     for dim, index in enumerate(indices):
+ *         if PyIndex_Check(index):             # <<<<<<<<<<<<<<
+ *             cindex = index
+ *             slice_memviewslice(
+ */
+      goto __pyx_L6;
+    }
+
+    /* "View.MemoryView":756
+ *                 0, 0, 0, # have_{start,stop,step}
+ *                 False)
+ *         elif index is None:             # <<<<<<<<<<<<<<
+ *             p_dst.shape[new_ndim] = 1
+ *             p_dst.strides[new_ndim] = 0
+ */
+    __pyx_t_1 = (__pyx_v_index == Py_None);
+    if (__pyx_t_1) {
+
+      /* "View.MemoryView":757
+ *                 False)
+ *         elif index is None:
+ *             p_dst.shape[new_ndim] = 1             # <<<<<<<<<<<<<<
+ *             p_dst.strides[new_ndim] = 0
+ *             p_dst.suboffsets[new_ndim] = -1
+ */
+      (__pyx_v_p_dst->shape[__pyx_v_new_ndim]) = 1;
+
+      /* "View.MemoryView":758
+ *         elif index is None:
+ *             p_dst.shape[new_ndim] = 1
+ *             p_dst.strides[new_ndim] = 0             # <<<<<<<<<<<<<<
+ *             p_dst.suboffsets[new_ndim] = -1
+ *             new_ndim += 1
+ */
+      (__pyx_v_p_dst->strides[__pyx_v_new_ndim]) = 0;
+
+      /* "View.MemoryView":759
+ *             p_dst.shape[new_ndim] = 1
+ *             p_dst.strides[new_ndim] = 0
+ *             p_dst.suboffsets[new_ndim] = -1             # <<<<<<<<<<<<<<
+ *             new_ndim += 1
+ *         else:
+ */
+      (__pyx_v_p_dst->suboffsets[__pyx_v_new_ndim]) = -1L;
+
+      /* "View.MemoryView":760
+ *             p_dst.strides[new_ndim] = 0
+ *             p_dst.suboffsets[new_ndim] = -1
+ *             new_ndim += 1             # <<<<<<<<<<<<<<
+ *         else:
+ *             start = index.start or 0
+ */
+      __pyx_v_new_ndim = (__pyx_v_new_ndim + 1);
+
+      /* "View.MemoryView":756
+ *                 0, 0, 0, # have_{start,stop,step}
+ *                 False)
+ *         elif index is None:             # <<<<<<<<<<<<<<
+ *             p_dst.shape[new_ndim] = 1
+ *             p_dst.strides[new_ndim] = 0
+ */
+      goto __pyx_L6;
+    }
+
+    /* "View.MemoryView":762
+ *             new_ndim += 1
+ *         else:
+ *             start = index.start or 0             # <<<<<<<<<<<<<<
+ *             stop = index.stop or 0
+ *             step = index.step or 0
+ */
+    /*else*/ {
+      __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_index, __pyx_n_s_start); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 762, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_t_8); if (unlikely((__pyx_t_1 < 0))) __PYX_ERR(1, 762, __pyx_L1_error)
+      if (!__pyx_t_1) {
+        __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      } else {
+        __pyx_t_11 = __Pyx_PyIndex_AsSsize_t(__pyx_t_8); if (unlikely((__pyx_t_11 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(1, 762, __pyx_L1_error)
+        __pyx_t_9 = __pyx_t_11;
+        __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+        goto __pyx_L7_bool_binop_done;
+      }
+      __pyx_t_9 = 0;
+      __pyx_L7_bool_binop_done:;
+      __pyx_v_start = __pyx_t_9;
+
+      /* "View.MemoryView":763
+ *         else:
+ *             start = index.start or 0
+ *             stop = index.stop or 0             # <<<<<<<<<<<<<<
+ *             step = index.step or 0
+ * 
+ */
+      __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_index, __pyx_n_s_stop); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 763, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_t_8); if (unlikely((__pyx_t_1 < 0))) __PYX_ERR(1, 763, __pyx_L1_error)
+      if (!__pyx_t_1) {
+        __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      } else {
+        __pyx_t_11 = __Pyx_PyIndex_AsSsize_t(__pyx_t_8); if (unlikely((__pyx_t_11 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(1, 763, __pyx_L1_error)
+        __pyx_t_9 = __pyx_t_11;
+        __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+        goto __pyx_L9_bool_binop_done;
+      }
+      __pyx_t_9 = 0;
+      __pyx_L9_bool_binop_done:;
+      __pyx_v_stop = __pyx_t_9;
+
+      /* "View.MemoryView":764
+ *             start = index.start or 0
+ *             stop = index.stop or 0
+ *             step = index.step or 0             # <<<<<<<<<<<<<<
+ * 
+ *             have_start = index.start is not None
+ */
+      __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_index, __pyx_n_s_step); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 764, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_t_8); if (unlikely((__pyx_t_1 < 0))) __PYX_ERR(1, 764, __pyx_L1_error)
+      if (!__pyx_t_1) {
+        __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      } else {
+        __pyx_t_11 = __Pyx_PyIndex_AsSsize_t(__pyx_t_8); if (unlikely((__pyx_t_11 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(1, 764, __pyx_L1_error)
+        __pyx_t_9 = __pyx_t_11;
+        __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+        goto __pyx_L11_bool_binop_done;
+      }
+      __pyx_t_9 = 0;
+      __pyx_L11_bool_binop_done:;
+      __pyx_v_step = __pyx_t_9;
+
+      /* "View.MemoryView":766
+ *             step = index.step or 0
+ * 
+ *             have_start = index.start is not None             # <<<<<<<<<<<<<<
+ *             have_stop = index.stop is not None
+ *             have_step = index.step is not None
+ */
+      __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_index, __pyx_n_s_start); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 766, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __pyx_t_1 = (__pyx_t_8 != Py_None);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __pyx_v_have_start = __pyx_t_1;
+
+      /* "View.MemoryView":767
+ * 
+ *             have_start = index.start is not None
+ *             have_stop = index.stop is not None             # <<<<<<<<<<<<<<
+ *             have_step = index.step is not None
+ * 
+ */
+      __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_index, __pyx_n_s_stop); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 767, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __pyx_t_1 = (__pyx_t_8 != Py_None);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __pyx_v_have_stop = __pyx_t_1;
+
+      /* "View.MemoryView":768
+ *             have_start = index.start is not None
+ *             have_stop = index.stop is not None
+ *             have_step = index.step is not None             # <<<<<<<<<<<<<<
+ * 
+ *             slice_memviewslice(
+ */
+      __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_v_index, __pyx_n_s_step); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 768, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __pyx_t_1 = (__pyx_t_8 != Py_None);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __pyx_v_have_step = __pyx_t_1;
+
+      /* "View.MemoryView":770
+ *             have_step = index.step is not None
+ * 
+ *             slice_memviewslice(             # <<<<<<<<<<<<<<
+ *                 p_dst, p_src.shape[dim], p_src.strides[dim], p_src.suboffsets[dim],
+ *                 dim, new_ndim, p_suboffset_dim,
+ */
+      __pyx_t_10 = __pyx_memoryview_slice_memviewslice(__pyx_v_p_dst, (__pyx_v_p_src->shape[__pyx_v_dim]), (__pyx_v_p_src->strides[__pyx_v_dim]), (__pyx_v_p_src->suboffsets[__pyx_v_dim]), __pyx_v_dim, __pyx_v_new_ndim, __pyx_v_p_suboffset_dim, __pyx_v_start, __pyx_v_stop, __pyx_v_step, __pyx_v_have_start, __pyx_v_have_stop, __pyx_v_have_step, 1); if (unlikely(__pyx_t_10 == ((int)-1))) __PYX_ERR(1, 770, __pyx_L1_error)
+
+      /* "View.MemoryView":776
+ *                 have_start, have_stop, have_step,
+ *                 True)
+ *             new_ndim += 1             # <<<<<<<<<<<<<<
+ * 
+ *     if isinstance(memview, _memoryviewslice):
+ */
+      __pyx_v_new_ndim = (__pyx_v_new_ndim + 1);
+    }
+    __pyx_L6:;
+
+    /* "View.MemoryView":747
+ *     cdef bint have_start, have_stop, have_step
+ * 
+ *     for dim, index in enumerate(indices):             # <<<<<<<<<<<<<<
+ *         if PyIndex_Check(index):
+ *             cindex = index
+ */
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "View.MemoryView":778
+ *             new_ndim += 1
+ * 
+ *     if isinstance(memview, _memoryviewslice):             # <<<<<<<<<<<<<<
+ *         return memoryview_fromslice(dst, new_ndim,
+ *                                     memviewsliceobj.to_object_func,
+ */
+  __pyx_t_1 = __Pyx_TypeCheck(((PyObject *)__pyx_v_memview), __pyx_memoryviewslice_type); 
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":779
+ * 
+ *     if isinstance(memview, _memoryviewslice):
+ *         return memoryview_fromslice(dst, new_ndim,             # <<<<<<<<<<<<<<
+ *                                     memviewsliceobj.to_object_func,
+ *                                     memviewsliceobj.to_dtype_func,
+ */
+    __Pyx_XDECREF((PyObject *)__pyx_r);
+
+    /* "View.MemoryView":780
+ *     if isinstance(memview, _memoryviewslice):
+ *         return memoryview_fromslice(dst, new_ndim,
+ *                                     memviewsliceobj.to_object_func,             # <<<<<<<<<<<<<<
+ *                                     memviewsliceobj.to_dtype_func,
+ *                                     memview.dtype_is_object)
+ */
+    if (unlikely(!__pyx_v_memviewsliceobj)) { __Pyx_RaiseUnboundLocalError("memviewsliceobj"); __PYX_ERR(1, 780, __pyx_L1_error) }
+
+    /* "View.MemoryView":781
+ *         return memoryview_fromslice(dst, new_ndim,
+ *                                     memviewsliceobj.to_object_func,
+ *                                     memviewsliceobj.to_dtype_func,             # <<<<<<<<<<<<<<
+ *                                     memview.dtype_is_object)
+ *     else:
+ */
+    if (unlikely(!__pyx_v_memviewsliceobj)) { __Pyx_RaiseUnboundLocalError("memviewsliceobj"); __PYX_ERR(1, 781, __pyx_L1_error) }
+
+    /* "View.MemoryView":779
+ * 
+ *     if isinstance(memview, _memoryviewslice):
+ *         return memoryview_fromslice(dst, new_ndim,             # <<<<<<<<<<<<<<
+ *                                     memviewsliceobj.to_object_func,
+ *                                     memviewsliceobj.to_dtype_func,
+ */
+    __pyx_t_2 = __pyx_memoryview_fromslice(__pyx_v_dst, __pyx_v_new_ndim, __pyx_v_memviewsliceobj->to_object_func, __pyx_v_memviewsliceobj->to_dtype_func, __pyx_v_memview->dtype_is_object); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 779, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    if (!(likely(((__pyx_t_2) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_2, __pyx_memoryview_type))))) __PYX_ERR(1, 779, __pyx_L1_error)
+    __pyx_r = ((struct __pyx_memoryview_obj *)__pyx_t_2);
+    __pyx_t_2 = 0;
+    goto __pyx_L0;
+
+    /* "View.MemoryView":778
+ *             new_ndim += 1
+ * 
+ *     if isinstance(memview, _memoryviewslice):             # <<<<<<<<<<<<<<
+ *         return memoryview_fromslice(dst, new_ndim,
+ *                                     memviewsliceobj.to_object_func,
+ */
+  }
+
+  /* "View.MemoryView":784
+ *                                     memview.dtype_is_object)
+ *     else:
+ *         return memoryview_fromslice(dst, new_ndim, NULL, NULL,             # <<<<<<<<<<<<<<
+ *                                     memview.dtype_is_object)
+ * 
+ */
+  /*else*/ {
+    __Pyx_XDECREF((PyObject *)__pyx_r);
+
+    /* "View.MemoryView":785
+ *     else:
+ *         return memoryview_fromslice(dst, new_ndim, NULL, NULL,
+ *                                     memview.dtype_is_object)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+    __pyx_t_2 = __pyx_memoryview_fromslice(__pyx_v_dst, __pyx_v_new_ndim, NULL, NULL, __pyx_v_memview->dtype_is_object); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 784, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+
+    /* "View.MemoryView":784
+ *                                     memview.dtype_is_object)
+ *     else:
+ *         return memoryview_fromslice(dst, new_ndim, NULL, NULL,             # <<<<<<<<<<<<<<
+ *                                     memview.dtype_is_object)
+ * 
+ */
+    if (!(likely(((__pyx_t_2) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_2, __pyx_memoryview_type))))) __PYX_ERR(1, 784, __pyx_L1_error)
+    __pyx_r = ((struct __pyx_memoryview_obj *)__pyx_t_2);
+    __pyx_t_2 = 0;
+    goto __pyx_L0;
+  }
+
+  /* "View.MemoryView":711
+ * 
+ * @cname('__pyx_memview_slice')
+ * cdef memoryview memview_slice(memoryview memview, object indices):             # <<<<<<<<<<<<<<
+ *     cdef int new_ndim = 0, suboffset_dim = -1, dim
+ *     cdef bint negative_step
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("View.MemoryView.memview_slice", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_memviewsliceobj);
+  __Pyx_XDECREF(__pyx_v_index);
+  __Pyx_XGIVEREF((PyObject *)__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":793
+ * 
+ * @cname('__pyx_memoryview_slice_memviewslice')
+ * cdef int slice_memviewslice(             # <<<<<<<<<<<<<<
+ *         __Pyx_memviewslice *dst,
+ *         Py_ssize_t shape, Py_ssize_t stride, Py_ssize_t suboffset,
+ */
+
+static int __pyx_memoryview_slice_memviewslice(__Pyx_memviewslice *__pyx_v_dst, Py_ssize_t __pyx_v_shape, Py_ssize_t __pyx_v_stride, Py_ssize_t __pyx_v_suboffset, int __pyx_v_dim, int __pyx_v_new_ndim, int *__pyx_v_suboffset_dim, Py_ssize_t __pyx_v_start, Py_ssize_t __pyx_v_stop, Py_ssize_t __pyx_v_step, int __pyx_v_have_start, int __pyx_v_have_stop, int __pyx_v_have_step, int __pyx_v_is_slice) {
+  Py_ssize_t __pyx_v_new_shape;
+  int __pyx_v_negative_step;
+  int __pyx_r;
+  int __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  #ifdef WITH_THREAD
+  PyGILState_STATE __pyx_gilstate_save;
+  #endif
+
+  /* "View.MemoryView":813
+ *     cdef bint negative_step
+ * 
+ *     if not is_slice:             # <<<<<<<<<<<<<<
+ * 
+ *         if start < 0:
+ */
+  __pyx_t_1 = (!__pyx_v_is_slice);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":815
+ *     if not is_slice:
+ * 
+ *         if start < 0:             # <<<<<<<<<<<<<<
+ *             start += shape
+ *         if not 0 <= start < shape:
+ */
+    __pyx_t_1 = (__pyx_v_start < 0);
+    if (__pyx_t_1) {
+
+      /* "View.MemoryView":816
+ * 
+ *         if start < 0:
+ *             start += shape             # <<<<<<<<<<<<<<
+ *         if not 0 <= start < shape:
+ *             _err_dim(PyExc_IndexError, "Index out of bounds (axis %d)", dim)
+ */
+      __pyx_v_start = (__pyx_v_start + __pyx_v_shape);
+
+      /* "View.MemoryView":815
+ *     if not is_slice:
+ * 
+ *         if start < 0:             # <<<<<<<<<<<<<<
+ *             start += shape
+ *         if not 0 <= start < shape:
+ */
+    }
+
+    /* "View.MemoryView":817
+ *         if start < 0:
+ *             start += shape
+ *         if not 0 <= start < shape:             # <<<<<<<<<<<<<<
+ *             _err_dim(PyExc_IndexError, "Index out of bounds (axis %d)", dim)
+ *     else:
+ */
+    __pyx_t_1 = (0 <= __pyx_v_start);
+    if (__pyx_t_1) {
+      __pyx_t_1 = (__pyx_v_start < __pyx_v_shape);
+    }
+    __pyx_t_2 = (!__pyx_t_1);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":818
+ *             start += shape
+ *         if not 0 <= start < shape:
+ *             _err_dim(PyExc_IndexError, "Index out of bounds (axis %d)", dim)             # <<<<<<<<<<<<<<
+ *     else:
+ * 
+ */
+      __pyx_t_3 = __pyx_memoryview_err_dim(PyExc_IndexError, __pyx_kp_s_Index_out_of_bounds_axis_d, __pyx_v_dim); if (unlikely(__pyx_t_3 == ((int)-1))) __PYX_ERR(1, 818, __pyx_L1_error)
+
+      /* "View.MemoryView":817
+ *         if start < 0:
+ *             start += shape
+ *         if not 0 <= start < shape:             # <<<<<<<<<<<<<<
+ *             _err_dim(PyExc_IndexError, "Index out of bounds (axis %d)", dim)
+ *     else:
+ */
+    }
+
+    /* "View.MemoryView":813
+ *     cdef bint negative_step
+ * 
+ *     if not is_slice:             # <<<<<<<<<<<<<<
+ * 
+ *         if start < 0:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":821
+ *     else:
+ * 
+ *         if have_step:             # <<<<<<<<<<<<<<
+ *             negative_step = step < 0
+ *             if step == 0:
+ */
+  /*else*/ {
+    __pyx_t_2 = (__pyx_v_have_step != 0);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":822
+ * 
+ *         if have_step:
+ *             negative_step = step < 0             # <<<<<<<<<<<<<<
+ *             if step == 0:
+ *                 _err_dim(PyExc_ValueError, "Step may not be zero (axis %d)", dim)
+ */
+      __pyx_v_negative_step = (__pyx_v_step < 0);
+
+      /* "View.MemoryView":823
+ *         if have_step:
+ *             negative_step = step < 0
+ *             if step == 0:             # <<<<<<<<<<<<<<
+ *                 _err_dim(PyExc_ValueError, "Step may not be zero (axis %d)", dim)
+ *         else:
+ */
+      __pyx_t_2 = (__pyx_v_step == 0);
+      if (__pyx_t_2) {
+
+        /* "View.MemoryView":824
+ *             negative_step = step < 0
+ *             if step == 0:
+ *                 _err_dim(PyExc_ValueError, "Step may not be zero (axis %d)", dim)             # <<<<<<<<<<<<<<
+ *         else:
+ *             negative_step = False
+ */
+        __pyx_t_3 = __pyx_memoryview_err_dim(PyExc_ValueError, __pyx_kp_s_Step_may_not_be_zero_axis_d, __pyx_v_dim); if (unlikely(__pyx_t_3 == ((int)-1))) __PYX_ERR(1, 824, __pyx_L1_error)
+
+        /* "View.MemoryView":823
+ *         if have_step:
+ *             negative_step = step < 0
+ *             if step == 0:             # <<<<<<<<<<<<<<
+ *                 _err_dim(PyExc_ValueError, "Step may not be zero (axis %d)", dim)
+ *         else:
+ */
+      }
+
+      /* "View.MemoryView":821
+ *     else:
+ * 
+ *         if have_step:             # <<<<<<<<<<<<<<
+ *             negative_step = step < 0
+ *             if step == 0:
+ */
+      goto __pyx_L6;
+    }
+
+    /* "View.MemoryView":826
+ *                 _err_dim(PyExc_ValueError, "Step may not be zero (axis %d)", dim)
+ *         else:
+ *             negative_step = False             # <<<<<<<<<<<<<<
+ *             step = 1
+ * 
+ */
+    /*else*/ {
+      __pyx_v_negative_step = 0;
+
+      /* "View.MemoryView":827
+ *         else:
+ *             negative_step = False
+ *             step = 1             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+      __pyx_v_step = 1;
+    }
+    __pyx_L6:;
+
+    /* "View.MemoryView":830
+ * 
+ * 
+ *         if have_start:             # <<<<<<<<<<<<<<
+ *             if start < 0:
+ *                 start += shape
+ */
+    __pyx_t_2 = (__pyx_v_have_start != 0);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":831
+ * 
+ *         if have_start:
+ *             if start < 0:             # <<<<<<<<<<<<<<
+ *                 start += shape
+ *                 if start < 0:
+ */
+      __pyx_t_2 = (__pyx_v_start < 0);
+      if (__pyx_t_2) {
+
+        /* "View.MemoryView":832
+ *         if have_start:
+ *             if start < 0:
+ *                 start += shape             # <<<<<<<<<<<<<<
+ *                 if start < 0:
+ *                     start = 0
+ */
+        __pyx_v_start = (__pyx_v_start + __pyx_v_shape);
+
+        /* "View.MemoryView":833
+ *             if start < 0:
+ *                 start += shape
+ *                 if start < 0:             # <<<<<<<<<<<<<<
+ *                     start = 0
+ *             elif start >= shape:
+ */
+        __pyx_t_2 = (__pyx_v_start < 0);
+        if (__pyx_t_2) {
+
+          /* "View.MemoryView":834
+ *                 start += shape
+ *                 if start < 0:
+ *                     start = 0             # <<<<<<<<<<<<<<
+ *             elif start >= shape:
+ *                 if negative_step:
+ */
+          __pyx_v_start = 0;
+
+          /* "View.MemoryView":833
+ *             if start < 0:
+ *                 start += shape
+ *                 if start < 0:             # <<<<<<<<<<<<<<
+ *                     start = 0
+ *             elif start >= shape:
+ */
+        }
+
+        /* "View.MemoryView":831
+ * 
+ *         if have_start:
+ *             if start < 0:             # <<<<<<<<<<<<<<
+ *                 start += shape
+ *                 if start < 0:
+ */
+        goto __pyx_L9;
+      }
+
+      /* "View.MemoryView":835
+ *                 if start < 0:
+ *                     start = 0
+ *             elif start >= shape:             # <<<<<<<<<<<<<<
+ *                 if negative_step:
+ *                     start = shape - 1
+ */
+      __pyx_t_2 = (__pyx_v_start >= __pyx_v_shape);
+      if (__pyx_t_2) {
+
+        /* "View.MemoryView":836
+ *                     start = 0
+ *             elif start >= shape:
+ *                 if negative_step:             # <<<<<<<<<<<<<<
+ *                     start = shape - 1
+ *                 else:
+ */
+        if (__pyx_v_negative_step) {
+
+          /* "View.MemoryView":837
+ *             elif start >= shape:
+ *                 if negative_step:
+ *                     start = shape - 1             # <<<<<<<<<<<<<<
+ *                 else:
+ *                     start = shape
+ */
+          __pyx_v_start = (__pyx_v_shape - 1);
+
+          /* "View.MemoryView":836
+ *                     start = 0
+ *             elif start >= shape:
+ *                 if negative_step:             # <<<<<<<<<<<<<<
+ *                     start = shape - 1
+ *                 else:
+ */
+          goto __pyx_L11;
+        }
+
+        /* "View.MemoryView":839
+ *                     start = shape - 1
+ *                 else:
+ *                     start = shape             # <<<<<<<<<<<<<<
+ *         else:
+ *             if negative_step:
+ */
+        /*else*/ {
+          __pyx_v_start = __pyx_v_shape;
+        }
+        __pyx_L11:;
+
+        /* "View.MemoryView":835
+ *                 if start < 0:
+ *                     start = 0
+ *             elif start >= shape:             # <<<<<<<<<<<<<<
+ *                 if negative_step:
+ *                     start = shape - 1
+ */
+      }
+      __pyx_L9:;
+
+      /* "View.MemoryView":830
+ * 
+ * 
+ *         if have_start:             # <<<<<<<<<<<<<<
+ *             if start < 0:
+ *                 start += shape
+ */
+      goto __pyx_L8;
+    }
+
+    /* "View.MemoryView":841
+ *                     start = shape
+ *         else:
+ *             if negative_step:             # <<<<<<<<<<<<<<
+ *                 start = shape - 1
+ *             else:
+ */
+    /*else*/ {
+      if (__pyx_v_negative_step) {
+
+        /* "View.MemoryView":842
+ *         else:
+ *             if negative_step:
+ *                 start = shape - 1             # <<<<<<<<<<<<<<
+ *             else:
+ *                 start = 0
+ */
+        __pyx_v_start = (__pyx_v_shape - 1);
+
+        /* "View.MemoryView":841
+ *                     start = shape
+ *         else:
+ *             if negative_step:             # <<<<<<<<<<<<<<
+ *                 start = shape - 1
+ *             else:
+ */
+        goto __pyx_L12;
+      }
+
+      /* "View.MemoryView":844
+ *                 start = shape - 1
+ *             else:
+ *                 start = 0             # <<<<<<<<<<<<<<
+ * 
+ *         if have_stop:
+ */
+      /*else*/ {
+        __pyx_v_start = 0;
+      }
+      __pyx_L12:;
+    }
+    __pyx_L8:;
+
+    /* "View.MemoryView":846
+ *                 start = 0
+ * 
+ *         if have_stop:             # <<<<<<<<<<<<<<
+ *             if stop < 0:
+ *                 stop += shape
+ */
+    __pyx_t_2 = (__pyx_v_have_stop != 0);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":847
+ * 
+ *         if have_stop:
+ *             if stop < 0:             # <<<<<<<<<<<<<<
+ *                 stop += shape
+ *                 if stop < 0:
+ */
+      __pyx_t_2 = (__pyx_v_stop < 0);
+      if (__pyx_t_2) {
+
+        /* "View.MemoryView":848
+ *         if have_stop:
+ *             if stop < 0:
+ *                 stop += shape             # <<<<<<<<<<<<<<
+ *                 if stop < 0:
+ *                     stop = 0
+ */
+        __pyx_v_stop = (__pyx_v_stop + __pyx_v_shape);
+
+        /* "View.MemoryView":849
+ *             if stop < 0:
+ *                 stop += shape
+ *                 if stop < 0:             # <<<<<<<<<<<<<<
+ *                     stop = 0
+ *             elif stop > shape:
+ */
+        __pyx_t_2 = (__pyx_v_stop < 0);
+        if (__pyx_t_2) {
+
+          /* "View.MemoryView":850
+ *                 stop += shape
+ *                 if stop < 0:
+ *                     stop = 0             # <<<<<<<<<<<<<<
+ *             elif stop > shape:
+ *                 stop = shape
+ */
+          __pyx_v_stop = 0;
+
+          /* "View.MemoryView":849
+ *             if stop < 0:
+ *                 stop += shape
+ *                 if stop < 0:             # <<<<<<<<<<<<<<
+ *                     stop = 0
+ *             elif stop > shape:
+ */
+        }
+
+        /* "View.MemoryView":847
+ * 
+ *         if have_stop:
+ *             if stop < 0:             # <<<<<<<<<<<<<<
+ *                 stop += shape
+ *                 if stop < 0:
+ */
+        goto __pyx_L14;
+      }
+
+      /* "View.MemoryView":851
+ *                 if stop < 0:
+ *                     stop = 0
+ *             elif stop > shape:             # <<<<<<<<<<<<<<
+ *                 stop = shape
+ *         else:
+ */
+      __pyx_t_2 = (__pyx_v_stop > __pyx_v_shape);
+      if (__pyx_t_2) {
+
+        /* "View.MemoryView":852
+ *                     stop = 0
+ *             elif stop > shape:
+ *                 stop = shape             # <<<<<<<<<<<<<<
+ *         else:
+ *             if negative_step:
+ */
+        __pyx_v_stop = __pyx_v_shape;
+
+        /* "View.MemoryView":851
+ *                 if stop < 0:
+ *                     stop = 0
+ *             elif stop > shape:             # <<<<<<<<<<<<<<
+ *                 stop = shape
+ *         else:
+ */
+      }
+      __pyx_L14:;
+
+      /* "View.MemoryView":846
+ *                 start = 0
+ * 
+ *         if have_stop:             # <<<<<<<<<<<<<<
+ *             if stop < 0:
+ *                 stop += shape
+ */
+      goto __pyx_L13;
+    }
+
+    /* "View.MemoryView":854
+ *                 stop = shape
+ *         else:
+ *             if negative_step:             # <<<<<<<<<<<<<<
+ *                 stop = -1
+ *             else:
+ */
+    /*else*/ {
+      if (__pyx_v_negative_step) {
+
+        /* "View.MemoryView":855
+ *         else:
+ *             if negative_step:
+ *                 stop = -1             # <<<<<<<<<<<<<<
+ *             else:
+ *                 stop = shape
+ */
+        __pyx_v_stop = -1L;
+
+        /* "View.MemoryView":854
+ *                 stop = shape
+ *         else:
+ *             if negative_step:             # <<<<<<<<<<<<<<
+ *                 stop = -1
+ *             else:
+ */
+        goto __pyx_L16;
+      }
+
+      /* "View.MemoryView":857
+ *                 stop = -1
+ *             else:
+ *                 stop = shape             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+      /*else*/ {
+        __pyx_v_stop = __pyx_v_shape;
+      }
+      __pyx_L16:;
+    }
+    __pyx_L13:;
+
+    /* "View.MemoryView":861
+ * 
+ *         with cython.cdivision(True):
+ *             new_shape = (stop - start) // step             # <<<<<<<<<<<<<<
+ * 
+ *             if (stop - start) - step * new_shape:
+ */
+    __pyx_v_new_shape = ((__pyx_v_stop - __pyx_v_start) / __pyx_v_step);
+
+    /* "View.MemoryView":863
+ *             new_shape = (stop - start) // step
+ * 
+ *             if (stop - start) - step * new_shape:             # <<<<<<<<<<<<<<
+ *                 new_shape += 1
+ * 
+ */
+    __pyx_t_2 = (((__pyx_v_stop - __pyx_v_start) - (__pyx_v_step * __pyx_v_new_shape)) != 0);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":864
+ * 
+ *             if (stop - start) - step * new_shape:
+ *                 new_shape += 1             # <<<<<<<<<<<<<<
+ * 
+ *         if new_shape < 0:
+ */
+      __pyx_v_new_shape = (__pyx_v_new_shape + 1);
+
+      /* "View.MemoryView":863
+ *             new_shape = (stop - start) // step
+ * 
+ *             if (stop - start) - step * new_shape:             # <<<<<<<<<<<<<<
+ *                 new_shape += 1
+ * 
+ */
+    }
+
+    /* "View.MemoryView":866
+ *                 new_shape += 1
+ * 
+ *         if new_shape < 0:             # <<<<<<<<<<<<<<
+ *             new_shape = 0
+ * 
+ */
+    __pyx_t_2 = (__pyx_v_new_shape < 0);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":867
+ * 
+ *         if new_shape < 0:
+ *             new_shape = 0             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+      __pyx_v_new_shape = 0;
+
+      /* "View.MemoryView":866
+ *                 new_shape += 1
+ * 
+ *         if new_shape < 0:             # <<<<<<<<<<<<<<
+ *             new_shape = 0
+ * 
+ */
+    }
+
+    /* "View.MemoryView":870
+ * 
+ * 
+ *         dst.strides[new_ndim] = stride * step             # <<<<<<<<<<<<<<
+ *         dst.shape[new_ndim] = new_shape
+ *         dst.suboffsets[new_ndim] = suboffset
+ */
+    (__pyx_v_dst->strides[__pyx_v_new_ndim]) = (__pyx_v_stride * __pyx_v_step);
+
+    /* "View.MemoryView":871
+ * 
+ *         dst.strides[new_ndim] = stride * step
+ *         dst.shape[new_ndim] = new_shape             # <<<<<<<<<<<<<<
+ *         dst.suboffsets[new_ndim] = suboffset
+ * 
+ */
+    (__pyx_v_dst->shape[__pyx_v_new_ndim]) = __pyx_v_new_shape;
+
+    /* "View.MemoryView":872
+ *         dst.strides[new_ndim] = stride * step
+ *         dst.shape[new_ndim] = new_shape
+ *         dst.suboffsets[new_ndim] = suboffset             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+    (__pyx_v_dst->suboffsets[__pyx_v_new_ndim]) = __pyx_v_suboffset;
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":875
+ * 
+ * 
+ *     if suboffset_dim[0] < 0:             # <<<<<<<<<<<<<<
+ *         dst.data += start * stride
+ *     else:
+ */
+  __pyx_t_2 = ((__pyx_v_suboffset_dim[0]) < 0);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":876
+ * 
+ *     if suboffset_dim[0] < 0:
+ *         dst.data += start * stride             # <<<<<<<<<<<<<<
+ *     else:
+ *         dst.suboffsets[suboffset_dim[0]] += start * stride
+ */
+    __pyx_v_dst->data = (__pyx_v_dst->data + (__pyx_v_start * __pyx_v_stride));
+
+    /* "View.MemoryView":875
+ * 
+ * 
+ *     if suboffset_dim[0] < 0:             # <<<<<<<<<<<<<<
+ *         dst.data += start * stride
+ *     else:
+ */
+    goto __pyx_L19;
+  }
+
+  /* "View.MemoryView":878
+ *         dst.data += start * stride
+ *     else:
+ *         dst.suboffsets[suboffset_dim[0]] += start * stride             # <<<<<<<<<<<<<<
+ * 
+ *     if suboffset >= 0:
+ */
+  /*else*/ {
+    __pyx_t_3 = (__pyx_v_suboffset_dim[0]);
+    (__pyx_v_dst->suboffsets[__pyx_t_3]) = ((__pyx_v_dst->suboffsets[__pyx_t_3]) + (__pyx_v_start * __pyx_v_stride));
+  }
+  __pyx_L19:;
+
+  /* "View.MemoryView":880
+ *         dst.suboffsets[suboffset_dim[0]] += start * stride
+ * 
+ *     if suboffset >= 0:             # <<<<<<<<<<<<<<
+ *         if not is_slice:
+ *             if new_ndim == 0:
+ */
+  __pyx_t_2 = (__pyx_v_suboffset >= 0);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":881
+ * 
+ *     if suboffset >= 0:
+ *         if not is_slice:             # <<<<<<<<<<<<<<
+ *             if new_ndim == 0:
+ *                 dst.data = (<char **> dst.data)[0] + suboffset
+ */
+    __pyx_t_2 = (!__pyx_v_is_slice);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":882
+ *     if suboffset >= 0:
+ *         if not is_slice:
+ *             if new_ndim == 0:             # <<<<<<<<<<<<<<
+ *                 dst.data = (<char **> dst.data)[0] + suboffset
+ *             else:
+ */
+      __pyx_t_2 = (__pyx_v_new_ndim == 0);
+      if (__pyx_t_2) {
+
+        /* "View.MemoryView":883
+ *         if not is_slice:
+ *             if new_ndim == 0:
+ *                 dst.data = (<char **> dst.data)[0] + suboffset             # <<<<<<<<<<<<<<
+ *             else:
+ *                 _err_dim(PyExc_IndexError, "All dimensions preceding dimension %d "
+ */
+        __pyx_v_dst->data = ((((char **)__pyx_v_dst->data)[0]) + __pyx_v_suboffset);
+
+        /* "View.MemoryView":882
+ *     if suboffset >= 0:
+ *         if not is_slice:
+ *             if new_ndim == 0:             # <<<<<<<<<<<<<<
+ *                 dst.data = (<char **> dst.data)[0] + suboffset
+ *             else:
+ */
+        goto __pyx_L22;
+      }
+
+      /* "View.MemoryView":885
+ *                 dst.data = (<char **> dst.data)[0] + suboffset
+ *             else:
+ *                 _err_dim(PyExc_IndexError, "All dimensions preceding dimension %d "             # <<<<<<<<<<<<<<
+ *                                      "must be indexed and not sliced", dim)
+ *         else:
+ */
+      /*else*/ {
+
+        /* "View.MemoryView":886
+ *             else:
+ *                 _err_dim(PyExc_IndexError, "All dimensions preceding dimension %d "
+ *                                      "must be indexed and not sliced", dim)             # <<<<<<<<<<<<<<
+ *         else:
+ *             suboffset_dim[0] = new_ndim
+ */
+        __pyx_t_3 = __pyx_memoryview_err_dim(PyExc_IndexError, __pyx_kp_s_All_dimensions_preceding_dimensi, __pyx_v_dim); if (unlikely(__pyx_t_3 == ((int)-1))) __PYX_ERR(1, 885, __pyx_L1_error)
+      }
+      __pyx_L22:;
+
+      /* "View.MemoryView":881
+ * 
+ *     if suboffset >= 0:
+ *         if not is_slice:             # <<<<<<<<<<<<<<
+ *             if new_ndim == 0:
+ *                 dst.data = (<char **> dst.data)[0] + suboffset
+ */
+      goto __pyx_L21;
+    }
+
+    /* "View.MemoryView":888
+ *                                      "must be indexed and not sliced", dim)
+ *         else:
+ *             suboffset_dim[0] = new_ndim             # <<<<<<<<<<<<<<
+ * 
+ *     return 0
+ */
+    /*else*/ {
+      (__pyx_v_suboffset_dim[0]) = __pyx_v_new_ndim;
+    }
+    __pyx_L21:;
+
+    /* "View.MemoryView":880
+ *         dst.suboffsets[suboffset_dim[0]] += start * stride
+ * 
+ *     if suboffset >= 0:             # <<<<<<<<<<<<<<
+ *         if not is_slice:
+ *             if new_ndim == 0:
+ */
+  }
+
+  /* "View.MemoryView":890
+ *             suboffset_dim[0] = new_ndim
+ * 
+ *     return 0             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":793
+ * 
+ * @cname('__pyx_memoryview_slice_memviewslice')
+ * cdef int slice_memviewslice(             # <<<<<<<<<<<<<<
+ *         __Pyx_memviewslice *dst,
+ *         Py_ssize_t shape, Py_ssize_t stride, Py_ssize_t suboffset,
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  #ifdef WITH_THREAD
+  __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
+  #endif
+  __Pyx_AddTraceback("View.MemoryView.slice_memviewslice", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  #ifdef WITH_THREAD
+  __Pyx_PyGILState_Release(__pyx_gilstate_save);
+  #endif
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":896
+ * 
+ * @cname('__pyx_pybuffer_index')
+ * cdef char *pybuffer_index(Py_buffer *view, char *bufp, Py_ssize_t index,             # <<<<<<<<<<<<<<
+ *                           Py_ssize_t dim) except NULL:
+ *     cdef Py_ssize_t shape, stride, suboffset = -1
+ */
+
+static char *__pyx_pybuffer_index(Py_buffer *__pyx_v_view, char *__pyx_v_bufp, Py_ssize_t __pyx_v_index, Py_ssize_t __pyx_v_dim) {
+  Py_ssize_t __pyx_v_shape;
+  Py_ssize_t __pyx_v_stride;
+  Py_ssize_t __pyx_v_suboffset;
+  Py_ssize_t __pyx_v_itemsize;
+  char *__pyx_v_resultp;
+  char *__pyx_r;
+  __Pyx_RefNannyDeclarations
+  Py_ssize_t __pyx_t_1;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  Py_UCS4 __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("pybuffer_index", 1);
+
+  /* "View.MemoryView":898
+ * cdef char *pybuffer_index(Py_buffer *view, char *bufp, Py_ssize_t index,
+ *                           Py_ssize_t dim) except NULL:
+ *     cdef Py_ssize_t shape, stride, suboffset = -1             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t itemsize = view.itemsize
+ *     cdef char *resultp
+ */
+  __pyx_v_suboffset = -1L;
+
+  /* "View.MemoryView":899
+ *                           Py_ssize_t dim) except NULL:
+ *     cdef Py_ssize_t shape, stride, suboffset = -1
+ *     cdef Py_ssize_t itemsize = view.itemsize             # <<<<<<<<<<<<<<
+ *     cdef char *resultp
+ * 
+ */
+  __pyx_t_1 = __pyx_v_view->itemsize;
+  __pyx_v_itemsize = __pyx_t_1;
+
+  /* "View.MemoryView":902
+ *     cdef char *resultp
+ * 
+ *     if view.ndim == 0:             # <<<<<<<<<<<<<<
+ *         shape = view.len // itemsize
+ *         stride = itemsize
+ */
+  __pyx_t_2 = (__pyx_v_view->ndim == 0);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":903
+ * 
+ *     if view.ndim == 0:
+ *         shape = view.len // itemsize             # <<<<<<<<<<<<<<
+ *         stride = itemsize
+ *     else:
+ */
+    if (unlikely(__pyx_v_itemsize == 0)) {
+      PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
+      __PYX_ERR(1, 903, __pyx_L1_error)
+    }
+    else if (sizeof(Py_ssize_t) == sizeof(long) && (!(((Py_ssize_t)-1) > 0)) && unlikely(__pyx_v_itemsize == (Py_ssize_t)-1)  && unlikely(__Pyx_UNARY_NEG_WOULD_OVERFLOW(__pyx_v_view->len))) {
+      PyErr_SetString(PyExc_OverflowError, "value too large to perform division");
+      __PYX_ERR(1, 903, __pyx_L1_error)
+    }
+    __pyx_v_shape = __Pyx_div_Py_ssize_t(__pyx_v_view->len, __pyx_v_itemsize);
+
+    /* "View.MemoryView":904
+ *     if view.ndim == 0:
+ *         shape = view.len // itemsize
+ *         stride = itemsize             # <<<<<<<<<<<<<<
+ *     else:
+ *         shape = view.shape[dim]
+ */
+    __pyx_v_stride = __pyx_v_itemsize;
+
+    /* "View.MemoryView":902
+ *     cdef char *resultp
+ * 
+ *     if view.ndim == 0:             # <<<<<<<<<<<<<<
+ *         shape = view.len // itemsize
+ *         stride = itemsize
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":906
+ *         stride = itemsize
+ *     else:
+ *         shape = view.shape[dim]             # <<<<<<<<<<<<<<
+ *         stride = view.strides[dim]
+ *         if view.suboffsets != NULL:
+ */
+  /*else*/ {
+    __pyx_v_shape = (__pyx_v_view->shape[__pyx_v_dim]);
+
+    /* "View.MemoryView":907
+ *     else:
+ *         shape = view.shape[dim]
+ *         stride = view.strides[dim]             # <<<<<<<<<<<<<<
+ *         if view.suboffsets != NULL:
+ *             suboffset = view.suboffsets[dim]
+ */
+    __pyx_v_stride = (__pyx_v_view->strides[__pyx_v_dim]);
+
+    /* "View.MemoryView":908
+ *         shape = view.shape[dim]
+ *         stride = view.strides[dim]
+ *         if view.suboffsets != NULL:             # <<<<<<<<<<<<<<
+ *             suboffset = view.suboffsets[dim]
+ * 
+ */
+    __pyx_t_2 = (__pyx_v_view->suboffsets != NULL);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":909
+ *         stride = view.strides[dim]
+ *         if view.suboffsets != NULL:
+ *             suboffset = view.suboffsets[dim]             # <<<<<<<<<<<<<<
+ * 
+ *     if index < 0:
+ */
+      __pyx_v_suboffset = (__pyx_v_view->suboffsets[__pyx_v_dim]);
+
+      /* "View.MemoryView":908
+ *         shape = view.shape[dim]
+ *         stride = view.strides[dim]
+ *         if view.suboffsets != NULL:             # <<<<<<<<<<<<<<
+ *             suboffset = view.suboffsets[dim]
+ * 
+ */
+    }
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":911
+ *             suboffset = view.suboffsets[dim]
+ * 
+ *     if index < 0:             # <<<<<<<<<<<<<<
+ *         index += view.shape[dim]
+ *         if index < 0:
+ */
+  __pyx_t_2 = (__pyx_v_index < 0);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":912
+ * 
+ *     if index < 0:
+ *         index += view.shape[dim]             # <<<<<<<<<<<<<<
+ *         if index < 0:
+ *             raise IndexError, f"Out of bounds on buffer access (axis {dim})"
+ */
+    __pyx_v_index = (__pyx_v_index + (__pyx_v_view->shape[__pyx_v_dim]));
+
+    /* "View.MemoryView":913
+ *     if index < 0:
+ *         index += view.shape[dim]
+ *         if index < 0:             # <<<<<<<<<<<<<<
+ *             raise IndexError, f"Out of bounds on buffer access (axis {dim})"
+ * 
+ */
+    __pyx_t_2 = (__pyx_v_index < 0);
+    if (unlikely(__pyx_t_2)) {
+
+      /* "View.MemoryView":914
+ *         index += view.shape[dim]
+ *         if index < 0:
+ *             raise IndexError, f"Out of bounds on buffer access (axis {dim})"             # <<<<<<<<<<<<<<
+ * 
+ *     if index >= shape:
+ */
+      __pyx_t_3 = PyTuple_New(3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 914, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_1 = 0;
+      __pyx_t_4 = 127;
+      __Pyx_INCREF(__pyx_kp_u_Out_of_bounds_on_buffer_access_a);
+      __pyx_t_1 += 37;
+      __Pyx_GIVEREF(__pyx_kp_u_Out_of_bounds_on_buffer_access_a);
+      PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_kp_u_Out_of_bounds_on_buffer_access_a);
+      __pyx_t_5 = __Pyx_PyUnicode_From_Py_ssize_t(__pyx_v_dim, 0, ' ', 'd'); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 914, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __pyx_t_1 += __Pyx_PyUnicode_GET_LENGTH(__pyx_t_5);
+      __Pyx_GIVEREF(__pyx_t_5);
+      PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_5);
+      __pyx_t_5 = 0;
+      __Pyx_INCREF(__pyx_kp_u__7);
+      __pyx_t_1 += 1;
+      __Pyx_GIVEREF(__pyx_kp_u__7);
+      PyTuple_SET_ITEM(__pyx_t_3, 2, __pyx_kp_u__7);
+      __pyx_t_5 = __Pyx_PyUnicode_Join(__pyx_t_3, 3, __pyx_t_1, __pyx_t_4); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 914, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_Raise(__pyx_builtin_IndexError, __pyx_t_5, 0, 0);
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __PYX_ERR(1, 914, __pyx_L1_error)
+
+      /* "View.MemoryView":913
+ *     if index < 0:
+ *         index += view.shape[dim]
+ *         if index < 0:             # <<<<<<<<<<<<<<
+ *             raise IndexError, f"Out of bounds on buffer access (axis {dim})"
+ * 
+ */
+    }
+
+    /* "View.MemoryView":911
+ *             suboffset = view.suboffsets[dim]
+ * 
+ *     if index < 0:             # <<<<<<<<<<<<<<
+ *         index += view.shape[dim]
+ *         if index < 0:
+ */
+  }
+
+  /* "View.MemoryView":916
+ *             raise IndexError, f"Out of bounds on buffer access (axis {dim})"
+ * 
+ *     if index >= shape:             # <<<<<<<<<<<<<<
+ *         raise IndexError, f"Out of bounds on buffer access (axis {dim})"
+ * 
+ */
+  __pyx_t_2 = (__pyx_v_index >= __pyx_v_shape);
+  if (unlikely(__pyx_t_2)) {
+
+    /* "View.MemoryView":917
+ * 
+ *     if index >= shape:
+ *         raise IndexError, f"Out of bounds on buffer access (axis {dim})"             # <<<<<<<<<<<<<<
+ * 
+ *     resultp = bufp + index * stride
+ */
+    __pyx_t_5 = PyTuple_New(3); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 917, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_1 = 0;
+    __pyx_t_4 = 127;
+    __Pyx_INCREF(__pyx_kp_u_Out_of_bounds_on_buffer_access_a);
+    __pyx_t_1 += 37;
+    __Pyx_GIVEREF(__pyx_kp_u_Out_of_bounds_on_buffer_access_a);
+    PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_kp_u_Out_of_bounds_on_buffer_access_a);
+    __pyx_t_3 = __Pyx_PyUnicode_From_Py_ssize_t(__pyx_v_dim, 0, ' ', 'd'); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 917, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_1 += __Pyx_PyUnicode_GET_LENGTH(__pyx_t_3);
+    __Pyx_GIVEREF(__pyx_t_3);
+    PyTuple_SET_ITEM(__pyx_t_5, 1, __pyx_t_3);
+    __pyx_t_3 = 0;
+    __Pyx_INCREF(__pyx_kp_u__7);
+    __pyx_t_1 += 1;
+    __Pyx_GIVEREF(__pyx_kp_u__7);
+    PyTuple_SET_ITEM(__pyx_t_5, 2, __pyx_kp_u__7);
+    __pyx_t_3 = __Pyx_PyUnicode_Join(__pyx_t_5, 3, __pyx_t_1, __pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 917, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __Pyx_Raise(__pyx_builtin_IndexError, __pyx_t_3, 0, 0);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __PYX_ERR(1, 917, __pyx_L1_error)
+
+    /* "View.MemoryView":916
+ *             raise IndexError, f"Out of bounds on buffer access (axis {dim})"
+ * 
+ *     if index >= shape:             # <<<<<<<<<<<<<<
+ *         raise IndexError, f"Out of bounds on buffer access (axis {dim})"
+ * 
+ */
+  }
+
+  /* "View.MemoryView":919
+ *         raise IndexError, f"Out of bounds on buffer access (axis {dim})"
+ * 
+ *     resultp = bufp + index * stride             # <<<<<<<<<<<<<<
+ *     if suboffset >= 0:
+ *         resultp = (<char **> resultp)[0] + suboffset
+ */
+  __pyx_v_resultp = (__pyx_v_bufp + (__pyx_v_index * __pyx_v_stride));
+
+  /* "View.MemoryView":920
+ * 
+ *     resultp = bufp + index * stride
+ *     if suboffset >= 0:             # <<<<<<<<<<<<<<
+ *         resultp = (<char **> resultp)[0] + suboffset
+ * 
+ */
+  __pyx_t_2 = (__pyx_v_suboffset >= 0);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":921
+ *     resultp = bufp + index * stride
+ *     if suboffset >= 0:
+ *         resultp = (<char **> resultp)[0] + suboffset             # <<<<<<<<<<<<<<
+ * 
+ *     return resultp
+ */
+    __pyx_v_resultp = ((((char **)__pyx_v_resultp)[0]) + __pyx_v_suboffset);
+
+    /* "View.MemoryView":920
+ * 
+ *     resultp = bufp + index * stride
+ *     if suboffset >= 0:             # <<<<<<<<<<<<<<
+ *         resultp = (<char **> resultp)[0] + suboffset
+ * 
+ */
+  }
+
+  /* "View.MemoryView":923
+ *         resultp = (<char **> resultp)[0] + suboffset
+ * 
+ *     return resultp             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = __pyx_v_resultp;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":896
+ * 
+ * @cname('__pyx_pybuffer_index')
+ * cdef char *pybuffer_index(Py_buffer *view, char *bufp, Py_ssize_t index,             # <<<<<<<<<<<<<<
+ *                           Py_ssize_t dim) except NULL:
+ *     cdef Py_ssize_t shape, stride, suboffset = -1
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_AddTraceback("View.MemoryView.pybuffer_index", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":929
+ * 
+ * @cname('__pyx_memslice_transpose')
+ * cdef int transpose_memslice(__Pyx_memviewslice *memslice) except -1 nogil:             # <<<<<<<<<<<<<<
+ *     cdef int ndim = memslice.memview.view.ndim
+ * 
+ */
+
+static int __pyx_memslice_transpose(__Pyx_memviewslice *__pyx_v_memslice) {
+  int __pyx_v_ndim;
+  Py_ssize_t *__pyx_v_shape;
+  Py_ssize_t *__pyx_v_strides;
+  int __pyx_v_i;
+  int __pyx_v_j;
+  int __pyx_r;
+  int __pyx_t_1;
+  Py_ssize_t *__pyx_t_2;
+  long __pyx_t_3;
+  long __pyx_t_4;
+  Py_ssize_t __pyx_t_5;
+  Py_ssize_t __pyx_t_6;
+  int __pyx_t_7;
+  int __pyx_t_8;
+  int __pyx_t_9;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  #ifdef WITH_THREAD
+  PyGILState_STATE __pyx_gilstate_save;
+  #endif
+
+  /* "View.MemoryView":930
+ * @cname('__pyx_memslice_transpose')
+ * cdef int transpose_memslice(__Pyx_memviewslice *memslice) except -1 nogil:
+ *     cdef int ndim = memslice.memview.view.ndim             # <<<<<<<<<<<<<<
+ * 
+ *     cdef Py_ssize_t *shape = memslice.shape
+ */
+  __pyx_t_1 = __pyx_v_memslice->memview->view.ndim;
+  __pyx_v_ndim = __pyx_t_1;
+
+  /* "View.MemoryView":932
+ *     cdef int ndim = memslice.memview.view.ndim
+ * 
+ *     cdef Py_ssize_t *shape = memslice.shape             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t *strides = memslice.strides
+ * 
+ */
+  __pyx_t_2 = __pyx_v_memslice->shape;
+  __pyx_v_shape = __pyx_t_2;
+
+  /* "View.MemoryView":933
+ * 
+ *     cdef Py_ssize_t *shape = memslice.shape
+ *     cdef Py_ssize_t *strides = memslice.strides             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_t_2 = __pyx_v_memslice->strides;
+  __pyx_v_strides = __pyx_t_2;
+
+  /* "View.MemoryView":937
+ * 
+ *     cdef int i, j
+ *     for i in range(ndim // 2):             # <<<<<<<<<<<<<<
+ *         j = ndim - 1 - i
+ *         strides[i], strides[j] = strides[j], strides[i]
+ */
+  __pyx_t_3 = __Pyx_div_long(__pyx_v_ndim, 2);
+  __pyx_t_4 = __pyx_t_3;
+  for (__pyx_t_1 = 0; __pyx_t_1 < __pyx_t_4; __pyx_t_1+=1) {
+    __pyx_v_i = __pyx_t_1;
+
+    /* "View.MemoryView":938
+ *     cdef int i, j
+ *     for i in range(ndim // 2):
+ *         j = ndim - 1 - i             # <<<<<<<<<<<<<<
+ *         strides[i], strides[j] = strides[j], strides[i]
+ *         shape[i], shape[j] = shape[j], shape[i]
+ */
+    __pyx_v_j = ((__pyx_v_ndim - 1) - __pyx_v_i);
+
+    /* "View.MemoryView":939
+ *     for i in range(ndim // 2):
+ *         j = ndim - 1 - i
+ *         strides[i], strides[j] = strides[j], strides[i]             # <<<<<<<<<<<<<<
+ *         shape[i], shape[j] = shape[j], shape[i]
+ * 
+ */
+    __pyx_t_5 = (__pyx_v_strides[__pyx_v_j]);
+    __pyx_t_6 = (__pyx_v_strides[__pyx_v_i]);
+    (__pyx_v_strides[__pyx_v_i]) = __pyx_t_5;
+    (__pyx_v_strides[__pyx_v_j]) = __pyx_t_6;
+
+    /* "View.MemoryView":940
+ *         j = ndim - 1 - i
+ *         strides[i], strides[j] = strides[j], strides[i]
+ *         shape[i], shape[j] = shape[j], shape[i]             # <<<<<<<<<<<<<<
+ * 
+ *         if memslice.suboffsets[i] >= 0 or memslice.suboffsets[j] >= 0:
+ */
+    __pyx_t_6 = (__pyx_v_shape[__pyx_v_j]);
+    __pyx_t_5 = (__pyx_v_shape[__pyx_v_i]);
+    (__pyx_v_shape[__pyx_v_i]) = __pyx_t_6;
+    (__pyx_v_shape[__pyx_v_j]) = __pyx_t_5;
+
+    /* "View.MemoryView":942
+ *         shape[i], shape[j] = shape[j], shape[i]
+ * 
+ *         if memslice.suboffsets[i] >= 0 or memslice.suboffsets[j] >= 0:             # <<<<<<<<<<<<<<
+ *             _err(PyExc_ValueError, "Cannot transpose memoryview with indirect dimensions")
+ * 
+ */
+    __pyx_t_8 = ((__pyx_v_memslice->suboffsets[__pyx_v_i]) >= 0);
+    if (!__pyx_t_8) {
+    } else {
+      __pyx_t_7 = __pyx_t_8;
+      goto __pyx_L6_bool_binop_done;
+    }
+    __pyx_t_8 = ((__pyx_v_memslice->suboffsets[__pyx_v_j]) >= 0);
+    __pyx_t_7 = __pyx_t_8;
+    __pyx_L6_bool_binop_done:;
+    if (__pyx_t_7) {
+
+      /* "View.MemoryView":943
+ * 
+ *         if memslice.suboffsets[i] >= 0 or memslice.suboffsets[j] >= 0:
+ *             _err(PyExc_ValueError, "Cannot transpose memoryview with indirect dimensions")             # <<<<<<<<<<<<<<
+ * 
+ *     return 0
+ */
+      __pyx_t_9 = __pyx_memoryview_err(PyExc_ValueError, __pyx_kp_s_Cannot_transpose_memoryview_with); if (unlikely(__pyx_t_9 == ((int)-1))) __PYX_ERR(1, 943, __pyx_L1_error)
+
+      /* "View.MemoryView":942
+ *         shape[i], shape[j] = shape[j], shape[i]
+ * 
+ *         if memslice.suboffsets[i] >= 0 or memslice.suboffsets[j] >= 0:             # <<<<<<<<<<<<<<
+ *             _err(PyExc_ValueError, "Cannot transpose memoryview with indirect dimensions")
+ * 
+ */
+    }
+  }
+
+  /* "View.MemoryView":945
+ *             _err(PyExc_ValueError, "Cannot transpose memoryview with indirect dimensions")
+ * 
+ *     return 0             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":929
+ * 
+ * @cname('__pyx_memslice_transpose')
+ * cdef int transpose_memslice(__Pyx_memviewslice *memslice) except -1 nogil:             # <<<<<<<<<<<<<<
+ *     cdef int ndim = memslice.memview.view.ndim
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  #ifdef WITH_THREAD
+  __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
+  #endif
+  __Pyx_AddTraceback("View.MemoryView.transpose_memslice", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  #ifdef WITH_THREAD
+  __Pyx_PyGILState_Release(__pyx_gilstate_save);
+  #endif
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":963
+ *     cdef int (*to_dtype_func)(char *, object) except 0
+ * 
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
+ *         __PYX_XCLEAR_MEMVIEW(&self.from_slice, 1)
+ * 
+ */
+
+/* Python wrapper */
+static void __pyx_memoryviewslice___dealloc__(PyObject *__pyx_v_self); /*proto*/
+static void __pyx_memoryviewslice___dealloc__(PyObject *__pyx_v_self) {
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0);
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  __pyx_memoryviewslice___pyx_pf_15View_dot_MemoryView_16_memoryviewslice___dealloc__(((struct __pyx_memoryviewslice_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+static void __pyx_memoryviewslice___pyx_pf_15View_dot_MemoryView_16_memoryviewslice___dealloc__(struct __pyx_memoryviewslice_obj *__pyx_v_self) {
+
+  /* "View.MemoryView":964
+ * 
+ *     def __dealloc__(self):
+ *         __PYX_XCLEAR_MEMVIEW(&self.from_slice, 1)             # <<<<<<<<<<<<<<
+ * 
+ *     cdef convert_item_to_object(self, char *itemp):
+ */
+  __PYX_XCLEAR_MEMVIEW((&__pyx_v_self->from_slice), 1);
+
+  /* "View.MemoryView":963
+ *     cdef int (*to_dtype_func)(char *, object) except 0
+ * 
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
+ *         __PYX_XCLEAR_MEMVIEW(&self.from_slice, 1)
+ * 
+ */
+
+  /* function exit code */
+}
+
+/* "View.MemoryView":966
+ *         __PYX_XCLEAR_MEMVIEW(&self.from_slice, 1)
+ * 
+ *     cdef convert_item_to_object(self, char *itemp):             # <<<<<<<<<<<<<<
+ *         if self.to_object_func != NULL:
+ *             return self.to_object_func(itemp)
+ */
+
+static PyObject *__pyx_memoryviewslice_convert_item_to_object(struct __pyx_memoryviewslice_obj *__pyx_v_self, char *__pyx_v_itemp) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("convert_item_to_object", 1);
+
+  /* "View.MemoryView":967
+ * 
+ *     cdef convert_item_to_object(self, char *itemp):
+ *         if self.to_object_func != NULL:             # <<<<<<<<<<<<<<
+ *             return self.to_object_func(itemp)
+ *         else:
+ */
+  __pyx_t_1 = (__pyx_v_self->to_object_func != NULL);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":968
+ *     cdef convert_item_to_object(self, char *itemp):
+ *         if self.to_object_func != NULL:
+ *             return self.to_object_func(itemp)             # <<<<<<<<<<<<<<
+ *         else:
+ *             return memoryview.convert_item_to_object(self, itemp)
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_t_2 = __pyx_v_self->to_object_func(__pyx_v_itemp); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 968, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_r = __pyx_t_2;
+    __pyx_t_2 = 0;
+    goto __pyx_L0;
+
+    /* "View.MemoryView":967
+ * 
+ *     cdef convert_item_to_object(self, char *itemp):
+ *         if self.to_object_func != NULL:             # <<<<<<<<<<<<<<
+ *             return self.to_object_func(itemp)
+ *         else:
+ */
+  }
+
+  /* "View.MemoryView":970
+ *             return self.to_object_func(itemp)
+ *         else:
+ *             return memoryview.convert_item_to_object(self, itemp)             # <<<<<<<<<<<<<<
+ * 
+ *     cdef assign_item_from_object(self, char *itemp, object value):
+ */
+  /*else*/ {
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_t_2 = __pyx_memoryview_convert_item_to_object(((struct __pyx_memoryview_obj *)__pyx_v_self), __pyx_v_itemp); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 970, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_r = __pyx_t_2;
+    __pyx_t_2 = 0;
+    goto __pyx_L0;
+  }
+
+  /* "View.MemoryView":966
+ *         __PYX_XCLEAR_MEMVIEW(&self.from_slice, 1)
+ * 
+ *     cdef convert_item_to_object(self, char *itemp):             # <<<<<<<<<<<<<<
+ *         if self.to_object_func != NULL:
+ *             return self.to_object_func(itemp)
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView._memoryviewslice.convert_item_to_object", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":972
+ *             return memoryview.convert_item_to_object(self, itemp)
+ * 
+ *     cdef assign_item_from_object(self, char *itemp, object value):             # <<<<<<<<<<<<<<
+ *         if self.to_dtype_func != NULL:
+ *             self.to_dtype_func(itemp, value)
+ */
+
+static PyObject *__pyx_memoryviewslice_assign_item_from_object(struct __pyx_memoryviewslice_obj *__pyx_v_self, char *__pyx_v_itemp, PyObject *__pyx_v_value) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("assign_item_from_object", 1);
+
+  /* "View.MemoryView":973
+ * 
+ *     cdef assign_item_from_object(self, char *itemp, object value):
+ *         if self.to_dtype_func != NULL:             # <<<<<<<<<<<<<<
+ *             self.to_dtype_func(itemp, value)
+ *         else:
+ */
+  __pyx_t_1 = (__pyx_v_self->to_dtype_func != NULL);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":974
+ *     cdef assign_item_from_object(self, char *itemp, object value):
+ *         if self.to_dtype_func != NULL:
+ *             self.to_dtype_func(itemp, value)             # <<<<<<<<<<<<<<
+ *         else:
+ *             memoryview.assign_item_from_object(self, itemp, value)
+ */
+    __pyx_t_2 = __pyx_v_self->to_dtype_func(__pyx_v_itemp, __pyx_v_value); if (unlikely(__pyx_t_2 == ((int)0))) __PYX_ERR(1, 974, __pyx_L1_error)
+
+    /* "View.MemoryView":973
+ * 
+ *     cdef assign_item_from_object(self, char *itemp, object value):
+ *         if self.to_dtype_func != NULL:             # <<<<<<<<<<<<<<
+ *             self.to_dtype_func(itemp, value)
+ *         else:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":976
+ *             self.to_dtype_func(itemp, value)
+ *         else:
+ *             memoryview.assign_item_from_object(self, itemp, value)             # <<<<<<<<<<<<<<
+ * 
+ *     cdef _get_base(self):
+ */
+  /*else*/ {
+    __pyx_t_3 = __pyx_memoryview_assign_item_from_object(((struct __pyx_memoryview_obj *)__pyx_v_self), __pyx_v_itemp, __pyx_v_value); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 976, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":972
+ *             return memoryview.convert_item_to_object(self, itemp)
+ * 
+ *     cdef assign_item_from_object(self, char *itemp, object value):             # <<<<<<<<<<<<<<
+ *         if self.to_dtype_func != NULL:
+ *             self.to_dtype_func(itemp, value)
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_AddTraceback("View.MemoryView._memoryviewslice.assign_item_from_object", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":978
+ *             memoryview.assign_item_from_object(self, itemp, value)
+ * 
+ *     cdef _get_base(self):             # <<<<<<<<<<<<<<
+ *         return self.from_object
+ * 
+ */
+
+static PyObject *__pyx_memoryviewslice__get_base(struct __pyx_memoryviewslice_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_get_base", 1);
+
+  /* "View.MemoryView":979
+ * 
+ *     cdef _get_base(self):
+ *         return self.from_object             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_self->from_object);
+  __pyx_r = __pyx_v_self->from_object;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":978
+ *             memoryview.assign_item_from_object(self, itemp, value)
+ * 
+ *     cdef _get_base(self):             # <<<<<<<<<<<<<<
+ *         return self.from_object
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw___pyx_memoryviewslice_1__reduce_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_pw___pyx_memoryviewslice_1__reduce_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__reduce_cython__ (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  if (unlikely(__pyx_nargs > 0)) {
+    __Pyx_RaiseArgtupleInvalid("__reduce_cython__", 1, 0, 0, __pyx_nargs); return NULL;}
+  if (unlikely(__pyx_kwds) && __Pyx_NumKwargs_FASTCALL(__pyx_kwds) && unlikely(!__Pyx_CheckKeywordStrings(__pyx_kwds, "__reduce_cython__", 0))) return NULL;
+  __pyx_r = __pyx_pf___pyx_memoryviewslice___reduce_cython__(((struct __pyx_memoryviewslice_obj *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf___pyx_memoryviewslice___reduce_cython__(CYTHON_UNUSED struct __pyx_memoryviewslice_obj *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__reduce_cython__", 1);
+
+  /* "(tree fragment)":2
+ * def __reduce_cython__(self):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"             # <<<<<<<<<<<<<<
+ * def __setstate_cython__(self, __pyx_state):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ */
+  __Pyx_Raise(__pyx_builtin_TypeError, __pyx_kp_s_no_default___reduce___due_to_non, 0, 0);
+  __PYX_ERR(1, 2, __pyx_L1_error)
+
+  /* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView._memoryviewslice.__reduce_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":3
+ * def __reduce_cython__(self):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw___pyx_memoryviewslice_3__setstate_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyObject *__pyx_pw___pyx_memoryviewslice_3__setstate_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  CYTHON_UNUSED PyObject *__pyx_v___pyx_state = 0;
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[1] = {0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__setstate_cython__ (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_pyx_state,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  1: values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_FASTCALL(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_pyx_state)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 3, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "__setstate_cython__") < 0)) __PYX_ERR(1, 3, __pyx_L3_error)
+      }
+    } else if (unlikely(__pyx_nargs != 1)) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+    }
+    __pyx_v___pyx_state = values[0];
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__setstate_cython__", 1, 1, 1, __pyx_nargs); __PYX_ERR(1, 3, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("View.MemoryView._memoryviewslice.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf___pyx_memoryviewslice_2__setstate_cython__(((struct __pyx_memoryviewslice_obj *)__pyx_v_self), __pyx_v___pyx_state);
+
+  /* function exit code */
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf___pyx_memoryviewslice_2__setstate_cython__(CYTHON_UNUSED struct __pyx_memoryviewslice_obj *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__setstate_cython__", 1);
+
+  /* "(tree fragment)":4
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"             # <<<<<<<<<<<<<<
+ */
+  __Pyx_Raise(__pyx_builtin_TypeError, __pyx_kp_s_no_default___reduce___due_to_non, 0, 0);
+  __PYX_ERR(1, 4, __pyx_L1_error)
+
+  /* "(tree fragment)":3
+ * def __reduce_cython__(self):
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     raise TypeError, "no default __reduce__ due to non-trivial __cinit__"
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView._memoryviewslice.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":999
+ * 
+ * @cname('__pyx_memoryview_fromslice')
+ * cdef memoryview_fromslice(__Pyx_memviewslice memviewslice,             # <<<<<<<<<<<<<<
+ *                           int ndim,
+ *                           object (*to_object_func)(char *),
+ */
+
+static PyObject *__pyx_memoryview_fromslice(__Pyx_memviewslice __pyx_v_memviewslice, int __pyx_v_ndim, PyObject *(*__pyx_v_to_object_func)(char *), int (*__pyx_v_to_dtype_func)(char *, PyObject *), int __pyx_v_dtype_is_object) {
+  struct __pyx_memoryviewslice_obj *__pyx_v_result = 0;
+  Py_ssize_t __pyx_v_suboffset;
+  PyObject *__pyx_v_length = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  __Pyx_TypeInfo *__pyx_t_4;
+  Py_buffer __pyx_t_5;
+  Py_ssize_t *__pyx_t_6;
+  Py_ssize_t *__pyx_t_7;
+  Py_ssize_t *__pyx_t_8;
+  Py_ssize_t __pyx_t_9;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("memoryview_fromslice", 1);
+
+  /* "View.MemoryView":1007
+ *     cdef _memoryviewslice result
+ * 
+ *     if <PyObject *> memviewslice.memview == Py_None:             # <<<<<<<<<<<<<<
+ *         return None
+ * 
+ */
+  __pyx_t_1 = (((PyObject *)__pyx_v_memviewslice.memview) == Py_None);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":1008
+ * 
+ *     if <PyObject *> memviewslice.memview == Py_None:
+ *         return None             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+    goto __pyx_L0;
+
+    /* "View.MemoryView":1007
+ *     cdef _memoryviewslice result
+ * 
+ *     if <PyObject *> memviewslice.memview == Py_None:             # <<<<<<<<<<<<<<
+ *         return None
+ * 
+ */
+  }
+
+  /* "View.MemoryView":1013
+ * 
+ * 
+ *     result = _memoryviewslice.__new__(_memoryviewslice, None, 0, dtype_is_object)             # <<<<<<<<<<<<<<
+ * 
+ *     result.from_slice = memviewslice
+ */
+  __pyx_t_2 = __Pyx_PyBool_FromLong(__pyx_v_dtype_is_object); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 1013, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = PyTuple_New(3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 1013, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_INCREF(Py_None);
+  __Pyx_GIVEREF(Py_None);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 0, Py_None)) __PYX_ERR(1, 1013, __pyx_L1_error);
+  __Pyx_INCREF(__pyx_int_0);
+  __Pyx_GIVEREF(__pyx_int_0);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_int_0)) __PYX_ERR(1, 1013, __pyx_L1_error);
+  __Pyx_GIVEREF(__pyx_t_2);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 2, __pyx_t_2)) __PYX_ERR(1, 1013, __pyx_L1_error);
+  __pyx_t_2 = 0;
+  __pyx_t_2 = ((PyObject *)__pyx_tp_new__memoryviewslice(((PyTypeObject *)__pyx_memoryviewslice_type), __pyx_t_3, NULL)); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 1013, __pyx_L1_error)
+  __Pyx_GOTREF((PyObject *)__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_v_result = ((struct __pyx_memoryviewslice_obj *)__pyx_t_2);
+  __pyx_t_2 = 0;
+
+  /* "View.MemoryView":1015
+ *     result = _memoryviewslice.__new__(_memoryviewslice, None, 0, dtype_is_object)
+ * 
+ *     result.from_slice = memviewslice             # <<<<<<<<<<<<<<
+ *     __PYX_INC_MEMVIEW(&memviewslice, 1)
+ * 
+ */
+  __pyx_v_result->from_slice = __pyx_v_memviewslice;
+
+  /* "View.MemoryView":1016
+ * 
+ *     result.from_slice = memviewslice
+ *     __PYX_INC_MEMVIEW(&memviewslice, 1)             # <<<<<<<<<<<<<<
+ * 
+ *     result.from_object = (<memoryview> memviewslice.memview)._get_base()
+ */
+  __PYX_INC_MEMVIEW((&__pyx_v_memviewslice), 1);
+
+  /* "View.MemoryView":1018
+ *     __PYX_INC_MEMVIEW(&memviewslice, 1)
+ * 
+ *     result.from_object = (<memoryview> memviewslice.memview)._get_base()             # <<<<<<<<<<<<<<
+ *     result.typeinfo = memviewslice.memview.typeinfo
+ * 
+ */
+  __pyx_t_2 = ((struct __pyx_vtabstruct_memoryview *)((struct __pyx_memoryview_obj *)__pyx_v_memviewslice.memview)->__pyx_vtab)->_get_base(((struct __pyx_memoryview_obj *)__pyx_v_memviewslice.memview)); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 1018, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_GIVEREF(__pyx_t_2);
+  __Pyx_GOTREF(__pyx_v_result->from_object);
+  __Pyx_DECREF(__pyx_v_result->from_object);
+  __pyx_v_result->from_object = __pyx_t_2;
+  __pyx_t_2 = 0;
+
+  /* "View.MemoryView":1019
+ * 
+ *     result.from_object = (<memoryview> memviewslice.memview)._get_base()
+ *     result.typeinfo = memviewslice.memview.typeinfo             # <<<<<<<<<<<<<<
+ * 
+ *     result.view = memviewslice.memview.view
+ */
+  __pyx_t_4 = __pyx_v_memviewslice.memview->typeinfo;
+  __pyx_v_result->__pyx_base.typeinfo = __pyx_t_4;
+
+  /* "View.MemoryView":1021
+ *     result.typeinfo = memviewslice.memview.typeinfo
+ * 
+ *     result.view = memviewslice.memview.view             # <<<<<<<<<<<<<<
+ *     result.view.buf = <void *> memviewslice.data
+ *     result.view.ndim = ndim
+ */
+  __pyx_t_5 = __pyx_v_memviewslice.memview->view;
+  __pyx_v_result->__pyx_base.view = __pyx_t_5;
+
+  /* "View.MemoryView":1022
+ * 
+ *     result.view = memviewslice.memview.view
+ *     result.view.buf = <void *> memviewslice.data             # <<<<<<<<<<<<<<
+ *     result.view.ndim = ndim
+ *     (<__pyx_buffer *> &result.view).obj = Py_None
+ */
+  __pyx_v_result->__pyx_base.view.buf = ((void *)__pyx_v_memviewslice.data);
+
+  /* "View.MemoryView":1023
+ *     result.view = memviewslice.memview.view
+ *     result.view.buf = <void *> memviewslice.data
+ *     result.view.ndim = ndim             # <<<<<<<<<<<<<<
+ *     (<__pyx_buffer *> &result.view).obj = Py_None
+ *     Py_INCREF(Py_None)
+ */
+  __pyx_v_result->__pyx_base.view.ndim = __pyx_v_ndim;
+
+  /* "View.MemoryView":1024
+ *     result.view.buf = <void *> memviewslice.data
+ *     result.view.ndim = ndim
+ *     (<__pyx_buffer *> &result.view).obj = Py_None             # <<<<<<<<<<<<<<
+ *     Py_INCREF(Py_None)
+ * 
+ */
+  ((Py_buffer *)(&__pyx_v_result->__pyx_base.view))->obj = Py_None;
+
+  /* "View.MemoryView":1025
+ *     result.view.ndim = ndim
+ *     (<__pyx_buffer *> &result.view).obj = Py_None
+ *     Py_INCREF(Py_None)             # <<<<<<<<<<<<<<
+ * 
+ *     if (<memoryview>memviewslice.memview).flags & PyBUF_WRITABLE:
+ */
+  Py_INCREF(Py_None);
+
+  /* "View.MemoryView":1027
+ *     Py_INCREF(Py_None)
+ * 
+ *     if (<memoryview>memviewslice.memview).flags & PyBUF_WRITABLE:             # <<<<<<<<<<<<<<
+ *         result.flags = PyBUF_RECORDS
+ *     else:
+ */
+  __pyx_t_1 = ((((struct __pyx_memoryview_obj *)__pyx_v_memviewslice.memview)->flags & PyBUF_WRITABLE) != 0);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":1028
+ * 
+ *     if (<memoryview>memviewslice.memview).flags & PyBUF_WRITABLE:
+ *         result.flags = PyBUF_RECORDS             # <<<<<<<<<<<<<<
+ *     else:
+ *         result.flags = PyBUF_RECORDS_RO
+ */
+    __pyx_v_result->__pyx_base.flags = PyBUF_RECORDS;
+
+    /* "View.MemoryView":1027
+ *     Py_INCREF(Py_None)
+ * 
+ *     if (<memoryview>memviewslice.memview).flags & PyBUF_WRITABLE:             # <<<<<<<<<<<<<<
+ *         result.flags = PyBUF_RECORDS
+ *     else:
+ */
+    goto __pyx_L4;
+  }
+
+  /* "View.MemoryView":1030
+ *         result.flags = PyBUF_RECORDS
+ *     else:
+ *         result.flags = PyBUF_RECORDS_RO             # <<<<<<<<<<<<<<
+ * 
+ *     result.view.shape = <Py_ssize_t *> result.from_slice.shape
+ */
+  /*else*/ {
+    __pyx_v_result->__pyx_base.flags = PyBUF_RECORDS_RO;
+  }
+  __pyx_L4:;
+
+  /* "View.MemoryView":1032
+ *         result.flags = PyBUF_RECORDS_RO
+ * 
+ *     result.view.shape = <Py_ssize_t *> result.from_slice.shape             # <<<<<<<<<<<<<<
+ *     result.view.strides = <Py_ssize_t *> result.from_slice.strides
+ * 
+ */
+  __pyx_v_result->__pyx_base.view.shape = ((Py_ssize_t *)__pyx_v_result->from_slice.shape);
+
+  /* "View.MemoryView":1033
+ * 
+ *     result.view.shape = <Py_ssize_t *> result.from_slice.shape
+ *     result.view.strides = <Py_ssize_t *> result.from_slice.strides             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_v_result->__pyx_base.view.strides = ((Py_ssize_t *)__pyx_v_result->from_slice.strides);
+
+  /* "View.MemoryView":1036
+ * 
+ * 
+ *     result.view.suboffsets = NULL             # <<<<<<<<<<<<<<
+ *     for suboffset in result.from_slice.suboffsets[:ndim]:
+ *         if suboffset >= 0:
+ */
+  __pyx_v_result->__pyx_base.view.suboffsets = NULL;
+
+  /* "View.MemoryView":1037
+ * 
+ *     result.view.suboffsets = NULL
+ *     for suboffset in result.from_slice.suboffsets[:ndim]:             # <<<<<<<<<<<<<<
+ *         if suboffset >= 0:
+ *             result.view.suboffsets = <Py_ssize_t *> result.from_slice.suboffsets
+ */
+  __pyx_t_7 = (__pyx_v_result->from_slice.suboffsets + __pyx_v_ndim);
+  for (__pyx_t_8 = __pyx_v_result->from_slice.suboffsets; __pyx_t_8 < __pyx_t_7; __pyx_t_8++) {
+    __pyx_t_6 = __pyx_t_8;
+    __pyx_v_suboffset = (__pyx_t_6[0]);
+
+    /* "View.MemoryView":1038
+ *     result.view.suboffsets = NULL
+ *     for suboffset in result.from_slice.suboffsets[:ndim]:
+ *         if suboffset >= 0:             # <<<<<<<<<<<<<<
+ *             result.view.suboffsets = <Py_ssize_t *> result.from_slice.suboffsets
+ *             break
+ */
+    __pyx_t_1 = (__pyx_v_suboffset >= 0);
+    if (__pyx_t_1) {
+
+      /* "View.MemoryView":1039
+ *     for suboffset in result.from_slice.suboffsets[:ndim]:
+ *         if suboffset >= 0:
+ *             result.view.suboffsets = <Py_ssize_t *> result.from_slice.suboffsets             # <<<<<<<<<<<<<<
+ *             break
+ * 
+ */
+      __pyx_v_result->__pyx_base.view.suboffsets = ((Py_ssize_t *)__pyx_v_result->from_slice.suboffsets);
+
+      /* "View.MemoryView":1040
+ *         if suboffset >= 0:
+ *             result.view.suboffsets = <Py_ssize_t *> result.from_slice.suboffsets
+ *             break             # <<<<<<<<<<<<<<
+ * 
+ *     result.view.len = result.view.itemsize
+ */
+      goto __pyx_L6_break;
+
+      /* "View.MemoryView":1038
+ *     result.view.suboffsets = NULL
+ *     for suboffset in result.from_slice.suboffsets[:ndim]:
+ *         if suboffset >= 0:             # <<<<<<<<<<<<<<
+ *             result.view.suboffsets = <Py_ssize_t *> result.from_slice.suboffsets
+ *             break
+ */
+    }
+  }
+  __pyx_L6_break:;
+
+  /* "View.MemoryView":1042
+ *             break
+ * 
+ *     result.view.len = result.view.itemsize             # <<<<<<<<<<<<<<
+ *     for length in result.view.shape[:ndim]:
+ *         result.view.len *= length
+ */
+  __pyx_t_9 = __pyx_v_result->__pyx_base.view.itemsize;
+  __pyx_v_result->__pyx_base.view.len = __pyx_t_9;
+
+  /* "View.MemoryView":1043
+ * 
+ *     result.view.len = result.view.itemsize
+ *     for length in result.view.shape[:ndim]:             # <<<<<<<<<<<<<<
+ *         result.view.len *= length
+ * 
+ */
+  __pyx_t_7 = (__pyx_v_result->__pyx_base.view.shape + __pyx_v_ndim);
+  for (__pyx_t_8 = __pyx_v_result->__pyx_base.view.shape; __pyx_t_8 < __pyx_t_7; __pyx_t_8++) {
+    __pyx_t_6 = __pyx_t_8;
+    __pyx_t_2 = PyInt_FromSsize_t((__pyx_t_6[0])); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 1043, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __Pyx_XDECREF_SET(__pyx_v_length, __pyx_t_2);
+    __pyx_t_2 = 0;
+
+    /* "View.MemoryView":1044
+ *     result.view.len = result.view.itemsize
+ *     for length in result.view.shape[:ndim]:
+ *         result.view.len *= length             # <<<<<<<<<<<<<<
+ * 
+ *     result.to_object_func = to_object_func
+ */
+    __pyx_t_2 = PyInt_FromSsize_t(__pyx_v_result->__pyx_base.view.len); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 1044, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_3 = PyNumber_InPlaceMultiply(__pyx_t_2, __pyx_v_length); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 1044, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __pyx_t_9 = __Pyx_PyIndex_AsSsize_t(__pyx_t_3); if (unlikely((__pyx_t_9 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(1, 1044, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_v_result->__pyx_base.view.len = __pyx_t_9;
+  }
+
+  /* "View.MemoryView":1046
+ *         result.view.len *= length
+ * 
+ *     result.to_object_func = to_object_func             # <<<<<<<<<<<<<<
+ *     result.to_dtype_func = to_dtype_func
+ * 
+ */
+  __pyx_v_result->to_object_func = __pyx_v_to_object_func;
+
+  /* "View.MemoryView":1047
+ * 
+ *     result.to_object_func = to_object_func
+ *     result.to_dtype_func = to_dtype_func             # <<<<<<<<<<<<<<
+ * 
+ *     return result
+ */
+  __pyx_v_result->to_dtype_func = __pyx_v_to_dtype_func;
+
+  /* "View.MemoryView":1049
+ *     result.to_dtype_func = to_dtype_func
+ * 
+ *     return result             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_get_slice_from_memoryview')
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF((PyObject *)__pyx_v_result);
+  __pyx_r = ((PyObject *)__pyx_v_result);
+  goto __pyx_L0;
+
+  /* "View.MemoryView":999
+ * 
+ * @cname('__pyx_memoryview_fromslice')
+ * cdef memoryview_fromslice(__Pyx_memviewslice memviewslice,             # <<<<<<<<<<<<<<
+ *                           int ndim,
+ *                           object (*to_object_func)(char *),
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_AddTraceback("View.MemoryView.memoryview_fromslice", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_result);
+  __Pyx_XDECREF(__pyx_v_length);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1052
+ * 
+ * @cname('__pyx_memoryview_get_slice_from_memoryview')
+ * cdef __Pyx_memviewslice *get_slice_from_memview(memoryview memview,             # <<<<<<<<<<<<<<
+ *                                                    __Pyx_memviewslice *mslice) except NULL:
+ *     cdef _memoryviewslice obj
+ */
+
+static __Pyx_memviewslice *__pyx_memoryview_get_slice_from_memoryview(struct __pyx_memoryview_obj *__pyx_v_memview, __Pyx_memviewslice *__pyx_v_mslice) {
+  struct __pyx_memoryviewslice_obj *__pyx_v_obj = 0;
+  __Pyx_memviewslice *__pyx_r;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("get_slice_from_memview", 1);
+
+  /* "View.MemoryView":1055
+ *                                                    __Pyx_memviewslice *mslice) except NULL:
+ *     cdef _memoryviewslice obj
+ *     if isinstance(memview, _memoryviewslice):             # <<<<<<<<<<<<<<
+ *         obj = memview
+ *         return &obj.from_slice
+ */
+  __pyx_t_1 = __Pyx_TypeCheck(((PyObject *)__pyx_v_memview), __pyx_memoryviewslice_type); 
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":1056
+ *     cdef _memoryviewslice obj
+ *     if isinstance(memview, _memoryviewslice):
+ *         obj = memview             # <<<<<<<<<<<<<<
+ *         return &obj.from_slice
+ *     else:
+ */
+    if (!(likely(((((PyObject *)__pyx_v_memview)) == Py_None) || likely(__Pyx_TypeTest(((PyObject *)__pyx_v_memview), __pyx_memoryviewslice_type))))) __PYX_ERR(1, 1056, __pyx_L1_error)
+    __pyx_t_2 = ((PyObject *)__pyx_v_memview);
+    __Pyx_INCREF(__pyx_t_2);
+    __pyx_v_obj = ((struct __pyx_memoryviewslice_obj *)__pyx_t_2);
+    __pyx_t_2 = 0;
+
+    /* "View.MemoryView":1057
+ *     if isinstance(memview, _memoryviewslice):
+ *         obj = memview
+ *         return &obj.from_slice             # <<<<<<<<<<<<<<
+ *     else:
+ *         slice_copy(memview, mslice)
+ */
+    __pyx_r = (&__pyx_v_obj->from_slice);
+    goto __pyx_L0;
+
+    /* "View.MemoryView":1055
+ *                                                    __Pyx_memviewslice *mslice) except NULL:
+ *     cdef _memoryviewslice obj
+ *     if isinstance(memview, _memoryviewslice):             # <<<<<<<<<<<<<<
+ *         obj = memview
+ *         return &obj.from_slice
+ */
+  }
+
+  /* "View.MemoryView":1059
+ *         return &obj.from_slice
+ *     else:
+ *         slice_copy(memview, mslice)             # <<<<<<<<<<<<<<
+ *         return mslice
+ * 
+ */
+  /*else*/ {
+    __pyx_memoryview_slice_copy(__pyx_v_memview, __pyx_v_mslice);
+
+    /* "View.MemoryView":1060
+ *     else:
+ *         slice_copy(memview, mslice)
+ *         return mslice             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_slice_copy')
+ */
+    __pyx_r = __pyx_v_mslice;
+    goto __pyx_L0;
+  }
+
+  /* "View.MemoryView":1052
+ * 
+ * @cname('__pyx_memoryview_get_slice_from_memoryview')
+ * cdef __Pyx_memviewslice *get_slice_from_memview(memoryview memview,             # <<<<<<<<<<<<<<
+ *                                                    __Pyx_memviewslice *mslice) except NULL:
+ *     cdef _memoryviewslice obj
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView.get_slice_from_memview", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_obj);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1063
+ * 
+ * @cname('__pyx_memoryview_slice_copy')
+ * cdef void slice_copy(memoryview memview, __Pyx_memviewslice *dst) noexcept:             # <<<<<<<<<<<<<<
+ *     cdef int dim
+ *     cdef (Py_ssize_t*) shape, strides, suboffsets
+ */
+
+static void __pyx_memoryview_slice_copy(struct __pyx_memoryview_obj *__pyx_v_memview, __Pyx_memviewslice *__pyx_v_dst) {
+  int __pyx_v_dim;
+  Py_ssize_t *__pyx_v_shape;
+  Py_ssize_t *__pyx_v_strides;
+  Py_ssize_t *__pyx_v_suboffsets;
+  Py_ssize_t *__pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+  int __pyx_t_4;
+  Py_ssize_t __pyx_t_5;
+  int __pyx_t_6;
+
+  /* "View.MemoryView":1067
+ *     cdef (Py_ssize_t*) shape, strides, suboffsets
+ * 
+ *     shape = memview.view.shape             # <<<<<<<<<<<<<<
+ *     strides = memview.view.strides
+ *     suboffsets = memview.view.suboffsets
+ */
+  __pyx_t_1 = __pyx_v_memview->view.shape;
+  __pyx_v_shape = __pyx_t_1;
+
+  /* "View.MemoryView":1068
+ * 
+ *     shape = memview.view.shape
+ *     strides = memview.view.strides             # <<<<<<<<<<<<<<
+ *     suboffsets = memview.view.suboffsets
+ * 
+ */
+  __pyx_t_1 = __pyx_v_memview->view.strides;
+  __pyx_v_strides = __pyx_t_1;
+
+  /* "View.MemoryView":1069
+ *     shape = memview.view.shape
+ *     strides = memview.view.strides
+ *     suboffsets = memview.view.suboffsets             # <<<<<<<<<<<<<<
+ * 
+ *     dst.memview = <__pyx_memoryview *> memview
+ */
+  __pyx_t_1 = __pyx_v_memview->view.suboffsets;
+  __pyx_v_suboffsets = __pyx_t_1;
+
+  /* "View.MemoryView":1071
+ *     suboffsets = memview.view.suboffsets
+ * 
+ *     dst.memview = <__pyx_memoryview *> memview             # <<<<<<<<<<<<<<
+ *     dst.data = <char *> memview.view.buf
+ * 
+ */
+  __pyx_v_dst->memview = ((struct __pyx_memoryview_obj *)__pyx_v_memview);
+
+  /* "View.MemoryView":1072
+ * 
+ *     dst.memview = <__pyx_memoryview *> memview
+ *     dst.data = <char *> memview.view.buf             # <<<<<<<<<<<<<<
+ * 
+ *     for dim in range(memview.view.ndim):
+ */
+  __pyx_v_dst->data = ((char *)__pyx_v_memview->view.buf);
+
+  /* "View.MemoryView":1074
+ *     dst.data = <char *> memview.view.buf
+ * 
+ *     for dim in range(memview.view.ndim):             # <<<<<<<<<<<<<<
+ *         dst.shape[dim] = shape[dim]
+ *         dst.strides[dim] = strides[dim]
+ */
+  __pyx_t_2 = __pyx_v_memview->view.ndim;
+  __pyx_t_3 = __pyx_t_2;
+  for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
+    __pyx_v_dim = __pyx_t_4;
+
+    /* "View.MemoryView":1075
+ * 
+ *     for dim in range(memview.view.ndim):
+ *         dst.shape[dim] = shape[dim]             # <<<<<<<<<<<<<<
+ *         dst.strides[dim] = strides[dim]
+ *         dst.suboffsets[dim] = suboffsets[dim] if suboffsets else -1
+ */
+    (__pyx_v_dst->shape[__pyx_v_dim]) = (__pyx_v_shape[__pyx_v_dim]);
+
+    /* "View.MemoryView":1076
+ *     for dim in range(memview.view.ndim):
+ *         dst.shape[dim] = shape[dim]
+ *         dst.strides[dim] = strides[dim]             # <<<<<<<<<<<<<<
+ *         dst.suboffsets[dim] = suboffsets[dim] if suboffsets else -1
+ * 
+ */
+    (__pyx_v_dst->strides[__pyx_v_dim]) = (__pyx_v_strides[__pyx_v_dim]);
+
+    /* "View.MemoryView":1077
+ *         dst.shape[dim] = shape[dim]
+ *         dst.strides[dim] = strides[dim]
+ *         dst.suboffsets[dim] = suboffsets[dim] if suboffsets else -1             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_copy_object')
+ */
+    __pyx_t_6 = (__pyx_v_suboffsets != 0);
+    if (__pyx_t_6) {
+      __pyx_t_5 = (__pyx_v_suboffsets[__pyx_v_dim]);
+    } else {
+      __pyx_t_5 = -1L;
+    }
+    (__pyx_v_dst->suboffsets[__pyx_v_dim]) = __pyx_t_5;
+  }
+
+  /* "View.MemoryView":1063
+ * 
+ * @cname('__pyx_memoryview_slice_copy')
+ * cdef void slice_copy(memoryview memview, __Pyx_memviewslice *dst) noexcept:             # <<<<<<<<<<<<<<
+ *     cdef int dim
+ *     cdef (Py_ssize_t*) shape, strides, suboffsets
+ */
+
+  /* function exit code */
+}
+
+/* "View.MemoryView":1080
+ * 
+ * @cname('__pyx_memoryview_copy_object')
+ * cdef memoryview_copy(memoryview memview):             # <<<<<<<<<<<<<<
+ *     "Create a new memoryview object"
+ *     cdef __Pyx_memviewslice memviewslice
+ */
+
+static PyObject *__pyx_memoryview_copy_object(struct __pyx_memoryview_obj *__pyx_v_memview) {
+  __Pyx_memviewslice __pyx_v_memviewslice;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("memoryview_copy", 1);
+
+  /* "View.MemoryView":1083
+ *     "Create a new memoryview object"
+ *     cdef __Pyx_memviewslice memviewslice
+ *     slice_copy(memview, &memviewslice)             # <<<<<<<<<<<<<<
+ *     return memoryview_copy_from_slice(memview, &memviewslice)
+ * 
+ */
+  __pyx_memoryview_slice_copy(__pyx_v_memview, (&__pyx_v_memviewslice));
+
+  /* "View.MemoryView":1084
+ *     cdef __Pyx_memviewslice memviewslice
+ *     slice_copy(memview, &memviewslice)
+ *     return memoryview_copy_from_slice(memview, &memviewslice)             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_copy_object_from_slice')
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __pyx_memoryview_copy_object_from_slice(__pyx_v_memview, (&__pyx_v_memviewslice)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 1084, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":1080
+ * 
+ * @cname('__pyx_memoryview_copy_object')
+ * cdef memoryview_copy(memoryview memview):             # <<<<<<<<<<<<<<
+ *     "Create a new memoryview object"
+ *     cdef __Pyx_memviewslice memviewslice
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("View.MemoryView.memoryview_copy", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1087
+ * 
+ * @cname('__pyx_memoryview_copy_object_from_slice')
+ * cdef memoryview_copy_from_slice(memoryview memview, __Pyx_memviewslice *memviewslice):             # <<<<<<<<<<<<<<
+ *     """
+ *     Create a new memoryview object from a given memoryview object and slice.
+ */
+
+static PyObject *__pyx_memoryview_copy_object_from_slice(struct __pyx_memoryview_obj *__pyx_v_memview, __Pyx_memviewslice *__pyx_v_memviewslice) {
+  PyObject *(*__pyx_v_to_object_func)(char *);
+  int (*__pyx_v_to_dtype_func)(char *, PyObject *);
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *(*__pyx_t_2)(char *);
+  int (*__pyx_t_3)(char *, PyObject *);
+  PyObject *__pyx_t_4 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("memoryview_copy_from_slice", 1);
+
+  /* "View.MemoryView":1094
+ *     cdef int (*to_dtype_func)(char *, object) except 0
+ * 
+ *     if isinstance(memview, _memoryviewslice):             # <<<<<<<<<<<<<<
+ *         to_object_func = (<_memoryviewslice> memview).to_object_func
+ *         to_dtype_func = (<_memoryviewslice> memview).to_dtype_func
+ */
+  __pyx_t_1 = __Pyx_TypeCheck(((PyObject *)__pyx_v_memview), __pyx_memoryviewslice_type); 
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":1095
+ * 
+ *     if isinstance(memview, _memoryviewslice):
+ *         to_object_func = (<_memoryviewslice> memview).to_object_func             # <<<<<<<<<<<<<<
+ *         to_dtype_func = (<_memoryviewslice> memview).to_dtype_func
+ *     else:
+ */
+    __pyx_t_2 = ((struct __pyx_memoryviewslice_obj *)__pyx_v_memview)->to_object_func;
+    __pyx_v_to_object_func = __pyx_t_2;
+
+    /* "View.MemoryView":1096
+ *     if isinstance(memview, _memoryviewslice):
+ *         to_object_func = (<_memoryviewslice> memview).to_object_func
+ *         to_dtype_func = (<_memoryviewslice> memview).to_dtype_func             # <<<<<<<<<<<<<<
+ *     else:
+ *         to_object_func = NULL
+ */
+    __pyx_t_3 = ((struct __pyx_memoryviewslice_obj *)__pyx_v_memview)->to_dtype_func;
+    __pyx_v_to_dtype_func = __pyx_t_3;
+
+    /* "View.MemoryView":1094
+ *     cdef int (*to_dtype_func)(char *, object) except 0
+ * 
+ *     if isinstance(memview, _memoryviewslice):             # <<<<<<<<<<<<<<
+ *         to_object_func = (<_memoryviewslice> memview).to_object_func
+ *         to_dtype_func = (<_memoryviewslice> memview).to_dtype_func
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":1098
+ *         to_dtype_func = (<_memoryviewslice> memview).to_dtype_func
+ *     else:
+ *         to_object_func = NULL             # <<<<<<<<<<<<<<
+ *         to_dtype_func = NULL
+ * 
+ */
+  /*else*/ {
+    __pyx_v_to_object_func = NULL;
+
+    /* "View.MemoryView":1099
+ *     else:
+ *         to_object_func = NULL
+ *         to_dtype_func = NULL             # <<<<<<<<<<<<<<
+ * 
+ *     return memoryview_fromslice(memviewslice[0], memview.view.ndim,
+ */
+    __pyx_v_to_dtype_func = NULL;
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":1101
+ *         to_dtype_func = NULL
+ * 
+ *     return memoryview_fromslice(memviewslice[0], memview.view.ndim,             # <<<<<<<<<<<<<<
+ *                                 to_object_func, to_dtype_func,
+ *                                 memview.dtype_is_object)
+ */
+  __Pyx_XDECREF(__pyx_r);
+
+  /* "View.MemoryView":1103
+ *     return memoryview_fromslice(memviewslice[0], memview.view.ndim,
+ *                                 to_object_func, to_dtype_func,
+ *                                 memview.dtype_is_object)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_t_4 = __pyx_memoryview_fromslice((__pyx_v_memviewslice[0]), __pyx_v_memview->view.ndim, __pyx_v_to_object_func, __pyx_v_to_dtype_func, __pyx_v_memview->dtype_is_object); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 1101, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_r = __pyx_t_4;
+  __pyx_t_4 = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":1087
+ * 
+ * @cname('__pyx_memoryview_copy_object_from_slice')
+ * cdef memoryview_copy_from_slice(memoryview memview, __Pyx_memviewslice *memviewslice):             # <<<<<<<<<<<<<<
+ *     """
+ *     Create a new memoryview object from a given memoryview object and slice.
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("View.MemoryView.memoryview_copy_from_slice", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1109
+ * 
+ * 
+ * cdef Py_ssize_t abs_py_ssize_t(Py_ssize_t arg) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     return -arg if arg < 0 else arg
+ * 
+ */
+
+static Py_ssize_t abs_py_ssize_t(Py_ssize_t __pyx_v_arg) {
+  Py_ssize_t __pyx_r;
+  Py_ssize_t __pyx_t_1;
+  int __pyx_t_2;
+
+  /* "View.MemoryView":1110
+ * 
+ * cdef Py_ssize_t abs_py_ssize_t(Py_ssize_t arg) noexcept nogil:
+ *     return -arg if arg < 0 else arg             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_get_best_slice_order')
+ */
+  __pyx_t_2 = (__pyx_v_arg < 0);
+  if (__pyx_t_2) {
+    __pyx_t_1 = (-__pyx_v_arg);
+  } else {
+    __pyx_t_1 = __pyx_v_arg;
+  }
+  __pyx_r = __pyx_t_1;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":1109
+ * 
+ * 
+ * cdef Py_ssize_t abs_py_ssize_t(Py_ssize_t arg) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     return -arg if arg < 0 else arg
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1113
+ * 
+ * @cname('__pyx_get_best_slice_order')
+ * cdef char get_best_order(__Pyx_memviewslice *mslice, int ndim) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     Figure out the best memory access order for a given slice.
+ */
+
+static char __pyx_get_best_slice_order(__Pyx_memviewslice *__pyx_v_mslice, int __pyx_v_ndim) {
+  int __pyx_v_i;
+  Py_ssize_t __pyx_v_c_stride;
+  Py_ssize_t __pyx_v_f_stride;
+  char __pyx_r;
+  int __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+  int __pyx_t_4;
+
+  /* "View.MemoryView":1118
+ *     """
+ *     cdef int i
+ *     cdef Py_ssize_t c_stride = 0             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t f_stride = 0
+ * 
+ */
+  __pyx_v_c_stride = 0;
+
+  /* "View.MemoryView":1119
+ *     cdef int i
+ *     cdef Py_ssize_t c_stride = 0
+ *     cdef Py_ssize_t f_stride = 0             # <<<<<<<<<<<<<<
+ * 
+ *     for i in range(ndim - 1, -1, -1):
+ */
+  __pyx_v_f_stride = 0;
+
+  /* "View.MemoryView":1121
+ *     cdef Py_ssize_t f_stride = 0
+ * 
+ *     for i in range(ndim - 1, -1, -1):             # <<<<<<<<<<<<<<
+ *         if mslice.shape[i] > 1:
+ *             c_stride = mslice.strides[i]
+ */
+  for (__pyx_t_1 = (__pyx_v_ndim - 1); __pyx_t_1 > -1; __pyx_t_1-=1) {
+    __pyx_v_i = __pyx_t_1;
+
+    /* "View.MemoryView":1122
+ * 
+ *     for i in range(ndim - 1, -1, -1):
+ *         if mslice.shape[i] > 1:             # <<<<<<<<<<<<<<
+ *             c_stride = mslice.strides[i]
+ *             break
+ */
+    __pyx_t_2 = ((__pyx_v_mslice->shape[__pyx_v_i]) > 1);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":1123
+ *     for i in range(ndim - 1, -1, -1):
+ *         if mslice.shape[i] > 1:
+ *             c_stride = mslice.strides[i]             # <<<<<<<<<<<<<<
+ *             break
+ * 
+ */
+      __pyx_v_c_stride = (__pyx_v_mslice->strides[__pyx_v_i]);
+
+      /* "View.MemoryView":1124
+ *         if mslice.shape[i] > 1:
+ *             c_stride = mslice.strides[i]
+ *             break             # <<<<<<<<<<<<<<
+ * 
+ *     for i in range(ndim):
+ */
+      goto __pyx_L4_break;
+
+      /* "View.MemoryView":1122
+ * 
+ *     for i in range(ndim - 1, -1, -1):
+ *         if mslice.shape[i] > 1:             # <<<<<<<<<<<<<<
+ *             c_stride = mslice.strides[i]
+ *             break
+ */
+    }
+  }
+  __pyx_L4_break:;
+
+  /* "View.MemoryView":1126
+ *             break
+ * 
+ *     for i in range(ndim):             # <<<<<<<<<<<<<<
+ *         if mslice.shape[i] > 1:
+ *             f_stride = mslice.strides[i]
+ */
+  __pyx_t_1 = __pyx_v_ndim;
+  __pyx_t_3 = __pyx_t_1;
+  for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
+    __pyx_v_i = __pyx_t_4;
+
+    /* "View.MemoryView":1127
+ * 
+ *     for i in range(ndim):
+ *         if mslice.shape[i] > 1:             # <<<<<<<<<<<<<<
+ *             f_stride = mslice.strides[i]
+ *             break
+ */
+    __pyx_t_2 = ((__pyx_v_mslice->shape[__pyx_v_i]) > 1);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":1128
+ *     for i in range(ndim):
+ *         if mslice.shape[i] > 1:
+ *             f_stride = mslice.strides[i]             # <<<<<<<<<<<<<<
+ *             break
+ * 
+ */
+      __pyx_v_f_stride = (__pyx_v_mslice->strides[__pyx_v_i]);
+
+      /* "View.MemoryView":1129
+ *         if mslice.shape[i] > 1:
+ *             f_stride = mslice.strides[i]
+ *             break             # <<<<<<<<<<<<<<
+ * 
+ *     if abs_py_ssize_t(c_stride) <= abs_py_ssize_t(f_stride):
+ */
+      goto __pyx_L7_break;
+
+      /* "View.MemoryView":1127
+ * 
+ *     for i in range(ndim):
+ *         if mslice.shape[i] > 1:             # <<<<<<<<<<<<<<
+ *             f_stride = mslice.strides[i]
+ *             break
+ */
+    }
+  }
+  __pyx_L7_break:;
+
+  /* "View.MemoryView":1131
+ *             break
+ * 
+ *     if abs_py_ssize_t(c_stride) <= abs_py_ssize_t(f_stride):             # <<<<<<<<<<<<<<
+ *         return 'C'
+ *     else:
+ */
+  __pyx_t_2 = (abs_py_ssize_t(__pyx_v_c_stride) <= abs_py_ssize_t(__pyx_v_f_stride));
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":1132
+ * 
+ *     if abs_py_ssize_t(c_stride) <= abs_py_ssize_t(f_stride):
+ *         return 'C'             # <<<<<<<<<<<<<<
+ *     else:
+ *         return 'F'
+ */
+    __pyx_r = 'C';
+    goto __pyx_L0;
+
+    /* "View.MemoryView":1131
+ *             break
+ * 
+ *     if abs_py_ssize_t(c_stride) <= abs_py_ssize_t(f_stride):             # <<<<<<<<<<<<<<
+ *         return 'C'
+ *     else:
+ */
+  }
+
+  /* "View.MemoryView":1134
+ *         return 'C'
+ *     else:
+ *         return 'F'             # <<<<<<<<<<<<<<
+ * 
+ * @cython.cdivision(True)
+ */
+  /*else*/ {
+    __pyx_r = 'F';
+    goto __pyx_L0;
+  }
+
+  /* "View.MemoryView":1113
+ * 
+ * @cname('__pyx_get_best_slice_order')
+ * cdef char get_best_order(__Pyx_memviewslice *mslice, int ndim) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     Figure out the best memory access order for a given slice.
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1137
+ * 
+ * @cython.cdivision(True)
+ * cdef void _copy_strided_to_strided(char *src_data, Py_ssize_t *src_strides,             # <<<<<<<<<<<<<<
+ *                                    char *dst_data, Py_ssize_t *dst_strides,
+ *                                    Py_ssize_t *src_shape, Py_ssize_t *dst_shape,
+ */
+
+static void _copy_strided_to_strided(char *__pyx_v_src_data, Py_ssize_t *__pyx_v_src_strides, char *__pyx_v_dst_data, Py_ssize_t *__pyx_v_dst_strides, Py_ssize_t *__pyx_v_src_shape, Py_ssize_t *__pyx_v_dst_shape, int __pyx_v_ndim, size_t __pyx_v_itemsize) {
+  CYTHON_UNUSED Py_ssize_t __pyx_v_i;
+  CYTHON_UNUSED Py_ssize_t __pyx_v_src_extent;
+  Py_ssize_t __pyx_v_dst_extent;
+  Py_ssize_t __pyx_v_src_stride;
+  Py_ssize_t __pyx_v_dst_stride;
+  int __pyx_t_1;
+  int __pyx_t_2;
+  Py_ssize_t __pyx_t_3;
+  Py_ssize_t __pyx_t_4;
+  Py_ssize_t __pyx_t_5;
+
+  /* "View.MemoryView":1144
+ * 
+ *     cdef Py_ssize_t i
+ *     cdef Py_ssize_t src_extent = src_shape[0]             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t dst_extent = dst_shape[0]
+ *     cdef Py_ssize_t src_stride = src_strides[0]
+ */
+  __pyx_v_src_extent = (__pyx_v_src_shape[0]);
+
+  /* "View.MemoryView":1145
+ *     cdef Py_ssize_t i
+ *     cdef Py_ssize_t src_extent = src_shape[0]
+ *     cdef Py_ssize_t dst_extent = dst_shape[0]             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t src_stride = src_strides[0]
+ *     cdef Py_ssize_t dst_stride = dst_strides[0]
+ */
+  __pyx_v_dst_extent = (__pyx_v_dst_shape[0]);
+
+  /* "View.MemoryView":1146
+ *     cdef Py_ssize_t src_extent = src_shape[0]
+ *     cdef Py_ssize_t dst_extent = dst_shape[0]
+ *     cdef Py_ssize_t src_stride = src_strides[0]             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t dst_stride = dst_strides[0]
+ * 
+ */
+  __pyx_v_src_stride = (__pyx_v_src_strides[0]);
+
+  /* "View.MemoryView":1147
+ *     cdef Py_ssize_t dst_extent = dst_shape[0]
+ *     cdef Py_ssize_t src_stride = src_strides[0]
+ *     cdef Py_ssize_t dst_stride = dst_strides[0]             # <<<<<<<<<<<<<<
+ * 
+ *     if ndim == 1:
+ */
+  __pyx_v_dst_stride = (__pyx_v_dst_strides[0]);
+
+  /* "View.MemoryView":1149
+ *     cdef Py_ssize_t dst_stride = dst_strides[0]
+ * 
+ *     if ndim == 1:             # <<<<<<<<<<<<<<
+ *         if (src_stride > 0 and dst_stride > 0 and
+ *             <size_t> src_stride == itemsize == <size_t> dst_stride):
+ */
+  __pyx_t_1 = (__pyx_v_ndim == 1);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":1150
+ * 
+ *     if ndim == 1:
+ *         if (src_stride > 0 and dst_stride > 0 and             # <<<<<<<<<<<<<<
+ *             <size_t> src_stride == itemsize == <size_t> dst_stride):
+ *             memcpy(dst_data, src_data, itemsize * dst_extent)
+ */
+    __pyx_t_2 = (__pyx_v_src_stride > 0);
+    if (__pyx_t_2) {
+    } else {
+      __pyx_t_1 = __pyx_t_2;
+      goto __pyx_L5_bool_binop_done;
+    }
+    __pyx_t_2 = (__pyx_v_dst_stride > 0);
+    if (__pyx_t_2) {
+    } else {
+      __pyx_t_1 = __pyx_t_2;
+      goto __pyx_L5_bool_binop_done;
+    }
+
+    /* "View.MemoryView":1151
+ *     if ndim == 1:
+ *         if (src_stride > 0 and dst_stride > 0 and
+ *             <size_t> src_stride == itemsize == <size_t> dst_stride):             # <<<<<<<<<<<<<<
+ *             memcpy(dst_data, src_data, itemsize * dst_extent)
+ *         else:
+ */
+    __pyx_t_2 = (((size_t)__pyx_v_src_stride) == __pyx_v_itemsize);
+    if (__pyx_t_2) {
+      __pyx_t_2 = (__pyx_v_itemsize == ((size_t)__pyx_v_dst_stride));
+    }
+    __pyx_t_1 = __pyx_t_2;
+    __pyx_L5_bool_binop_done:;
+
+    /* "View.MemoryView":1150
+ * 
+ *     if ndim == 1:
+ *         if (src_stride > 0 and dst_stride > 0 and             # <<<<<<<<<<<<<<
+ *             <size_t> src_stride == itemsize == <size_t> dst_stride):
+ *             memcpy(dst_data, src_data, itemsize * dst_extent)
+ */
+    if (__pyx_t_1) {
+
+      /* "View.MemoryView":1152
+ *         if (src_stride > 0 and dst_stride > 0 and
+ *             <size_t> src_stride == itemsize == <size_t> dst_stride):
+ *             memcpy(dst_data, src_data, itemsize * dst_extent)             # <<<<<<<<<<<<<<
+ *         else:
+ *             for i in range(dst_extent):
+ */
+      (void)(memcpy(__pyx_v_dst_data, __pyx_v_src_data, (__pyx_v_itemsize * __pyx_v_dst_extent)));
+
+      /* "View.MemoryView":1150
+ * 
+ *     if ndim == 1:
+ *         if (src_stride > 0 and dst_stride > 0 and             # <<<<<<<<<<<<<<
+ *             <size_t> src_stride == itemsize == <size_t> dst_stride):
+ *             memcpy(dst_data, src_data, itemsize * dst_extent)
+ */
+      goto __pyx_L4;
+    }
+
+    /* "View.MemoryView":1154
+ *             memcpy(dst_data, src_data, itemsize * dst_extent)
+ *         else:
+ *             for i in range(dst_extent):             # <<<<<<<<<<<<<<
+ *                 memcpy(dst_data, src_data, itemsize)
+ *                 src_data += src_stride
+ */
+    /*else*/ {
+      __pyx_t_3 = __pyx_v_dst_extent;
+      __pyx_t_4 = __pyx_t_3;
+      for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
+        __pyx_v_i = __pyx_t_5;
+
+        /* "View.MemoryView":1155
+ *         else:
+ *             for i in range(dst_extent):
+ *                 memcpy(dst_data, src_data, itemsize)             # <<<<<<<<<<<<<<
+ *                 src_data += src_stride
+ *                 dst_data += dst_stride
+ */
+        (void)(memcpy(__pyx_v_dst_data, __pyx_v_src_data, __pyx_v_itemsize));
+
+        /* "View.MemoryView":1156
+ *             for i in range(dst_extent):
+ *                 memcpy(dst_data, src_data, itemsize)
+ *                 src_data += src_stride             # <<<<<<<<<<<<<<
+ *                 dst_data += dst_stride
+ *     else:
+ */
+        __pyx_v_src_data = (__pyx_v_src_data + __pyx_v_src_stride);
+
+        /* "View.MemoryView":1157
+ *                 memcpy(dst_data, src_data, itemsize)
+ *                 src_data += src_stride
+ *                 dst_data += dst_stride             # <<<<<<<<<<<<<<
+ *     else:
+ *         for i in range(dst_extent):
+ */
+        __pyx_v_dst_data = (__pyx_v_dst_data + __pyx_v_dst_stride);
+      }
+    }
+    __pyx_L4:;
+
+    /* "View.MemoryView":1149
+ *     cdef Py_ssize_t dst_stride = dst_strides[0]
+ * 
+ *     if ndim == 1:             # <<<<<<<<<<<<<<
+ *         if (src_stride > 0 and dst_stride > 0 and
+ *             <size_t> src_stride == itemsize == <size_t> dst_stride):
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":1159
+ *                 dst_data += dst_stride
+ *     else:
+ *         for i in range(dst_extent):             # <<<<<<<<<<<<<<
+ *             _copy_strided_to_strided(src_data, src_strides + 1,
+ *                                      dst_data, dst_strides + 1,
+ */
+  /*else*/ {
+    __pyx_t_3 = __pyx_v_dst_extent;
+    __pyx_t_4 = __pyx_t_3;
+    for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
+      __pyx_v_i = __pyx_t_5;
+
+      /* "View.MemoryView":1160
+ *     else:
+ *         for i in range(dst_extent):
+ *             _copy_strided_to_strided(src_data, src_strides + 1,             # <<<<<<<<<<<<<<
+ *                                      dst_data, dst_strides + 1,
+ *                                      src_shape + 1, dst_shape + 1,
+ */
+      _copy_strided_to_strided(__pyx_v_src_data, (__pyx_v_src_strides + 1), __pyx_v_dst_data, (__pyx_v_dst_strides + 1), (__pyx_v_src_shape + 1), (__pyx_v_dst_shape + 1), (__pyx_v_ndim - 1), __pyx_v_itemsize);
+
+      /* "View.MemoryView":1164
+ *                                      src_shape + 1, dst_shape + 1,
+ *                                      ndim - 1, itemsize)
+ *             src_data += src_stride             # <<<<<<<<<<<<<<
+ *             dst_data += dst_stride
+ * 
+ */
+      __pyx_v_src_data = (__pyx_v_src_data + __pyx_v_src_stride);
+
+      /* "View.MemoryView":1165
+ *                                      ndim - 1, itemsize)
+ *             src_data += src_stride
+ *             dst_data += dst_stride             # <<<<<<<<<<<<<<
+ * 
+ * cdef void copy_strided_to_strided(__Pyx_memviewslice *src,
+ */
+      __pyx_v_dst_data = (__pyx_v_dst_data + __pyx_v_dst_stride);
+    }
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":1137
+ * 
+ * @cython.cdivision(True)
+ * cdef void _copy_strided_to_strided(char *src_data, Py_ssize_t *src_strides,             # <<<<<<<<<<<<<<
+ *                                    char *dst_data, Py_ssize_t *dst_strides,
+ *                                    Py_ssize_t *src_shape, Py_ssize_t *dst_shape,
+ */
+
+  /* function exit code */
+}
+
+/* "View.MemoryView":1167
+ *             dst_data += dst_stride
+ * 
+ * cdef void copy_strided_to_strided(__Pyx_memviewslice *src,             # <<<<<<<<<<<<<<
+ *                                   __Pyx_memviewslice *dst,
+ *                                   int ndim, size_t itemsize) noexcept nogil:
+ */
+
+static void copy_strided_to_strided(__Pyx_memviewslice *__pyx_v_src, __Pyx_memviewslice *__pyx_v_dst, int __pyx_v_ndim, size_t __pyx_v_itemsize) {
+
+  /* "View.MemoryView":1170
+ *                                   __Pyx_memviewslice *dst,
+ *                                   int ndim, size_t itemsize) noexcept nogil:
+ *     _copy_strided_to_strided(src.data, src.strides, dst.data, dst.strides,             # <<<<<<<<<<<<<<
+ *                              src.shape, dst.shape, ndim, itemsize)
+ * 
+ */
+  _copy_strided_to_strided(__pyx_v_src->data, __pyx_v_src->strides, __pyx_v_dst->data, __pyx_v_dst->strides, __pyx_v_src->shape, __pyx_v_dst->shape, __pyx_v_ndim, __pyx_v_itemsize);
+
+  /* "View.MemoryView":1167
+ *             dst_data += dst_stride
+ * 
+ * cdef void copy_strided_to_strided(__Pyx_memviewslice *src,             # <<<<<<<<<<<<<<
+ *                                   __Pyx_memviewslice *dst,
+ *                                   int ndim, size_t itemsize) noexcept nogil:
+ */
+
+  /* function exit code */
+}
+
+/* "View.MemoryView":1174
+ * 
+ * @cname('__pyx_memoryview_slice_get_size')
+ * cdef Py_ssize_t slice_get_size(__Pyx_memviewslice *src, int ndim) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     "Return the size of the memory occupied by the slice in number of bytes"
+ *     cdef Py_ssize_t shape, size = src.memview.view.itemsize
+ */
+
+static Py_ssize_t __pyx_memoryview_slice_get_size(__Pyx_memviewslice *__pyx_v_src, int __pyx_v_ndim) {
+  Py_ssize_t __pyx_v_shape;
+  Py_ssize_t __pyx_v_size;
+  Py_ssize_t __pyx_r;
+  Py_ssize_t __pyx_t_1;
+  Py_ssize_t *__pyx_t_2;
+  Py_ssize_t *__pyx_t_3;
+  Py_ssize_t *__pyx_t_4;
+
+  /* "View.MemoryView":1176
+ * cdef Py_ssize_t slice_get_size(__Pyx_memviewslice *src, int ndim) noexcept nogil:
+ *     "Return the size of the memory occupied by the slice in number of bytes"
+ *     cdef Py_ssize_t shape, size = src.memview.view.itemsize             # <<<<<<<<<<<<<<
+ * 
+ *     for shape in src.shape[:ndim]:
+ */
+  __pyx_t_1 = __pyx_v_src->memview->view.itemsize;
+  __pyx_v_size = __pyx_t_1;
+
+  /* "View.MemoryView":1178
+ *     cdef Py_ssize_t shape, size = src.memview.view.itemsize
+ * 
+ *     for shape in src.shape[:ndim]:             # <<<<<<<<<<<<<<
+ *         size *= shape
+ * 
+ */
+  __pyx_t_3 = (__pyx_v_src->shape + __pyx_v_ndim);
+  for (__pyx_t_4 = __pyx_v_src->shape; __pyx_t_4 < __pyx_t_3; __pyx_t_4++) {
+    __pyx_t_2 = __pyx_t_4;
+    __pyx_v_shape = (__pyx_t_2[0]);
+
+    /* "View.MemoryView":1179
+ * 
+ *     for shape in src.shape[:ndim]:
+ *         size *= shape             # <<<<<<<<<<<<<<
+ * 
+ *     return size
+ */
+    __pyx_v_size = (__pyx_v_size * __pyx_v_shape);
+  }
+
+  /* "View.MemoryView":1181
+ *         size *= shape
+ * 
+ *     return size             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_fill_contig_strides_array')
+ */
+  __pyx_r = __pyx_v_size;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":1174
+ * 
+ * @cname('__pyx_memoryview_slice_get_size')
+ * cdef Py_ssize_t slice_get_size(__Pyx_memviewslice *src, int ndim) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     "Return the size of the memory occupied by the slice in number of bytes"
+ *     cdef Py_ssize_t shape, size = src.memview.view.itemsize
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1184
+ * 
+ * @cname('__pyx_fill_contig_strides_array')
+ * cdef Py_ssize_t fill_contig_strides_array(             # <<<<<<<<<<<<<<
+ *                 Py_ssize_t *shape, Py_ssize_t *strides, Py_ssize_t stride,
+ *                 int ndim, char order) noexcept nogil:
+ */
+
+static Py_ssize_t __pyx_fill_contig_strides_array(Py_ssize_t *__pyx_v_shape, Py_ssize_t *__pyx_v_strides, Py_ssize_t __pyx_v_stride, int __pyx_v_ndim, char __pyx_v_order) {
+  int __pyx_v_idx;
+  Py_ssize_t __pyx_r;
+  int __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+  int __pyx_t_4;
+
+  /* "View.MemoryView":1193
+ *     cdef int idx
+ * 
+ *     if order == 'F':             # <<<<<<<<<<<<<<
+ *         for idx in range(ndim):
+ *             strides[idx] = stride
+ */
+  __pyx_t_1 = (__pyx_v_order == 'F');
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":1194
+ * 
+ *     if order == 'F':
+ *         for idx in range(ndim):             # <<<<<<<<<<<<<<
+ *             strides[idx] = stride
+ *             stride *= shape[idx]
+ */
+    __pyx_t_2 = __pyx_v_ndim;
+    __pyx_t_3 = __pyx_t_2;
+    for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
+      __pyx_v_idx = __pyx_t_4;
+
+      /* "View.MemoryView":1195
+ *     if order == 'F':
+ *         for idx in range(ndim):
+ *             strides[idx] = stride             # <<<<<<<<<<<<<<
+ *             stride *= shape[idx]
+ *     else:
+ */
+      (__pyx_v_strides[__pyx_v_idx]) = __pyx_v_stride;
+
+      /* "View.MemoryView":1196
+ *         for idx in range(ndim):
+ *             strides[idx] = stride
+ *             stride *= shape[idx]             # <<<<<<<<<<<<<<
+ *     else:
+ *         for idx in range(ndim - 1, -1, -1):
+ */
+      __pyx_v_stride = (__pyx_v_stride * (__pyx_v_shape[__pyx_v_idx]));
+    }
+
+    /* "View.MemoryView":1193
+ *     cdef int idx
+ * 
+ *     if order == 'F':             # <<<<<<<<<<<<<<
+ *         for idx in range(ndim):
+ *             strides[idx] = stride
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":1198
+ *             stride *= shape[idx]
+ *     else:
+ *         for idx in range(ndim - 1, -1, -1):             # <<<<<<<<<<<<<<
+ *             strides[idx] = stride
+ *             stride *= shape[idx]
+ */
+  /*else*/ {
+    for (__pyx_t_2 = (__pyx_v_ndim - 1); __pyx_t_2 > -1; __pyx_t_2-=1) {
+      __pyx_v_idx = __pyx_t_2;
+
+      /* "View.MemoryView":1199
+ *     else:
+ *         for idx in range(ndim - 1, -1, -1):
+ *             strides[idx] = stride             # <<<<<<<<<<<<<<
+ *             stride *= shape[idx]
+ * 
+ */
+      (__pyx_v_strides[__pyx_v_idx]) = __pyx_v_stride;
+
+      /* "View.MemoryView":1200
+ *         for idx in range(ndim - 1, -1, -1):
+ *             strides[idx] = stride
+ *             stride *= shape[idx]             # <<<<<<<<<<<<<<
+ * 
+ *     return stride
+ */
+      __pyx_v_stride = (__pyx_v_stride * (__pyx_v_shape[__pyx_v_idx]));
+    }
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":1202
+ *             stride *= shape[idx]
+ * 
+ *     return stride             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_copy_data_to_temp')
+ */
+  __pyx_r = __pyx_v_stride;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":1184
+ * 
+ * @cname('__pyx_fill_contig_strides_array')
+ * cdef Py_ssize_t fill_contig_strides_array(             # <<<<<<<<<<<<<<
+ *                 Py_ssize_t *shape, Py_ssize_t *strides, Py_ssize_t stride,
+ *                 int ndim, char order) noexcept nogil:
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1205
+ * 
+ * @cname('__pyx_memoryview_copy_data_to_temp')
+ * cdef void *copy_data_to_temp(__Pyx_memviewslice *src,             # <<<<<<<<<<<<<<
+ *                              __Pyx_memviewslice *tmpslice,
+ *                              char order,
+ */
+
+static void *__pyx_memoryview_copy_data_to_temp(__Pyx_memviewslice *__pyx_v_src, __Pyx_memviewslice *__pyx_v_tmpslice, char __pyx_v_order, int __pyx_v_ndim) {
+  int __pyx_v_i;
+  void *__pyx_v_result;
+  size_t __pyx_v_itemsize;
+  size_t __pyx_v_size;
+  void *__pyx_r;
+  Py_ssize_t __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+  struct __pyx_memoryview_obj *__pyx_t_4;
+  int __pyx_t_5;
+  int __pyx_t_6;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  #ifdef WITH_THREAD
+  PyGILState_STATE __pyx_gilstate_save;
+  #endif
+
+  /* "View.MemoryView":1216
+ *     cdef void *result
+ * 
+ *     cdef size_t itemsize = src.memview.view.itemsize             # <<<<<<<<<<<<<<
+ *     cdef size_t size = slice_get_size(src, ndim)
+ * 
+ */
+  __pyx_t_1 = __pyx_v_src->memview->view.itemsize;
+  __pyx_v_itemsize = __pyx_t_1;
+
+  /* "View.MemoryView":1217
+ * 
+ *     cdef size_t itemsize = src.memview.view.itemsize
+ *     cdef size_t size = slice_get_size(src, ndim)             # <<<<<<<<<<<<<<
+ * 
+ *     result = malloc(size)
+ */
+  __pyx_v_size = __pyx_memoryview_slice_get_size(__pyx_v_src, __pyx_v_ndim);
+
+  /* "View.MemoryView":1219
+ *     cdef size_t size = slice_get_size(src, ndim)
+ * 
+ *     result = malloc(size)             # <<<<<<<<<<<<<<
+ *     if not result:
+ *         _err_no_memory()
+ */
+  __pyx_v_result = malloc(__pyx_v_size);
+
+  /* "View.MemoryView":1220
+ * 
+ *     result = malloc(size)
+ *     if not result:             # <<<<<<<<<<<<<<
+ *         _err_no_memory()
+ * 
+ */
+  __pyx_t_2 = (!(__pyx_v_result != 0));
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":1221
+ *     result = malloc(size)
+ *     if not result:
+ *         _err_no_memory()             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+    __pyx_t_3 = __pyx_memoryview_err_no_memory(); if (unlikely(__pyx_t_3 == ((int)-1))) __PYX_ERR(1, 1221, __pyx_L1_error)
+
+    /* "View.MemoryView":1220
+ * 
+ *     result = malloc(size)
+ *     if not result:             # <<<<<<<<<<<<<<
+ *         _err_no_memory()
+ * 
+ */
+  }
+
+  /* "View.MemoryView":1224
+ * 
+ * 
+ *     tmpslice.data = <char *> result             # <<<<<<<<<<<<<<
+ *     tmpslice.memview = src.memview
+ *     for i in range(ndim):
+ */
+  __pyx_v_tmpslice->data = ((char *)__pyx_v_result);
+
+  /* "View.MemoryView":1225
+ * 
+ *     tmpslice.data = <char *> result
+ *     tmpslice.memview = src.memview             # <<<<<<<<<<<<<<
+ *     for i in range(ndim):
+ *         tmpslice.shape[i] = src.shape[i]
+ */
+  __pyx_t_4 = __pyx_v_src->memview;
+  __pyx_v_tmpslice->memview = __pyx_t_4;
+
+  /* "View.MemoryView":1226
+ *     tmpslice.data = <char *> result
+ *     tmpslice.memview = src.memview
+ *     for i in range(ndim):             # <<<<<<<<<<<<<<
+ *         tmpslice.shape[i] = src.shape[i]
+ *         tmpslice.suboffsets[i] = -1
+ */
+  __pyx_t_3 = __pyx_v_ndim;
+  __pyx_t_5 = __pyx_t_3;
+  for (__pyx_t_6 = 0; __pyx_t_6 < __pyx_t_5; __pyx_t_6+=1) {
+    __pyx_v_i = __pyx_t_6;
+
+    /* "View.MemoryView":1227
+ *     tmpslice.memview = src.memview
+ *     for i in range(ndim):
+ *         tmpslice.shape[i] = src.shape[i]             # <<<<<<<<<<<<<<
+ *         tmpslice.suboffsets[i] = -1
+ * 
+ */
+    (__pyx_v_tmpslice->shape[__pyx_v_i]) = (__pyx_v_src->shape[__pyx_v_i]);
+
+    /* "View.MemoryView":1228
+ *     for i in range(ndim):
+ *         tmpslice.shape[i] = src.shape[i]
+ *         tmpslice.suboffsets[i] = -1             # <<<<<<<<<<<<<<
+ * 
+ *     fill_contig_strides_array(&tmpslice.shape[0], &tmpslice.strides[0], itemsize, ndim, order)
+ */
+    (__pyx_v_tmpslice->suboffsets[__pyx_v_i]) = -1L;
+  }
+
+  /* "View.MemoryView":1230
+ *         tmpslice.suboffsets[i] = -1
+ * 
+ *     fill_contig_strides_array(&tmpslice.shape[0], &tmpslice.strides[0], itemsize, ndim, order)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  (void)(__pyx_fill_contig_strides_array((&(__pyx_v_tmpslice->shape[0])), (&(__pyx_v_tmpslice->strides[0])), __pyx_v_itemsize, __pyx_v_ndim, __pyx_v_order));
+
+  /* "View.MemoryView":1233
+ * 
+ * 
+ *     for i in range(ndim):             # <<<<<<<<<<<<<<
+ *         if tmpslice.shape[i] == 1:
+ *             tmpslice.strides[i] = 0
+ */
+  __pyx_t_3 = __pyx_v_ndim;
+  __pyx_t_5 = __pyx_t_3;
+  for (__pyx_t_6 = 0; __pyx_t_6 < __pyx_t_5; __pyx_t_6+=1) {
+    __pyx_v_i = __pyx_t_6;
+
+    /* "View.MemoryView":1234
+ * 
+ *     for i in range(ndim):
+ *         if tmpslice.shape[i] == 1:             # <<<<<<<<<<<<<<
+ *             tmpslice.strides[i] = 0
+ * 
+ */
+    __pyx_t_2 = ((__pyx_v_tmpslice->shape[__pyx_v_i]) == 1);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":1235
+ *     for i in range(ndim):
+ *         if tmpslice.shape[i] == 1:
+ *             tmpslice.strides[i] = 0             # <<<<<<<<<<<<<<
+ * 
+ *     if slice_is_contig(src[0], order, ndim):
+ */
+      (__pyx_v_tmpslice->strides[__pyx_v_i]) = 0;
+
+      /* "View.MemoryView":1234
+ * 
+ *     for i in range(ndim):
+ *         if tmpslice.shape[i] == 1:             # <<<<<<<<<<<<<<
+ *             tmpslice.strides[i] = 0
+ * 
+ */
+    }
+  }
+
+  /* "View.MemoryView":1237
+ *             tmpslice.strides[i] = 0
+ * 
+ *     if slice_is_contig(src[0], order, ndim):             # <<<<<<<<<<<<<<
+ *         memcpy(result, src.data, size)
+ *     else:
+ */
+  __pyx_t_2 = __pyx_memviewslice_is_contig((__pyx_v_src[0]), __pyx_v_order, __pyx_v_ndim);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":1238
+ * 
+ *     if slice_is_contig(src[0], order, ndim):
+ *         memcpy(result, src.data, size)             # <<<<<<<<<<<<<<
+ *     else:
+ *         copy_strided_to_strided(src, tmpslice, ndim, itemsize)
+ */
+    (void)(memcpy(__pyx_v_result, __pyx_v_src->data, __pyx_v_size));
+
+    /* "View.MemoryView":1237
+ *             tmpslice.strides[i] = 0
+ * 
+ *     if slice_is_contig(src[0], order, ndim):             # <<<<<<<<<<<<<<
+ *         memcpy(result, src.data, size)
+ *     else:
+ */
+    goto __pyx_L9;
+  }
+
+  /* "View.MemoryView":1240
+ *         memcpy(result, src.data, size)
+ *     else:
+ *         copy_strided_to_strided(src, tmpslice, ndim, itemsize)             # <<<<<<<<<<<<<<
+ * 
+ *     return result
+ */
+  /*else*/ {
+    copy_strided_to_strided(__pyx_v_src, __pyx_v_tmpslice, __pyx_v_ndim, __pyx_v_itemsize);
+  }
+  __pyx_L9:;
+
+  /* "View.MemoryView":1242
+ *         copy_strided_to_strided(src, tmpslice, ndim, itemsize)
+ * 
+ *     return result             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = __pyx_v_result;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":1205
+ * 
+ * @cname('__pyx_memoryview_copy_data_to_temp')
+ * cdef void *copy_data_to_temp(__Pyx_memviewslice *src,             # <<<<<<<<<<<<<<
+ *                              __Pyx_memviewslice *tmpslice,
+ *                              char order,
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  #ifdef WITH_THREAD
+  __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
+  #endif
+  __Pyx_AddTraceback("View.MemoryView.copy_data_to_temp", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  #ifdef WITH_THREAD
+  __Pyx_PyGILState_Release(__pyx_gilstate_save);
+  #endif
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1247
+ * 
+ * @cname('__pyx_memoryview_err_extents')
+ * cdef int _err_extents(int i, Py_ssize_t extent1,             # <<<<<<<<<<<<<<
+ *                              Py_ssize_t extent2) except -1 with gil:
+ *     raise ValueError, f"got differing extents in dimension {i} (got {extent1} and {extent2})"
+ */
+
+static int __pyx_memoryview_err_extents(int __pyx_v_i, Py_ssize_t __pyx_v_extent1, Py_ssize_t __pyx_v_extent2) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  Py_ssize_t __pyx_t_2;
+  Py_UCS4 __pyx_t_3;
+  PyObject *__pyx_t_4 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  #ifdef WITH_THREAD
+  PyGILState_STATE __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
+  #endif
+  __Pyx_RefNannySetupContext("_err_extents", 0);
+
+  /* "View.MemoryView":1249
+ * cdef int _err_extents(int i, Py_ssize_t extent1,
+ *                              Py_ssize_t extent2) except -1 with gil:
+ *     raise ValueError, f"got differing extents in dimension {i} (got {extent1} and {extent2})"             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_err_dim')
+ */
+  __pyx_t_1 = PyTuple_New(7); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 1249, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = 0;
+  __pyx_t_3 = 127;
+  __Pyx_INCREF(__pyx_kp_u_got_differing_extents_in_dimensi);
+  __pyx_t_2 += 35;
+  __Pyx_GIVEREF(__pyx_kp_u_got_differing_extents_in_dimensi);
+  PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_kp_u_got_differing_extents_in_dimensi);
+  __pyx_t_4 = __Pyx_PyUnicode_From_int(__pyx_v_i, 0, ' ', 'd'); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 1249, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_2 += __Pyx_PyUnicode_GET_LENGTH(__pyx_t_4);
+  __Pyx_GIVEREF(__pyx_t_4);
+  PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_t_4);
+  __pyx_t_4 = 0;
+  __Pyx_INCREF(__pyx_kp_u_got);
+  __pyx_t_2 += 6;
+  __Pyx_GIVEREF(__pyx_kp_u_got);
+  PyTuple_SET_ITEM(__pyx_t_1, 2, __pyx_kp_u_got);
+  __pyx_t_4 = __Pyx_PyUnicode_From_Py_ssize_t(__pyx_v_extent1, 0, ' ', 'd'); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 1249, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_2 += __Pyx_PyUnicode_GET_LENGTH(__pyx_t_4);
+  __Pyx_GIVEREF(__pyx_t_4);
+  PyTuple_SET_ITEM(__pyx_t_1, 3, __pyx_t_4);
+  __pyx_t_4 = 0;
+  __Pyx_INCREF(__pyx_kp_u_and);
+  __pyx_t_2 += 5;
+  __Pyx_GIVEREF(__pyx_kp_u_and);
+  PyTuple_SET_ITEM(__pyx_t_1, 4, __pyx_kp_u_and);
+  __pyx_t_4 = __Pyx_PyUnicode_From_Py_ssize_t(__pyx_v_extent2, 0, ' ', 'd'); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 1249, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_2 += __Pyx_PyUnicode_GET_LENGTH(__pyx_t_4);
+  __Pyx_GIVEREF(__pyx_t_4);
+  PyTuple_SET_ITEM(__pyx_t_1, 5, __pyx_t_4);
+  __pyx_t_4 = 0;
+  __Pyx_INCREF(__pyx_kp_u__7);
+  __pyx_t_2 += 1;
+  __Pyx_GIVEREF(__pyx_kp_u__7);
+  PyTuple_SET_ITEM(__pyx_t_1, 6, __pyx_kp_u__7);
+  __pyx_t_4 = __Pyx_PyUnicode_Join(__pyx_t_1, 7, __pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 1249, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_Raise(__pyx_builtin_ValueError, __pyx_t_4, 0, 0);
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __PYX_ERR(1, 1249, __pyx_L1_error)
+
+  /* "View.MemoryView":1247
+ * 
+ * @cname('__pyx_memoryview_err_extents')
+ * cdef int _err_extents(int i, Py_ssize_t extent1,             # <<<<<<<<<<<<<<
+ *                              Py_ssize_t extent2) except -1 with gil:
+ *     raise ValueError, f"got differing extents in dimension {i} (got {extent1} and {extent2})"
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("View.MemoryView._err_extents", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __Pyx_RefNannyFinishContext();
+  #ifdef WITH_THREAD
+  __Pyx_PyGILState_Release(__pyx_gilstate_save);
+  #endif
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1252
+ * 
+ * @cname('__pyx_memoryview_err_dim')
+ * cdef int _err_dim(PyObject *error, str msg, int dim) except -1 with gil:             # <<<<<<<<<<<<<<
+ *     raise <object>error, msg % dim
+ * 
+ */
+
+static int __pyx_memoryview_err_dim(PyObject *__pyx_v_error, PyObject *__pyx_v_msg, int __pyx_v_dim) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  #ifdef WITH_THREAD
+  PyGILState_STATE __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
+  #endif
+  __Pyx_RefNannySetupContext("_err_dim", 0);
+  __Pyx_INCREF(__pyx_v_msg);
+
+  /* "View.MemoryView":1253
+ * @cname('__pyx_memoryview_err_dim')
+ * cdef int _err_dim(PyObject *error, str msg, int dim) except -1 with gil:
+ *     raise <object>error, msg % dim             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_err')
+ */
+  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_dim); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 1253, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyString_FormatSafe(__pyx_v_msg, __pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 1253, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_Raise(((PyObject *)__pyx_v_error), __pyx_t_2, 0, 0);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __PYX_ERR(1, 1253, __pyx_L1_error)
+
+  /* "View.MemoryView":1252
+ * 
+ * @cname('__pyx_memoryview_err_dim')
+ * cdef int _err_dim(PyObject *error, str msg, int dim) except -1 with gil:             # <<<<<<<<<<<<<<
+ *     raise <object>error, msg % dim
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("View.MemoryView._err_dim", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __Pyx_XDECREF(__pyx_v_msg);
+  __Pyx_RefNannyFinishContext();
+  #ifdef WITH_THREAD
+  __Pyx_PyGILState_Release(__pyx_gilstate_save);
+  #endif
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1256
+ * 
+ * @cname('__pyx_memoryview_err')
+ * cdef int _err(PyObject *error, str msg) except -1 with gil:             # <<<<<<<<<<<<<<
+ *     raise <object>error, msg
+ * 
+ */
+
+static int __pyx_memoryview_err(PyObject *__pyx_v_error, PyObject *__pyx_v_msg) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  #ifdef WITH_THREAD
+  PyGILState_STATE __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
+  #endif
+  __Pyx_RefNannySetupContext("_err", 0);
+  __Pyx_INCREF(__pyx_v_msg);
+
+  /* "View.MemoryView":1257
+ * @cname('__pyx_memoryview_err')
+ * cdef int _err(PyObject *error, str msg) except -1 with gil:
+ *     raise <object>error, msg             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_err_no_memory')
+ */
+  __Pyx_Raise(((PyObject *)__pyx_v_error), __pyx_v_msg, 0, 0);
+  __PYX_ERR(1, 1257, __pyx_L1_error)
+
+  /* "View.MemoryView":1256
+ * 
+ * @cname('__pyx_memoryview_err')
+ * cdef int _err(PyObject *error, str msg) except -1 with gil:             # <<<<<<<<<<<<<<
+ *     raise <object>error, msg
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView._err", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __Pyx_XDECREF(__pyx_v_msg);
+  __Pyx_RefNannyFinishContext();
+  #ifdef WITH_THREAD
+  __Pyx_PyGILState_Release(__pyx_gilstate_save);
+  #endif
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1260
+ * 
+ * @cname('__pyx_memoryview_err_no_memory')
+ * cdef int _err_no_memory() except -1 with gil:             # <<<<<<<<<<<<<<
+ *     raise MemoryError
+ * 
+ */
+
+static int __pyx_memoryview_err_no_memory(void) {
+  int __pyx_r;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  #ifdef WITH_THREAD
+  PyGILState_STATE __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
+  #endif
+
+  /* "View.MemoryView":1261
+ * @cname('__pyx_memoryview_err_no_memory')
+ * cdef int _err_no_memory() except -1 with gil:
+ *     raise MemoryError             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  PyErr_NoMemory(); __PYX_ERR(1, 1261, __pyx_L1_error)
+
+  /* "View.MemoryView":1260
+ * 
+ * @cname('__pyx_memoryview_err_no_memory')
+ * cdef int _err_no_memory() except -1 with gil:             # <<<<<<<<<<<<<<
+ *     raise MemoryError
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("View.MemoryView._err_no_memory", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  #ifdef WITH_THREAD
+  __Pyx_PyGILState_Release(__pyx_gilstate_save);
+  #endif
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1265
+ * 
+ * @cname('__pyx_memoryview_copy_contents')
+ * cdef int memoryview_copy_contents(__Pyx_memviewslice src,             # <<<<<<<<<<<<<<
+ *                                   __Pyx_memviewslice dst,
+ *                                   int src_ndim, int dst_ndim,
+ */
+
+static int __pyx_memoryview_copy_contents(__Pyx_memviewslice __pyx_v_src, __Pyx_memviewslice __pyx_v_dst, int __pyx_v_src_ndim, int __pyx_v_dst_ndim, int __pyx_v_dtype_is_object) {
+  void *__pyx_v_tmpdata;
+  size_t __pyx_v_itemsize;
+  int __pyx_v_i;
+  char __pyx_v_order;
+  int __pyx_v_broadcasting;
+  int __pyx_v_direct_copy;
+  __Pyx_memviewslice __pyx_v_tmp;
+  int __pyx_v_ndim;
+  int __pyx_r;
+  Py_ssize_t __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+  int __pyx_t_4;
+  int __pyx_t_5;
+  int __pyx_t_6;
+  void *__pyx_t_7;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  #ifdef WITH_THREAD
+  PyGILState_STATE __pyx_gilstate_save;
+  #endif
+
+  /* "View.MemoryView":1273
+ *     Check for overlapping memory and verify the shapes.
+ *     """
+ *     cdef void *tmpdata = NULL             # <<<<<<<<<<<<<<
+ *     cdef size_t itemsize = src.memview.view.itemsize
+ *     cdef int i
+ */
+  __pyx_v_tmpdata = NULL;
+
+  /* "View.MemoryView":1274
+ *     """
+ *     cdef void *tmpdata = NULL
+ *     cdef size_t itemsize = src.memview.view.itemsize             # <<<<<<<<<<<<<<
+ *     cdef int i
+ *     cdef char order = get_best_order(&src, src_ndim)
+ */
+  __pyx_t_1 = __pyx_v_src.memview->view.itemsize;
+  __pyx_v_itemsize = __pyx_t_1;
+
+  /* "View.MemoryView":1276
+ *     cdef size_t itemsize = src.memview.view.itemsize
+ *     cdef int i
+ *     cdef char order = get_best_order(&src, src_ndim)             # <<<<<<<<<<<<<<
+ *     cdef bint broadcasting = False
+ *     cdef bint direct_copy = False
+ */
+  __pyx_v_order = __pyx_get_best_slice_order((&__pyx_v_src), __pyx_v_src_ndim);
+
+  /* "View.MemoryView":1277
+ *     cdef int i
+ *     cdef char order = get_best_order(&src, src_ndim)
+ *     cdef bint broadcasting = False             # <<<<<<<<<<<<<<
+ *     cdef bint direct_copy = False
+ *     cdef __Pyx_memviewslice tmp
+ */
+  __pyx_v_broadcasting = 0;
+
+  /* "View.MemoryView":1278
+ *     cdef char order = get_best_order(&src, src_ndim)
+ *     cdef bint broadcasting = False
+ *     cdef bint direct_copy = False             # <<<<<<<<<<<<<<
+ *     cdef __Pyx_memviewslice tmp
+ * 
+ */
+  __pyx_v_direct_copy = 0;
+
+  /* "View.MemoryView":1281
+ *     cdef __Pyx_memviewslice tmp
+ * 
+ *     if src_ndim < dst_ndim:             # <<<<<<<<<<<<<<
+ *         broadcast_leading(&src, src_ndim, dst_ndim)
+ *     elif dst_ndim < src_ndim:
+ */
+  __pyx_t_2 = (__pyx_v_src_ndim < __pyx_v_dst_ndim);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":1282
+ * 
+ *     if src_ndim < dst_ndim:
+ *         broadcast_leading(&src, src_ndim, dst_ndim)             # <<<<<<<<<<<<<<
+ *     elif dst_ndim < src_ndim:
+ *         broadcast_leading(&dst, dst_ndim, src_ndim)
+ */
+    __pyx_memoryview_broadcast_leading((&__pyx_v_src), __pyx_v_src_ndim, __pyx_v_dst_ndim);
+
+    /* "View.MemoryView":1281
+ *     cdef __Pyx_memviewslice tmp
+ * 
+ *     if src_ndim < dst_ndim:             # <<<<<<<<<<<<<<
+ *         broadcast_leading(&src, src_ndim, dst_ndim)
+ *     elif dst_ndim < src_ndim:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":1283
+ *     if src_ndim < dst_ndim:
+ *         broadcast_leading(&src, src_ndim, dst_ndim)
+ *     elif dst_ndim < src_ndim:             # <<<<<<<<<<<<<<
+ *         broadcast_leading(&dst, dst_ndim, src_ndim)
+ * 
+ */
+  __pyx_t_2 = (__pyx_v_dst_ndim < __pyx_v_src_ndim);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":1284
+ *         broadcast_leading(&src, src_ndim, dst_ndim)
+ *     elif dst_ndim < src_ndim:
+ *         broadcast_leading(&dst, dst_ndim, src_ndim)             # <<<<<<<<<<<<<<
+ * 
+ *     cdef int ndim = max(src_ndim, dst_ndim)
+ */
+    __pyx_memoryview_broadcast_leading((&__pyx_v_dst), __pyx_v_dst_ndim, __pyx_v_src_ndim);
+
+    /* "View.MemoryView":1283
+ *     if src_ndim < dst_ndim:
+ *         broadcast_leading(&src, src_ndim, dst_ndim)
+ *     elif dst_ndim < src_ndim:             # <<<<<<<<<<<<<<
+ *         broadcast_leading(&dst, dst_ndim, src_ndim)
+ * 
+ */
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":1286
+ *         broadcast_leading(&dst, dst_ndim, src_ndim)
+ * 
+ *     cdef int ndim = max(src_ndim, dst_ndim)             # <<<<<<<<<<<<<<
+ * 
+ *     for i in range(ndim):
+ */
+  __pyx_t_3 = __pyx_v_dst_ndim;
+  __pyx_t_4 = __pyx_v_src_ndim;
+  __pyx_t_2 = (__pyx_t_3 > __pyx_t_4);
+  if (__pyx_t_2) {
+    __pyx_t_5 = __pyx_t_3;
+  } else {
+    __pyx_t_5 = __pyx_t_4;
+  }
+  __pyx_v_ndim = __pyx_t_5;
+
+  /* "View.MemoryView":1288
+ *     cdef int ndim = max(src_ndim, dst_ndim)
+ * 
+ *     for i in range(ndim):             # <<<<<<<<<<<<<<
+ *         if src.shape[i] != dst.shape[i]:
+ *             if src.shape[i] == 1:
+ */
+  __pyx_t_5 = __pyx_v_ndim;
+  __pyx_t_3 = __pyx_t_5;
+  for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
+    __pyx_v_i = __pyx_t_4;
+
+    /* "View.MemoryView":1289
+ * 
+ *     for i in range(ndim):
+ *         if src.shape[i] != dst.shape[i]:             # <<<<<<<<<<<<<<
+ *             if src.shape[i] == 1:
+ *                 broadcasting = True
+ */
+    __pyx_t_2 = ((__pyx_v_src.shape[__pyx_v_i]) != (__pyx_v_dst.shape[__pyx_v_i]));
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":1290
+ *     for i in range(ndim):
+ *         if src.shape[i] != dst.shape[i]:
+ *             if src.shape[i] == 1:             # <<<<<<<<<<<<<<
+ *                 broadcasting = True
+ *                 src.strides[i] = 0
+ */
+      __pyx_t_2 = ((__pyx_v_src.shape[__pyx_v_i]) == 1);
+      if (__pyx_t_2) {
+
+        /* "View.MemoryView":1291
+ *         if src.shape[i] != dst.shape[i]:
+ *             if src.shape[i] == 1:
+ *                 broadcasting = True             # <<<<<<<<<<<<<<
+ *                 src.strides[i] = 0
+ *             else:
+ */
+        __pyx_v_broadcasting = 1;
+
+        /* "View.MemoryView":1292
+ *             if src.shape[i] == 1:
+ *                 broadcasting = True
+ *                 src.strides[i] = 0             # <<<<<<<<<<<<<<
+ *             else:
+ *                 _err_extents(i, dst.shape[i], src.shape[i])
+ */
+        (__pyx_v_src.strides[__pyx_v_i]) = 0;
+
+        /* "View.MemoryView":1290
+ *     for i in range(ndim):
+ *         if src.shape[i] != dst.shape[i]:
+ *             if src.shape[i] == 1:             # <<<<<<<<<<<<<<
+ *                 broadcasting = True
+ *                 src.strides[i] = 0
+ */
+        goto __pyx_L7;
+      }
+
+      /* "View.MemoryView":1294
+ *                 src.strides[i] = 0
+ *             else:
+ *                 _err_extents(i, dst.shape[i], src.shape[i])             # <<<<<<<<<<<<<<
+ * 
+ *         if src.suboffsets[i] >= 0:
+ */
+      /*else*/ {
+        __pyx_t_6 = __pyx_memoryview_err_extents(__pyx_v_i, (__pyx_v_dst.shape[__pyx_v_i]), (__pyx_v_src.shape[__pyx_v_i])); if (unlikely(__pyx_t_6 == ((int)-1))) __PYX_ERR(1, 1294, __pyx_L1_error)
+      }
+      __pyx_L7:;
+
+      /* "View.MemoryView":1289
+ * 
+ *     for i in range(ndim):
+ *         if src.shape[i] != dst.shape[i]:             # <<<<<<<<<<<<<<
+ *             if src.shape[i] == 1:
+ *                 broadcasting = True
+ */
+    }
+
+    /* "View.MemoryView":1296
+ *                 _err_extents(i, dst.shape[i], src.shape[i])
+ * 
+ *         if src.suboffsets[i] >= 0:             # <<<<<<<<<<<<<<
+ *             _err_dim(PyExc_ValueError, "Dimension %d is not direct", i)
+ * 
+ */
+    __pyx_t_2 = ((__pyx_v_src.suboffsets[__pyx_v_i]) >= 0);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":1297
+ * 
+ *         if src.suboffsets[i] >= 0:
+ *             _err_dim(PyExc_ValueError, "Dimension %d is not direct", i)             # <<<<<<<<<<<<<<
+ * 
+ *     if slices_overlap(&src, &dst, ndim, itemsize):
+ */
+      __pyx_t_6 = __pyx_memoryview_err_dim(PyExc_ValueError, __pyx_kp_s_Dimension_d_is_not_direct, __pyx_v_i); if (unlikely(__pyx_t_6 == ((int)-1))) __PYX_ERR(1, 1297, __pyx_L1_error)
+
+      /* "View.MemoryView":1296
+ *                 _err_extents(i, dst.shape[i], src.shape[i])
+ * 
+ *         if src.suboffsets[i] >= 0:             # <<<<<<<<<<<<<<
+ *             _err_dim(PyExc_ValueError, "Dimension %d is not direct", i)
+ * 
+ */
+    }
+  }
+
+  /* "View.MemoryView":1299
+ *             _err_dim(PyExc_ValueError, "Dimension %d is not direct", i)
+ * 
+ *     if slices_overlap(&src, &dst, ndim, itemsize):             # <<<<<<<<<<<<<<
+ * 
+ *         if not slice_is_contig(src, order, ndim):
+ */
+  __pyx_t_2 = __pyx_slices_overlap((&__pyx_v_src), (&__pyx_v_dst), __pyx_v_ndim, __pyx_v_itemsize);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":1301
+ *     if slices_overlap(&src, &dst, ndim, itemsize):
+ * 
+ *         if not slice_is_contig(src, order, ndim):             # <<<<<<<<<<<<<<
+ *             order = get_best_order(&dst, ndim)
+ * 
+ */
+    __pyx_t_2 = (!__pyx_memviewslice_is_contig(__pyx_v_src, __pyx_v_order, __pyx_v_ndim));
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":1302
+ * 
+ *         if not slice_is_contig(src, order, ndim):
+ *             order = get_best_order(&dst, ndim)             # <<<<<<<<<<<<<<
+ * 
+ *         tmpdata = copy_data_to_temp(&src, &tmp, order, ndim)
+ */
+      __pyx_v_order = __pyx_get_best_slice_order((&__pyx_v_dst), __pyx_v_ndim);
+
+      /* "View.MemoryView":1301
+ *     if slices_overlap(&src, &dst, ndim, itemsize):
+ * 
+ *         if not slice_is_contig(src, order, ndim):             # <<<<<<<<<<<<<<
+ *             order = get_best_order(&dst, ndim)
+ * 
+ */
+    }
+
+    /* "View.MemoryView":1304
+ *             order = get_best_order(&dst, ndim)
+ * 
+ *         tmpdata = copy_data_to_temp(&src, &tmp, order, ndim)             # <<<<<<<<<<<<<<
+ *         src = tmp
+ * 
+ */
+    __pyx_t_7 = __pyx_memoryview_copy_data_to_temp((&__pyx_v_src), (&__pyx_v_tmp), __pyx_v_order, __pyx_v_ndim); if (unlikely(__pyx_t_7 == ((void *)NULL))) __PYX_ERR(1, 1304, __pyx_L1_error)
+    __pyx_v_tmpdata = __pyx_t_7;
+
+    /* "View.MemoryView":1305
+ * 
+ *         tmpdata = copy_data_to_temp(&src, &tmp, order, ndim)
+ *         src = tmp             # <<<<<<<<<<<<<<
+ * 
+ *     if not broadcasting:
+ */
+    __pyx_v_src = __pyx_v_tmp;
+
+    /* "View.MemoryView":1299
+ *             _err_dim(PyExc_ValueError, "Dimension %d is not direct", i)
+ * 
+ *     if slices_overlap(&src, &dst, ndim, itemsize):             # <<<<<<<<<<<<<<
+ * 
+ *         if not slice_is_contig(src, order, ndim):
+ */
+  }
+
+  /* "View.MemoryView":1307
+ *         src = tmp
+ * 
+ *     if not broadcasting:             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_t_2 = (!__pyx_v_broadcasting);
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":1310
+ * 
+ * 
+ *         if slice_is_contig(src, 'C', ndim):             # <<<<<<<<<<<<<<
+ *             direct_copy = slice_is_contig(dst, 'C', ndim)
+ *         elif slice_is_contig(src, 'F', ndim):
+ */
+    __pyx_t_2 = __pyx_memviewslice_is_contig(__pyx_v_src, 'C', __pyx_v_ndim);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":1311
+ * 
+ *         if slice_is_contig(src, 'C', ndim):
+ *             direct_copy = slice_is_contig(dst, 'C', ndim)             # <<<<<<<<<<<<<<
+ *         elif slice_is_contig(src, 'F', ndim):
+ *             direct_copy = slice_is_contig(dst, 'F', ndim)
+ */
+      __pyx_v_direct_copy = __pyx_memviewslice_is_contig(__pyx_v_dst, 'C', __pyx_v_ndim);
+
+      /* "View.MemoryView":1310
+ * 
+ * 
+ *         if slice_is_contig(src, 'C', ndim):             # <<<<<<<<<<<<<<
+ *             direct_copy = slice_is_contig(dst, 'C', ndim)
+ *         elif slice_is_contig(src, 'F', ndim):
+ */
+      goto __pyx_L12;
+    }
+
+    /* "View.MemoryView":1312
+ *         if slice_is_contig(src, 'C', ndim):
+ *             direct_copy = slice_is_contig(dst, 'C', ndim)
+ *         elif slice_is_contig(src, 'F', ndim):             # <<<<<<<<<<<<<<
+ *             direct_copy = slice_is_contig(dst, 'F', ndim)
+ * 
+ */
+    __pyx_t_2 = __pyx_memviewslice_is_contig(__pyx_v_src, 'F', __pyx_v_ndim);
+    if (__pyx_t_2) {
+
+      /* "View.MemoryView":1313
+ *             direct_copy = slice_is_contig(dst, 'C', ndim)
+ *         elif slice_is_contig(src, 'F', ndim):
+ *             direct_copy = slice_is_contig(dst, 'F', ndim)             # <<<<<<<<<<<<<<
+ * 
+ *         if direct_copy:
+ */
+      __pyx_v_direct_copy = __pyx_memviewslice_is_contig(__pyx_v_dst, 'F', __pyx_v_ndim);
+
+      /* "View.MemoryView":1312
+ *         if slice_is_contig(src, 'C', ndim):
+ *             direct_copy = slice_is_contig(dst, 'C', ndim)
+ *         elif slice_is_contig(src, 'F', ndim):             # <<<<<<<<<<<<<<
+ *             direct_copy = slice_is_contig(dst, 'F', ndim)
+ * 
+ */
+    }
+    __pyx_L12:;
+
+    /* "View.MemoryView":1315
+ *             direct_copy = slice_is_contig(dst, 'F', ndim)
+ * 
+ *         if direct_copy:             # <<<<<<<<<<<<<<
+ * 
+ *             refcount_copying(&dst, dtype_is_object, ndim, inc=False)
+ */
+    if (__pyx_v_direct_copy) {
+
+      /* "View.MemoryView":1317
+ *         if direct_copy:
+ * 
+ *             refcount_copying(&dst, dtype_is_object, ndim, inc=False)             # <<<<<<<<<<<<<<
+ *             memcpy(dst.data, src.data, slice_get_size(&src, ndim))
+ *             refcount_copying(&dst, dtype_is_object, ndim, inc=True)
+ */
+      __pyx_memoryview_refcount_copying((&__pyx_v_dst), __pyx_v_dtype_is_object, __pyx_v_ndim, 0);
+
+      /* "View.MemoryView":1318
+ * 
+ *             refcount_copying(&dst, dtype_is_object, ndim, inc=False)
+ *             memcpy(dst.data, src.data, slice_get_size(&src, ndim))             # <<<<<<<<<<<<<<
+ *             refcount_copying(&dst, dtype_is_object, ndim, inc=True)
+ *             free(tmpdata)
+ */
+      (void)(memcpy(__pyx_v_dst.data, __pyx_v_src.data, __pyx_memoryview_slice_get_size((&__pyx_v_src), __pyx_v_ndim)));
+
+      /* "View.MemoryView":1319
+ *             refcount_copying(&dst, dtype_is_object, ndim, inc=False)
+ *             memcpy(dst.data, src.data, slice_get_size(&src, ndim))
+ *             refcount_copying(&dst, dtype_is_object, ndim, inc=True)             # <<<<<<<<<<<<<<
+ *             free(tmpdata)
+ *             return 0
+ */
+      __pyx_memoryview_refcount_copying((&__pyx_v_dst), __pyx_v_dtype_is_object, __pyx_v_ndim, 1);
+
+      /* "View.MemoryView":1320
+ *             memcpy(dst.data, src.data, slice_get_size(&src, ndim))
+ *             refcount_copying(&dst, dtype_is_object, ndim, inc=True)
+ *             free(tmpdata)             # <<<<<<<<<<<<<<
+ *             return 0
+ * 
+ */
+      free(__pyx_v_tmpdata);
+
+      /* "View.MemoryView":1321
+ *             refcount_copying(&dst, dtype_is_object, ndim, inc=True)
+ *             free(tmpdata)
+ *             return 0             # <<<<<<<<<<<<<<
+ * 
+ *     if order == 'F' == get_best_order(&dst, ndim):
+ */
+      __pyx_r = 0;
+      goto __pyx_L0;
+
+      /* "View.MemoryView":1315
+ *             direct_copy = slice_is_contig(dst, 'F', ndim)
+ * 
+ *         if direct_copy:             # <<<<<<<<<<<<<<
+ * 
+ *             refcount_copying(&dst, dtype_is_object, ndim, inc=False)
+ */
+    }
+
+    /* "View.MemoryView":1307
+ *         src = tmp
+ * 
+ *     if not broadcasting:             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  }
+
+  /* "View.MemoryView":1323
+ *             return 0
+ * 
+ *     if order == 'F' == get_best_order(&dst, ndim):             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_t_2 = (__pyx_v_order == 'F');
+  if (__pyx_t_2) {
+    __pyx_t_2 = ('F' == __pyx_get_best_slice_order((&__pyx_v_dst), __pyx_v_ndim));
+  }
+  if (__pyx_t_2) {
+
+    /* "View.MemoryView":1326
+ * 
+ * 
+ *         transpose_memslice(&src)             # <<<<<<<<<<<<<<
+ *         transpose_memslice(&dst)
+ * 
+ */
+    __pyx_t_5 = __pyx_memslice_transpose((&__pyx_v_src)); if (unlikely(__pyx_t_5 == ((int)-1))) __PYX_ERR(1, 1326, __pyx_L1_error)
+
+    /* "View.MemoryView":1327
+ * 
+ *         transpose_memslice(&src)
+ *         transpose_memslice(&dst)             # <<<<<<<<<<<<<<
+ * 
+ *     refcount_copying(&dst, dtype_is_object, ndim, inc=False)
+ */
+    __pyx_t_5 = __pyx_memslice_transpose((&__pyx_v_dst)); if (unlikely(__pyx_t_5 == ((int)-1))) __PYX_ERR(1, 1327, __pyx_L1_error)
+
+    /* "View.MemoryView":1323
+ *             return 0
+ * 
+ *     if order == 'F' == get_best_order(&dst, ndim):             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  }
+
+  /* "View.MemoryView":1329
+ *         transpose_memslice(&dst)
+ * 
+ *     refcount_copying(&dst, dtype_is_object, ndim, inc=False)             # <<<<<<<<<<<<<<
+ *     copy_strided_to_strided(&src, &dst, ndim, itemsize)
+ *     refcount_copying(&dst, dtype_is_object, ndim, inc=True)
+ */
+  __pyx_memoryview_refcount_copying((&__pyx_v_dst), __pyx_v_dtype_is_object, __pyx_v_ndim, 0);
+
+  /* "View.MemoryView":1330
+ * 
+ *     refcount_copying(&dst, dtype_is_object, ndim, inc=False)
+ *     copy_strided_to_strided(&src, &dst, ndim, itemsize)             # <<<<<<<<<<<<<<
+ *     refcount_copying(&dst, dtype_is_object, ndim, inc=True)
+ * 
+ */
+  copy_strided_to_strided((&__pyx_v_src), (&__pyx_v_dst), __pyx_v_ndim, __pyx_v_itemsize);
+
+  /* "View.MemoryView":1331
+ *     refcount_copying(&dst, dtype_is_object, ndim, inc=False)
+ *     copy_strided_to_strided(&src, &dst, ndim, itemsize)
+ *     refcount_copying(&dst, dtype_is_object, ndim, inc=True)             # <<<<<<<<<<<<<<
+ * 
+ *     free(tmpdata)
+ */
+  __pyx_memoryview_refcount_copying((&__pyx_v_dst), __pyx_v_dtype_is_object, __pyx_v_ndim, 1);
+
+  /* "View.MemoryView":1333
+ *     refcount_copying(&dst, dtype_is_object, ndim, inc=True)
+ * 
+ *     free(tmpdata)             # <<<<<<<<<<<<<<
+ *     return 0
+ * 
+ */
+  free(__pyx_v_tmpdata);
+
+  /* "View.MemoryView":1334
+ * 
+ *     free(tmpdata)
+ *     return 0             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_broadcast_leading')
+ */
+  __pyx_r = 0;
+  goto __pyx_L0;
+
+  /* "View.MemoryView":1265
+ * 
+ * @cname('__pyx_memoryview_copy_contents')
+ * cdef int memoryview_copy_contents(__Pyx_memviewslice src,             # <<<<<<<<<<<<<<
+ *                                   __Pyx_memviewslice dst,
+ *                                   int src_ndim, int dst_ndim,
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  #ifdef WITH_THREAD
+  __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
+  #endif
+  __Pyx_AddTraceback("View.MemoryView.memoryview_copy_contents", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  #ifdef WITH_THREAD
+  __Pyx_PyGILState_Release(__pyx_gilstate_save);
+  #endif
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "View.MemoryView":1337
+ * 
+ * @cname('__pyx_memoryview_broadcast_leading')
+ * cdef void broadcast_leading(__Pyx_memviewslice *mslice,             # <<<<<<<<<<<<<<
+ *                             int ndim,
+ *                             int ndim_other) noexcept nogil:
+ */
+
+static void __pyx_memoryview_broadcast_leading(__Pyx_memviewslice *__pyx_v_mslice, int __pyx_v_ndim, int __pyx_v_ndim_other) {
+  int __pyx_v_i;
+  int __pyx_v_offset;
+  int __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+
+  /* "View.MemoryView":1341
+ *                             int ndim_other) noexcept nogil:
+ *     cdef int i
+ *     cdef int offset = ndim_other - ndim             # <<<<<<<<<<<<<<
+ * 
+ *     for i in range(ndim - 1, -1, -1):
+ */
+  __pyx_v_offset = (__pyx_v_ndim_other - __pyx_v_ndim);
+
+  /* "View.MemoryView":1343
+ *     cdef int offset = ndim_other - ndim
+ * 
+ *     for i in range(ndim - 1, -1, -1):             # <<<<<<<<<<<<<<
+ *         mslice.shape[i + offset] = mslice.shape[i]
+ *         mslice.strides[i + offset] = mslice.strides[i]
+ */
+  for (__pyx_t_1 = (__pyx_v_ndim - 1); __pyx_t_1 > -1; __pyx_t_1-=1) {
+    __pyx_v_i = __pyx_t_1;
+
+    /* "View.MemoryView":1344
+ * 
+ *     for i in range(ndim - 1, -1, -1):
+ *         mslice.shape[i + offset] = mslice.shape[i]             # <<<<<<<<<<<<<<
+ *         mslice.strides[i + offset] = mslice.strides[i]
+ *         mslice.suboffsets[i + offset] = mslice.suboffsets[i]
+ */
+    (__pyx_v_mslice->shape[(__pyx_v_i + __pyx_v_offset)]) = (__pyx_v_mslice->shape[__pyx_v_i]);
+
+    /* "View.MemoryView":1345
+ *     for i in range(ndim - 1, -1, -1):
+ *         mslice.shape[i + offset] = mslice.shape[i]
+ *         mslice.strides[i + offset] = mslice.strides[i]             # <<<<<<<<<<<<<<
+ *         mslice.suboffsets[i + offset] = mslice.suboffsets[i]
+ * 
+ */
+    (__pyx_v_mslice->strides[(__pyx_v_i + __pyx_v_offset)]) = (__pyx_v_mslice->strides[__pyx_v_i]);
+
+    /* "View.MemoryView":1346
+ *         mslice.shape[i + offset] = mslice.shape[i]
+ *         mslice.strides[i + offset] = mslice.strides[i]
+ *         mslice.suboffsets[i + offset] = mslice.suboffsets[i]             # <<<<<<<<<<<<<<
+ * 
+ *     for i in range(offset):
+ */
+    (__pyx_v_mslice->suboffsets[(__pyx_v_i + __pyx_v_offset)]) = (__pyx_v_mslice->suboffsets[__pyx_v_i]);
+  }
+
+  /* "View.MemoryView":1348
+ *         mslice.suboffsets[i + offset] = mslice.suboffsets[i]
+ * 
+ *     for i in range(offset):             # <<<<<<<<<<<<<<
+ *         mslice.shape[i] = 1
+ *         mslice.strides[i] = mslice.strides[0]
+ */
+  __pyx_t_1 = __pyx_v_offset;
+  __pyx_t_2 = __pyx_t_1;
+  for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_2; __pyx_t_3+=1) {
+    __pyx_v_i = __pyx_t_3;
+
+    /* "View.MemoryView":1349
+ * 
+ *     for i in range(offset):
+ *         mslice.shape[i] = 1             # <<<<<<<<<<<<<<
+ *         mslice.strides[i] = mslice.strides[0]
+ *         mslice.suboffsets[i] = -1
+ */
+    (__pyx_v_mslice->shape[__pyx_v_i]) = 1;
+
+    /* "View.MemoryView":1350
+ *     for i in range(offset):
+ *         mslice.shape[i] = 1
+ *         mslice.strides[i] = mslice.strides[0]             # <<<<<<<<<<<<<<
+ *         mslice.suboffsets[i] = -1
+ * 
+ */
+    (__pyx_v_mslice->strides[__pyx_v_i]) = (__pyx_v_mslice->strides[0]);
+
+    /* "View.MemoryView":1351
+ *         mslice.shape[i] = 1
+ *         mslice.strides[i] = mslice.strides[0]
+ *         mslice.suboffsets[i] = -1             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+    (__pyx_v_mslice->suboffsets[__pyx_v_i]) = -1L;
+  }
+
+  /* "View.MemoryView":1337
+ * 
+ * @cname('__pyx_memoryview_broadcast_leading')
+ * cdef void broadcast_leading(__Pyx_memviewslice *mslice,             # <<<<<<<<<<<<<<
+ *                             int ndim,
+ *                             int ndim_other) noexcept nogil:
+ */
+
+  /* function exit code */
+}
+
+/* "View.MemoryView":1359
+ * 
+ * @cname('__pyx_memoryview_refcount_copying')
+ * cdef void refcount_copying(__Pyx_memviewslice *dst, bint dtype_is_object, int ndim, bint inc) noexcept nogil:             # <<<<<<<<<<<<<<
+ * 
+ *     if dtype_is_object:
+ */
+
+static void __pyx_memoryview_refcount_copying(__Pyx_memviewslice *__pyx_v_dst, int __pyx_v_dtype_is_object, int __pyx_v_ndim, int __pyx_v_inc) {
+
+  /* "View.MemoryView":1361
+ * cdef void refcount_copying(__Pyx_memviewslice *dst, bint dtype_is_object, int ndim, bint inc) noexcept nogil:
+ * 
+ *     if dtype_is_object:             # <<<<<<<<<<<<<<
+ *         refcount_objects_in_slice_with_gil(dst.data, dst.shape, dst.strides, ndim, inc)
+ * 
+ */
+  if (__pyx_v_dtype_is_object) {
+
+    /* "View.MemoryView":1362
+ * 
+ *     if dtype_is_object:
+ *         refcount_objects_in_slice_with_gil(dst.data, dst.shape, dst.strides, ndim, inc)             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_refcount_objects_in_slice_with_gil')
+ */
+    __pyx_memoryview_refcount_objects_in_slice_with_gil(__pyx_v_dst->data, __pyx_v_dst->shape, __pyx_v_dst->strides, __pyx_v_ndim, __pyx_v_inc);
+
+    /* "View.MemoryView":1361
+ * cdef void refcount_copying(__Pyx_memviewslice *dst, bint dtype_is_object, int ndim, bint inc) noexcept nogil:
+ * 
+ *     if dtype_is_object:             # <<<<<<<<<<<<<<
+ *         refcount_objects_in_slice_with_gil(dst.data, dst.shape, dst.strides, ndim, inc)
+ * 
+ */
+  }
+
+  /* "View.MemoryView":1359
+ * 
+ * @cname('__pyx_memoryview_refcount_copying')
+ * cdef void refcount_copying(__Pyx_memviewslice *dst, bint dtype_is_object, int ndim, bint inc) noexcept nogil:             # <<<<<<<<<<<<<<
+ * 
+ *     if dtype_is_object:
+ */
+
+  /* function exit code */
+}
+
+/* "View.MemoryView":1365
+ * 
+ * @cname('__pyx_memoryview_refcount_objects_in_slice_with_gil')
+ * cdef void refcount_objects_in_slice_with_gil(char *data, Py_ssize_t *shape,             # <<<<<<<<<<<<<<
+ *                                              Py_ssize_t *strides, int ndim,
+ *                                              bint inc) noexcept with gil:
+ */
+
+static void __pyx_memoryview_refcount_objects_in_slice_with_gil(char *__pyx_v_data, Py_ssize_t *__pyx_v_shape, Py_ssize_t *__pyx_v_strides, int __pyx_v_ndim, int __pyx_v_inc) {
+  #ifdef WITH_THREAD
+  PyGILState_STATE __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
+  #endif
+
+  /* "View.MemoryView":1368
+ *                                              Py_ssize_t *strides, int ndim,
+ *                                              bint inc) noexcept with gil:
+ *     refcount_objects_in_slice(data, shape, strides, ndim, inc)             # <<<<<<<<<<<<<<
+ * 
+ * @cname('__pyx_memoryview_refcount_objects_in_slice')
+ */
+  __pyx_memoryview_refcount_objects_in_slice(__pyx_v_data, __pyx_v_shape, __pyx_v_strides, __pyx_v_ndim, __pyx_v_inc);
+
+  /* "View.MemoryView":1365
+ * 
+ * @cname('__pyx_memoryview_refcount_objects_in_slice_with_gil')
+ * cdef void refcount_objects_in_slice_with_gil(char *data, Py_ssize_t *shape,             # <<<<<<<<<<<<<<
+ *                                              Py_ssize_t *strides, int ndim,
+ *                                              bint inc) noexcept with gil:
+ */
+
+  /* function exit code */
+  #ifdef WITH_THREAD
+  __Pyx_PyGILState_Release(__pyx_gilstate_save);
+  #endif
+}
+
+/* "View.MemoryView":1371
+ * 
+ * @cname('__pyx_memoryview_refcount_objects_in_slice')
+ * cdef void refcount_objects_in_slice(char *data, Py_ssize_t *shape,             # <<<<<<<<<<<<<<
+ *                                     Py_ssize_t *strides, int ndim, bint inc) noexcept:
+ *     cdef Py_ssize_t i
+ */
+
+static void __pyx_memoryview_refcount_objects_in_slice(char *__pyx_v_data, Py_ssize_t *__pyx_v_shape, Py_ssize_t *__pyx_v_strides, int __pyx_v_ndim, int __pyx_v_inc) {
+  CYTHON_UNUSED Py_ssize_t __pyx_v_i;
+  Py_ssize_t __pyx_v_stride;
+  Py_ssize_t __pyx_t_1;
+  Py_ssize_t __pyx_t_2;
+  Py_ssize_t __pyx_t_3;
+  int __pyx_t_4;
+
+  /* "View.MemoryView":1374
+ *                                     Py_ssize_t *strides, int ndim, bint inc) noexcept:
+ *     cdef Py_ssize_t i
+ *     cdef Py_ssize_t stride = strides[0]             # <<<<<<<<<<<<<<
+ * 
+ *     for i in range(shape[0]):
+ */
+  __pyx_v_stride = (__pyx_v_strides[0]);
+
+  /* "View.MemoryView":1376
+ *     cdef Py_ssize_t stride = strides[0]
+ * 
+ *     for i in range(shape[0]):             # <<<<<<<<<<<<<<
+ *         if ndim == 1:
+ *             if inc:
+ */
+  __pyx_t_1 = (__pyx_v_shape[0]);
+  __pyx_t_2 = __pyx_t_1;
+  for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_2; __pyx_t_3+=1) {
+    __pyx_v_i = __pyx_t_3;
+
+    /* "View.MemoryView":1377
+ * 
+ *     for i in range(shape[0]):
+ *         if ndim == 1:             # <<<<<<<<<<<<<<
+ *             if inc:
+ *                 Py_INCREF((<PyObject **> data)[0])
+ */
+    __pyx_t_4 = (__pyx_v_ndim == 1);
+    if (__pyx_t_4) {
+
+      /* "View.MemoryView":1378
+ *     for i in range(shape[0]):
+ *         if ndim == 1:
+ *             if inc:             # <<<<<<<<<<<<<<
+ *                 Py_INCREF((<PyObject **> data)[0])
+ *             else:
+ */
+      if (__pyx_v_inc) {
+
+        /* "View.MemoryView":1379
+ *         if ndim == 1:
+ *             if inc:
+ *                 Py_INCREF((<PyObject **> data)[0])             # <<<<<<<<<<<<<<
+ *             else:
+ *                 Py_DECREF((<PyObject **> data)[0])
+ */
+        Py_INCREF((((PyObject **)__pyx_v_data)[0]));
+
+        /* "View.MemoryView":1378
+ *     for i in range(shape[0]):
+ *         if ndim == 1:
+ *             if inc:             # <<<<<<<<<<<<<<
+ *                 Py_INCREF((<PyObject **> data)[0])
+ *             else:
+ */
+        goto __pyx_L6;
+      }
+
+      /* "View.MemoryView":1381
+ *                 Py_INCREF((<PyObject **> data)[0])
+ *             else:
+ *                 Py_DECREF((<PyObject **> data)[0])             # <<<<<<<<<<<<<<
+ *         else:
+ *             refcount_objects_in_slice(data, shape + 1, strides + 1, ndim - 1, inc)
+ */
+      /*else*/ {
+        Py_DECREF((((PyObject **)__pyx_v_data)[0]));
+      }
+      __pyx_L6:;
+
+      /* "View.MemoryView":1377
+ * 
+ *     for i in range(shape[0]):
+ *         if ndim == 1:             # <<<<<<<<<<<<<<
+ *             if inc:
+ *                 Py_INCREF((<PyObject **> data)[0])
+ */
+      goto __pyx_L5;
+    }
+
+    /* "View.MemoryView":1383
+ *                 Py_DECREF((<PyObject **> data)[0])
+ *         else:
+ *             refcount_objects_in_slice(data, shape + 1, strides + 1, ndim - 1, inc)             # <<<<<<<<<<<<<<
+ * 
+ *         data += stride
+ */
+    /*else*/ {
+      __pyx_memoryview_refcount_objects_in_slice(__pyx_v_data, (__pyx_v_shape + 1), (__pyx_v_strides + 1), (__pyx_v_ndim - 1), __pyx_v_inc);
+    }
+    __pyx_L5:;
+
+    /* "View.MemoryView":1385
+ *             refcount_objects_in_slice(data, shape + 1, strides + 1, ndim - 1, inc)
+ * 
+ *         data += stride             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+    __pyx_v_data = (__pyx_v_data + __pyx_v_stride);
+  }
+
+  /* "View.MemoryView":1371
+ * 
+ * @cname('__pyx_memoryview_refcount_objects_in_slice')
+ * cdef void refcount_objects_in_slice(char *data, Py_ssize_t *shape,             # <<<<<<<<<<<<<<
+ *                                     Py_ssize_t *strides, int ndim, bint inc) noexcept:
+ *     cdef Py_ssize_t i
+ */
+
+  /* function exit code */
+}
+
+/* "View.MemoryView":1391
+ * 
+ * @cname('__pyx_memoryview_slice_assign_scalar')
+ * cdef void slice_assign_scalar(__Pyx_memviewslice *dst, int ndim,             # <<<<<<<<<<<<<<
+ *                               size_t itemsize, void *item,
+ *                               bint dtype_is_object) noexcept nogil:
+ */
+
+static void __pyx_memoryview_slice_assign_scalar(__Pyx_memviewslice *__pyx_v_dst, int __pyx_v_ndim, size_t __pyx_v_itemsize, void *__pyx_v_item, int __pyx_v_dtype_is_object) {
+
+  /* "View.MemoryView":1394
+ *                               size_t itemsize, void *item,
+ *                               bint dtype_is_object) noexcept nogil:
+ *     refcount_copying(dst, dtype_is_object, ndim, inc=False)             # <<<<<<<<<<<<<<
+ *     _slice_assign_scalar(dst.data, dst.shape, dst.strides, ndim, itemsize, item)
+ *     refcount_copying(dst, dtype_is_object, ndim, inc=True)
+ */
+  __pyx_memoryview_refcount_copying(__pyx_v_dst, __pyx_v_dtype_is_object, __pyx_v_ndim, 0);
+
+  /* "View.MemoryView":1395
+ *                               bint dtype_is_object) noexcept nogil:
+ *     refcount_copying(dst, dtype_is_object, ndim, inc=False)
+ *     _slice_assign_scalar(dst.data, dst.shape, dst.strides, ndim, itemsize, item)             # <<<<<<<<<<<<<<
+ *     refcount_copying(dst, dtype_is_object, ndim, inc=True)
+ * 
+ */
+  __pyx_memoryview__slice_assign_scalar(__pyx_v_dst->data, __pyx_v_dst->shape, __pyx_v_dst->strides, __pyx_v_ndim, __pyx_v_itemsize, __pyx_v_item);
+
+  /* "View.MemoryView":1396
+ *     refcount_copying(dst, dtype_is_object, ndim, inc=False)
+ *     _slice_assign_scalar(dst.data, dst.shape, dst.strides, ndim, itemsize, item)
+ *     refcount_copying(dst, dtype_is_object, ndim, inc=True)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_memoryview_refcount_copying(__pyx_v_dst, __pyx_v_dtype_is_object, __pyx_v_ndim, 1);
+
+  /* "View.MemoryView":1391
+ * 
+ * @cname('__pyx_memoryview_slice_assign_scalar')
+ * cdef void slice_assign_scalar(__Pyx_memviewslice *dst, int ndim,             # <<<<<<<<<<<<<<
+ *                               size_t itemsize, void *item,
+ *                               bint dtype_is_object) noexcept nogil:
+ */
+
+  /* function exit code */
+}
+
+/* "View.MemoryView":1400
+ * 
+ * @cname('__pyx_memoryview__slice_assign_scalar')
+ * cdef void _slice_assign_scalar(char *data, Py_ssize_t *shape,             # <<<<<<<<<<<<<<
+ *                               Py_ssize_t *strides, int ndim,
+ *                               size_t itemsize, void *item) noexcept nogil:
+ */
+
+static void __pyx_memoryview__slice_assign_scalar(char *__pyx_v_data, Py_ssize_t *__pyx_v_shape, Py_ssize_t *__pyx_v_strides, int __pyx_v_ndim, size_t __pyx_v_itemsize, void *__pyx_v_item) {
+  CYTHON_UNUSED Py_ssize_t __pyx_v_i;
+  Py_ssize_t __pyx_v_stride;
+  Py_ssize_t __pyx_v_extent;
+  int __pyx_t_1;
+  Py_ssize_t __pyx_t_2;
+  Py_ssize_t __pyx_t_3;
+  Py_ssize_t __pyx_t_4;
+
+  /* "View.MemoryView":1404
+ *                               size_t itemsize, void *item) noexcept nogil:
+ *     cdef Py_ssize_t i
+ *     cdef Py_ssize_t stride = strides[0]             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t extent = shape[0]
+ * 
+ */
+  __pyx_v_stride = (__pyx_v_strides[0]);
+
+  /* "View.MemoryView":1405
+ *     cdef Py_ssize_t i
+ *     cdef Py_ssize_t stride = strides[0]
+ *     cdef Py_ssize_t extent = shape[0]             # <<<<<<<<<<<<<<
+ * 
+ *     if ndim == 1:
+ */
+  __pyx_v_extent = (__pyx_v_shape[0]);
+
+  /* "View.MemoryView":1407
+ *     cdef Py_ssize_t extent = shape[0]
+ * 
+ *     if ndim == 1:             # <<<<<<<<<<<<<<
+ *         for i in range(extent):
+ *             memcpy(data, item, itemsize)
+ */
+  __pyx_t_1 = (__pyx_v_ndim == 1);
+  if (__pyx_t_1) {
+
+    /* "View.MemoryView":1408
+ * 
+ *     if ndim == 1:
+ *         for i in range(extent):             # <<<<<<<<<<<<<<
+ *             memcpy(data, item, itemsize)
+ *             data += stride
+ */
+    __pyx_t_2 = __pyx_v_extent;
+    __pyx_t_3 = __pyx_t_2;
+    for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
+      __pyx_v_i = __pyx_t_4;
+
+      /* "View.MemoryView":1409
+ *     if ndim == 1:
+ *         for i in range(extent):
+ *             memcpy(data, item, itemsize)             # <<<<<<<<<<<<<<
+ *             data += stride
+ *     else:
+ */
+      (void)(memcpy(__pyx_v_data, __pyx_v_item, __pyx_v_itemsize));
+
+      /* "View.MemoryView":1410
+ *         for i in range(extent):
+ *             memcpy(data, item, itemsize)
+ *             data += stride             # <<<<<<<<<<<<<<
+ *     else:
+ *         for i in range(extent):
+ */
+      __pyx_v_data = (__pyx_v_data + __pyx_v_stride);
+    }
+
+    /* "View.MemoryView":1407
+ *     cdef Py_ssize_t extent = shape[0]
+ * 
+ *     if ndim == 1:             # <<<<<<<<<<<<<<
+ *         for i in range(extent):
+ *             memcpy(data, item, itemsize)
+ */
+    goto __pyx_L3;
+  }
+
+  /* "View.MemoryView":1412
+ *             data += stride
+ *     else:
+ *         for i in range(extent):             # <<<<<<<<<<<<<<
+ *             _slice_assign_scalar(data, shape + 1, strides + 1, ndim - 1, itemsize, item)
+ *             data += stride
+ */
+  /*else*/ {
+    __pyx_t_2 = __pyx_v_extent;
+    __pyx_t_3 = __pyx_t_2;
+    for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
+      __pyx_v_i = __pyx_t_4;
+
+      /* "View.MemoryView":1413
+ *     else:
+ *         for i in range(extent):
+ *             _slice_assign_scalar(data, shape + 1, strides + 1, ndim - 1, itemsize, item)             # <<<<<<<<<<<<<<
+ *             data += stride
+ * 
+ */
+      __pyx_memoryview__slice_assign_scalar(__pyx_v_data, (__pyx_v_shape + 1), (__pyx_v_strides + 1), (__pyx_v_ndim - 1), __pyx_v_itemsize, __pyx_v_item);
+
+      /* "View.MemoryView":1414
+ *         for i in range(extent):
+ *             _slice_assign_scalar(data, shape + 1, strides + 1, ndim - 1, itemsize, item)
+ *             data += stride             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+      __pyx_v_data = (__pyx_v_data + __pyx_v_stride);
+    }
+  }
+  __pyx_L3:;
+
+  /* "View.MemoryView":1400
+ * 
+ * @cname('__pyx_memoryview__slice_assign_scalar')
+ * cdef void _slice_assign_scalar(char *data, Py_ssize_t *shape,             # <<<<<<<<<<<<<<
+ *                               Py_ssize_t *strides, int ndim,
+ *                               size_t itemsize, void *item) noexcept nogil:
+ */
+
+  /* function exit code */
+}
+
+/* "(tree fragment)":1
+ * def __pyx_unpickle_Enum(__pyx_type, long __pyx_checksum, __pyx_state):             # <<<<<<<<<<<<<<
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_1__pyx_unpickle_Enum(PyObject *__pyx_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyMethodDef __pyx_mdef_15View_dot_MemoryView_1__pyx_unpickle_Enum = {"__pyx_unpickle_Enum", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_15View_dot_MemoryView_1__pyx_unpickle_Enum, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_15View_dot_MemoryView_1__pyx_unpickle_Enum(PyObject *__pyx_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  PyObject *__pyx_v___pyx_type = 0;
+  long __pyx_v___pyx_checksum;
+  PyObject *__pyx_v___pyx_state = 0;
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[3] = {0,0,0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__pyx_unpickle_Enum (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_pyx_type,&__pyx_n_s_pyx_checksum,&__pyx_n_s_pyx_state,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  3: values[2] = __Pyx_Arg_FASTCALL(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = __Pyx_Arg_FASTCALL(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_FASTCALL(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_pyx_type)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 1, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_pyx_checksum)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[1]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 1, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("__pyx_unpickle_Enum", 1, 3, 3, 1); __PYX_ERR(1, 1, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_pyx_state)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[2]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 1, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("__pyx_unpickle_Enum", 1, 3, 3, 2); __PYX_ERR(1, 1, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "__pyx_unpickle_Enum") < 0)) __PYX_ERR(1, 1, __pyx_L3_error)
+      }
+    } else if (unlikely(__pyx_nargs != 3)) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+      values[1] = __Pyx_Arg_FASTCALL(__pyx_args, 1);
+      values[2] = __Pyx_Arg_FASTCALL(__pyx_args, 2);
+    }
+    __pyx_v___pyx_type = values[0];
+    __pyx_v___pyx_checksum = __Pyx_PyInt_As_long(values[1]); if (unlikely((__pyx_v___pyx_checksum == (long)-1) && PyErr_Occurred())) __PYX_ERR(1, 1, __pyx_L3_error)
+    __pyx_v___pyx_state = values[2];
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__pyx_unpickle_Enum", 1, 3, 3, __pyx_nargs); __PYX_ERR(1, 1, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("View.MemoryView.__pyx_unpickle_Enum", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_15View_dot_MemoryView___pyx_unpickle_Enum(__pyx_self, __pyx_v___pyx_type, __pyx_v___pyx_checksum, __pyx_v___pyx_state);
+
+  /* function exit code */
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView___pyx_unpickle_Enum(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v___pyx_type, long __pyx_v___pyx_checksum, PyObject *__pyx_v___pyx_state) {
+  PyObject *__pyx_v___pyx_PickleError = 0;
+  PyObject *__pyx_v___pyx_result = 0;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  unsigned int __pyx_t_5;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__pyx_unpickle_Enum", 1);
+
+  /* "(tree fragment)":4
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ *     if __pyx_checksum not in (0x82a3537, 0x6ae9995, 0xb068931):             # <<<<<<<<<<<<<<
+ *         from pickle import PickleError as __pyx_PickleError
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x82a3537, 0x6ae9995, 0xb068931) = (name))" % __pyx_checksum
+ */
+  __pyx_t_1 = __Pyx_PyInt_From_long(__pyx_v___pyx_checksum); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 4, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = (__Pyx_PySequence_ContainsTF(__pyx_t_1, __pyx_tuple__8, Py_NE)); if (unlikely((__pyx_t_2 < 0))) __PYX_ERR(1, 4, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (__pyx_t_2) {
+
+    /* "(tree fragment)":5
+ *     cdef object __pyx_result
+ *     if __pyx_checksum not in (0x82a3537, 0x6ae9995, 0xb068931):
+ *         from pickle import PickleError as __pyx_PickleError             # <<<<<<<<<<<<<<
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x82a3537, 0x6ae9995, 0xb068931) = (name))" % __pyx_checksum
+ *     __pyx_result = Enum.__new__(__pyx_type)
+ */
+    __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 5, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_INCREF(__pyx_n_s_PickleError);
+    __Pyx_GIVEREF(__pyx_n_s_PickleError);
+    if (__Pyx_PyList_SET_ITEM(__pyx_t_1, 0, __pyx_n_s_PickleError)) __PYX_ERR(1, 5, __pyx_L1_error);
+    __pyx_t_3 = __Pyx_Import(__pyx_n_s_pickle, __pyx_t_1, 0); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 5, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_3, __pyx_n_s_PickleError); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 5, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_INCREF(__pyx_t_1);
+    __pyx_v___pyx_PickleError = __pyx_t_1;
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+
+    /* "(tree fragment)":6
+ *     if __pyx_checksum not in (0x82a3537, 0x6ae9995, 0xb068931):
+ *         from pickle import PickleError as __pyx_PickleError
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x82a3537, 0x6ae9995, 0xb068931) = (name))" % __pyx_checksum             # <<<<<<<<<<<<<<
+ *     __pyx_result = Enum.__new__(__pyx_type)
+ *     if __pyx_state is not None:
+ */
+    __pyx_t_3 = __Pyx_PyInt_From_long(__pyx_v___pyx_checksum); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 6, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_1 = __Pyx_PyString_Format(__pyx_kp_s_Incompatible_checksums_0x_x_vs_0, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 6, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __Pyx_Raise(__pyx_v___pyx_PickleError, __pyx_t_1, 0, 0);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __PYX_ERR(1, 6, __pyx_L1_error)
+
+    /* "(tree fragment)":4
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ *     if __pyx_checksum not in (0x82a3537, 0x6ae9995, 0xb068931):             # <<<<<<<<<<<<<<
+ *         from pickle import PickleError as __pyx_PickleError
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x82a3537, 0x6ae9995, 0xb068931) = (name))" % __pyx_checksum
+ */
+  }
+
+  /* "(tree fragment)":7
+ *         from pickle import PickleError as __pyx_PickleError
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x82a3537, 0x6ae9995, 0xb068931) = (name))" % __pyx_checksum
+ *     __pyx_result = Enum.__new__(__pyx_type)             # <<<<<<<<<<<<<<
+ *     if __pyx_state is not None:
+ *         __pyx_unpickle_Enum__set_state(<Enum> __pyx_result, __pyx_state)
+ */
+  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_MemviewEnum_type), __pyx_n_s_new); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 7, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_4 = NULL;
+  __pyx_t_5 = 0;
+  #if CYTHON_UNPACK_METHODS
+  if (likely(PyMethod_Check(__pyx_t_3))) {
+    __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_3);
+    if (likely(__pyx_t_4)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3);
+      __Pyx_INCREF(__pyx_t_4);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_3, function);
+      __pyx_t_5 = 1;
+    }
+  }
+  #endif
+  {
+    PyObject *__pyx_callargs[2] = {__pyx_t_4, __pyx_v___pyx_type};
+    __pyx_t_1 = __Pyx_PyObject_FastCall(__pyx_t_3, __pyx_callargs+1-__pyx_t_5, 1+__pyx_t_5);
+    __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+    if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 7, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  }
+  __pyx_v___pyx_result = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "(tree fragment)":8
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x82a3537, 0x6ae9995, 0xb068931) = (name))" % __pyx_checksum
+ *     __pyx_result = Enum.__new__(__pyx_type)
+ *     if __pyx_state is not None:             # <<<<<<<<<<<<<<
+ *         __pyx_unpickle_Enum__set_state(<Enum> __pyx_result, __pyx_state)
+ *     return __pyx_result
+ */
+  __pyx_t_2 = (__pyx_v___pyx_state != Py_None);
+  if (__pyx_t_2) {
+
+    /* "(tree fragment)":9
+ *     __pyx_result = Enum.__new__(__pyx_type)
+ *     if __pyx_state is not None:
+ *         __pyx_unpickle_Enum__set_state(<Enum> __pyx_result, __pyx_state)             # <<<<<<<<<<<<<<
+ *     return __pyx_result
+ * cdef __pyx_unpickle_Enum__set_state(Enum __pyx_result, tuple __pyx_state):
+ */
+    if (!(likely(PyTuple_CheckExact(__pyx_v___pyx_state))||((__pyx_v___pyx_state) == Py_None) || __Pyx_RaiseUnexpectedTypeError("tuple", __pyx_v___pyx_state))) __PYX_ERR(1, 9, __pyx_L1_error)
+    __pyx_t_1 = __pyx_unpickle_Enum__set_state(((struct __pyx_MemviewEnum_obj *)__pyx_v___pyx_result), ((PyObject*)__pyx_v___pyx_state)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 9, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+    /* "(tree fragment)":8
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x82a3537, 0x6ae9995, 0xb068931) = (name))" % __pyx_checksum
+ *     __pyx_result = Enum.__new__(__pyx_type)
+ *     if __pyx_state is not None:             # <<<<<<<<<<<<<<
+ *         __pyx_unpickle_Enum__set_state(<Enum> __pyx_result, __pyx_state)
+ *     return __pyx_result
+ */
+  }
+
+  /* "(tree fragment)":10
+ *     if __pyx_state is not None:
+ *         __pyx_unpickle_Enum__set_state(<Enum> __pyx_result, __pyx_state)
+ *     return __pyx_result             # <<<<<<<<<<<<<<
+ * cdef __pyx_unpickle_Enum__set_state(Enum __pyx_result, tuple __pyx_state):
+ *     __pyx_result.name = __pyx_state[0]
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v___pyx_result);
+  __pyx_r = __pyx_v___pyx_result;
+  goto __pyx_L0;
+
+  /* "(tree fragment)":1
+ * def __pyx_unpickle_Enum(__pyx_type, long __pyx_checksum, __pyx_state):             # <<<<<<<<<<<<<<
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("View.MemoryView.__pyx_unpickle_Enum", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v___pyx_PickleError);
+  __Pyx_XDECREF(__pyx_v___pyx_result);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":11
+ *         __pyx_unpickle_Enum__set_state(<Enum> __pyx_result, __pyx_state)
+ *     return __pyx_result
+ * cdef __pyx_unpickle_Enum__set_state(Enum __pyx_result, tuple __pyx_state):             # <<<<<<<<<<<<<<
+ *     __pyx_result.name = __pyx_state[0]
+ *     if len(__pyx_state) > 1 and hasattr(__pyx_result, '__dict__'):
+ */
+
+static PyObject *__pyx_unpickle_Enum__set_state(struct __pyx_MemviewEnum_obj *__pyx_v___pyx_result, PyObject *__pyx_v___pyx_state) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  Py_ssize_t __pyx_t_3;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  unsigned int __pyx_t_8;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__pyx_unpickle_Enum__set_state", 1);
+
+  /* "(tree fragment)":12
+ *     return __pyx_result
+ * cdef __pyx_unpickle_Enum__set_state(Enum __pyx_result, tuple __pyx_state):
+ *     __pyx_result.name = __pyx_state[0]             # <<<<<<<<<<<<<<
+ *     if len(__pyx_state) > 1 and hasattr(__pyx_result, '__dict__'):
+ *         __pyx_result.__dict__.update(__pyx_state[1])
+ */
+  if (unlikely(__pyx_v___pyx_state == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable");
+    __PYX_ERR(1, 12, __pyx_L1_error)
+  }
+  __pyx_t_1 = __Pyx_GetItemInt_Tuple(__pyx_v___pyx_state, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 12, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_GIVEREF(__pyx_t_1);
+  __Pyx_GOTREF(__pyx_v___pyx_result->name);
+  __Pyx_DECREF(__pyx_v___pyx_result->name);
+  __pyx_v___pyx_result->name = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "(tree fragment)":13
+ * cdef __pyx_unpickle_Enum__set_state(Enum __pyx_result, tuple __pyx_state):
+ *     __pyx_result.name = __pyx_state[0]
+ *     if len(__pyx_state) > 1 and hasattr(__pyx_result, '__dict__'):             # <<<<<<<<<<<<<<
+ *         __pyx_result.__dict__.update(__pyx_state[1])
+ */
+  if (unlikely(__pyx_v___pyx_state == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()");
+    __PYX_ERR(1, 13, __pyx_L1_error)
+  }
+  __pyx_t_3 = __Pyx_PyTuple_GET_SIZE(__pyx_v___pyx_state); if (unlikely(__pyx_t_3 == ((Py_ssize_t)-1))) __PYX_ERR(1, 13, __pyx_L1_error)
+  __pyx_t_4 = (__pyx_t_3 > 1);
+  if (__pyx_t_4) {
+  } else {
+    __pyx_t_2 = __pyx_t_4;
+    goto __pyx_L4_bool_binop_done;
+  }
+  __pyx_t_4 = __Pyx_HasAttr(((PyObject *)__pyx_v___pyx_result), __pyx_n_s_dict); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(1, 13, __pyx_L1_error)
+  __pyx_t_2 = __pyx_t_4;
+  __pyx_L4_bool_binop_done:;
+  if (__pyx_t_2) {
+
+    /* "(tree fragment)":14
+ *     __pyx_result.name = __pyx_state[0]
+ *     if len(__pyx_state) > 1 and hasattr(__pyx_result, '__dict__'):
+ *         __pyx_result.__dict__.update(__pyx_state[1])             # <<<<<<<<<<<<<<
+ */
+    __pyx_t_5 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v___pyx_result), __pyx_n_s_dict); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 14, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_6 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_update); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 14, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    if (unlikely(__pyx_v___pyx_state == Py_None)) {
+      PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable");
+      __PYX_ERR(1, 14, __pyx_L1_error)
+    }
+    __pyx_t_5 = __Pyx_GetItemInt_Tuple(__pyx_v___pyx_state, 1, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 14, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_7 = NULL;
+    __pyx_t_8 = 0;
+    #if CYTHON_UNPACK_METHODS
+    if (likely(PyMethod_Check(__pyx_t_6))) {
+      __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_6);
+      if (likely(__pyx_t_7)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_6);
+        __Pyx_INCREF(__pyx_t_7);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_6, function);
+        __pyx_t_8 = 1;
+      }
+    }
+    #endif
+    {
+      PyObject *__pyx_callargs[2] = {__pyx_t_7, __pyx_t_5};
+      __pyx_t_1 = __Pyx_PyObject_FastCall(__pyx_t_6, __pyx_callargs+1-__pyx_t_8, 1+__pyx_t_8);
+      __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 14, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    }
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+    /* "(tree fragment)":13
+ * cdef __pyx_unpickle_Enum__set_state(Enum __pyx_result, tuple __pyx_state):
+ *     __pyx_result.name = __pyx_state[0]
+ *     if len(__pyx_state) > 1 and hasattr(__pyx_result, '__dict__'):             # <<<<<<<<<<<<<<
+ *         __pyx_result.__dict__.update(__pyx_state[1])
+ */
+  }
+
+  /* "(tree fragment)":11
+ *         __pyx_unpickle_Enum__set_state(<Enum> __pyx_result, __pyx_state)
+ *     return __pyx_result
+ * cdef __pyx_unpickle_Enum__set_state(Enum __pyx_result, tuple __pyx_state):             # <<<<<<<<<<<<<<
+ *     __pyx_result.name = __pyx_state[0]
+ *     if len(__pyx_state) > 1 and hasattr(__pyx_result, '__dict__'):
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_AddTraceback("View.MemoryView.__pyx_unpickle_Enum__set_state", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":287
+ * 
+ *         @property
+ *         cdef inline npy_intp itemsize(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             return PyDataType_ELSIZE(self)
+ * 
+ */
+
+static CYTHON_INLINE npy_intp __pyx_f_5numpy_5dtype_8itemsize_itemsize(PyArray_Descr *__pyx_v_self) {
+  npy_intp __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":288
+ *         @property
+ *         cdef inline npy_intp itemsize(self) noexcept nogil:
+ *             return PyDataType_ELSIZE(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyDataType_ELSIZE(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":287
+ * 
+ *         @property
+ *         cdef inline npy_intp itemsize(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             return PyDataType_ELSIZE(self)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":291
+ * 
+ *         @property
+ *         cdef inline npy_intp alignment(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             return PyDataType_ALIGNMENT(self)
+ * 
+ */
+
+static CYTHON_INLINE npy_intp __pyx_f_5numpy_5dtype_9alignment_alignment(PyArray_Descr *__pyx_v_self) {
+  npy_intp __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":292
+ *         @property
+ *         cdef inline npy_intp alignment(self) noexcept nogil:
+ *             return PyDataType_ALIGNMENT(self)             # <<<<<<<<<<<<<<
+ * 
+ *         # Use fields/names with care as they may be NULL.  You must check
+ */
+  __pyx_r = PyDataType_ALIGNMENT(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":291
+ * 
+ *         @property
+ *         cdef inline npy_intp alignment(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             return PyDataType_ALIGNMENT(self)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":297
+ *         # for this using PyDataType_HASFIELDS.
+ *         @property
+ *         cdef inline object fields(self):             # <<<<<<<<<<<<<<
+ *             return <object>PyDataType_FIELDS(self)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_5dtype_6fields_fields(PyArray_Descr *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1;
+  __Pyx_RefNannySetupContext("fields", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":298
+ *         @property
+ *         cdef inline object fields(self):
+ *             return <object>PyDataType_FIELDS(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyDataType_FIELDS(__pyx_v_self);
+  __Pyx_INCREF(((PyObject *)__pyx_t_1));
+  __pyx_r = ((PyObject *)__pyx_t_1);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":297
+ *         # for this using PyDataType_HASFIELDS.
+ *         @property
+ *         cdef inline object fields(self):             # <<<<<<<<<<<<<<
+ *             return <object>PyDataType_FIELDS(self)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":301
+ * 
+ *         @property
+ *         cdef inline tuple names(self):             # <<<<<<<<<<<<<<
+ *             return <tuple>PyDataType_NAMES(self)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_5dtype_5names_names(PyArray_Descr *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1;
+  __Pyx_RefNannySetupContext("names", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":302
+ *         @property
+ *         cdef inline tuple names(self):
+ *             return <tuple>PyDataType_NAMES(self)             # <<<<<<<<<<<<<<
+ * 
+ *         # Use PyDataType_HASSUBARRAY to test whether this field is
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyDataType_NAMES(__pyx_v_self);
+  __Pyx_INCREF(((PyObject*)__pyx_t_1));
+  __pyx_r = ((PyObject*)__pyx_t_1);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":301
+ * 
+ *         @property
+ *         cdef inline tuple names(self):             # <<<<<<<<<<<<<<
+ *             return <tuple>PyDataType_NAMES(self)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":308
+ *         # this field via the inline helper method PyDataType_SHAPE.
+ *         @property
+ *         cdef inline PyArray_ArrayDescr* subarray(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             return PyDataType_SUBARRAY(self)
+ * 
+ */
+
+static CYTHON_INLINE PyArray_ArrayDescr *__pyx_f_5numpy_5dtype_8subarray_subarray(PyArray_Descr *__pyx_v_self) {
+  PyArray_ArrayDescr *__pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":309
+ *         @property
+ *         cdef inline PyArray_ArrayDescr* subarray(self) noexcept nogil:
+ *             return PyDataType_SUBARRAY(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyDataType_SUBARRAY(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":308
+ *         # this field via the inline helper method PyDataType_SHAPE.
+ *         @property
+ *         cdef inline PyArray_ArrayDescr* subarray(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             return PyDataType_SUBARRAY(self)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":312
+ * 
+ *         @property
+ *         cdef inline npy_uint64 flags(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The data types flags."""
+ *             return PyDataType_FLAGS(self)
+ */
+
+static CYTHON_INLINE npy_uint64 __pyx_f_5numpy_5dtype_5flags_flags(PyArray_Descr *__pyx_v_self) {
+  npy_uint64 __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":314
+ *         cdef inline npy_uint64 flags(self) noexcept nogil:
+ *             """The data types flags."""
+ *             return PyDataType_FLAGS(self)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = PyDataType_FLAGS(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":312
+ * 
+ *         @property
+ *         cdef inline npy_uint64 flags(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The data types flags."""
+ *             return PyDataType_FLAGS(self)
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":324
+ * 
+ *         @property
+ *         cdef inline int numiter(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The number of arrays that need to be broadcast to the same shape."""
+ *             return PyArray_MultiIter_NUMITER(self)
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_9broadcast_7numiter_numiter(PyArrayMultiIterObject *__pyx_v_self) {
+  int __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":326
+ *         cdef inline int numiter(self) noexcept nogil:
+ *             """The number of arrays that need to be broadcast to the same shape."""
+ *             return PyArray_MultiIter_NUMITER(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyArray_MultiIter_NUMITER(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":324
+ * 
+ *         @property
+ *         cdef inline int numiter(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The number of arrays that need to be broadcast to the same shape."""
+ *             return PyArray_MultiIter_NUMITER(self)
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":329
+ * 
+ *         @property
+ *         cdef inline npy_intp size(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The total broadcasted size."""
+ *             return PyArray_MultiIter_SIZE(self)
+ */
+
+static CYTHON_INLINE npy_intp __pyx_f_5numpy_9broadcast_4size_size(PyArrayMultiIterObject *__pyx_v_self) {
+  npy_intp __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":331
+ *         cdef inline npy_intp size(self) noexcept nogil:
+ *             """The total broadcasted size."""
+ *             return PyArray_MultiIter_SIZE(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyArray_MultiIter_SIZE(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":329
+ * 
+ *         @property
+ *         cdef inline npy_intp size(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The total broadcasted size."""
+ *             return PyArray_MultiIter_SIZE(self)
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":334
+ * 
+ *         @property
+ *         cdef inline npy_intp index(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The current (1-d) index into the broadcasted result."""
+ *             return PyArray_MultiIter_INDEX(self)
+ */
+
+static CYTHON_INLINE npy_intp __pyx_f_5numpy_9broadcast_5index_index(PyArrayMultiIterObject *__pyx_v_self) {
+  npy_intp __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":336
+ *         cdef inline npy_intp index(self) noexcept nogil:
+ *             """The current (1-d) index into the broadcasted result."""
+ *             return PyArray_MultiIter_INDEX(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyArray_MultiIter_INDEX(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":334
+ * 
+ *         @property
+ *         cdef inline npy_intp index(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The current (1-d) index into the broadcasted result."""
+ *             return PyArray_MultiIter_INDEX(self)
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":339
+ * 
+ *         @property
+ *         cdef inline int nd(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The number of dimensions in the broadcasted result."""
+ *             return PyArray_MultiIter_NDIM(self)
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_9broadcast_2nd_nd(PyArrayMultiIterObject *__pyx_v_self) {
+  int __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":341
+ *         cdef inline int nd(self) noexcept nogil:
+ *             """The number of dimensions in the broadcasted result."""
+ *             return PyArray_MultiIter_NDIM(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyArray_MultiIter_NDIM(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":339
+ * 
+ *         @property
+ *         cdef inline int nd(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The number of dimensions in the broadcasted result."""
+ *             return PyArray_MultiIter_NDIM(self)
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":344
+ * 
+ *         @property
+ *         cdef inline npy_intp* dimensions(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The shape of the broadcasted result."""
+ *             return PyArray_MultiIter_DIMS(self)
+ */
+
+static CYTHON_INLINE npy_intp *__pyx_f_5numpy_9broadcast_10dimensions_dimensions(PyArrayMultiIterObject *__pyx_v_self) {
+  npy_intp *__pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":346
+ *         cdef inline npy_intp* dimensions(self) noexcept nogil:
+ *             """The shape of the broadcasted result."""
+ *             return PyArray_MultiIter_DIMS(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyArray_MultiIter_DIMS(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":344
+ * 
+ *         @property
+ *         cdef inline npy_intp* dimensions(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The shape of the broadcasted result."""
+ *             return PyArray_MultiIter_DIMS(self)
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":349
+ * 
+ *         @property
+ *         cdef inline void** iters(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """An array of iterator objects that holds the iterators for the arrays to be broadcast together.
+ *             On return, the iterators are adjusted for broadcasting."""
+ */
+
+static CYTHON_INLINE void **__pyx_f_5numpy_9broadcast_5iters_iters(PyArrayMultiIterObject *__pyx_v_self) {
+  void **__pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":352
+ *             """An array of iterator objects that holds the iterators for the arrays to be broadcast together.
+ *             On return, the iterators are adjusted for broadcasting."""
+ *             return PyArray_MultiIter_ITERS(self)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = PyArray_MultiIter_ITERS(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":349
+ * 
+ *         @property
+ *         cdef inline void** iters(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """An array of iterator objects that holds the iterators for the arrays to be broadcast together.
+ *             On return, the iterators are adjusted for broadcasting."""
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":367
+ * 
+ *         @property
+ *         cdef inline PyObject* base(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """Returns a borrowed reference to the object owning the data/memory.
+ *             """
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_7ndarray_4base_base(PyArrayObject *__pyx_v_self) {
+  PyObject *__pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":370
+ *             """Returns a borrowed reference to the object owning the data/memory.
+ *             """
+ *             return PyArray_BASE(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyArray_BASE(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":367
+ * 
+ *         @property
+ *         cdef inline PyObject* base(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """Returns a borrowed reference to the object owning the data/memory.
+ *             """
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":373
+ * 
+ *         @property
+ *         cdef inline dtype descr(self):             # <<<<<<<<<<<<<<
+ *             """Returns an owned reference to the dtype of the array.
+ *             """
+ */
+
+static CYTHON_INLINE PyArray_Descr *__pyx_f_5numpy_7ndarray_5descr_descr(PyArrayObject *__pyx_v_self) {
+  PyArray_Descr *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyArray_Descr *__pyx_t_1;
+  __Pyx_RefNannySetupContext("descr", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":376
+ *             """Returns an owned reference to the dtype of the array.
+ *             """
+ *             return <dtype>PyArray_DESCR(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __Pyx_XDECREF((PyObject *)__pyx_r);
+  __pyx_t_1 = PyArray_DESCR(__pyx_v_self);
+  __Pyx_INCREF((PyObject *)((PyArray_Descr *)__pyx_t_1));
+  __pyx_r = ((PyArray_Descr *)__pyx_t_1);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":373
+ * 
+ *         @property
+ *         cdef inline dtype descr(self):             # <<<<<<<<<<<<<<
+ *             """Returns an owned reference to the dtype of the array.
+ *             """
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF((PyObject *)__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":379
+ * 
+ *         @property
+ *         cdef inline int ndim(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """Returns the number of dimensions in the array.
+ *             """
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_7ndarray_4ndim_ndim(PyArrayObject *__pyx_v_self) {
+  int __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":382
+ *             """Returns the number of dimensions in the array.
+ *             """
+ *             return PyArray_NDIM(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyArray_NDIM(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":379
+ * 
+ *         @property
+ *         cdef inline int ndim(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """Returns the number of dimensions in the array.
+ *             """
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":385
+ * 
+ *         @property
+ *         cdef inline npy_intp *shape(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """Returns a pointer to the dimensions/shape of the array.
+ *             The number of elements matches the number of dimensions of the array (ndim).
+ */
+
+static CYTHON_INLINE npy_intp *__pyx_f_5numpy_7ndarray_5shape_shape(PyArrayObject *__pyx_v_self) {
+  npy_intp *__pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":390
+ *             Can return NULL for 0-dimensional arrays.
+ *             """
+ *             return PyArray_DIMS(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyArray_DIMS(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":385
+ * 
+ *         @property
+ *         cdef inline npy_intp *shape(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """Returns a pointer to the dimensions/shape of the array.
+ *             The number of elements matches the number of dimensions of the array (ndim).
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":393
+ * 
+ *         @property
+ *         cdef inline npy_intp *strides(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """Returns a pointer to the strides of the array.
+ *             The number of elements matches the number of dimensions of the array (ndim).
+ */
+
+static CYTHON_INLINE npy_intp *__pyx_f_5numpy_7ndarray_7strides_strides(PyArrayObject *__pyx_v_self) {
+  npy_intp *__pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":397
+ *             The number of elements matches the number of dimensions of the array (ndim).
+ *             """
+ *             return PyArray_STRIDES(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyArray_STRIDES(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":393
+ * 
+ *         @property
+ *         cdef inline npy_intp *strides(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """Returns a pointer to the strides of the array.
+ *             The number of elements matches the number of dimensions of the array (ndim).
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":400
+ * 
+ *         @property
+ *         cdef inline npy_intp size(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """Returns the total size (in number of elements) of the array.
+ *             """
+ */
+
+static CYTHON_INLINE npy_intp __pyx_f_5numpy_7ndarray_4size_size(PyArrayObject *__pyx_v_self) {
+  npy_intp __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":403
+ *             """Returns the total size (in number of elements) of the array.
+ *             """
+ *             return PyArray_SIZE(self)             # <<<<<<<<<<<<<<
+ * 
+ *         @property
+ */
+  __pyx_r = PyArray_SIZE(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":400
+ * 
+ *         @property
+ *         cdef inline npy_intp size(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """Returns the total size (in number of elements) of the array.
+ *             """
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":406
+ * 
+ *         @property
+ *         cdef inline char* data(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The pointer to the data buffer as a char*.
+ *             This is provided for legacy reasons to avoid direct struct field access.
+ */
+
+static CYTHON_INLINE char *__pyx_f_5numpy_7ndarray_4data_data(PyArrayObject *__pyx_v_self) {
+  char *__pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":412
+ *             of `PyArray_DATA()` instead, which returns a 'void*'.
+ *             """
+ *             return PyArray_BYTES(self)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = PyArray_BYTES(__pyx_v_self);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":406
+ * 
+ *         @property
+ *         cdef inline char* data(self) noexcept nogil:             # <<<<<<<<<<<<<<
+ *             """The pointer to the data buffer as a char*.
+ *             This is provided for legacy reasons to avoid direct struct field access.
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":824
+ * ctypedef long double complex clongdouble_t
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew1(PyObject *__pyx_v_a) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew1", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":825
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):
+ *     return PyArray_MultiIterNew(1, <void*>a)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(1, ((void *)__pyx_v_a)); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 825, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":824
+ * ctypedef long double complex clongdouble_t
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew1", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":827
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew2(PyObject *__pyx_v_a, PyObject *__pyx_v_b) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew2", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":828
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(2, ((void *)__pyx_v_a), ((void *)__pyx_v_b)); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 828, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":827
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew2", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":830
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew3(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew3", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":831
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(3, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c)); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 831, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":830
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew3", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":833
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew4(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew4", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":834
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(4, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d)); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 834, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":833
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew4", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":836
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew5(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d, PyObject *__pyx_v_e) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew5", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":837
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(5, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d), ((void *)__pyx_v_e)); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 837, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":836
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew5", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":839
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):             # <<<<<<<<<<<<<<
+ *     if PyDataType_HASSUBARRAY(d):
+ *         return <tuple>d.subarray.shape
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyDataType_SHAPE(PyArray_Descr *__pyx_v_d) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *__pyx_t_2;
+  __Pyx_RefNannySetupContext("PyDataType_SHAPE", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":840
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ *     if PyDataType_HASSUBARRAY(d):             # <<<<<<<<<<<<<<
+ *         return <tuple>d.subarray.shape
+ *     else:
+ */
+  __pyx_t_1 = PyDataType_HASSUBARRAY(__pyx_v_d);
+  if (__pyx_t_1) {
+
+    /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":841
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ *     if PyDataType_HASSUBARRAY(d):
+ *         return <tuple>d.subarray.shape             # <<<<<<<<<<<<<<
+ *     else:
+ *         return ()
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_t_2 = __pyx_f_5numpy_5dtype_8subarray_subarray(__pyx_v_d)->shape;
+    __Pyx_INCREF(((PyObject*)__pyx_t_2));
+    __pyx_r = ((PyObject*)__pyx_t_2);
+    goto __pyx_L0;
+
+    /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":840
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ *     if PyDataType_HASSUBARRAY(d):             # <<<<<<<<<<<<<<
+ *         return <tuple>d.subarray.shape
+ *     else:
+ */
+  }
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":843
+ *         return <tuple>d.subarray.shape
+ *     else:
+ *         return ()             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  /*else*/ {
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(__pyx_empty_tuple);
+    __pyx_r = __pyx_empty_tuple;
+    goto __pyx_L0;
+  }
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":839
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):             # <<<<<<<<<<<<<<
+ *     if PyDataType_HASSUBARRAY(d):
+ *         return <tuple>d.subarray.shape
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1027
+ *     int _import_umath() except -1
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base) except *:             # <<<<<<<<<<<<<<
+ *     Py_INCREF(base) # important to do this before stealing the reference below!
+ *     PyArray_SetBaseObject(arr, base)
+ */
+
+static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_arr, PyObject *__pyx_v_base) {
+  int __pyx_t_1;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1028
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base) except *:
+ *     Py_INCREF(base) # important to do this before stealing the reference below!             # <<<<<<<<<<<<<<
+ *     PyArray_SetBaseObject(arr, base)
+ * 
+ */
+  Py_INCREF(__pyx_v_base);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1029
+ * cdef inline void set_array_base(ndarray arr, object base) except *:
+ *     Py_INCREF(base) # important to do this before stealing the reference below!
+ *     PyArray_SetBaseObject(arr, base)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ */
+  __pyx_t_1 = PyArray_SetBaseObject(__pyx_v_arr, __pyx_v_base); if (unlikely(__pyx_t_1 == ((int)-1))) __PYX_ERR(2, 1029, __pyx_L1_error)
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1027
+ *     int _import_umath() except -1
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base) except *:             # <<<<<<<<<<<<<<
+ *     Py_INCREF(base) # important to do this before stealing the reference below!
+ *     PyArray_SetBaseObject(arr, base)
+ */
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("numpy.set_array_base", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_L0:;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1031
+ *     PyArray_SetBaseObject(arr, base)
+ * 
+ * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
+ *     base = PyArray_BASE(arr)
+ *     if base is NULL:
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__pyx_v_arr) {
+  PyObject *__pyx_v_base;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  __Pyx_RefNannySetupContext("get_array_base", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1032
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ *     base = PyArray_BASE(arr)             # <<<<<<<<<<<<<<
+ *     if base is NULL:
+ *         return None
+ */
+  __pyx_v_base = PyArray_BASE(__pyx_v_arr);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1033
+ * cdef inline object get_array_base(ndarray arr):
+ *     base = PyArray_BASE(arr)
+ *     if base is NULL:             # <<<<<<<<<<<<<<
+ *         return None
+ *     return <object>base
+ */
+  __pyx_t_1 = (__pyx_v_base == NULL);
+  if (__pyx_t_1) {
+
+    /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1034
+ *     base = PyArray_BASE(arr)
+ *     if base is NULL:
+ *         return None             # <<<<<<<<<<<<<<
+ *     return <object>base
+ * 
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+    goto __pyx_L0;
+
+    /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1033
+ * cdef inline object get_array_base(ndarray arr):
+ *     base = PyArray_BASE(arr)
+ *     if base is NULL:             # <<<<<<<<<<<<<<
+ *         return None
+ *     return <object>base
+ */
+  }
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1035
+ *     if base is NULL:
+ *         return None
+ *     return <object>base             # <<<<<<<<<<<<<<
+ * 
+ * # Versions of the import_* functions which are more suitable for
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(((PyObject *)__pyx_v_base));
+  __pyx_r = ((PyObject *)__pyx_v_base);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1031
+ *     PyArray_SetBaseObject(arr, base)
+ * 
+ * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
+ *     base = PyArray_BASE(arr)
+ *     if base is NULL:
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1039
+ * # Versions of the import_* functions which are more suitable for
+ * # Cython code.
+ * cdef inline int import_array() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         __pyx_import_array()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_array(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("import_array", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1040
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         __pyx_import_array()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1041
+ * cdef inline int import_array() except -1:
+ *     try:
+ *         __pyx_import_array()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy._core.multiarray failed to import")
+ */
+      __pyx_t_4 = _import_array(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(2, 1041, __pyx_L3_error)
+
+      /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1040
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         __pyx_import_array()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L8_try_end;
+    __pyx_L3_error:;
+
+    /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1042
+ *     try:
+ *         __pyx_import_array()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy._core.multiarray failed to import")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_array", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(2, 1042, __pyx_L5_except_error)
+      __Pyx_XGOTREF(__pyx_t_5);
+      __Pyx_XGOTREF(__pyx_t_6);
+      __Pyx_XGOTREF(__pyx_t_7);
+
+      /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1043
+ *         __pyx_import_array()
+ *     except Exception:
+ *         raise ImportError("numpy._core.multiarray failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_umath() except -1:
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__9, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(2, 1043, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(2, 1043, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+
+    /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1040
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         __pyx_import_array()
+ *     except Exception:
+ */
+    __pyx_L5_except_error:;
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L8_try_end:;
+  }
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1039
+ * # Versions of the import_* functions which are more suitable for
+ * # Cython code.
+ * cdef inline int import_array() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         __pyx_import_array()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_array", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1045
+ *         raise ImportError("numpy._core.multiarray failed to import")
+ * 
+ * cdef inline int import_umath() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_umath(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("import_umath", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1046
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1047
+ * cdef inline int import_umath() except -1:
+ *     try:
+ *         _import_umath()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy._core.umath failed to import")
+ */
+      __pyx_t_4 = _import_umath(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(2, 1047, __pyx_L3_error)
+
+      /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1046
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L8_try_end;
+    __pyx_L3_error:;
+
+    /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1048
+ *     try:
+ *         _import_umath()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy._core.umath failed to import")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_umath", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(2, 1048, __pyx_L5_except_error)
+      __Pyx_XGOTREF(__pyx_t_5);
+      __Pyx_XGOTREF(__pyx_t_6);
+      __Pyx_XGOTREF(__pyx_t_7);
+
+      /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1049
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy._core.umath failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_ufunc() except -1:
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__10, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(2, 1049, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(2, 1049, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+
+    /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1046
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    __pyx_L5_except_error:;
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L8_try_end:;
+  }
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1045
+ *         raise ImportError("numpy._core.multiarray failed to import")
+ * 
+ * cdef inline int import_umath() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_umath", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1051
+ *         raise ImportError("numpy._core.umath failed to import")
+ * 
+ * cdef inline int import_ufunc() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_ufunc(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("import_ufunc", 1);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1052
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1053
+ * cdef inline int import_ufunc() except -1:
+ *     try:
+ *         _import_umath()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy._core.umath failed to import")
+ */
+      __pyx_t_4 = _import_umath(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(2, 1053, __pyx_L3_error)
+
+      /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1052
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L8_try_end;
+    __pyx_L3_error:;
+
+    /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1054
+ *     try:
+ *         _import_umath()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy._core.umath failed to import")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_ufunc", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(2, 1054, __pyx_L5_except_error)
+      __Pyx_XGOTREF(__pyx_t_5);
+      __Pyx_XGOTREF(__pyx_t_6);
+      __Pyx_XGOTREF(__pyx_t_7);
+
+      /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1055
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy._core.umath failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__10, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(2, 1055, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(2, 1055, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+
+    /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1052
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    __pyx_L5_except_error:;
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L8_try_end:;
+  }
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1051
+ *         raise ImportError("numpy._core.umath failed to import")
+ * 
+ * cdef inline int import_ufunc() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_ufunc", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1058
+ * 
+ * 
+ * cdef inline bint is_timedelta64_object(object obj) noexcept:             # <<<<<<<<<<<<<<
+ *     """
+ *     Cython equivalent of `isinstance(obj, np.timedelta64)`
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_is_timedelta64_object(PyObject *__pyx_v_obj) {
+  int __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1070
+ *     bool
+ *     """
+ *     return PyObject_TypeCheck(obj, &PyTimedeltaArrType_Type)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = PyObject_TypeCheck(__pyx_v_obj, (&PyTimedeltaArrType_Type));
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1058
+ * 
+ * 
+ * cdef inline bint is_timedelta64_object(object obj) noexcept:             # <<<<<<<<<<<<<<
+ *     """
+ *     Cython equivalent of `isinstance(obj, np.timedelta64)`
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1073
+ * 
+ * 
+ * cdef inline bint is_datetime64_object(object obj) noexcept:             # <<<<<<<<<<<<<<
+ *     """
+ *     Cython equivalent of `isinstance(obj, np.datetime64)`
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_is_datetime64_object(PyObject *__pyx_v_obj) {
+  int __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1085
+ *     bool
+ *     """
+ *     return PyObject_TypeCheck(obj, &PyDatetimeArrType_Type)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = PyObject_TypeCheck(__pyx_v_obj, (&PyDatetimeArrType_Type));
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1073
+ * 
+ * 
+ * cdef inline bint is_datetime64_object(object obj) noexcept:             # <<<<<<<<<<<<<<
+ *     """
+ *     Cython equivalent of `isinstance(obj, np.datetime64)`
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1088
+ * 
+ * 
+ * cdef inline npy_datetime get_datetime64_value(object obj) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     returns the int64 value underlying scalar numpy datetime64 object
+ */
+
+static CYTHON_INLINE npy_datetime __pyx_f_5numpy_get_datetime64_value(PyObject *__pyx_v_obj) {
+  npy_datetime __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1095
+ *     also needed.  That can be found using `get_datetime64_unit`.
+ *     """
+ *     return (<PyDatetimeScalarObject*>obj).obval             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = ((PyDatetimeScalarObject *)__pyx_v_obj)->obval;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1088
+ * 
+ * 
+ * cdef inline npy_datetime get_datetime64_value(object obj) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     returns the int64 value underlying scalar numpy datetime64 object
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1098
+ * 
+ * 
+ * cdef inline npy_timedelta get_timedelta64_value(object obj) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     returns the int64 value underlying scalar numpy timedelta64 object
+ */
+
+static CYTHON_INLINE npy_timedelta __pyx_f_5numpy_get_timedelta64_value(PyObject *__pyx_v_obj) {
+  npy_timedelta __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1102
+ *     returns the int64 value underlying scalar numpy timedelta64 object
+ *     """
+ *     return (<PyTimedeltaScalarObject*>obj).obval             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = ((PyTimedeltaScalarObject *)__pyx_v_obj)->obval;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1098
+ * 
+ * 
+ * cdef inline npy_timedelta get_timedelta64_value(object obj) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     returns the int64 value underlying scalar numpy timedelta64 object
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1105
+ * 
+ * 
+ * cdef inline NPY_DATETIMEUNIT get_datetime64_unit(object obj) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     returns the unit part of the dtype for a numpy datetime64 object.
+ */
+
+static CYTHON_INLINE NPY_DATETIMEUNIT __pyx_f_5numpy_get_datetime64_unit(PyObject *__pyx_v_obj) {
+  NPY_DATETIMEUNIT __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1109
+ *     returns the unit part of the dtype for a numpy datetime64 object.
+ *     """
+ *     return <NPY_DATETIMEUNIT>(<PyDatetimeScalarObject*>obj).obmeta.base             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = ((NPY_DATETIMEUNIT)((PyDatetimeScalarObject *)__pyx_v_obj)->obmeta.base);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1105
+ * 
+ * 
+ * cdef inline NPY_DATETIMEUNIT get_datetime64_unit(object obj) noexcept nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     returns the unit part of the dtype for a numpy datetime64 object.
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "fairseq/data/token_block_utils_fast.pyx":22
+ * @cython.wraparound(False)
+ * @cython.nonecheck(False)
+ * cdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_none_mode(np.ndarray[DTYPE_t, ndim=1] sizes, int block_size):             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t total_size = sizes.sum()
+ *     cdef DTYPE_t length = <DTYPE_t> ceil(total_size / <double> block_size)
+ */
+
+static PyArrayObject *__pyx_f_7fairseq_4data_22token_block_utils_fast__get_slice_indices_none_mode(PyArrayObject *__pyx_v_sizes, int __pyx_v_block_size) {
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_total_size;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_length;
+  PyArrayObject *__pyx_v_slice_indices = 0;
+  __Pyx_memviewslice __pyx_v_slice_indices_view = { 0, 0, { 0 }, { 0 }, { 0 } };
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_i;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_start;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_end;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_sizes;
+  __Pyx_Buffer __pyx_pybuffer_sizes;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_slice_indices;
+  __Pyx_Buffer __pyx_pybuffer_slice_indices;
+  PyArrayObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  unsigned int __pyx_t_4;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_t_5;
+  PyObject *__pyx_t_6 = NULL;
+  PyArrayObject *__pyx_t_7 = NULL;
+  __Pyx_memviewslice __pyx_t_8 = { 0, 0, { 0 }, { 0 }, { 0 } };
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_t_9;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_t_10;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_t_11;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_t_12;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_t_13;
+  int __pyx_t_14;
+  Py_ssize_t __pyx_t_15;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("_get_slice_indices_none_mode", 1);
+  __pyx_pybuffer_slice_indices.pybuffer.buf = NULL;
+  __pyx_pybuffer_slice_indices.refcount = 0;
+  __pyx_pybuffernd_slice_indices.data = NULL;
+  __pyx_pybuffernd_slice_indices.rcbuffer = &__pyx_pybuffer_slice_indices;
+  __pyx_pybuffer_sizes.pybuffer.buf = NULL;
+  __pyx_pybuffer_sizes.refcount = 0;
+  __pyx_pybuffernd_sizes.data = NULL;
+  __pyx_pybuffernd_sizes.rcbuffer = &__pyx_pybuffer_sizes;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_sizes.rcbuffer->pybuffer, (PyObject*)__pyx_v_sizes, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) __PYX_ERR(0, 22, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_sizes.diminfo[0].strides = __pyx_pybuffernd_sizes.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_sizes.diminfo[0].shape = __pyx_pybuffernd_sizes.rcbuffer->pybuffer.shape[0];
+
+  /* "fairseq/data/token_block_utils_fast.pyx":23
+ * @cython.nonecheck(False)
+ * cdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_none_mode(np.ndarray[DTYPE_t, ndim=1] sizes, int block_size):
+ *     cdef DTYPE_t total_size = sizes.sum()             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t length = <DTYPE_t> ceil(total_size / <double> block_size)
+ *     cdef np.ndarray[DTYPE_t, ndim=2] slice_indices = np.zeros([length, 2], dtype=DTYPE)
+ */
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_sizes), __pyx_n_s_sum); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 23, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = NULL;
+  __pyx_t_4 = 0;
+  #if CYTHON_UNPACK_METHODS
+  if (likely(PyMethod_Check(__pyx_t_2))) {
+    __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_2);
+    if (likely(__pyx_t_3)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
+      __Pyx_INCREF(__pyx_t_3);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_2, function);
+      __pyx_t_4 = 1;
+    }
+  }
+  #endif
+  {
+    PyObject *__pyx_callargs[2] = {__pyx_t_3, NULL};
+    __pyx_t_1 = __Pyx_PyObject_FastCall(__pyx_t_2, __pyx_callargs+1-__pyx_t_4, 0+__pyx_t_4);
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 23, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  }
+  __pyx_t_5 = __Pyx_PyInt_As_npy_int64(__pyx_t_1); if (unlikely((__pyx_t_5 == ((npy_int64)-1)) && PyErr_Occurred())) __PYX_ERR(0, 23, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_total_size = __pyx_t_5;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":24
+ * cdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_none_mode(np.ndarray[DTYPE_t, ndim=1] sizes, int block_size):
+ *     cdef DTYPE_t total_size = sizes.sum()
+ *     cdef DTYPE_t length = <DTYPE_t> ceil(total_size / <double> block_size)             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[DTYPE_t, ndim=2] slice_indices = np.zeros([length, 2], dtype=DTYPE)
+ *     cdef DTYPE_t[:, :] slice_indices_view = slice_indices
+ */
+  if (unlikely(((double)__pyx_v_block_size) == 0)) {
+    PyErr_SetString(PyExc_ZeroDivisionError, "float division");
+    __PYX_ERR(0, 24, __pyx_L1_error)
+  }
+  __pyx_v_length = ((__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t)ceil((((double)__pyx_v_total_size) / ((double)__pyx_v_block_size))));
+
+  /* "fairseq/data/token_block_utils_fast.pyx":25
+ *     cdef DTYPE_t total_size = sizes.sum()
+ *     cdef DTYPE_t length = <DTYPE_t> ceil(total_size / <double> block_size)
+ *     cdef np.ndarray[DTYPE_t, ndim=2] slice_indices = np.zeros([length, 2], dtype=DTYPE)             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t[:, :] slice_indices_view = slice_indices
+ *     cdef DTYPE_t i
+ */
+  __Pyx_GetModuleGlobalName(__pyx_t_1, __pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 25, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_zeros); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 25, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyInt_From_npy_int64(__pyx_v_length); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 25, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_3 = PyList_New(2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 25, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_GIVEREF(__pyx_t_1);
+  if (__Pyx_PyList_SET_ITEM(__pyx_t_3, 0, __pyx_t_1)) __PYX_ERR(0, 25, __pyx_L1_error);
+  __Pyx_INCREF(__pyx_int_2);
+  __Pyx_GIVEREF(__pyx_int_2);
+  if (__Pyx_PyList_SET_ITEM(__pyx_t_3, 1, __pyx_int_2)) __PYX_ERR(0, 25, __pyx_L1_error);
+  __pyx_t_1 = 0;
+  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 25, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_GIVEREF(__pyx_t_3);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_3)) __PYX_ERR(0, 25, __pyx_L1_error);
+  __pyx_t_3 = 0;
+  __pyx_t_3 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 25, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_GetModuleGlobalName(__pyx_t_6, __pyx_n_s_DTYPE); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 25, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_6);
+  if (PyDict_SetItem(__pyx_t_3, __pyx_n_s_dtype, __pyx_t_6) < 0) __PYX_ERR(0, 25, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+  __pyx_t_6 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_1, __pyx_t_3); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 25, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_6);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  if (!(likely(((__pyx_t_6) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_6, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 25, __pyx_L1_error)
+  __pyx_t_7 = ((PyArrayObject *)__pyx_t_6);
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer, (PyObject*)__pyx_t_7, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) {
+      __pyx_v_slice_indices = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.buf = NULL;
+      __PYX_ERR(0, 25, __pyx_L1_error)
+    } else {__pyx_pybuffernd_slice_indices.diminfo[0].strides = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_slice_indices.diminfo[0].shape = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_slice_indices.diminfo[1].strides = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_slice_indices.diminfo[1].shape = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.shape[1];
+    }
+  }
+  __pyx_t_7 = 0;
+  __pyx_v_slice_indices = ((PyArrayObject *)__pyx_t_6);
+  __pyx_t_6 = 0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":26
+ *     cdef DTYPE_t length = <DTYPE_t> ceil(total_size / <double> block_size)
+ *     cdef np.ndarray[DTYPE_t, ndim=2] slice_indices = np.zeros([length, 2], dtype=DTYPE)
+ *     cdef DTYPE_t[:, :] slice_indices_view = slice_indices             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t i
+ *     cdef DTYPE_t start
+ */
+  __pyx_t_8 = __Pyx_PyObject_to_MemoryviewSlice_dsds_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t(((PyObject *)__pyx_v_slice_indices), PyBUF_WRITABLE); if (unlikely(!__pyx_t_8.memview)) __PYX_ERR(0, 26, __pyx_L1_error)
+  __pyx_v_slice_indices_view = __pyx_t_8;
+  __pyx_t_8.memview = NULL;
+  __pyx_t_8.data = NULL;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":30
+ *     cdef DTYPE_t start
+ *     cdef DTYPE_t end
+ *     for i in range(length):             # <<<<<<<<<<<<<<
+ *         start = i * block_size
+ *         end = min(start + block_size, total_size)
+ */
+  __pyx_t_5 = __pyx_v_length;
+  __pyx_t_9 = __pyx_t_5;
+  for (__pyx_t_10 = 0; __pyx_t_10 < __pyx_t_9; __pyx_t_10+=1) {
+    __pyx_v_i = __pyx_t_10;
+
+    /* "fairseq/data/token_block_utils_fast.pyx":31
+ *     cdef DTYPE_t end
+ *     for i in range(length):
+ *         start = i * block_size             # <<<<<<<<<<<<<<
+ *         end = min(start + block_size, total_size)
+ *         slice_indices_view[i][0] = start
+ */
+    __pyx_v_start = (__pyx_v_i * __pyx_v_block_size);
+
+    /* "fairseq/data/token_block_utils_fast.pyx":32
+ *     for i in range(length):
+ *         start = i * block_size
+ *         end = min(start + block_size, total_size)             # <<<<<<<<<<<<<<
+ *         slice_indices_view[i][0] = start
+ *         slice_indices_view[i][1] = end
+ */
+    __pyx_t_11 = __pyx_v_total_size;
+    __pyx_t_12 = (__pyx_v_start + __pyx_v_block_size);
+    __pyx_t_14 = (__pyx_t_11 < __pyx_t_12);
+    if (__pyx_t_14) {
+      __pyx_t_13 = __pyx_t_11;
+    } else {
+      __pyx_t_13 = __pyx_t_12;
+    }
+    __pyx_v_end = __pyx_t_13;
+
+    /* "fairseq/data/token_block_utils_fast.pyx":33
+ *         start = i * block_size
+ *         end = min(start + block_size, total_size)
+ *         slice_indices_view[i][0] = start             # <<<<<<<<<<<<<<
+ *         slice_indices_view[i][1] = end
+ *     return slice_indices
+ */
+    __pyx_t_13 = __pyx_v_i;
+    __pyx_t_15 = 0;
+    *((__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_slice_indices_view.data + __pyx_t_13 * __pyx_v_slice_indices_view.strides[0]) ) + __pyx_t_15 * __pyx_v_slice_indices_view.strides[1]) )) = __pyx_v_start;
+
+    /* "fairseq/data/token_block_utils_fast.pyx":34
+ *         end = min(start + block_size, total_size)
+ *         slice_indices_view[i][0] = start
+ *         slice_indices_view[i][1] = end             # <<<<<<<<<<<<<<
+ *     return slice_indices
+ * 
+ */
+    __pyx_t_13 = __pyx_v_i;
+    __pyx_t_15 = 1;
+    *((__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_slice_indices_view.data + __pyx_t_13 * __pyx_v_slice_indices_view.strides[0]) ) + __pyx_t_15 * __pyx_v_slice_indices_view.strides[1]) )) = __pyx_v_end;
+  }
+
+  /* "fairseq/data/token_block_utils_fast.pyx":35
+ *         slice_indices_view[i][0] = start
+ *         slice_indices_view[i][1] = end
+ *     return slice_indices             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __Pyx_XDECREF((PyObject *)__pyx_r);
+  __Pyx_INCREF((PyObject *)__pyx_v_slice_indices);
+  __pyx_r = ((PyArrayObject *)__pyx_v_slice_indices);
+  goto __pyx_L0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":22
+ * @cython.wraparound(False)
+ * @cython.nonecheck(False)
+ * cdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_none_mode(np.ndarray[DTYPE_t, ndim=1] sizes, int block_size):             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t total_size = sizes.sum()
+ *     cdef DTYPE_t length = <DTYPE_t> ceil(total_size / <double> block_size)
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_6);
+  __PYX_XCLEAR_MEMVIEW(&__pyx_t_8, 1);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_sizes.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("fairseq.data.token_block_utils_fast._get_slice_indices_none_mode", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_sizes.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_slice_indices);
+  __PYX_XCLEAR_MEMVIEW(&__pyx_v_slice_indices_view, 1);
+  __Pyx_XGIVEREF((PyObject *)__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "fairseq/data/token_block_utils_fast.pyx":38
+ * 
+ * 
+ * cdef np.ndarray[DTYPE_t, ndim=2] _fast_convert_to_np_array(list list_of_list):             # <<<<<<<<<<<<<<
+ *     """
+ *     Faster function to convert DTYPE_t list of list.
+ */
+
+static PyArrayObject *__pyx_f_7fairseq_4data_22token_block_utils_fast__fast_convert_to_np_array(PyObject *__pyx_v_list_of_list) {
+  PyArrayObject *__pyx_v_flat = 0;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_flat;
+  __Pyx_Buffer __pyx_pybuffer_flat;
+  PyArrayObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  unsigned int __pyx_t_6;
+  PyArrayObject *__pyx_t_7 = NULL;
+  Py_ssize_t __pyx_t_8;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("_fast_convert_to_np_array", 1);
+  __pyx_pybuffer_flat.pybuffer.buf = NULL;
+  __pyx_pybuffer_flat.refcount = 0;
+  __pyx_pybuffernd_flat.data = NULL;
+  __pyx_pybuffernd_flat.rcbuffer = &__pyx_pybuffer_flat;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":43
+ *     Only fast when there are huge number of rows and low number of columns.
+ *     """
+ *     cdef np.ndarray[DTYPE_t, ndim=1] flat = np.fromiter(chain.from_iterable(list_of_list), DTYPE, -1)             # <<<<<<<<<<<<<<
+ *     return flat.reshape((len(list_of_list), -1))
+ * 
+ */
+  __Pyx_GetModuleGlobalName(__pyx_t_2, __pyx_n_s_np); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 43, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_fromiter); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 43, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_GetModuleGlobalName(__pyx_t_4, __pyx_n_s_chain); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 43, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_from_iterable); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 43, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_t_4 = NULL;
+  __pyx_t_6 = 0;
+  #if CYTHON_UNPACK_METHODS
+  if (unlikely(PyMethod_Check(__pyx_t_5))) {
+    __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_5);
+    if (likely(__pyx_t_4)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_5);
+      __Pyx_INCREF(__pyx_t_4);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_5, function);
+      __pyx_t_6 = 1;
+    }
+  }
+  #endif
+  {
+    PyObject *__pyx_callargs[2] = {__pyx_t_4, __pyx_v_list_of_list};
+    __pyx_t_2 = __Pyx_PyObject_FastCall(__pyx_t_5, __pyx_callargs+1-__pyx_t_6, 1+__pyx_t_6);
+    __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+    if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 43, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  }
+  __Pyx_GetModuleGlobalName(__pyx_t_5, __pyx_n_s_DTYPE); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 43, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __pyx_t_4 = NULL;
+  __pyx_t_6 = 0;
+  #if CYTHON_UNPACK_METHODS
+  if (unlikely(PyMethod_Check(__pyx_t_3))) {
+    __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_3);
+    if (likely(__pyx_t_4)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3);
+      __Pyx_INCREF(__pyx_t_4);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_3, function);
+      __pyx_t_6 = 1;
+    }
+  }
+  #endif
+  {
+    PyObject *__pyx_callargs[4] = {__pyx_t_4, __pyx_t_2, __pyx_t_5, __pyx_int_neg_1};
+    __pyx_t_1 = __Pyx_PyObject_FastCall(__pyx_t_3, __pyx_callargs+1-__pyx_t_6, 3+__pyx_t_6);
+    __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 43, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  }
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 43, __pyx_L1_error)
+  __pyx_t_7 = ((PyArrayObject *)__pyx_t_1);
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_flat.rcbuffer->pybuffer, (PyObject*)__pyx_t_7, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
+      __pyx_v_flat = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_flat.rcbuffer->pybuffer.buf = NULL;
+      __PYX_ERR(0, 43, __pyx_L1_error)
+    } else {__pyx_pybuffernd_flat.diminfo[0].strides = __pyx_pybuffernd_flat.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_flat.diminfo[0].shape = __pyx_pybuffernd_flat.rcbuffer->pybuffer.shape[0];
+    }
+  }
+  __pyx_t_7 = 0;
+  __pyx_v_flat = ((PyArrayObject *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":44
+ *     """
+ *     cdef np.ndarray[DTYPE_t, ndim=1] flat = np.fromiter(chain.from_iterable(list_of_list), DTYPE, -1)
+ *     return flat.reshape((len(list_of_list), -1))             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __Pyx_XDECREF((PyObject *)__pyx_r);
+  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_flat), __pyx_n_s_reshape); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 44, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  if (unlikely(__pyx_v_list_of_list == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()");
+    __PYX_ERR(0, 44, __pyx_L1_error)
+  }
+  __pyx_t_8 = __Pyx_PyList_GET_SIZE(__pyx_v_list_of_list); if (unlikely(__pyx_t_8 == ((Py_ssize_t)-1))) __PYX_ERR(0, 44, __pyx_L1_error)
+  __pyx_t_5 = PyInt_FromSsize_t(__pyx_t_8); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 44, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 44, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_GIVEREF(__pyx_t_5);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_5)) __PYX_ERR(0, 44, __pyx_L1_error);
+  __Pyx_INCREF(__pyx_int_neg_1);
+  __Pyx_GIVEREF(__pyx_int_neg_1);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_2, 1, __pyx_int_neg_1)) __PYX_ERR(0, 44, __pyx_L1_error);
+  __pyx_t_5 = 0;
+  __pyx_t_5 = NULL;
+  __pyx_t_6 = 0;
+  #if CYTHON_UNPACK_METHODS
+  if (likely(PyMethod_Check(__pyx_t_3))) {
+    __pyx_t_5 = PyMethod_GET_SELF(__pyx_t_3);
+    if (likely(__pyx_t_5)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3);
+      __Pyx_INCREF(__pyx_t_5);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_3, function);
+      __pyx_t_6 = 1;
+    }
+  }
+  #endif
+  {
+    PyObject *__pyx_callargs[2] = {__pyx_t_5, __pyx_t_2};
+    __pyx_t_1 = __Pyx_PyObject_FastCall(__pyx_t_3, __pyx_callargs+1-__pyx_t_6, 1+__pyx_t_6);
+    __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 44, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  }
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 44, __pyx_L1_error)
+  __pyx_r = ((PyArrayObject *)__pyx_t_1);
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":38
+ * 
+ * 
+ * cdef np.ndarray[DTYPE_t, ndim=2] _fast_convert_to_np_array(list list_of_list):             # <<<<<<<<<<<<<<
+ *     """
+ *     Faster function to convert DTYPE_t list of list.
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_flat.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("fairseq.data.token_block_utils_fast._fast_convert_to_np_array", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_flat.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_flat);
+  __Pyx_XGIVEREF((PyObject *)__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "fairseq/data/token_block_utils_fast.pyx":50
+ * @cython.wraparound(False)
+ * @cython.nonecheck(False)
+ * cpdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_fast(np.ndarray[DTYPE_t, ndim=1] sizes, str break_mode, int block_size, int document_sep_len):             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t tok_idx = 0
+ *     cdef DTYPE_t sz_idx = 0
+ */
+
+static PyObject *__pyx_pw_7fairseq_4data_22token_block_utils_fast_1_get_slice_indices_fast(PyObject *__pyx_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyArrayObject *__pyx_f_7fairseq_4data_22token_block_utils_fast__get_slice_indices_fast(PyArrayObject *__pyx_v_sizes, PyObject *__pyx_v_break_mode, int __pyx_v_block_size, int __pyx_v_document_sep_len, CYTHON_UNUSED int __pyx_skip_dispatch) {
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_tok_idx;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_sz_idx;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_curr_size;
+  CYTHON_UNUSED __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_i;
+  __Pyx_memviewslice __pyx_v_sizes_view = { 0, 0, { 0 }, { 0 }, { 0 } };
+  PyArrayObject *__pyx_v_slice_indices = 0;
+  PyObject *__pyx_v_slice_indices_list = 0;
+  PyObject *__pyx_v_cumsum = NULL;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_sizes;
+  __Pyx_Buffer __pyx_pybuffer_sizes;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_slice_indices;
+  __Pyx_Buffer __pyx_pybuffer_slice_indices;
+  PyArrayObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_memviewslice __pyx_t_1 = { 0, 0, { 0 }, { 0 }, { 0 } };
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_t_3;
+  int __pyx_t_4;
+  int __pyx_t_5;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  Py_ssize_t __pyx_t_9;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_t_10;
+  PyObject *__pyx_t_11 = NULL;
+  PyObject *__pyx_t_12 = NULL;
+  int __pyx_t_13;
+  PyObject *__pyx_t_14 = NULL;
+  PyArrayObject *__pyx_t_15 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("_get_slice_indices_fast", 1);
+  __pyx_pybuffer_slice_indices.pybuffer.buf = NULL;
+  __pyx_pybuffer_slice_indices.refcount = 0;
+  __pyx_pybuffernd_slice_indices.data = NULL;
+  __pyx_pybuffernd_slice_indices.rcbuffer = &__pyx_pybuffer_slice_indices;
+  __pyx_pybuffer_sizes.pybuffer.buf = NULL;
+  __pyx_pybuffer_sizes.refcount = 0;
+  __pyx_pybuffernd_sizes.data = NULL;
+  __pyx_pybuffernd_sizes.rcbuffer = &__pyx_pybuffer_sizes;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_sizes.rcbuffer->pybuffer, (PyObject*)__pyx_v_sizes, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) __PYX_ERR(0, 50, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_sizes.diminfo[0].strides = __pyx_pybuffernd_sizes.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_sizes.diminfo[0].shape = __pyx_pybuffernd_sizes.rcbuffer->pybuffer.shape[0];
+
+  /* "fairseq/data/token_block_utils_fast.pyx":51
+ * @cython.nonecheck(False)
+ * cpdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_fast(np.ndarray[DTYPE_t, ndim=1] sizes, str break_mode, int block_size, int document_sep_len):
+ *     cdef DTYPE_t tok_idx = 0             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t sz_idx = 0
+ *     cdef DTYPE_t curr_size = 0
+ */
+  __pyx_v_tok_idx = 0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":52
+ * cpdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_fast(np.ndarray[DTYPE_t, ndim=1] sizes, str break_mode, int block_size, int document_sep_len):
+ *     cdef DTYPE_t tok_idx = 0
+ *     cdef DTYPE_t sz_idx = 0             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t curr_size = 0
+ *     cdef DTYPE_t i = 0
+ */
+  __pyx_v_sz_idx = 0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":53
+ *     cdef DTYPE_t tok_idx = 0
+ *     cdef DTYPE_t sz_idx = 0
+ *     cdef DTYPE_t curr_size = 0             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t i = 0
+ *     cdef DTYPE_t length
+ */
+  __pyx_v_curr_size = 0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":54
+ *     cdef DTYPE_t sz_idx = 0
+ *     cdef DTYPE_t curr_size = 0
+ *     cdef DTYPE_t i = 0             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t length
+ *     cdef DTYPE_t total_size
+ */
+  __pyx_v_i = 0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":57
+ *     cdef DTYPE_t length
+ *     cdef DTYPE_t total_size
+ *     cdef DTYPE_t[:] sizes_view = sizes             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[DTYPE_t, ndim=2] slice_indices
+ *     cdef list slice_indices_list = []
+ */
+  __pyx_t_1 = __Pyx_PyObject_to_MemoryviewSlice_ds_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t(((PyObject *)__pyx_v_sizes), PyBUF_WRITABLE); if (unlikely(!__pyx_t_1.memview)) __PYX_ERR(0, 57, __pyx_L1_error)
+  __pyx_v_sizes_view = __pyx_t_1;
+  __pyx_t_1.memview = NULL;
+  __pyx_t_1.data = NULL;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":59
+ *     cdef DTYPE_t[:] sizes_view = sizes
+ *     cdef np.ndarray[DTYPE_t, ndim=2] slice_indices
+ *     cdef list slice_indices_list = []             # <<<<<<<<<<<<<<
+ * 
+ *     if break_mode is None or break_mode == 'none':
+ */
+  __pyx_t_2 = PyList_New(0); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 59, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_v_slice_indices_list = ((PyObject*)__pyx_t_2);
+  __pyx_t_2 = 0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":61
+ *     cdef list slice_indices_list = []
+ * 
+ *     if break_mode is None or break_mode == 'none':             # <<<<<<<<<<<<<<
+ *         slice_indices = _get_slice_indices_none_mode(sizes, block_size)
+ *     elif break_mode == 'complete':
+ */
+  __pyx_t_4 = (__pyx_v_break_mode == ((PyObject*)Py_None));
+  if (!__pyx_t_4) {
+  } else {
+    __pyx_t_3 = __pyx_t_4;
+    goto __pyx_L4_bool_binop_done;
+  }
+  __pyx_t_4 = (__Pyx_PyUnicode_Equals(__pyx_v_break_mode, __pyx_n_u_none, Py_EQ)); if (unlikely((__pyx_t_4 < 0))) __PYX_ERR(0, 61, __pyx_L1_error)
+  __pyx_t_3 = __pyx_t_4;
+  __pyx_L4_bool_binop_done:;
+  if (__pyx_t_3) {
+
+    /* "fairseq/data/token_block_utils_fast.pyx":62
+ * 
+ *     if break_mode is None or break_mode == 'none':
+ *         slice_indices = _get_slice_indices_none_mode(sizes, block_size)             # <<<<<<<<<<<<<<
+ *     elif break_mode == 'complete':
+ *         while sz_idx < len(sizes_view):
+ */
+    __pyx_t_2 = ((PyObject *)__pyx_f_7fairseq_4data_22token_block_utils_fast__get_slice_indices_none_mode(((PyArrayObject *)__pyx_v_sizes), __pyx_v_block_size)); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 62, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    {
+      __Pyx_BufFmt_StackElem __pyx_stack[1];
+      __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer);
+      __pyx_t_5 = __Pyx_GetBufferAndValidate(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer, (PyObject*)((PyArrayObject *)__pyx_t_2), &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack);
+      if (unlikely(__pyx_t_5 < 0)) {
+        PyErr_Fetch(&__pyx_t_6, &__pyx_t_7, &__pyx_t_8);
+        if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer, (PyObject*)__pyx_v_slice_indices, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) {
+          Py_XDECREF(__pyx_t_6); Py_XDECREF(__pyx_t_7); Py_XDECREF(__pyx_t_8);
+          __Pyx_RaiseBufferFallbackError();
+        } else {
+          PyErr_Restore(__pyx_t_6, __pyx_t_7, __pyx_t_8);
+        }
+        __pyx_t_6 = __pyx_t_7 = __pyx_t_8 = 0;
+      }
+      __pyx_pybuffernd_slice_indices.diminfo[0].strides = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_slice_indices.diminfo[0].shape = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_slice_indices.diminfo[1].strides = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_slice_indices.diminfo[1].shape = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.shape[1];
+      if (unlikely((__pyx_t_5 < 0))) __PYX_ERR(0, 62, __pyx_L1_error)
+    }
+    __pyx_v_slice_indices = ((PyArrayObject *)__pyx_t_2);
+    __pyx_t_2 = 0;
+
+    /* "fairseq/data/token_block_utils_fast.pyx":61
+ *     cdef list slice_indices_list = []
+ * 
+ *     if break_mode is None or break_mode == 'none':             # <<<<<<<<<<<<<<
+ *         slice_indices = _get_slice_indices_none_mode(sizes, block_size)
+ *     elif break_mode == 'complete':
+ */
+    goto __pyx_L3;
+  }
+
+  /* "fairseq/data/token_block_utils_fast.pyx":63
+ *     if break_mode is None or break_mode == 'none':
+ *         slice_indices = _get_slice_indices_none_mode(sizes, block_size)
+ *     elif break_mode == 'complete':             # <<<<<<<<<<<<<<
+ *         while sz_idx < len(sizes_view):
+ *             if curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0:
+ */
+  __pyx_t_3 = (__Pyx_PyUnicode_Equals(__pyx_v_break_mode, __pyx_n_u_complete, Py_EQ)); if (unlikely((__pyx_t_3 < 0))) __PYX_ERR(0, 63, __pyx_L1_error)
+  if (__pyx_t_3) {
+
+    /* "fairseq/data/token_block_utils_fast.pyx":64
+ *         slice_indices = _get_slice_indices_none_mode(sizes, block_size)
+ *     elif break_mode == 'complete':
+ *         while sz_idx < len(sizes_view):             # <<<<<<<<<<<<<<
+ *             if curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0:
+ *                 curr_size += sizes_view[sz_idx]
+ */
+    while (1) {
+      __pyx_t_9 = __Pyx_MemoryView_Len(__pyx_v_sizes_view); 
+      __pyx_t_3 = (__pyx_v_sz_idx < __pyx_t_9);
+      if (!__pyx_t_3) break;
+
+      /* "fairseq/data/token_block_utils_fast.pyx":65
+ *     elif break_mode == 'complete':
+ *         while sz_idx < len(sizes_view):
+ *             if curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0:             # <<<<<<<<<<<<<<
+ *                 curr_size += sizes_view[sz_idx]
+ *                 sz_idx += 1
+ */
+      __pyx_t_10 = __pyx_v_sz_idx;
+      __pyx_t_4 = ((__pyx_v_curr_size + (*((__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t *) ( /* dim=0 */ (__pyx_v_sizes_view.data + __pyx_t_10 * __pyx_v_sizes_view.strides[0]) )))) <= __pyx_v_block_size);
+      if (!__pyx_t_4) {
+      } else {
+        __pyx_t_3 = __pyx_t_4;
+        goto __pyx_L9_bool_binop_done;
+      }
+      __pyx_t_4 = (__pyx_v_curr_size == 0);
+      __pyx_t_3 = __pyx_t_4;
+      __pyx_L9_bool_binop_done:;
+      if (__pyx_t_3) {
+
+        /* "fairseq/data/token_block_utils_fast.pyx":66
+ *         while sz_idx < len(sizes_view):
+ *             if curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0:
+ *                 curr_size += sizes_view[sz_idx]             # <<<<<<<<<<<<<<
+ *                 sz_idx += 1
+ *             else:
+ */
+        __pyx_t_10 = __pyx_v_sz_idx;
+        __pyx_v_curr_size = (__pyx_v_curr_size + (*((__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t *) ( /* dim=0 */ (__pyx_v_sizes_view.data + __pyx_t_10 * __pyx_v_sizes_view.strides[0]) ))));
+
+        /* "fairseq/data/token_block_utils_fast.pyx":67
+ *             if curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0:
+ *                 curr_size += sizes_view[sz_idx]
+ *                 sz_idx += 1             # <<<<<<<<<<<<<<
+ *             else:
+ *                 slice_indices_list.append((tok_idx, tok_idx + curr_size))
+ */
+        __pyx_v_sz_idx = (__pyx_v_sz_idx + 1);
+
+        /* "fairseq/data/token_block_utils_fast.pyx":65
+ *     elif break_mode == 'complete':
+ *         while sz_idx < len(sizes_view):
+ *             if curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0:             # <<<<<<<<<<<<<<
+ *                 curr_size += sizes_view[sz_idx]
+ *                 sz_idx += 1
+ */
+        goto __pyx_L8;
+      }
+
+      /* "fairseq/data/token_block_utils_fast.pyx":69
+ *                 sz_idx += 1
+ *             else:
+ *                 slice_indices_list.append((tok_idx, tok_idx + curr_size))             # <<<<<<<<<<<<<<
+ *                 tok_idx += curr_size
+ *                 curr_size = 0
+ */
+      /*else*/ {
+        __pyx_t_2 = __Pyx_PyInt_From_npy_int64(__pyx_v_tok_idx); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 69, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_2);
+        __pyx_t_11 = __Pyx_PyInt_From_npy_int64((__pyx_v_tok_idx + __pyx_v_curr_size)); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 69, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_11);
+        __pyx_t_12 = PyTuple_New(2); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 69, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_12);
+        __Pyx_GIVEREF(__pyx_t_2);
+        if (__Pyx_PyTuple_SET_ITEM(__pyx_t_12, 0, __pyx_t_2)) __PYX_ERR(0, 69, __pyx_L1_error);
+        __Pyx_GIVEREF(__pyx_t_11);
+        if (__Pyx_PyTuple_SET_ITEM(__pyx_t_12, 1, __pyx_t_11)) __PYX_ERR(0, 69, __pyx_L1_error);
+        __pyx_t_2 = 0;
+        __pyx_t_11 = 0;
+        __pyx_t_13 = __Pyx_PyList_Append(__pyx_v_slice_indices_list, __pyx_t_12); if (unlikely(__pyx_t_13 == ((int)-1))) __PYX_ERR(0, 69, __pyx_L1_error)
+        __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
+
+        /* "fairseq/data/token_block_utils_fast.pyx":70
+ *             else:
+ *                 slice_indices_list.append((tok_idx, tok_idx + curr_size))
+ *                 tok_idx += curr_size             # <<<<<<<<<<<<<<
+ *                 curr_size = 0
+ *         if curr_size > 0:
+ */
+        __pyx_v_tok_idx = (__pyx_v_tok_idx + __pyx_v_curr_size);
+
+        /* "fairseq/data/token_block_utils_fast.pyx":71
+ *                 slice_indices_list.append((tok_idx, tok_idx + curr_size))
+ *                 tok_idx += curr_size
+ *                 curr_size = 0             # <<<<<<<<<<<<<<
+ *         if curr_size > 0:
+ *             slice_indices_list.append((tok_idx, tok_idx + curr_size))
+ */
+        __pyx_v_curr_size = 0;
+      }
+      __pyx_L8:;
+    }
+
+    /* "fairseq/data/token_block_utils_fast.pyx":72
+ *                 tok_idx += curr_size
+ *                 curr_size = 0
+ *         if curr_size > 0:             # <<<<<<<<<<<<<<
+ *             slice_indices_list.append((tok_idx, tok_idx + curr_size))
+ *         slice_indices = _fast_convert_to_np_array(slice_indices_list)
+ */
+    __pyx_t_3 = (__pyx_v_curr_size > 0);
+    if (__pyx_t_3) {
+
+      /* "fairseq/data/token_block_utils_fast.pyx":73
+ *                 curr_size = 0
+ *         if curr_size > 0:
+ *             slice_indices_list.append((tok_idx, tok_idx + curr_size))             # <<<<<<<<<<<<<<
+ *         slice_indices = _fast_convert_to_np_array(slice_indices_list)
+ *     elif break_mode == 'complete_doc':
+ */
+      __pyx_t_12 = __Pyx_PyInt_From_npy_int64(__pyx_v_tok_idx); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 73, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_12);
+      __pyx_t_11 = __Pyx_PyInt_From_npy_int64((__pyx_v_tok_idx + __pyx_v_curr_size)); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 73, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_11);
+      __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 73, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_2);
+      __Pyx_GIVEREF(__pyx_t_12);
+      if (__Pyx_PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_12)) __PYX_ERR(0, 73, __pyx_L1_error);
+      __Pyx_GIVEREF(__pyx_t_11);
+      if (__Pyx_PyTuple_SET_ITEM(__pyx_t_2, 1, __pyx_t_11)) __PYX_ERR(0, 73, __pyx_L1_error);
+      __pyx_t_12 = 0;
+      __pyx_t_11 = 0;
+      __pyx_t_13 = __Pyx_PyList_Append(__pyx_v_slice_indices_list, __pyx_t_2); if (unlikely(__pyx_t_13 == ((int)-1))) __PYX_ERR(0, 73, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+      /* "fairseq/data/token_block_utils_fast.pyx":72
+ *                 tok_idx += curr_size
+ *                 curr_size = 0
+ *         if curr_size > 0:             # <<<<<<<<<<<<<<
+ *             slice_indices_list.append((tok_idx, tok_idx + curr_size))
+ *         slice_indices = _fast_convert_to_np_array(slice_indices_list)
+ */
+    }
+
+    /* "fairseq/data/token_block_utils_fast.pyx":74
+ *         if curr_size > 0:
+ *             slice_indices_list.append((tok_idx, tok_idx + curr_size))
+ *         slice_indices = _fast_convert_to_np_array(slice_indices_list)             # <<<<<<<<<<<<<<
+ *     elif break_mode == 'complete_doc':
+ *         while sz_idx < len(sizes_view):
+ */
+    __pyx_t_2 = ((PyObject *)__pyx_f_7fairseq_4data_22token_block_utils_fast__fast_convert_to_np_array(__pyx_v_slice_indices_list)); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 74, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    {
+      __Pyx_BufFmt_StackElem __pyx_stack[1];
+      __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer);
+      __pyx_t_5 = __Pyx_GetBufferAndValidate(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer, (PyObject*)((PyArrayObject *)__pyx_t_2), &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack);
+      if (unlikely(__pyx_t_5 < 0)) {
+        PyErr_Fetch(&__pyx_t_8, &__pyx_t_7, &__pyx_t_6);
+        if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer, (PyObject*)__pyx_v_slice_indices, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) {
+          Py_XDECREF(__pyx_t_8); Py_XDECREF(__pyx_t_7); Py_XDECREF(__pyx_t_6);
+          __Pyx_RaiseBufferFallbackError();
+        } else {
+          PyErr_Restore(__pyx_t_8, __pyx_t_7, __pyx_t_6);
+        }
+        __pyx_t_8 = __pyx_t_7 = __pyx_t_6 = 0;
+      }
+      __pyx_pybuffernd_slice_indices.diminfo[0].strides = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_slice_indices.diminfo[0].shape = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_slice_indices.diminfo[1].strides = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_slice_indices.diminfo[1].shape = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.shape[1];
+      if (unlikely((__pyx_t_5 < 0))) __PYX_ERR(0, 74, __pyx_L1_error)
+    }
+    __pyx_v_slice_indices = ((PyArrayObject *)__pyx_t_2);
+    __pyx_t_2 = 0;
+
+    /* "fairseq/data/token_block_utils_fast.pyx":63
+ *     if break_mode is None or break_mode == 'none':
+ *         slice_indices = _get_slice_indices_none_mode(sizes, block_size)
+ *     elif break_mode == 'complete':             # <<<<<<<<<<<<<<
+ *         while sz_idx < len(sizes_view):
+ *             if curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "fairseq/data/token_block_utils_fast.pyx":75
+ *             slice_indices_list.append((tok_idx, tok_idx + curr_size))
+ *         slice_indices = _fast_convert_to_np_array(slice_indices_list)
+ *     elif break_mode == 'complete_doc':             # <<<<<<<<<<<<<<
+ *         while sz_idx < len(sizes_view):
+ *             if (
+ */
+  __pyx_t_3 = (__Pyx_PyUnicode_Equals(__pyx_v_break_mode, __pyx_n_u_complete_doc, Py_EQ)); if (unlikely((__pyx_t_3 < 0))) __PYX_ERR(0, 75, __pyx_L1_error)
+  if (__pyx_t_3) {
+
+    /* "fairseq/data/token_block_utils_fast.pyx":76
+ *         slice_indices = _fast_convert_to_np_array(slice_indices_list)
+ *     elif break_mode == 'complete_doc':
+ *         while sz_idx < len(sizes_view):             # <<<<<<<<<<<<<<
+ *             if (
+ *                 (curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0)
+ */
+    while (1) {
+      __pyx_t_9 = __Pyx_MemoryView_Len(__pyx_v_sizes_view); 
+      __pyx_t_3 = (__pyx_v_sz_idx < __pyx_t_9);
+      if (!__pyx_t_3) break;
+
+      /* "fairseq/data/token_block_utils_fast.pyx":78
+ *         while sz_idx < len(sizes_view):
+ *             if (
+ *                 (curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0)             # <<<<<<<<<<<<<<
+ *                 # an empty sentence indicates end-of-document:
+ *                 and sizes_view[sz_idx] != document_sep_len
+ */
+      __pyx_t_10 = __pyx_v_sz_idx;
+      __pyx_t_4 = ((__pyx_v_curr_size + (*((__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t *) ( /* dim=0 */ (__pyx_v_sizes_view.data + __pyx_t_10 * __pyx_v_sizes_view.strides[0]) )))) <= __pyx_v_block_size);
+      if (!__pyx_t_4) {
+      } else {
+        goto __pyx_L16_next_and;
+      }
+      __pyx_t_4 = (__pyx_v_curr_size == 0);
+      if (__pyx_t_4) {
+      } else {
+        __pyx_t_3 = __pyx_t_4;
+        goto __pyx_L15_bool_binop_done;
+      }
+      __pyx_L16_next_and:;
+
+      /* "fairseq/data/token_block_utils_fast.pyx":80
+ *                 (curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0)
+ *                 # an empty sentence indicates end-of-document:
+ *                 and sizes_view[sz_idx] != document_sep_len             # <<<<<<<<<<<<<<
+ *             ):
+ *                 curr_size += sizes_view[sz_idx]
+ */
+      __pyx_t_10 = __pyx_v_sz_idx;
+      __pyx_t_4 = ((*((__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t *) ( /* dim=0 */ (__pyx_v_sizes_view.data + __pyx_t_10 * __pyx_v_sizes_view.strides[0]) ))) != __pyx_v_document_sep_len);
+      __pyx_t_3 = __pyx_t_4;
+      __pyx_L15_bool_binop_done:;
+
+      /* "fairseq/data/token_block_utils_fast.pyx":77
+ *     elif break_mode == 'complete_doc':
+ *         while sz_idx < len(sizes_view):
+ *             if (             # <<<<<<<<<<<<<<
+ *                 (curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0)
+ *                 # an empty sentence indicates end-of-document:
+ */
+      if (__pyx_t_3) {
+
+        /* "fairseq/data/token_block_utils_fast.pyx":82
+ *                 and sizes_view[sz_idx] != document_sep_len
+ *             ):
+ *                 curr_size += sizes_view[sz_idx]             # <<<<<<<<<<<<<<
+ *                 sz_idx += 1
+ *             else:
+ */
+        __pyx_t_10 = __pyx_v_sz_idx;
+        __pyx_v_curr_size = (__pyx_v_curr_size + (*((__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t *) ( /* dim=0 */ (__pyx_v_sizes_view.data + __pyx_t_10 * __pyx_v_sizes_view.strides[0]) ))));
+
+        /* "fairseq/data/token_block_utils_fast.pyx":83
+ *             ):
+ *                 curr_size += sizes_view[sz_idx]
+ *                 sz_idx += 1             # <<<<<<<<<<<<<<
+ *             else:
+ *                 # Only keep non-empty documents.
+ */
+        __pyx_v_sz_idx = (__pyx_v_sz_idx + 1);
+
+        /* "fairseq/data/token_block_utils_fast.pyx":77
+ *     elif break_mode == 'complete_doc':
+ *         while sz_idx < len(sizes_view):
+ *             if (             # <<<<<<<<<<<<<<
+ *                 (curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0)
+ *                 # an empty sentence indicates end-of-document:
+ */
+        goto __pyx_L14;
+      }
+
+      /* "fairseq/data/token_block_utils_fast.pyx":86
+ *             else:
+ *                 # Only keep non-empty documents.
+ *                 if curr_size > 1:             # <<<<<<<<<<<<<<
+ *                     slice_indices_list.append((tok_idx, tok_idx + curr_size))
+ *                 tok_idx += curr_size
+ */
+      /*else*/ {
+        __pyx_t_3 = (__pyx_v_curr_size > 1);
+        if (__pyx_t_3) {
+
+          /* "fairseq/data/token_block_utils_fast.pyx":87
+ *                 # Only keep non-empty documents.
+ *                 if curr_size > 1:
+ *                     slice_indices_list.append((tok_idx, tok_idx + curr_size))             # <<<<<<<<<<<<<<
+ *                 tok_idx += curr_size
+ *                 curr_size = 0
+ */
+          __pyx_t_2 = __Pyx_PyInt_From_npy_int64(__pyx_v_tok_idx); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 87, __pyx_L1_error)
+          __Pyx_GOTREF(__pyx_t_2);
+          __pyx_t_11 = __Pyx_PyInt_From_npy_int64((__pyx_v_tok_idx + __pyx_v_curr_size)); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 87, __pyx_L1_error)
+          __Pyx_GOTREF(__pyx_t_11);
+          __pyx_t_12 = PyTuple_New(2); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 87, __pyx_L1_error)
+          __Pyx_GOTREF(__pyx_t_12);
+          __Pyx_GIVEREF(__pyx_t_2);
+          if (__Pyx_PyTuple_SET_ITEM(__pyx_t_12, 0, __pyx_t_2)) __PYX_ERR(0, 87, __pyx_L1_error);
+          __Pyx_GIVEREF(__pyx_t_11);
+          if (__Pyx_PyTuple_SET_ITEM(__pyx_t_12, 1, __pyx_t_11)) __PYX_ERR(0, 87, __pyx_L1_error);
+          __pyx_t_2 = 0;
+          __pyx_t_11 = 0;
+          __pyx_t_13 = __Pyx_PyList_Append(__pyx_v_slice_indices_list, __pyx_t_12); if (unlikely(__pyx_t_13 == ((int)-1))) __PYX_ERR(0, 87, __pyx_L1_error)
+          __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
+
+          /* "fairseq/data/token_block_utils_fast.pyx":86
+ *             else:
+ *                 # Only keep non-empty documents.
+ *                 if curr_size > 1:             # <<<<<<<<<<<<<<
+ *                     slice_indices_list.append((tok_idx, tok_idx + curr_size))
+ *                 tok_idx += curr_size
+ */
+        }
+
+        /* "fairseq/data/token_block_utils_fast.pyx":88
+ *                 if curr_size > 1:
+ *                     slice_indices_list.append((tok_idx, tok_idx + curr_size))
+ *                 tok_idx += curr_size             # <<<<<<<<<<<<<<
+ *                 curr_size = 0
+ *                 if sizes_view[sz_idx] == document_sep_len:
+ */
+        __pyx_v_tok_idx = (__pyx_v_tok_idx + __pyx_v_curr_size);
+
+        /* "fairseq/data/token_block_utils_fast.pyx":89
+ *                     slice_indices_list.append((tok_idx, tok_idx + curr_size))
+ *                 tok_idx += curr_size
+ *                 curr_size = 0             # <<<<<<<<<<<<<<
+ *                 if sizes_view[sz_idx] == document_sep_len:
+ *                     tok_idx += sizes_view[sz_idx]
+ */
+        __pyx_v_curr_size = 0;
+
+        /* "fairseq/data/token_block_utils_fast.pyx":90
+ *                 tok_idx += curr_size
+ *                 curr_size = 0
+ *                 if sizes_view[sz_idx] == document_sep_len:             # <<<<<<<<<<<<<<
+ *                     tok_idx += sizes_view[sz_idx]
+ *                     sz_idx += 1
+ */
+        __pyx_t_10 = __pyx_v_sz_idx;
+        __pyx_t_3 = ((*((__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t *) ( /* dim=0 */ (__pyx_v_sizes_view.data + __pyx_t_10 * __pyx_v_sizes_view.strides[0]) ))) == __pyx_v_document_sep_len);
+        if (__pyx_t_3) {
+
+          /* "fairseq/data/token_block_utils_fast.pyx":91
+ *                 curr_size = 0
+ *                 if sizes_view[sz_idx] == document_sep_len:
+ *                     tok_idx += sizes_view[sz_idx]             # <<<<<<<<<<<<<<
+ *                     sz_idx += 1
+ *         if curr_size > 1:
+ */
+          __pyx_t_10 = __pyx_v_sz_idx;
+          __pyx_v_tok_idx = (__pyx_v_tok_idx + (*((__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t *) ( /* dim=0 */ (__pyx_v_sizes_view.data + __pyx_t_10 * __pyx_v_sizes_view.strides[0]) ))));
+
+          /* "fairseq/data/token_block_utils_fast.pyx":92
+ *                 if sizes_view[sz_idx] == document_sep_len:
+ *                     tok_idx += sizes_view[sz_idx]
+ *                     sz_idx += 1             # <<<<<<<<<<<<<<
+ *         if curr_size > 1:
+ *             slice_indices_list.append((tok_idx, tok_idx + curr_size))
+ */
+          __pyx_v_sz_idx = (__pyx_v_sz_idx + 1);
+
+          /* "fairseq/data/token_block_utils_fast.pyx":90
+ *                 tok_idx += curr_size
+ *                 curr_size = 0
+ *                 if sizes_view[sz_idx] == document_sep_len:             # <<<<<<<<<<<<<<
+ *                     tok_idx += sizes_view[sz_idx]
+ *                     sz_idx += 1
+ */
+        }
+      }
+      __pyx_L14:;
+    }
+
+    /* "fairseq/data/token_block_utils_fast.pyx":93
+ *                     tok_idx += sizes_view[sz_idx]
+ *                     sz_idx += 1
+ *         if curr_size > 1:             # <<<<<<<<<<<<<<
+ *             slice_indices_list.append((tok_idx, tok_idx + curr_size))
+ *         slice_indices = _fast_convert_to_np_array(slice_indices_list)
+ */
+    __pyx_t_3 = (__pyx_v_curr_size > 1);
+    if (__pyx_t_3) {
+
+      /* "fairseq/data/token_block_utils_fast.pyx":94
+ *                     sz_idx += 1
+ *         if curr_size > 1:
+ *             slice_indices_list.append((tok_idx, tok_idx + curr_size))             # <<<<<<<<<<<<<<
+ *         slice_indices = _fast_convert_to_np_array(slice_indices_list)
+ *     elif break_mode == 'eos':
+ */
+      __pyx_t_12 = __Pyx_PyInt_From_npy_int64(__pyx_v_tok_idx); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 94, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_12);
+      __pyx_t_11 = __Pyx_PyInt_From_npy_int64((__pyx_v_tok_idx + __pyx_v_curr_size)); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 94, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_11);
+      __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 94, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_2);
+      __Pyx_GIVEREF(__pyx_t_12);
+      if (__Pyx_PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_12)) __PYX_ERR(0, 94, __pyx_L1_error);
+      __Pyx_GIVEREF(__pyx_t_11);
+      if (__Pyx_PyTuple_SET_ITEM(__pyx_t_2, 1, __pyx_t_11)) __PYX_ERR(0, 94, __pyx_L1_error);
+      __pyx_t_12 = 0;
+      __pyx_t_11 = 0;
+      __pyx_t_13 = __Pyx_PyList_Append(__pyx_v_slice_indices_list, __pyx_t_2); if (unlikely(__pyx_t_13 == ((int)-1))) __PYX_ERR(0, 94, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+      /* "fairseq/data/token_block_utils_fast.pyx":93
+ *                     tok_idx += sizes_view[sz_idx]
+ *                     sz_idx += 1
+ *         if curr_size > 1:             # <<<<<<<<<<<<<<
+ *             slice_indices_list.append((tok_idx, tok_idx + curr_size))
+ *         slice_indices = _fast_convert_to_np_array(slice_indices_list)
+ */
+    }
+
+    /* "fairseq/data/token_block_utils_fast.pyx":95
+ *         if curr_size > 1:
+ *             slice_indices_list.append((tok_idx, tok_idx + curr_size))
+ *         slice_indices = _fast_convert_to_np_array(slice_indices_list)             # <<<<<<<<<<<<<<
+ *     elif break_mode == 'eos':
+ *         slice_indices = np.zeros((len(sizes), 2), dtype=DTYPE)
+ */
+    __pyx_t_2 = ((PyObject *)__pyx_f_7fairseq_4data_22token_block_utils_fast__fast_convert_to_np_array(__pyx_v_slice_indices_list)); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 95, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    {
+      __Pyx_BufFmt_StackElem __pyx_stack[1];
+      __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer);
+      __pyx_t_5 = __Pyx_GetBufferAndValidate(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer, (PyObject*)((PyArrayObject *)__pyx_t_2), &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack);
+      if (unlikely(__pyx_t_5 < 0)) {
+        PyErr_Fetch(&__pyx_t_6, &__pyx_t_7, &__pyx_t_8);
+        if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer, (PyObject*)__pyx_v_slice_indices, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) {
+          Py_XDECREF(__pyx_t_6); Py_XDECREF(__pyx_t_7); Py_XDECREF(__pyx_t_8);
+          __Pyx_RaiseBufferFallbackError();
+        } else {
+          PyErr_Restore(__pyx_t_6, __pyx_t_7, __pyx_t_8);
+        }
+        __pyx_t_6 = __pyx_t_7 = __pyx_t_8 = 0;
+      }
+      __pyx_pybuffernd_slice_indices.diminfo[0].strides = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_slice_indices.diminfo[0].shape = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_slice_indices.diminfo[1].strides = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_slice_indices.diminfo[1].shape = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.shape[1];
+      if (unlikely((__pyx_t_5 < 0))) __PYX_ERR(0, 95, __pyx_L1_error)
+    }
+    __pyx_v_slice_indices = ((PyArrayObject *)__pyx_t_2);
+    __pyx_t_2 = 0;
+
+    /* "fairseq/data/token_block_utils_fast.pyx":75
+ *             slice_indices_list.append((tok_idx, tok_idx + curr_size))
+ *         slice_indices = _fast_convert_to_np_array(slice_indices_list)
+ *     elif break_mode == 'complete_doc':             # <<<<<<<<<<<<<<
+ *         while sz_idx < len(sizes_view):
+ *             if (
+ */
+    goto __pyx_L3;
+  }
+
+  /* "fairseq/data/token_block_utils_fast.pyx":96
+ *             slice_indices_list.append((tok_idx, tok_idx + curr_size))
+ *         slice_indices = _fast_convert_to_np_array(slice_indices_list)
+ *     elif break_mode == 'eos':             # <<<<<<<<<<<<<<
+ *         slice_indices = np.zeros((len(sizes), 2), dtype=DTYPE)
+ *         cumsum = sizes.cumsum(axis=0)
+ */
+  __pyx_t_3 = (__Pyx_PyUnicode_Equals(__pyx_v_break_mode, __pyx_n_u_eos, Py_EQ)); if (unlikely((__pyx_t_3 < 0))) __PYX_ERR(0, 96, __pyx_L1_error)
+  if (likely(__pyx_t_3)) {
+
+    /* "fairseq/data/token_block_utils_fast.pyx":97
+ *         slice_indices = _fast_convert_to_np_array(slice_indices_list)
+ *     elif break_mode == 'eos':
+ *         slice_indices = np.zeros((len(sizes), 2), dtype=DTYPE)             # <<<<<<<<<<<<<<
+ *         cumsum = sizes.cumsum(axis=0)
+ *         slice_indices[1:, 0] = cumsum[:cumsum.shape[0] - 1]
+ */
+    __Pyx_GetModuleGlobalName(__pyx_t_2, __pyx_n_s_np); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 97, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_11 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_zeros); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 97, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_11);
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __pyx_t_9 = PyObject_Length(((PyObject *)__pyx_v_sizes)); if (unlikely(__pyx_t_9 == ((Py_ssize_t)-1))) __PYX_ERR(0, 97, __pyx_L1_error)
+    __pyx_t_2 = PyInt_FromSsize_t(__pyx_t_9); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 97, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_12 = PyTuple_New(2); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 97, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_12);
+    __Pyx_GIVEREF(__pyx_t_2);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_12, 0, __pyx_t_2)) __PYX_ERR(0, 97, __pyx_L1_error);
+    __Pyx_INCREF(__pyx_int_2);
+    __Pyx_GIVEREF(__pyx_int_2);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_12, 1, __pyx_int_2)) __PYX_ERR(0, 97, __pyx_L1_error);
+    __pyx_t_2 = 0;
+    __pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 97, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __Pyx_GIVEREF(__pyx_t_12);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_12)) __PYX_ERR(0, 97, __pyx_L1_error);
+    __pyx_t_12 = 0;
+    __pyx_t_12 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 97, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_12);
+    __Pyx_GetModuleGlobalName(__pyx_t_14, __pyx_n_s_DTYPE); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 97, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_14);
+    if (PyDict_SetItem(__pyx_t_12, __pyx_n_s_dtype, __pyx_t_14) < 0) __PYX_ERR(0, 97, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_14); __pyx_t_14 = 0;
+    __pyx_t_14 = __Pyx_PyObject_Call(__pyx_t_11, __pyx_t_2, __pyx_t_12); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 97, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_14);
+    __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0;
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
+    if (!(likely(((__pyx_t_14) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_14, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 97, __pyx_L1_error)
+    __pyx_t_15 = ((PyArrayObject *)__pyx_t_14);
+    {
+      __Pyx_BufFmt_StackElem __pyx_stack[1];
+      __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer);
+      __pyx_t_5 = __Pyx_GetBufferAndValidate(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer, (PyObject*)__pyx_t_15, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack);
+      if (unlikely(__pyx_t_5 < 0)) {
+        PyErr_Fetch(&__pyx_t_8, &__pyx_t_7, &__pyx_t_6);
+        if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer, (PyObject*)__pyx_v_slice_indices, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) {
+          Py_XDECREF(__pyx_t_8); Py_XDECREF(__pyx_t_7); Py_XDECREF(__pyx_t_6);
+          __Pyx_RaiseBufferFallbackError();
+        } else {
+          PyErr_Restore(__pyx_t_8, __pyx_t_7, __pyx_t_6);
+        }
+        __pyx_t_8 = __pyx_t_7 = __pyx_t_6 = 0;
+      }
+      __pyx_pybuffernd_slice_indices.diminfo[0].strides = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_slice_indices.diminfo[0].shape = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_slice_indices.diminfo[1].strides = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_slice_indices.diminfo[1].shape = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.shape[1];
+      if (unlikely((__pyx_t_5 < 0))) __PYX_ERR(0, 97, __pyx_L1_error)
+    }
+    __pyx_t_15 = 0;
+    __pyx_v_slice_indices = ((PyArrayObject *)__pyx_t_14);
+    __pyx_t_14 = 0;
+
+    /* "fairseq/data/token_block_utils_fast.pyx":98
+ *     elif break_mode == 'eos':
+ *         slice_indices = np.zeros((len(sizes), 2), dtype=DTYPE)
+ *         cumsum = sizes.cumsum(axis=0)             # <<<<<<<<<<<<<<
+ *         slice_indices[1:, 0] = cumsum[:cumsum.shape[0] - 1]
+ *         slice_indices[:, 1] = cumsum
+ */
+    __pyx_t_14 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_sizes), __pyx_n_s_cumsum); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 98, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_14);
+    __pyx_t_12 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 98, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_12);
+    if (PyDict_SetItem(__pyx_t_12, __pyx_n_s_axis, __pyx_int_0) < 0) __PYX_ERR(0, 98, __pyx_L1_error)
+    __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_14, __pyx_empty_tuple, __pyx_t_12); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 98, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __Pyx_DECREF(__pyx_t_14); __pyx_t_14 = 0;
+    __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
+    __pyx_v_cumsum = __pyx_t_2;
+    __pyx_t_2 = 0;
+
+    /* "fairseq/data/token_block_utils_fast.pyx":99
+ *         slice_indices = np.zeros((len(sizes), 2), dtype=DTYPE)
+ *         cumsum = sizes.cumsum(axis=0)
+ *         slice_indices[1:, 0] = cumsum[:cumsum.shape[0] - 1]             # <<<<<<<<<<<<<<
+ *         slice_indices[:, 1] = cumsum
+ *     else:
+ */
+    __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_cumsum, __pyx_n_s_shape); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 99, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_12 = __Pyx_GetItemInt(__pyx_t_2, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 99, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_12);
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __pyx_t_2 = __Pyx_PyInt_SubtractObjC(__pyx_t_12, __pyx_int_1, 1, 0, 0); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 99, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
+    __pyx_t_12 = __Pyx_PyObject_GetSlice(__pyx_v_cumsum, 0, 0, NULL, &__pyx_t_2, NULL, 0, 0, 0); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 99, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_12);
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    if (unlikely((PyObject_SetItem(((PyObject *)__pyx_v_slice_indices), __pyx_tuple__12, __pyx_t_12) < 0))) __PYX_ERR(0, 99, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
+
+    /* "fairseq/data/token_block_utils_fast.pyx":100
+ *         cumsum = sizes.cumsum(axis=0)
+ *         slice_indices[1:, 0] = cumsum[:cumsum.shape[0] - 1]
+ *         slice_indices[:, 1] = cumsum             # <<<<<<<<<<<<<<
+ *     else:
+ *         raise ValueError('Invalid break_mode: ' + break_mode)
+ */
+    if (unlikely((PyObject_SetItem(((PyObject *)__pyx_v_slice_indices), __pyx_tuple__13, __pyx_v_cumsum) < 0))) __PYX_ERR(0, 100, __pyx_L1_error)
+
+    /* "fairseq/data/token_block_utils_fast.pyx":96
+ *             slice_indices_list.append((tok_idx, tok_idx + curr_size))
+ *         slice_indices = _fast_convert_to_np_array(slice_indices_list)
+ *     elif break_mode == 'eos':             # <<<<<<<<<<<<<<
+ *         slice_indices = np.zeros((len(sizes), 2), dtype=DTYPE)
+ *         cumsum = sizes.cumsum(axis=0)
+ */
+    goto __pyx_L3;
+  }
+
+  /* "fairseq/data/token_block_utils_fast.pyx":102
+ *         slice_indices[:, 1] = cumsum
+ *     else:
+ *         raise ValueError('Invalid break_mode: ' + break_mode)             # <<<<<<<<<<<<<<
+ *     return slice_indices
+ * 
+ */
+  /*else*/ {
+    __pyx_t_12 = __Pyx_PyUnicode_ConcatSafe(__pyx_kp_u_Invalid_break_mode, __pyx_v_break_mode); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 102, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_12);
+    __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_builtin_ValueError, __pyx_t_12); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 102, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
+    __Pyx_Raise(__pyx_t_2, 0, 0, 0);
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __PYX_ERR(0, 102, __pyx_L1_error)
+  }
+  __pyx_L3:;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":103
+ *     else:
+ *         raise ValueError('Invalid break_mode: ' + break_mode)
+ *     return slice_indices             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __Pyx_XDECREF((PyObject *)__pyx_r);
+  __Pyx_INCREF((PyObject *)__pyx_v_slice_indices);
+  __pyx_r = ((PyArrayObject *)__pyx_v_slice_indices);
+  goto __pyx_L0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":50
+ * @cython.wraparound(False)
+ * @cython.nonecheck(False)
+ * cpdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_fast(np.ndarray[DTYPE_t, ndim=1] sizes, str break_mode, int block_size, int document_sep_len):             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t tok_idx = 0
+ *     cdef DTYPE_t sz_idx = 0
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __PYX_XCLEAR_MEMVIEW(&__pyx_t_1, 1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_11);
+  __Pyx_XDECREF(__pyx_t_12);
+  __Pyx_XDECREF(__pyx_t_14);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_sizes.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("fairseq.data.token_block_utils_fast._get_slice_indices_fast", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_sizes.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __PYX_XCLEAR_MEMVIEW(&__pyx_v_sizes_view, 1);
+  __Pyx_XDECREF((PyObject *)__pyx_v_slice_indices);
+  __Pyx_XDECREF(__pyx_v_slice_indices_list);
+  __Pyx_XDECREF(__pyx_v_cumsum);
+  __Pyx_XGIVEREF((PyObject *)__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* Python wrapper */
+static PyObject *__pyx_pw_7fairseq_4data_22token_block_utils_fast_1_get_slice_indices_fast(PyObject *__pyx_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyMethodDef __pyx_mdef_7fairseq_4data_22token_block_utils_fast_1_get_slice_indices_fast = {"_get_slice_indices_fast", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_7fairseq_4data_22token_block_utils_fast_1_get_slice_indices_fast, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_7fairseq_4data_22token_block_utils_fast_1_get_slice_indices_fast(PyObject *__pyx_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  PyArrayObject *__pyx_v_sizes = 0;
+  PyObject *__pyx_v_break_mode = 0;
+  int __pyx_v_block_size;
+  int __pyx_v_document_sep_len;
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[4] = {0,0,0,0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_get_slice_indices_fast (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_sizes,&__pyx_n_s_break_mode,&__pyx_n_s_block_size,&__pyx_n_s_document_sep_len,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  4: values[3] = __Pyx_Arg_FASTCALL(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = __Pyx_Arg_FASTCALL(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = __Pyx_Arg_FASTCALL(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_FASTCALL(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_sizes)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 50, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_break_mode)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[1]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 50, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("_get_slice_indices_fast", 1, 4, 4, 1); __PYX_ERR(0, 50, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_block_size)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[2]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 50, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("_get_slice_indices_fast", 1, 4, 4, 2); __PYX_ERR(0, 50, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  3:
+        if (likely((values[3] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_document_sep_len)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[3]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 50, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("_get_slice_indices_fast", 1, 4, 4, 3); __PYX_ERR(0, 50, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "_get_slice_indices_fast") < 0)) __PYX_ERR(0, 50, __pyx_L3_error)
+      }
+    } else if (unlikely(__pyx_nargs != 4)) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+      values[1] = __Pyx_Arg_FASTCALL(__pyx_args, 1);
+      values[2] = __Pyx_Arg_FASTCALL(__pyx_args, 2);
+      values[3] = __Pyx_Arg_FASTCALL(__pyx_args, 3);
+    }
+    __pyx_v_sizes = ((PyArrayObject *)values[0]);
+    __pyx_v_break_mode = ((PyObject*)values[1]);
+    __pyx_v_block_size = __Pyx_PyInt_As_int(values[2]); if (unlikely((__pyx_v_block_size == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 50, __pyx_L3_error)
+    __pyx_v_document_sep_len = __Pyx_PyInt_As_int(values[3]); if (unlikely((__pyx_v_document_sep_len == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 50, __pyx_L3_error)
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("_get_slice_indices_fast", 1, 4, 4, __pyx_nargs); __PYX_ERR(0, 50, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("fairseq.data.token_block_utils_fast._get_slice_indices_fast", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_sizes), __pyx_ptype_5numpy_ndarray, 1, "sizes", 0))) __PYX_ERR(0, 50, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_break_mode), (&PyUnicode_Type), 1, "break_mode", 1))) __PYX_ERR(0, 50, __pyx_L1_error)
+  __pyx_r = __pyx_pf_7fairseq_4data_22token_block_utils_fast__get_slice_indices_fast(__pyx_self, __pyx_v_sizes, __pyx_v_break_mode, __pyx_v_block_size, __pyx_v_document_sep_len);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_7fairseq_4data_22token_block_utils_fast__get_slice_indices_fast(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_sizes, PyObject *__pyx_v_break_mode, int __pyx_v_block_size, int __pyx_v_document_sep_len) {
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_sizes;
+  __Pyx_Buffer __pyx_pybuffer_sizes;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("_get_slice_indices_fast", 1);
+  __pyx_pybuffer_sizes.pybuffer.buf = NULL;
+  __pyx_pybuffer_sizes.refcount = 0;
+  __pyx_pybuffernd_sizes.data = NULL;
+  __pyx_pybuffernd_sizes.rcbuffer = &__pyx_pybuffer_sizes;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_sizes.rcbuffer->pybuffer, (PyObject*)__pyx_v_sizes, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) __PYX_ERR(0, 50, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_sizes.diminfo[0].strides = __pyx_pybuffernd_sizes.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_sizes.diminfo[0].shape = __pyx_pybuffernd_sizes.rcbuffer->pybuffer.shape[0];
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = ((PyObject *)__pyx_f_7fairseq_4data_22token_block_utils_fast__get_slice_indices_fast(__pyx_v_sizes, __pyx_v_break_mode, __pyx_v_block_size, __pyx_v_document_sep_len, 0)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 50, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_sizes.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("fairseq.data.token_block_utils_fast._get_slice_indices_fast", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_sizes.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "fairseq/data/token_block_utils_fast.pyx":109
+ * @cython.wraparound(False)
+ * @cython.nonecheck(False)
+ * cpdef np.ndarray[DTYPE_t, ndim=2] _get_block_to_dataset_index_fast(np.ndarray[DTYPE_t, ndim=1] sizes, np.ndarray[DTYPE_t, ndim=2] slice_indices):             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t start_ds_idx
+ *     cdef DTYPE_t start_offset
+ */
+
+static PyObject *__pyx_pw_7fairseq_4data_22token_block_utils_fast_3_get_block_to_dataset_index_fast(PyObject *__pyx_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyArrayObject *__pyx_f_7fairseq_4data_22token_block_utils_fast__get_block_to_dataset_index_fast(PyArrayObject *__pyx_v_sizes, PyArrayObject *__pyx_v_slice_indices, CYTHON_UNUSED int __pyx_skip_dispatch) {
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_start_ds_idx;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_start_offset;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_end_ds_idx;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_i;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_s;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_e;
+  struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *__pyx_v_ds = 0;
+  PyArrayObject *__pyx_v_block_to_dataset_index = 0;
+  __Pyx_memviewslice __pyx_v_block_to_dataset_index_view = { 0, 0, { 0 }, { 0 }, { 0 } };
+  __Pyx_memviewslice __pyx_v_slice_indices_view = { 0, 0, { 0 }, { 0 }, { 0 } };
+  Py_ssize_t __pyx_v_x_max;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_block_to_dataset_index;
+  __Pyx_Buffer __pyx_pybuffer_block_to_dataset_index;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_sizes;
+  __Pyx_Buffer __pyx_pybuffer_sizes;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_slice_indices;
+  __Pyx_Buffer __pyx_pybuffer_slice_indices;
+  PyArrayObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  Py_ssize_t __pyx_t_3;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  PyArrayObject *__pyx_t_6 = NULL;
+  __Pyx_memviewslice __pyx_t_7 = { 0, 0, { 0 }, { 0 }, { 0 } };
+  Py_ssize_t __pyx_t_8;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_t_9;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_t_10;
+  Py_ssize_t __pyx_t_11;
+  int __pyx_t_12;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("_get_block_to_dataset_index_fast", 1);
+  __pyx_pybuffer_block_to_dataset_index.pybuffer.buf = NULL;
+  __pyx_pybuffer_block_to_dataset_index.refcount = 0;
+  __pyx_pybuffernd_block_to_dataset_index.data = NULL;
+  __pyx_pybuffernd_block_to_dataset_index.rcbuffer = &__pyx_pybuffer_block_to_dataset_index;
+  __pyx_pybuffer_sizes.pybuffer.buf = NULL;
+  __pyx_pybuffer_sizes.refcount = 0;
+  __pyx_pybuffernd_sizes.data = NULL;
+  __pyx_pybuffernd_sizes.rcbuffer = &__pyx_pybuffer_sizes;
+  __pyx_pybuffer_slice_indices.pybuffer.buf = NULL;
+  __pyx_pybuffer_slice_indices.refcount = 0;
+  __pyx_pybuffernd_slice_indices.data = NULL;
+  __pyx_pybuffernd_slice_indices.rcbuffer = &__pyx_pybuffer_slice_indices;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_sizes.rcbuffer->pybuffer, (PyObject*)__pyx_v_sizes, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) __PYX_ERR(0, 109, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_sizes.diminfo[0].strides = __pyx_pybuffernd_sizes.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_sizes.diminfo[0].shape = __pyx_pybuffernd_sizes.rcbuffer->pybuffer.shape[0];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer, (PyObject*)__pyx_v_slice_indices, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 109, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_slice_indices.diminfo[0].strides = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_slice_indices.diminfo[0].shape = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_slice_indices.diminfo[1].strides = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_slice_indices.diminfo[1].shape = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.shape[1];
+
+  /* "fairseq/data/token_block_utils_fast.pyx":116
+ *     cdef DTYPE_t s
+ *     cdef DTYPE_t e
+ *     cdef DatasetSearcher ds = DatasetSearcher(sizes)             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[DTYPE_t, ndim=2] block_to_dataset_index = np.zeros([len(slice_indices), 3], dtype=DTYPE)
+ *     cdef DTYPE_t[:, :] block_to_dataset_index_view = block_to_dataset_index
+ */
+  __pyx_t_1 = __Pyx_PyObject_CallOneArg(((PyObject *)__pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher), ((PyObject *)__pyx_v_sizes)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 116, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v_ds = ((struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":117
+ *     cdef DTYPE_t e
+ *     cdef DatasetSearcher ds = DatasetSearcher(sizes)
+ *     cdef np.ndarray[DTYPE_t, ndim=2] block_to_dataset_index = np.zeros([len(slice_indices), 3], dtype=DTYPE)             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t[:, :] block_to_dataset_index_view = block_to_dataset_index
+ *     cdef DTYPE_t[:, :] slice_indices_view = slice_indices
+ */
+  __Pyx_GetModuleGlobalName(__pyx_t_1, __pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 117, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_zeros); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 117, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_3 = PyObject_Length(((PyObject *)__pyx_v_slice_indices)); if (unlikely(__pyx_t_3 == ((Py_ssize_t)-1))) __PYX_ERR(0, 117, __pyx_L1_error)
+  __pyx_t_1 = PyInt_FromSsize_t(__pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 117, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_4 = PyList_New(2); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 117, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_GIVEREF(__pyx_t_1);
+  if (__Pyx_PyList_SET_ITEM(__pyx_t_4, 0, __pyx_t_1)) __PYX_ERR(0, 117, __pyx_L1_error);
+  __Pyx_INCREF(__pyx_int_3);
+  __Pyx_GIVEREF(__pyx_int_3);
+  if (__Pyx_PyList_SET_ITEM(__pyx_t_4, 1, __pyx_int_3)) __PYX_ERR(0, 117, __pyx_L1_error);
+  __pyx_t_1 = 0;
+  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 117, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_GIVEREF(__pyx_t_4);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_4)) __PYX_ERR(0, 117, __pyx_L1_error);
+  __pyx_t_4 = 0;
+  __pyx_t_4 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 117, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_GetModuleGlobalName(__pyx_t_5, __pyx_n_s_DTYPE); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 117, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  if (PyDict_SetItem(__pyx_t_4, __pyx_n_s_dtype, __pyx_t_5) < 0) __PYX_ERR(0, 117, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_1, __pyx_t_4); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 117, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  if (!(likely(((__pyx_t_5) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_5, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 117, __pyx_L1_error)
+  __pyx_t_6 = ((PyArrayObject *)__pyx_t_5);
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_block_to_dataset_index.rcbuffer->pybuffer, (PyObject*)__pyx_t_6, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) {
+      __pyx_v_block_to_dataset_index = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_block_to_dataset_index.rcbuffer->pybuffer.buf = NULL;
+      __PYX_ERR(0, 117, __pyx_L1_error)
+    } else {__pyx_pybuffernd_block_to_dataset_index.diminfo[0].strides = __pyx_pybuffernd_block_to_dataset_index.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_block_to_dataset_index.diminfo[0].shape = __pyx_pybuffernd_block_to_dataset_index.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_block_to_dataset_index.diminfo[1].strides = __pyx_pybuffernd_block_to_dataset_index.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_block_to_dataset_index.diminfo[1].shape = __pyx_pybuffernd_block_to_dataset_index.rcbuffer->pybuffer.shape[1];
+    }
+  }
+  __pyx_t_6 = 0;
+  __pyx_v_block_to_dataset_index = ((PyArrayObject *)__pyx_t_5);
+  __pyx_t_5 = 0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":118
+ *     cdef DatasetSearcher ds = DatasetSearcher(sizes)
+ *     cdef np.ndarray[DTYPE_t, ndim=2] block_to_dataset_index = np.zeros([len(slice_indices), 3], dtype=DTYPE)
+ *     cdef DTYPE_t[:, :] block_to_dataset_index_view = block_to_dataset_index             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t[:, :] slice_indices_view = slice_indices
+ *     cdef Py_ssize_t x_max = slice_indices.shape[0]
+ */
+  __pyx_t_7 = __Pyx_PyObject_to_MemoryviewSlice_dsds_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t(((PyObject *)__pyx_v_block_to_dataset_index), PyBUF_WRITABLE); if (unlikely(!__pyx_t_7.memview)) __PYX_ERR(0, 118, __pyx_L1_error)
+  __pyx_v_block_to_dataset_index_view = __pyx_t_7;
+  __pyx_t_7.memview = NULL;
+  __pyx_t_7.data = NULL;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":119
+ *     cdef np.ndarray[DTYPE_t, ndim=2] block_to_dataset_index = np.zeros([len(slice_indices), 3], dtype=DTYPE)
+ *     cdef DTYPE_t[:, :] block_to_dataset_index_view = block_to_dataset_index
+ *     cdef DTYPE_t[:, :] slice_indices_view = slice_indices             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t x_max = slice_indices.shape[0]
+ * 
+ */
+  __pyx_t_7 = __Pyx_PyObject_to_MemoryviewSlice_dsds_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t(((PyObject *)__pyx_v_slice_indices), PyBUF_WRITABLE); if (unlikely(!__pyx_t_7.memview)) __PYX_ERR(0, 119, __pyx_L1_error)
+  __pyx_v_slice_indices_view = __pyx_t_7;
+  __pyx_t_7.memview = NULL;
+  __pyx_t_7.data = NULL;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":120
+ *     cdef DTYPE_t[:, :] block_to_dataset_index_view = block_to_dataset_index
+ *     cdef DTYPE_t[:, :] slice_indices_view = slice_indices
+ *     cdef Py_ssize_t x_max = slice_indices.shape[0]             # <<<<<<<<<<<<<<
+ * 
+ *     for i in range(x_max):
+ */
+  __pyx_v_x_max = (__pyx_f_5numpy_7ndarray_5shape_shape(((PyArrayObject *)__pyx_v_slice_indices))[0]);
+
+  /* "fairseq/data/token_block_utils_fast.pyx":122
+ *     cdef Py_ssize_t x_max = slice_indices.shape[0]
+ * 
+ *     for i in range(x_max):             # <<<<<<<<<<<<<<
+ *         s = slice_indices_view[i][0]
+ *         e = slice_indices_view[i][1]
+ */
+  __pyx_t_3 = __pyx_v_x_max;
+  __pyx_t_8 = __pyx_t_3;
+  for (__pyx_t_9 = 0; __pyx_t_9 < __pyx_t_8; __pyx_t_9+=1) {
+    __pyx_v_i = __pyx_t_9;
+
+    /* "fairseq/data/token_block_utils_fast.pyx":123
+ * 
+ *     for i in range(x_max):
+ *         s = slice_indices_view[i][0]             # <<<<<<<<<<<<<<
+ *         e = slice_indices_view[i][1]
+ *         ds.seek(s)
+ */
+    __pyx_t_10 = __pyx_v_i;
+    __pyx_t_11 = 0;
+    __pyx_v_s = (*((__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_slice_indices_view.data + __pyx_t_10 * __pyx_v_slice_indices_view.strides[0]) ) + __pyx_t_11 * __pyx_v_slice_indices_view.strides[1]) )));
+
+    /* "fairseq/data/token_block_utils_fast.pyx":124
+ *     for i in range(x_max):
+ *         s = slice_indices_view[i][0]
+ *         e = slice_indices_view[i][1]             # <<<<<<<<<<<<<<
+ *         ds.seek(s)
+ *         start_ds_idx = ds.current_index
+ */
+    __pyx_t_10 = __pyx_v_i;
+    __pyx_t_11 = 1;
+    __pyx_v_e = (*((__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_slice_indices_view.data + __pyx_t_10 * __pyx_v_slice_indices_view.strides[0]) ) + __pyx_t_11 * __pyx_v_slice_indices_view.strides[1]) )));
+
+    /* "fairseq/data/token_block_utils_fast.pyx":125
+ *         s = slice_indices_view[i][0]
+ *         e = slice_indices_view[i][1]
+ *         ds.seek(s)             # <<<<<<<<<<<<<<
+ *         start_ds_idx = ds.current_index
+ *         start_offset = ds.current_offset
+ */
+    __pyx_t_5 = ((struct __pyx_vtabstruct_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *)__pyx_v_ds->__pyx_vtab)->seek(__pyx_v_ds, __pyx_v_s); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 125, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+
+    /* "fairseq/data/token_block_utils_fast.pyx":126
+ *         e = slice_indices_view[i][1]
+ *         ds.seek(s)
+ *         start_ds_idx = ds.current_index             # <<<<<<<<<<<<<<
+ *         start_offset = ds.current_offset
+ *         if e <= s:
+ */
+    __pyx_t_10 = __pyx_v_ds->current_index;
+    __pyx_v_start_ds_idx = __pyx_t_10;
+
+    /* "fairseq/data/token_block_utils_fast.pyx":127
+ *         ds.seek(s)
+ *         start_ds_idx = ds.current_index
+ *         start_offset = ds.current_offset             # <<<<<<<<<<<<<<
+ *         if e <= s:
+ *             end_ds_idx = start_ds_idx
+ */
+    __pyx_t_10 = __pyx_v_ds->current_offset;
+    __pyx_v_start_offset = __pyx_t_10;
+
+    /* "fairseq/data/token_block_utils_fast.pyx":128
+ *         start_ds_idx = ds.current_index
+ *         start_offset = ds.current_offset
+ *         if e <= s:             # <<<<<<<<<<<<<<
+ *             end_ds_idx = start_ds_idx
+ *         else:
+ */
+    __pyx_t_12 = (__pyx_v_e <= __pyx_v_s);
+    if (__pyx_t_12) {
+
+      /* "fairseq/data/token_block_utils_fast.pyx":129
+ *         start_offset = ds.current_offset
+ *         if e <= s:
+ *             end_ds_idx = start_ds_idx             # <<<<<<<<<<<<<<
+ *         else:
+ *             ds.seek(e - 1)
+ */
+      __pyx_v_end_ds_idx = __pyx_v_start_ds_idx;
+
+      /* "fairseq/data/token_block_utils_fast.pyx":128
+ *         start_ds_idx = ds.current_index
+ *         start_offset = ds.current_offset
+ *         if e <= s:             # <<<<<<<<<<<<<<
+ *             end_ds_idx = start_ds_idx
+ *         else:
+ */
+      goto __pyx_L5;
+    }
+
+    /* "fairseq/data/token_block_utils_fast.pyx":131
+ *             end_ds_idx = start_ds_idx
+ *         else:
+ *             ds.seek(e - 1)             # <<<<<<<<<<<<<<
+ *             end_ds_idx = ds.current_index
+ *         block_to_dataset_index_view[i][0] = start_ds_idx  # starting index in dataset
+ */
+    /*else*/ {
+      __pyx_t_5 = ((struct __pyx_vtabstruct_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *)__pyx_v_ds->__pyx_vtab)->seek(__pyx_v_ds, (__pyx_v_e - 1)); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 131, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+
+      /* "fairseq/data/token_block_utils_fast.pyx":132
+ *         else:
+ *             ds.seek(e - 1)
+ *             end_ds_idx = ds.current_index             # <<<<<<<<<<<<<<
+ *         block_to_dataset_index_view[i][0] = start_ds_idx  # starting index in dataset
+ *         block_to_dataset_index_view[i][1] = start_offset  # starting offset within starting index
+ */
+      __pyx_t_10 = __pyx_v_ds->current_index;
+      __pyx_v_end_ds_idx = __pyx_t_10;
+    }
+    __pyx_L5:;
+
+    /* "fairseq/data/token_block_utils_fast.pyx":133
+ *             ds.seek(e - 1)
+ *             end_ds_idx = ds.current_index
+ *         block_to_dataset_index_view[i][0] = start_ds_idx  # starting index in dataset             # <<<<<<<<<<<<<<
+ *         block_to_dataset_index_view[i][1] = start_offset  # starting offset within starting index
+ *         block_to_dataset_index_view[i][2] = end_ds_idx    # ending index in dataset
+ */
+    __pyx_t_10 = __pyx_v_i;
+    __pyx_t_11 = 0;
+    *((__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_block_to_dataset_index_view.data + __pyx_t_10 * __pyx_v_block_to_dataset_index_view.strides[0]) ) + __pyx_t_11 * __pyx_v_block_to_dataset_index_view.strides[1]) )) = __pyx_v_start_ds_idx;
+
+    /* "fairseq/data/token_block_utils_fast.pyx":134
+ *             end_ds_idx = ds.current_index
+ *         block_to_dataset_index_view[i][0] = start_ds_idx  # starting index in dataset
+ *         block_to_dataset_index_view[i][1] = start_offset  # starting offset within starting index             # <<<<<<<<<<<<<<
+ *         block_to_dataset_index_view[i][2] = end_ds_idx    # ending index in dataset
+ *     return block_to_dataset_index
+ */
+    __pyx_t_10 = __pyx_v_i;
+    __pyx_t_11 = 1;
+    *((__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_block_to_dataset_index_view.data + __pyx_t_10 * __pyx_v_block_to_dataset_index_view.strides[0]) ) + __pyx_t_11 * __pyx_v_block_to_dataset_index_view.strides[1]) )) = __pyx_v_start_offset;
+
+    /* "fairseq/data/token_block_utils_fast.pyx":135
+ *         block_to_dataset_index_view[i][0] = start_ds_idx  # starting index in dataset
+ *         block_to_dataset_index_view[i][1] = start_offset  # starting offset within starting index
+ *         block_to_dataset_index_view[i][2] = end_ds_idx    # ending index in dataset             # <<<<<<<<<<<<<<
+ *     return block_to_dataset_index
+ * 
+ */
+    __pyx_t_10 = __pyx_v_i;
+    __pyx_t_11 = 2;
+    *((__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_block_to_dataset_index_view.data + __pyx_t_10 * __pyx_v_block_to_dataset_index_view.strides[0]) ) + __pyx_t_11 * __pyx_v_block_to_dataset_index_view.strides[1]) )) = __pyx_v_end_ds_idx;
+  }
+
+  /* "fairseq/data/token_block_utils_fast.pyx":136
+ *         block_to_dataset_index_view[i][1] = start_offset  # starting offset within starting index
+ *         block_to_dataset_index_view[i][2] = end_ds_idx    # ending index in dataset
+ *     return block_to_dataset_index             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __Pyx_XDECREF((PyObject *)__pyx_r);
+  __Pyx_INCREF((PyObject *)__pyx_v_block_to_dataset_index);
+  __pyx_r = ((PyArrayObject *)__pyx_v_block_to_dataset_index);
+  goto __pyx_L0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":109
+ * @cython.wraparound(False)
+ * @cython.nonecheck(False)
+ * cpdef np.ndarray[DTYPE_t, ndim=2] _get_block_to_dataset_index_fast(np.ndarray[DTYPE_t, ndim=1] sizes, np.ndarray[DTYPE_t, ndim=2] slice_indices):             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t start_ds_idx
+ *     cdef DTYPE_t start_offset
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __PYX_XCLEAR_MEMVIEW(&__pyx_t_7, 1);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_block_to_dataset_index.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_sizes.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("fairseq.data.token_block_utils_fast._get_block_to_dataset_index_fast", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_block_to_dataset_index.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_sizes.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_ds);
+  __Pyx_XDECREF((PyObject *)__pyx_v_block_to_dataset_index);
+  __PYX_XCLEAR_MEMVIEW(&__pyx_v_block_to_dataset_index_view, 1);
+  __PYX_XCLEAR_MEMVIEW(&__pyx_v_slice_indices_view, 1);
+  __Pyx_XGIVEREF((PyObject *)__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* Python wrapper */
+static PyObject *__pyx_pw_7fairseq_4data_22token_block_utils_fast_3_get_block_to_dataset_index_fast(PyObject *__pyx_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyMethodDef __pyx_mdef_7fairseq_4data_22token_block_utils_fast_3_get_block_to_dataset_index_fast = {"_get_block_to_dataset_index_fast", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_7fairseq_4data_22token_block_utils_fast_3_get_block_to_dataset_index_fast, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_7fairseq_4data_22token_block_utils_fast_3_get_block_to_dataset_index_fast(PyObject *__pyx_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  PyArrayObject *__pyx_v_sizes = 0;
+  PyArrayObject *__pyx_v_slice_indices = 0;
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[2] = {0,0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_get_block_to_dataset_index_fast (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_sizes,&__pyx_n_s_slice_indices,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  2: values[1] = __Pyx_Arg_FASTCALL(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_FASTCALL(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_sizes)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 109, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_slice_indices)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[1]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 109, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("_get_block_to_dataset_index_fast", 1, 2, 2, 1); __PYX_ERR(0, 109, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "_get_block_to_dataset_index_fast") < 0)) __PYX_ERR(0, 109, __pyx_L3_error)
+      }
+    } else if (unlikely(__pyx_nargs != 2)) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+      values[1] = __Pyx_Arg_FASTCALL(__pyx_args, 1);
+    }
+    __pyx_v_sizes = ((PyArrayObject *)values[0]);
+    __pyx_v_slice_indices = ((PyArrayObject *)values[1]);
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("_get_block_to_dataset_index_fast", 1, 2, 2, __pyx_nargs); __PYX_ERR(0, 109, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("fairseq.data.token_block_utils_fast._get_block_to_dataset_index_fast", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_sizes), __pyx_ptype_5numpy_ndarray, 1, "sizes", 0))) __PYX_ERR(0, 109, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_slice_indices), __pyx_ptype_5numpy_ndarray, 1, "slice_indices", 0))) __PYX_ERR(0, 109, __pyx_L1_error)
+  __pyx_r = __pyx_pf_7fairseq_4data_22token_block_utils_fast_2_get_block_to_dataset_index_fast(__pyx_self, __pyx_v_sizes, __pyx_v_slice_indices);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_7fairseq_4data_22token_block_utils_fast_2_get_block_to_dataset_index_fast(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_sizes, PyArrayObject *__pyx_v_slice_indices) {
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_sizes;
+  __Pyx_Buffer __pyx_pybuffer_sizes;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_slice_indices;
+  __Pyx_Buffer __pyx_pybuffer_slice_indices;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("_get_block_to_dataset_index_fast", 1);
+  __pyx_pybuffer_sizes.pybuffer.buf = NULL;
+  __pyx_pybuffer_sizes.refcount = 0;
+  __pyx_pybuffernd_sizes.data = NULL;
+  __pyx_pybuffernd_sizes.rcbuffer = &__pyx_pybuffer_sizes;
+  __pyx_pybuffer_slice_indices.pybuffer.buf = NULL;
+  __pyx_pybuffer_slice_indices.refcount = 0;
+  __pyx_pybuffernd_slice_indices.data = NULL;
+  __pyx_pybuffernd_slice_indices.rcbuffer = &__pyx_pybuffer_slice_indices;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_sizes.rcbuffer->pybuffer, (PyObject*)__pyx_v_sizes, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) __PYX_ERR(0, 109, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_sizes.diminfo[0].strides = __pyx_pybuffernd_sizes.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_sizes.diminfo[0].shape = __pyx_pybuffernd_sizes.rcbuffer->pybuffer.shape[0];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer, (PyObject*)__pyx_v_slice_indices, &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 109, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_slice_indices.diminfo[0].strides = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_slice_indices.diminfo[0].shape = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_slice_indices.diminfo[1].strides = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_slice_indices.diminfo[1].shape = __pyx_pybuffernd_slice_indices.rcbuffer->pybuffer.shape[1];
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = ((PyObject *)__pyx_f_7fairseq_4data_22token_block_utils_fast__get_block_to_dataset_index_fast(__pyx_v_sizes, __pyx_v_slice_indices, 0)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 109, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_sizes.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("fairseq.data.token_block_utils_fast._get_block_to_dataset_index_fast", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_sizes.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_slice_indices.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "fairseq/data/token_block_utils_fast.pyx":147
+ *     cdef DTYPE_t[:] sizes
+ * 
+ *     def __init__(self, DTYPE_t[:] sizes):             # <<<<<<<<<<<<<<
+ *         self.sizes = sizes
+ *         self.reset()
+ */
+
+/* Python wrapper */
+static int __pyx_pw_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_1__init__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static int __pyx_pw_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_1__init__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  __Pyx_memviewslice __pyx_v_sizes = { 0, 0, { 0 }, { 0 }, { 0 } };
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[1] = {0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__init__ (wrapper)", 0);
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return -1;
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_VARARGS(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_sizes,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  1: values[0] = __Pyx_Arg_VARARGS(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_VARARGS(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_VARARGS(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_sizes)) != 0)) {
+          (void)__Pyx_Arg_NewRef_VARARGS(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 147, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "__init__") < 0)) __PYX_ERR(0, 147, __pyx_L3_error)
+      }
+    } else if (unlikely(__pyx_nargs != 1)) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = __Pyx_Arg_VARARGS(__pyx_args, 0);
+    }
+    __pyx_v_sizes = __Pyx_PyObject_to_MemoryviewSlice_ds_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t(values[0], PyBUF_WRITABLE); if (unlikely(!__pyx_v_sizes.memview)) __PYX_ERR(0, 147, __pyx_L3_error)
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__init__", 1, 1, 1, __pyx_nargs); __PYX_ERR(0, 147, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_VARARGS(values[__pyx_temp]);
+    }
+  }
+  __PYX_XCLEAR_MEMVIEW(&__pyx_v_sizes, 1);
+  __Pyx_AddTraceback("fairseq.data.token_block_utils_fast.DatasetSearcher.__init__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return -1;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher___init__(((struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *)__pyx_v_self), __pyx_v_sizes);
+
+  /* function exit code */
+  __PYX_XCLEAR_MEMVIEW(&__pyx_v_sizes, 1);
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_VARARGS(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_pf_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher___init__(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *__pyx_v_self, __Pyx_memviewslice __pyx_v_sizes) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__init__", 1);
+
+  /* "fairseq/data/token_block_utils_fast.pyx":148
+ * 
+ *     def __init__(self, DTYPE_t[:] sizes):
+ *         self.sizes = sizes             # <<<<<<<<<<<<<<
+ *         self.reset()
+ * 
+ */
+  __PYX_XCLEAR_MEMVIEW(&__pyx_v_self->sizes, 0);
+  __PYX_INC_MEMVIEW(&__pyx_v_sizes, 1);
+  __pyx_v_self->sizes = __pyx_v_sizes;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":149
+ *     def __init__(self, DTYPE_t[:] sizes):
+ *         self.sizes = sizes
+ *         self.reset()             # <<<<<<<<<<<<<<
+ * 
+ *     cdef reset(self):
+ */
+  __pyx_t_1 = ((struct __pyx_vtabstruct_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *)__pyx_v_self->__pyx_vtab)->reset(__pyx_v_self); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 149, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":147
+ *     cdef DTYPE_t[:] sizes
+ * 
+ *     def __init__(self, DTYPE_t[:] sizes):             # <<<<<<<<<<<<<<
+ *         self.sizes = sizes
+ *         self.reset()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("fairseq.data.token_block_utils_fast.DatasetSearcher.__init__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "fairseq/data/token_block_utils_fast.pyx":151
+ *         self.reset()
+ * 
+ *     cdef reset(self):             # <<<<<<<<<<<<<<
+ *         self.current_offset = 0     # offset within current index in underlying dataset
+ *         self.current_i = 0          # "flat" index
+ */
+
+static PyObject *__pyx_f_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_reset(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("reset", 1);
+
+  /* "fairseq/data/token_block_utils_fast.pyx":152
+ * 
+ *     cdef reset(self):
+ *         self.current_offset = 0     # offset within current index in underlying dataset             # <<<<<<<<<<<<<<
+ *         self.current_i = 0          # "flat" index
+ *         self.current_index = 0      # index in underlying dataset
+ */
+  __pyx_v_self->current_offset = 0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":153
+ *     cdef reset(self):
+ *         self.current_offset = 0     # offset within current index in underlying dataset
+ *         self.current_i = 0          # "flat" index             # <<<<<<<<<<<<<<
+ *         self.current_index = 0      # index in underlying dataset
+ * 
+ */
+  __pyx_v_self->current_i = 0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":154
+ *         self.current_offset = 0     # offset within current index in underlying dataset
+ *         self.current_i = 0          # "flat" index
+ *         self.current_index = 0      # index in underlying dataset             # <<<<<<<<<<<<<<
+ * 
+ *     @cython.boundscheck(False)
+ */
+  __pyx_v_self->current_index = 0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":151
+ *         self.reset()
+ * 
+ *     cdef reset(self):             # <<<<<<<<<<<<<<
+ *         self.current_offset = 0     # offset within current index in underlying dataset
+ *         self.current_i = 0          # "flat" index
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "fairseq/data/token_block_utils_fast.pyx":159
+ *     @cython.wraparound(False)
+ *     @cython.nonecheck(False)
+ *     cdef int step(self, DTYPE_t i):             # <<<<<<<<<<<<<<
+ *         cdef DTYPE_t to_consume
+ *         cdef DTYPE_t remaining
+ */
+
+static int __pyx_f_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_step(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *__pyx_v_self, __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_i) {
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_to_consume;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_remaining;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_t_3;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("step", 1);
+
+  /* "fairseq/data/token_block_utils_fast.pyx":162
+ *         cdef DTYPE_t to_consume
+ *         cdef DTYPE_t remaining
+ *         if i < self.current_i:             # <<<<<<<<<<<<<<
+ *             self.reset()
+ *         if i > self.current_i:
+ */
+  __pyx_t_1 = (__pyx_v_i < __pyx_v_self->current_i);
+  if (__pyx_t_1) {
+
+    /* "fairseq/data/token_block_utils_fast.pyx":163
+ *         cdef DTYPE_t remaining
+ *         if i < self.current_i:
+ *             self.reset()             # <<<<<<<<<<<<<<
+ *         if i > self.current_i:
+ *             to_consume = i - self.current_i
+ */
+    __pyx_t_2 = ((struct __pyx_vtabstruct_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *)__pyx_v_self->__pyx_vtab)->reset(__pyx_v_self); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 163, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+    /* "fairseq/data/token_block_utils_fast.pyx":162
+ *         cdef DTYPE_t to_consume
+ *         cdef DTYPE_t remaining
+ *         if i < self.current_i:             # <<<<<<<<<<<<<<
+ *             self.reset()
+ *         if i > self.current_i:
+ */
+  }
+
+  /* "fairseq/data/token_block_utils_fast.pyx":164
+ *         if i < self.current_i:
+ *             self.reset()
+ *         if i > self.current_i:             # <<<<<<<<<<<<<<
+ *             to_consume = i - self.current_i
+ *             remaining = self.sizes[self.current_index] - self.current_offset
+ */
+  __pyx_t_1 = (__pyx_v_i > __pyx_v_self->current_i);
+  if (__pyx_t_1) {
+
+    /* "fairseq/data/token_block_utils_fast.pyx":165
+ *             self.reset()
+ *         if i > self.current_i:
+ *             to_consume = i - self.current_i             # <<<<<<<<<<<<<<
+ *             remaining = self.sizes[self.current_index] - self.current_offset
+ *             if remaining > to_consume:
+ */
+    __pyx_v_to_consume = (__pyx_v_i - __pyx_v_self->current_i);
+
+    /* "fairseq/data/token_block_utils_fast.pyx":166
+ *         if i > self.current_i:
+ *             to_consume = i - self.current_i
+ *             remaining = self.sizes[self.current_index] - self.current_offset             # <<<<<<<<<<<<<<
+ *             if remaining > to_consume:
+ *                 self.current_offset += to_consume
+ */
+    if (unlikely(!__pyx_v_self->sizes.memview)) {PyErr_SetString(PyExc_AttributeError,"Memoryview is not initialized");__PYX_ERR(0, 166, __pyx_L1_error)}
+    __pyx_t_3 = __pyx_v_self->current_index;
+    __pyx_v_remaining = ((*((__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t *) ( /* dim=0 */ (__pyx_v_self->sizes.data + __pyx_t_3 * __pyx_v_self->sizes.strides[0]) ))) - __pyx_v_self->current_offset);
+
+    /* "fairseq/data/token_block_utils_fast.pyx":167
+ *             to_consume = i - self.current_i
+ *             remaining = self.sizes[self.current_index] - self.current_offset
+ *             if remaining > to_consume:             # <<<<<<<<<<<<<<
+ *                 self.current_offset += to_consume
+ *                 self.current_i += to_consume
+ */
+    __pyx_t_1 = (__pyx_v_remaining > __pyx_v_to_consume);
+    if (__pyx_t_1) {
+
+      /* "fairseq/data/token_block_utils_fast.pyx":168
+ *             remaining = self.sizes[self.current_index] - self.current_offset
+ *             if remaining > to_consume:
+ *                 self.current_offset += to_consume             # <<<<<<<<<<<<<<
+ *                 self.current_i += to_consume
+ *             else:
+ */
+      __pyx_v_self->current_offset = (__pyx_v_self->current_offset + __pyx_v_to_consume);
+
+      /* "fairseq/data/token_block_utils_fast.pyx":169
+ *             if remaining > to_consume:
+ *                 self.current_offset += to_consume
+ *                 self.current_i += to_consume             # <<<<<<<<<<<<<<
+ *             else:
+ *                 assert remaining > 0
+ */
+      __pyx_v_self->current_i = (__pyx_v_self->current_i + __pyx_v_to_consume);
+
+      /* "fairseq/data/token_block_utils_fast.pyx":167
+ *             to_consume = i - self.current_i
+ *             remaining = self.sizes[self.current_index] - self.current_offset
+ *             if remaining > to_consume:             # <<<<<<<<<<<<<<
+ *                 self.current_offset += to_consume
+ *                 self.current_i += to_consume
+ */
+      goto __pyx_L5;
+    }
+
+    /* "fairseq/data/token_block_utils_fast.pyx":171
+ *                 self.current_i += to_consume
+ *             else:
+ *                 assert remaining > 0             # <<<<<<<<<<<<<<
+ *                 self.current_i += remaining
+ *                 self.current_index += 1
+ */
+    /*else*/ {
+      #ifndef CYTHON_WITHOUT_ASSERTIONS
+      if (unlikely(__pyx_assertions_enabled())) {
+        __pyx_t_1 = (__pyx_v_remaining > 0);
+        if (unlikely(!__pyx_t_1)) {
+          __Pyx_Raise(__pyx_builtin_AssertionError, 0, 0, 0);
+          __PYX_ERR(0, 171, __pyx_L1_error)
+        }
+      }
+      #else
+      if ((1)); else __PYX_ERR(0, 171, __pyx_L1_error)
+      #endif
+
+      /* "fairseq/data/token_block_utils_fast.pyx":172
+ *             else:
+ *                 assert remaining > 0
+ *                 self.current_i += remaining             # <<<<<<<<<<<<<<
+ *                 self.current_index += 1
+ *                 self.current_offset = 0
+ */
+      __pyx_v_self->current_i = (__pyx_v_self->current_i + __pyx_v_remaining);
+
+      /* "fairseq/data/token_block_utils_fast.pyx":173
+ *                 assert remaining > 0
+ *                 self.current_i += remaining
+ *                 self.current_index += 1             # <<<<<<<<<<<<<<
+ *                 self.current_offset = 0
+ *                 return 1
+ */
+      __pyx_v_self->current_index = (__pyx_v_self->current_index + 1);
+
+      /* "fairseq/data/token_block_utils_fast.pyx":174
+ *                 self.current_i += remaining
+ *                 self.current_index += 1
+ *                 self.current_offset = 0             # <<<<<<<<<<<<<<
+ *                 return 1
+ *         return 0
+ */
+      __pyx_v_self->current_offset = 0;
+
+      /* "fairseq/data/token_block_utils_fast.pyx":175
+ *                 self.current_index += 1
+ *                 self.current_offset = 0
+ *                 return 1             # <<<<<<<<<<<<<<
+ *         return 0
+ * 
+ */
+      __pyx_r = 1;
+      goto __pyx_L0;
+    }
+    __pyx_L5:;
+
+    /* "fairseq/data/token_block_utils_fast.pyx":164
+ *         if i < self.current_i:
+ *             self.reset()
+ *         if i > self.current_i:             # <<<<<<<<<<<<<<
+ *             to_consume = i - self.current_i
+ *             remaining = self.sizes[self.current_index] - self.current_offset
+ */
+  }
+
+  /* "fairseq/data/token_block_utils_fast.pyx":176
+ *                 self.current_offset = 0
+ *                 return 1
+ *         return 0             # <<<<<<<<<<<<<<
+ * 
+ *     @cython.boundscheck(False)
+ */
+  __pyx_r = 0;
+  goto __pyx_L0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":159
+ *     @cython.wraparound(False)
+ *     @cython.nonecheck(False)
+ *     cdef int step(self, DTYPE_t i):             # <<<<<<<<<<<<<<
+ *         cdef DTYPE_t to_consume
+ *         cdef DTYPE_t remaining
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("fairseq.data.token_block_utils_fast.DatasetSearcher.step", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "fairseq/data/token_block_utils_fast.pyx":181
+ *     @cython.wraparound(False)
+ *     @cython.nonecheck(False)
+ *     cdef seek(self, DTYPE_t i):             # <<<<<<<<<<<<<<
+ *         cdef int not_done = 1
+ *         while not_done == 1:
+ */
+
+static PyObject *__pyx_f_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_seek(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *__pyx_v_self, __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_v_i) {
+  int __pyx_v_not_done;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("seek", 1);
+
+  /* "fairseq/data/token_block_utils_fast.pyx":182
+ *     @cython.nonecheck(False)
+ *     cdef seek(self, DTYPE_t i):
+ *         cdef int not_done = 1             # <<<<<<<<<<<<<<
+ *         while not_done == 1:
+ *             not_done = self.step(i)
+ */
+  __pyx_v_not_done = 1;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":183
+ *     cdef seek(self, DTYPE_t i):
+ *         cdef int not_done = 1
+ *         while not_done == 1:             # <<<<<<<<<<<<<<
+ *             not_done = self.step(i)
+ *         assert self.current_i == i
+ */
+  while (1) {
+    __pyx_t_1 = (__pyx_v_not_done == 1);
+    if (!__pyx_t_1) break;
+
+    /* "fairseq/data/token_block_utils_fast.pyx":184
+ *         cdef int not_done = 1
+ *         while not_done == 1:
+ *             not_done = self.step(i)             # <<<<<<<<<<<<<<
+ *         assert self.current_i == i
+ */
+    __pyx_t_2 = ((struct __pyx_vtabstruct_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *)__pyx_v_self->__pyx_vtab)->step(__pyx_v_self, __pyx_v_i); if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 184, __pyx_L1_error)
+    __pyx_v_not_done = __pyx_t_2;
+  }
+
+  /* "fairseq/data/token_block_utils_fast.pyx":185
+ *         while not_done == 1:
+ *             not_done = self.step(i)
+ *         assert self.current_i == i             # <<<<<<<<<<<<<<
+ */
+  #ifndef CYTHON_WITHOUT_ASSERTIONS
+  if (unlikely(__pyx_assertions_enabled())) {
+    __pyx_t_1 = (__pyx_v_self->current_i == __pyx_v_i);
+    if (unlikely(!__pyx_t_1)) {
+      __Pyx_Raise(__pyx_builtin_AssertionError, 0, 0, 0);
+      __PYX_ERR(0, 185, __pyx_L1_error)
+    }
+  }
+  #else
+  if ((1)); else __PYX_ERR(0, 185, __pyx_L1_error)
+  #endif
+
+  /* "fairseq/data/token_block_utils_fast.pyx":181
+ *     @cython.wraparound(False)
+ *     @cython.nonecheck(False)
+ *     cdef seek(self, DTYPE_t i):             # <<<<<<<<<<<<<<
+ *         cdef int not_done = 1
+ *         while not_done == 1:
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("fairseq.data.token_block_utils_fast.DatasetSearcher.seek", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     cdef tuple state
+ *     cdef object _dict
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_3__reduce_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyMethodDef __pyx_mdef_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_3__reduce_cython__ = {"__reduce_cython__", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_3__reduce_cython__, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_3__reduce_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__reduce_cython__ (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  if (unlikely(__pyx_nargs > 0)) {
+    __Pyx_RaiseArgtupleInvalid("__reduce_cython__", 1, 0, 0, __pyx_nargs); return NULL;}
+  if (unlikely(__pyx_kwds) && __Pyx_NumKwargs_FASTCALL(__pyx_kwds) && unlikely(!__Pyx_CheckKeywordStrings(__pyx_kwds, "__reduce_cython__", 0))) return NULL;
+  __pyx_r = __pyx_pf_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_2__reduce_cython__(((struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_2__reduce_cython__(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *__pyx_v_self) {
+  PyObject *__pyx_v_state = 0;
+  PyObject *__pyx_v__dict = 0;
+  int __pyx_v_use_setstate;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  int __pyx_t_6;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__reduce_cython__", 1);
+
+  /* "(tree fragment)":5
+ *     cdef object _dict
+ *     cdef bint use_setstate
+ *     state = (self.current_i, self.current_index, self.current_offset, self.sizes)             # <<<<<<<<<<<<<<
+ *     _dict = getattr(self, '__dict__', None)
+ *     if _dict is not None:
+ */
+  __pyx_t_1 = __Pyx_PyInt_From_npy_int64(__pyx_v_self->current_i); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 5, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyInt_From_npy_int64(__pyx_v_self->current_index); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 5, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = __Pyx_PyInt_From_npy_int64(__pyx_v_self->current_offset); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 5, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  if (unlikely(!__pyx_v_self->sizes.memview)) {PyErr_SetString(PyExc_AttributeError,"Memoryview is not initialized");__PYX_ERR(1, 5, __pyx_L1_error)}
+  __pyx_t_4 = __pyx_memoryview_fromslice(__pyx_v_self->sizes, 1, (PyObject *(*)(char *)) __pyx_memview_get_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, (int (*)(char *, PyObject *)) __pyx_memview_set_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, 0);; if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 5, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_5 = PyTuple_New(4); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 5, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_GIVEREF(__pyx_t_1);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_1)) __PYX_ERR(1, 5, __pyx_L1_error);
+  __Pyx_GIVEREF(__pyx_t_2);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_5, 1, __pyx_t_2)) __PYX_ERR(1, 5, __pyx_L1_error);
+  __Pyx_GIVEREF(__pyx_t_3);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_5, 2, __pyx_t_3)) __PYX_ERR(1, 5, __pyx_L1_error);
+  __Pyx_GIVEREF(__pyx_t_4);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_t_5, 3, __pyx_t_4)) __PYX_ERR(1, 5, __pyx_L1_error);
+  __pyx_t_1 = 0;
+  __pyx_t_2 = 0;
+  __pyx_t_3 = 0;
+  __pyx_t_4 = 0;
+  __pyx_v_state = ((PyObject*)__pyx_t_5);
+  __pyx_t_5 = 0;
+
+  /* "(tree fragment)":6
+ *     cdef bint use_setstate
+ *     state = (self.current_i, self.current_index, self.current_offset, self.sizes)
+ *     _dict = getattr(self, '__dict__', None)             # <<<<<<<<<<<<<<
+ *     if _dict is not None:
+ *         state += (_dict,)
+ */
+  __pyx_t_5 = __Pyx_GetAttr3(((PyObject *)__pyx_v_self), __pyx_n_s_dict, Py_None); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 6, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __pyx_v__dict = __pyx_t_5;
+  __pyx_t_5 = 0;
+
+  /* "(tree fragment)":7
+ *     state = (self.current_i, self.current_index, self.current_offset, self.sizes)
+ *     _dict = getattr(self, '__dict__', None)
+ *     if _dict is not None:             # <<<<<<<<<<<<<<
+ *         state += (_dict,)
+ *         use_setstate = True
+ */
+  __pyx_t_6 = (__pyx_v__dict != Py_None);
+  if (__pyx_t_6) {
+
+    /* "(tree fragment)":8
+ *     _dict = getattr(self, '__dict__', None)
+ *     if _dict is not None:
+ *         state += (_dict,)             # <<<<<<<<<<<<<<
+ *         use_setstate = True
+ *     else:
+ */
+    __pyx_t_5 = PyTuple_New(1); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 8, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_INCREF(__pyx_v__dict);
+    __Pyx_GIVEREF(__pyx_v__dict);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_v__dict)) __PYX_ERR(1, 8, __pyx_L1_error);
+    __pyx_t_4 = PyNumber_InPlaceAdd(__pyx_v_state, __pyx_t_5); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 8, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __Pyx_DECREF_SET(__pyx_v_state, ((PyObject*)__pyx_t_4));
+    __pyx_t_4 = 0;
+
+    /* "(tree fragment)":9
+ *     if _dict is not None:
+ *         state += (_dict,)
+ *         use_setstate = True             # <<<<<<<<<<<<<<
+ *     else:
+ *         use_setstate = False
+ */
+    __pyx_v_use_setstate = 1;
+
+    /* "(tree fragment)":7
+ *     state = (self.current_i, self.current_index, self.current_offset, self.sizes)
+ *     _dict = getattr(self, '__dict__', None)
+ *     if _dict is not None:             # <<<<<<<<<<<<<<
+ *         state += (_dict,)
+ *         use_setstate = True
+ */
+    goto __pyx_L3;
+  }
+
+  /* "(tree fragment)":11
+ *         use_setstate = True
+ *     else:
+ *         use_setstate = False             # <<<<<<<<<<<<<<
+ *     if use_setstate:
+ *         return __pyx_unpickle_DatasetSearcher, (type(self), 0x8c67b45, None), state
+ */
+  /*else*/ {
+    __pyx_v_use_setstate = 0;
+  }
+  __pyx_L3:;
+
+  /* "(tree fragment)":12
+ *     else:
+ *         use_setstate = False
+ *     if use_setstate:             # <<<<<<<<<<<<<<
+ *         return __pyx_unpickle_DatasetSearcher, (type(self), 0x8c67b45, None), state
+ *     else:
+ */
+  if (__pyx_v_use_setstate) {
+
+    /* "(tree fragment)":13
+ *         use_setstate = False
+ *     if use_setstate:
+ *         return __pyx_unpickle_DatasetSearcher, (type(self), 0x8c67b45, None), state             # <<<<<<<<<<<<<<
+ *     else:
+ *         return __pyx_unpickle_DatasetSearcher, (type(self), 0x8c67b45, state)
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_GetModuleGlobalName(__pyx_t_4, __pyx_n_s_pyx_unpickle_DatasetSearcher); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 13, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_5 = PyTuple_New(3); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 13, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_INCREF(((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))));
+    __Pyx_GIVEREF(((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))));
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_5, 0, ((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))))) __PYX_ERR(1, 13, __pyx_L1_error);
+    __Pyx_INCREF(__pyx_int_147225413);
+    __Pyx_GIVEREF(__pyx_int_147225413);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_5, 1, __pyx_int_147225413)) __PYX_ERR(1, 13, __pyx_L1_error);
+    __Pyx_INCREF(Py_None);
+    __Pyx_GIVEREF(Py_None);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_5, 2, Py_None)) __PYX_ERR(1, 13, __pyx_L1_error);
+    __pyx_t_3 = PyTuple_New(3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 13, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_GIVEREF(__pyx_t_4);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_4)) __PYX_ERR(1, 13, __pyx_L1_error);
+    __Pyx_GIVEREF(__pyx_t_5);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_5)) __PYX_ERR(1, 13, __pyx_L1_error);
+    __Pyx_INCREF(__pyx_v_state);
+    __Pyx_GIVEREF(__pyx_v_state);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_3, 2, __pyx_v_state)) __PYX_ERR(1, 13, __pyx_L1_error);
+    __pyx_t_4 = 0;
+    __pyx_t_5 = 0;
+    __pyx_r = __pyx_t_3;
+    __pyx_t_3 = 0;
+    goto __pyx_L0;
+
+    /* "(tree fragment)":12
+ *     else:
+ *         use_setstate = False
+ *     if use_setstate:             # <<<<<<<<<<<<<<
+ *         return __pyx_unpickle_DatasetSearcher, (type(self), 0x8c67b45, None), state
+ *     else:
+ */
+  }
+
+  /* "(tree fragment)":15
+ *         return __pyx_unpickle_DatasetSearcher, (type(self), 0x8c67b45, None), state
+ *     else:
+ *         return __pyx_unpickle_DatasetSearcher, (type(self), 0x8c67b45, state)             # <<<<<<<<<<<<<<
+ * def __setstate_cython__(self, __pyx_state):
+ *     __pyx_unpickle_DatasetSearcher__set_state(self, __pyx_state)
+ */
+  /*else*/ {
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_GetModuleGlobalName(__pyx_t_3, __pyx_n_s_pyx_unpickle_DatasetSearcher); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 15, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_5 = PyTuple_New(3); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 15, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_INCREF(((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))));
+    __Pyx_GIVEREF(((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))));
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_5, 0, ((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))))) __PYX_ERR(1, 15, __pyx_L1_error);
+    __Pyx_INCREF(__pyx_int_147225413);
+    __Pyx_GIVEREF(__pyx_int_147225413);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_5, 1, __pyx_int_147225413)) __PYX_ERR(1, 15, __pyx_L1_error);
+    __Pyx_INCREF(__pyx_v_state);
+    __Pyx_GIVEREF(__pyx_v_state);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_5, 2, __pyx_v_state)) __PYX_ERR(1, 15, __pyx_L1_error);
+    __pyx_t_4 = PyTuple_New(2); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 15, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_GIVEREF(__pyx_t_3);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3)) __PYX_ERR(1, 15, __pyx_L1_error);
+    __Pyx_GIVEREF(__pyx_t_5);
+    if (__Pyx_PyTuple_SET_ITEM(__pyx_t_4, 1, __pyx_t_5)) __PYX_ERR(1, 15, __pyx_L1_error);
+    __pyx_t_3 = 0;
+    __pyx_t_5 = 0;
+    __pyx_r = __pyx_t_4;
+    __pyx_t_4 = 0;
+    goto __pyx_L0;
+  }
+
+  /* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     cdef tuple state
+ *     cdef object _dict
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_AddTraceback("fairseq.data.token_block_utils_fast.DatasetSearcher.__reduce_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_state);
+  __Pyx_XDECREF(__pyx_v__dict);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":16
+ *     else:
+ *         return __pyx_unpickle_DatasetSearcher, (type(self), 0x8c67b45, state)
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     __pyx_unpickle_DatasetSearcher__set_state(self, __pyx_state)
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_5__setstate_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyMethodDef __pyx_mdef_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_5__setstate_cython__ = {"__setstate_cython__", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_5__setstate_cython__, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_5__setstate_cython__(PyObject *__pyx_v_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  PyObject *__pyx_v___pyx_state = 0;
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[1] = {0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__setstate_cython__ (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_pyx_state,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  1: values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_FASTCALL(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_pyx_state)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 16, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "__setstate_cython__") < 0)) __PYX_ERR(1, 16, __pyx_L3_error)
+      }
+    } else if (unlikely(__pyx_nargs != 1)) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+    }
+    __pyx_v___pyx_state = values[0];
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__setstate_cython__", 1, 1, 1, __pyx_nargs); __PYX_ERR(1, 16, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("fairseq.data.token_block_utils_fast.DatasetSearcher.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_4__setstate_cython__(((struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *)__pyx_v_self), __pyx_v___pyx_state);
+
+  /* function exit code */
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_4__setstate_cython__(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *__pyx_v_self, PyObject *__pyx_v___pyx_state) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__setstate_cython__", 1);
+
+  /* "(tree fragment)":17
+ *         return __pyx_unpickle_DatasetSearcher, (type(self), 0x8c67b45, state)
+ * def __setstate_cython__(self, __pyx_state):
+ *     __pyx_unpickle_DatasetSearcher__set_state(self, __pyx_state)             # <<<<<<<<<<<<<<
+ */
+  if (!(likely(PyTuple_CheckExact(__pyx_v___pyx_state))||((__pyx_v___pyx_state) == Py_None) || __Pyx_RaiseUnexpectedTypeError("tuple", __pyx_v___pyx_state))) __PYX_ERR(1, 17, __pyx_L1_error)
+  __pyx_t_1 = __pyx_f_7fairseq_4data_22token_block_utils_fast___pyx_unpickle_DatasetSearcher__set_state(__pyx_v_self, ((PyObject*)__pyx_v___pyx_state)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 17, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "(tree fragment)":16
+ *     else:
+ *         return __pyx_unpickle_DatasetSearcher, (type(self), 0x8c67b45, state)
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     __pyx_unpickle_DatasetSearcher__set_state(self, __pyx_state)
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("fairseq.data.token_block_utils_fast.DatasetSearcher.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":1
+ * def __pyx_unpickle_DatasetSearcher(__pyx_type, long __pyx_checksum, __pyx_state):             # <<<<<<<<<<<<<<
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_7fairseq_4data_22token_block_utils_fast_5__pyx_unpickle_DatasetSearcher(PyObject *__pyx_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+); /*proto*/
+static PyMethodDef __pyx_mdef_7fairseq_4data_22token_block_utils_fast_5__pyx_unpickle_DatasetSearcher = {"__pyx_unpickle_DatasetSearcher", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_7fairseq_4data_22token_block_utils_fast_5__pyx_unpickle_DatasetSearcher, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_7fairseq_4data_22token_block_utils_fast_5__pyx_unpickle_DatasetSearcher(PyObject *__pyx_self, 
+#if CYTHON_METH_FASTCALL
+PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
+#else
+PyObject *__pyx_args, PyObject *__pyx_kwds
+#endif
+) {
+  PyObject *__pyx_v___pyx_type = 0;
+  long __pyx_v___pyx_checksum;
+  PyObject *__pyx_v___pyx_state = 0;
+  #if !CYTHON_METH_FASTCALL
+  CYTHON_UNUSED Py_ssize_t __pyx_nargs;
+  #endif
+  CYTHON_UNUSED PyObject *const *__pyx_kwvalues;
+  PyObject* values[3] = {0,0,0};
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__pyx_unpickle_DatasetSearcher (wrapper)", 0);
+  #if !CYTHON_METH_FASTCALL
+  #if CYTHON_ASSUME_SAFE_MACROS
+  __pyx_nargs = PyTuple_GET_SIZE(__pyx_args);
+  #else
+  __pyx_nargs = PyTuple_Size(__pyx_args); if (unlikely(__pyx_nargs < 0)) return NULL;
+  #endif
+  #endif
+  __pyx_kwvalues = __Pyx_KwValues_FASTCALL(__pyx_args, __pyx_nargs);
+  {
+    PyObject **__pyx_pyargnames[] = {&__pyx_n_s_pyx_type,&__pyx_n_s_pyx_checksum,&__pyx_n_s_pyx_state,0};
+    if (__pyx_kwds) {
+      Py_ssize_t kw_args;
+      switch (__pyx_nargs) {
+        case  3: values[2] = __Pyx_Arg_FASTCALL(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = __Pyx_Arg_FASTCALL(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = __Pyx_NumKwargs_FASTCALL(__pyx_kwds);
+      switch (__pyx_nargs) {
+        case  0:
+        if (likely((values[0] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_pyx_type)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[0]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 1, __pyx_L3_error)
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_pyx_checksum)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[1]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 1, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("__pyx_unpickle_DatasetSearcher", 1, 3, 3, 1); __PYX_ERR(1, 1, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_GetKwValue_FASTCALL(__pyx_kwds, __pyx_kwvalues, __pyx_n_s_pyx_state)) != 0)) {
+          (void)__Pyx_Arg_NewRef_FASTCALL(values[2]);
+          kw_args--;
+        }
+        else if (unlikely(PyErr_Occurred())) __PYX_ERR(1, 1, __pyx_L3_error)
+        else {
+          __Pyx_RaiseArgtupleInvalid("__pyx_unpickle_DatasetSearcher", 1, 3, 3, 2); __PYX_ERR(1, 1, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        const Py_ssize_t kwd_pos_args = __pyx_nargs;
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_kwvalues, __pyx_pyargnames, 0, values + 0, kwd_pos_args, "__pyx_unpickle_DatasetSearcher") < 0)) __PYX_ERR(1, 1, __pyx_L3_error)
+      }
+    } else if (unlikely(__pyx_nargs != 3)) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = __Pyx_Arg_FASTCALL(__pyx_args, 0);
+      values[1] = __Pyx_Arg_FASTCALL(__pyx_args, 1);
+      values[2] = __Pyx_Arg_FASTCALL(__pyx_args, 2);
+    }
+    __pyx_v___pyx_type = values[0];
+    __pyx_v___pyx_checksum = __Pyx_PyInt_As_long(values[1]); if (unlikely((__pyx_v___pyx_checksum == (long)-1) && PyErr_Occurred())) __PYX_ERR(1, 1, __pyx_L3_error)
+    __pyx_v___pyx_state = values[2];
+  }
+  goto __pyx_L6_skip;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__pyx_unpickle_DatasetSearcher", 1, 3, 3, __pyx_nargs); __PYX_ERR(1, 1, __pyx_L3_error)
+  __pyx_L6_skip:;
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L3_error:;
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_AddTraceback("fairseq.data.token_block_utils_fast.__pyx_unpickle_DatasetSearcher", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_7fairseq_4data_22token_block_utils_fast_4__pyx_unpickle_DatasetSearcher(__pyx_self, __pyx_v___pyx_type, __pyx_v___pyx_checksum, __pyx_v___pyx_state);
+
+  /* function exit code */
+  {
+    Py_ssize_t __pyx_temp;
+    for (__pyx_temp=0; __pyx_temp < (Py_ssize_t)(sizeof(values)/sizeof(values[0])); ++__pyx_temp) {
+      __Pyx_Arg_XDECREF_FASTCALL(values[__pyx_temp]);
+    }
+  }
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_7fairseq_4data_22token_block_utils_fast_4__pyx_unpickle_DatasetSearcher(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v___pyx_type, long __pyx_v___pyx_checksum, PyObject *__pyx_v___pyx_state) {
+  PyObject *__pyx_v___pyx_PickleError = 0;
+  PyObject *__pyx_v___pyx_result = 0;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  unsigned int __pyx_t_5;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__pyx_unpickle_DatasetSearcher", 1);
+
+  /* "(tree fragment)":4
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ *     if __pyx_checksum not in (0x8c67b45, 0x2e2dd22, 0x6632805):             # <<<<<<<<<<<<<<
+ *         from pickle import PickleError as __pyx_PickleError
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x8c67b45, 0x2e2dd22, 0x6632805) = (current_i, current_index, current_offset, sizes))" % __pyx_checksum
+ */
+  __pyx_t_1 = __Pyx_PyInt_From_long(__pyx_v___pyx_checksum); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 4, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = (__Pyx_PySequence_ContainsTF(__pyx_t_1, __pyx_tuple__14, Py_NE)); if (unlikely((__pyx_t_2 < 0))) __PYX_ERR(1, 4, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (__pyx_t_2) {
+
+    /* "(tree fragment)":5
+ *     cdef object __pyx_result
+ *     if __pyx_checksum not in (0x8c67b45, 0x2e2dd22, 0x6632805):
+ *         from pickle import PickleError as __pyx_PickleError             # <<<<<<<<<<<<<<
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x8c67b45, 0x2e2dd22, 0x6632805) = (current_i, current_index, current_offset, sizes))" % __pyx_checksum
+ *     __pyx_result = DatasetSearcher.__new__(__pyx_type)
+ */
+    __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 5, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_INCREF(__pyx_n_s_PickleError);
+    __Pyx_GIVEREF(__pyx_n_s_PickleError);
+    if (__Pyx_PyList_SET_ITEM(__pyx_t_1, 0, __pyx_n_s_PickleError)) __PYX_ERR(1, 5, __pyx_L1_error);
+    __pyx_t_3 = __Pyx_Import(__pyx_n_s_pickle, __pyx_t_1, 0); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 5, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_3, __pyx_n_s_PickleError); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 5, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_INCREF(__pyx_t_1);
+    __pyx_v___pyx_PickleError = __pyx_t_1;
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+
+    /* "(tree fragment)":6
+ *     if __pyx_checksum not in (0x8c67b45, 0x2e2dd22, 0x6632805):
+ *         from pickle import PickleError as __pyx_PickleError
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x8c67b45, 0x2e2dd22, 0x6632805) = (current_i, current_index, current_offset, sizes))" % __pyx_checksum             # <<<<<<<<<<<<<<
+ *     __pyx_result = DatasetSearcher.__new__(__pyx_type)
+ *     if __pyx_state is not None:
+ */
+    __pyx_t_3 = __Pyx_PyInt_From_long(__pyx_v___pyx_checksum); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 6, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_1 = __Pyx_PyString_Format(__pyx_kp_s_Incompatible_checksums_0x_x_vs_0_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 6, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __Pyx_Raise(__pyx_v___pyx_PickleError, __pyx_t_1, 0, 0);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __PYX_ERR(1, 6, __pyx_L1_error)
+
+    /* "(tree fragment)":4
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ *     if __pyx_checksum not in (0x8c67b45, 0x2e2dd22, 0x6632805):             # <<<<<<<<<<<<<<
+ *         from pickle import PickleError as __pyx_PickleError
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x8c67b45, 0x2e2dd22, 0x6632805) = (current_i, current_index, current_offset, sizes))" % __pyx_checksum
+ */
+  }
+
+  /* "(tree fragment)":7
+ *         from pickle import PickleError as __pyx_PickleError
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x8c67b45, 0x2e2dd22, 0x6632805) = (current_i, current_index, current_offset, sizes))" % __pyx_checksum
+ *     __pyx_result = DatasetSearcher.__new__(__pyx_type)             # <<<<<<<<<<<<<<
+ *     if __pyx_state is not None:
+ *         __pyx_unpickle_DatasetSearcher__set_state(<DatasetSearcher> __pyx_result, __pyx_state)
+ */
+  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher), __pyx_n_s_new); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 7, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_4 = NULL;
+  __pyx_t_5 = 0;
+  #if CYTHON_UNPACK_METHODS
+  if (likely(PyMethod_Check(__pyx_t_3))) {
+    __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_3);
+    if (likely(__pyx_t_4)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3);
+      __Pyx_INCREF(__pyx_t_4);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_3, function);
+      __pyx_t_5 = 1;
+    }
+  }
+  #endif
+  {
+    PyObject *__pyx_callargs[2] = {__pyx_t_4, __pyx_v___pyx_type};
+    __pyx_t_1 = __Pyx_PyObject_FastCall(__pyx_t_3, __pyx_callargs+1-__pyx_t_5, 1+__pyx_t_5);
+    __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+    if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 7, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  }
+  __pyx_v___pyx_result = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "(tree fragment)":8
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x8c67b45, 0x2e2dd22, 0x6632805) = (current_i, current_index, current_offset, sizes))" % __pyx_checksum
+ *     __pyx_result = DatasetSearcher.__new__(__pyx_type)
+ *     if __pyx_state is not None:             # <<<<<<<<<<<<<<
+ *         __pyx_unpickle_DatasetSearcher__set_state(<DatasetSearcher> __pyx_result, __pyx_state)
+ *     return __pyx_result
+ */
+  __pyx_t_2 = (__pyx_v___pyx_state != Py_None);
+  if (__pyx_t_2) {
+
+    /* "(tree fragment)":9
+ *     __pyx_result = DatasetSearcher.__new__(__pyx_type)
+ *     if __pyx_state is not None:
+ *         __pyx_unpickle_DatasetSearcher__set_state(<DatasetSearcher> __pyx_result, __pyx_state)             # <<<<<<<<<<<<<<
+ *     return __pyx_result
+ * cdef __pyx_unpickle_DatasetSearcher__set_state(DatasetSearcher __pyx_result, tuple __pyx_state):
+ */
+    if (!(likely(PyTuple_CheckExact(__pyx_v___pyx_state))||((__pyx_v___pyx_state) == Py_None) || __Pyx_RaiseUnexpectedTypeError("tuple", __pyx_v___pyx_state))) __PYX_ERR(1, 9, __pyx_L1_error)
+    __pyx_t_1 = __pyx_f_7fairseq_4data_22token_block_utils_fast___pyx_unpickle_DatasetSearcher__set_state(((struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *)__pyx_v___pyx_result), ((PyObject*)__pyx_v___pyx_state)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 9, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+    /* "(tree fragment)":8
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x8c67b45, 0x2e2dd22, 0x6632805) = (current_i, current_index, current_offset, sizes))" % __pyx_checksum
+ *     __pyx_result = DatasetSearcher.__new__(__pyx_type)
+ *     if __pyx_state is not None:             # <<<<<<<<<<<<<<
+ *         __pyx_unpickle_DatasetSearcher__set_state(<DatasetSearcher> __pyx_result, __pyx_state)
+ *     return __pyx_result
+ */
+  }
+
+  /* "(tree fragment)":10
+ *     if __pyx_state is not None:
+ *         __pyx_unpickle_DatasetSearcher__set_state(<DatasetSearcher> __pyx_result, __pyx_state)
+ *     return __pyx_result             # <<<<<<<<<<<<<<
+ * cdef __pyx_unpickle_DatasetSearcher__set_state(DatasetSearcher __pyx_result, tuple __pyx_state):
+ *     __pyx_result.current_i = __pyx_state[0]; __pyx_result.current_index = __pyx_state[1]; __pyx_result.current_offset = __pyx_state[2]; __pyx_result.sizes = __pyx_state[3]
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v___pyx_result);
+  __pyx_r = __pyx_v___pyx_result;
+  goto __pyx_L0;
+
+  /* "(tree fragment)":1
+ * def __pyx_unpickle_DatasetSearcher(__pyx_type, long __pyx_checksum, __pyx_state):             # <<<<<<<<<<<<<<
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("fairseq.data.token_block_utils_fast.__pyx_unpickle_DatasetSearcher", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v___pyx_PickleError);
+  __Pyx_XDECREF(__pyx_v___pyx_result);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":11
+ *         __pyx_unpickle_DatasetSearcher__set_state(<DatasetSearcher> __pyx_result, __pyx_state)
+ *     return __pyx_result
+ * cdef __pyx_unpickle_DatasetSearcher__set_state(DatasetSearcher __pyx_result, tuple __pyx_state):             # <<<<<<<<<<<<<<
+ *     __pyx_result.current_i = __pyx_state[0]; __pyx_result.current_index = __pyx_state[1]; __pyx_result.current_offset = __pyx_state[2]; __pyx_result.sizes = __pyx_state[3]
+ *     if len(__pyx_state) > 4 and hasattr(__pyx_result, '__dict__'):
+ */
+
+static PyObject *__pyx_f_7fairseq_4data_22token_block_utils_fast___pyx_unpickle_DatasetSearcher__set_state(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *__pyx_v___pyx_result, PyObject *__pyx_v___pyx_state) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t __pyx_t_2;
+  __Pyx_memviewslice __pyx_t_3 = { 0, 0, { 0 }, { 0 }, { 0 } };
+  int __pyx_t_4;
+  Py_ssize_t __pyx_t_5;
+  int __pyx_t_6;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  PyObject *__pyx_t_9 = NULL;
+  unsigned int __pyx_t_10;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__pyx_unpickle_DatasetSearcher__set_state", 1);
+
+  /* "(tree fragment)":12
+ *     return __pyx_result
+ * cdef __pyx_unpickle_DatasetSearcher__set_state(DatasetSearcher __pyx_result, tuple __pyx_state):
+ *     __pyx_result.current_i = __pyx_state[0]; __pyx_result.current_index = __pyx_state[1]; __pyx_result.current_offset = __pyx_state[2]; __pyx_result.sizes = __pyx_state[3]             # <<<<<<<<<<<<<<
+ *     if len(__pyx_state) > 4 and hasattr(__pyx_result, '__dict__'):
+ *         __pyx_result.__dict__.update(__pyx_state[4])
+ */
+  if (unlikely(__pyx_v___pyx_state == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable");
+    __PYX_ERR(1, 12, __pyx_L1_error)
+  }
+  __pyx_t_1 = __Pyx_GetItemInt_Tuple(__pyx_v___pyx_state, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 12, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyInt_As_npy_int64(__pyx_t_1); if (unlikely((__pyx_t_2 == ((npy_int64)-1)) && PyErr_Occurred())) __PYX_ERR(1, 12, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v___pyx_result->current_i = __pyx_t_2;
+  if (unlikely(__pyx_v___pyx_state == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable");
+    __PYX_ERR(1, 12, __pyx_L1_error)
+  }
+  __pyx_t_1 = __Pyx_GetItemInt_Tuple(__pyx_v___pyx_state, 1, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 12, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyInt_As_npy_int64(__pyx_t_1); if (unlikely((__pyx_t_2 == ((npy_int64)-1)) && PyErr_Occurred())) __PYX_ERR(1, 12, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v___pyx_result->current_index = __pyx_t_2;
+  if (unlikely(__pyx_v___pyx_state == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable");
+    __PYX_ERR(1, 12, __pyx_L1_error)
+  }
+  __pyx_t_1 = __Pyx_GetItemInt_Tuple(__pyx_v___pyx_state, 2, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 12, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyInt_As_npy_int64(__pyx_t_1); if (unlikely((__pyx_t_2 == ((npy_int64)-1)) && PyErr_Occurred())) __PYX_ERR(1, 12, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v___pyx_result->current_offset = __pyx_t_2;
+  if (unlikely(__pyx_v___pyx_state == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable");
+    __PYX_ERR(1, 12, __pyx_L1_error)
+  }
+  __pyx_t_1 = __Pyx_GetItemInt_Tuple(__pyx_v___pyx_state, 3, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 12, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_3 = __Pyx_PyObject_to_MemoryviewSlice_ds_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t(__pyx_t_1, PyBUF_WRITABLE); if (unlikely(!__pyx_t_3.memview)) __PYX_ERR(1, 12, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __PYX_XCLEAR_MEMVIEW(&__pyx_v___pyx_result->sizes, 0);
+  __pyx_v___pyx_result->sizes = __pyx_t_3;
+  __pyx_t_3.memview = NULL;
+  __pyx_t_3.data = NULL;
+
+  /* "(tree fragment)":13
+ * cdef __pyx_unpickle_DatasetSearcher__set_state(DatasetSearcher __pyx_result, tuple __pyx_state):
+ *     __pyx_result.current_i = __pyx_state[0]; __pyx_result.current_index = __pyx_state[1]; __pyx_result.current_offset = __pyx_state[2]; __pyx_result.sizes = __pyx_state[3]
+ *     if len(__pyx_state) > 4 and hasattr(__pyx_result, '__dict__'):             # <<<<<<<<<<<<<<
+ *         __pyx_result.__dict__.update(__pyx_state[4])
+ */
+  if (unlikely(__pyx_v___pyx_state == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()");
+    __PYX_ERR(1, 13, __pyx_L1_error)
+  }
+  __pyx_t_5 = __Pyx_PyTuple_GET_SIZE(__pyx_v___pyx_state); if (unlikely(__pyx_t_5 == ((Py_ssize_t)-1))) __PYX_ERR(1, 13, __pyx_L1_error)
+  __pyx_t_6 = (__pyx_t_5 > 4);
+  if (__pyx_t_6) {
+  } else {
+    __pyx_t_4 = __pyx_t_6;
+    goto __pyx_L4_bool_binop_done;
+  }
+  __pyx_t_6 = __Pyx_HasAttr(((PyObject *)__pyx_v___pyx_result), __pyx_n_s_dict); if (unlikely(__pyx_t_6 == ((int)-1))) __PYX_ERR(1, 13, __pyx_L1_error)
+  __pyx_t_4 = __pyx_t_6;
+  __pyx_L4_bool_binop_done:;
+  if (__pyx_t_4) {
+
+    /* "(tree fragment)":14
+ *     __pyx_result.current_i = __pyx_state[0]; __pyx_result.current_index = __pyx_state[1]; __pyx_result.current_offset = __pyx_state[2]; __pyx_result.sizes = __pyx_state[3]
+ *     if len(__pyx_state) > 4 and hasattr(__pyx_result, '__dict__'):
+ *         __pyx_result.__dict__.update(__pyx_state[4])             # <<<<<<<<<<<<<<
+ */
+    __pyx_t_7 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v___pyx_result), __pyx_n_s_dict); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 14, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_t_7, __pyx_n_s_update); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 14, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_8);
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    if (unlikely(__pyx_v___pyx_state == Py_None)) {
+      PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable");
+      __PYX_ERR(1, 14, __pyx_L1_error)
+    }
+    __pyx_t_7 = __Pyx_GetItemInt_Tuple(__pyx_v___pyx_state, 4, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 14, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __pyx_t_9 = NULL;
+    __pyx_t_10 = 0;
+    #if CYTHON_UNPACK_METHODS
+    if (likely(PyMethod_Check(__pyx_t_8))) {
+      __pyx_t_9 = PyMethod_GET_SELF(__pyx_t_8);
+      if (likely(__pyx_t_9)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_8);
+        __Pyx_INCREF(__pyx_t_9);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_8, function);
+        __pyx_t_10 = 1;
+      }
+    }
+    #endif
+    {
+      PyObject *__pyx_callargs[2] = {__pyx_t_9, __pyx_t_7};
+      __pyx_t_1 = __Pyx_PyObject_FastCall(__pyx_t_8, __pyx_callargs+1-__pyx_t_10, 1+__pyx_t_10);
+      __Pyx_XDECREF(__pyx_t_9); __pyx_t_9 = 0;
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 14, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+    }
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+    /* "(tree fragment)":13
+ * cdef __pyx_unpickle_DatasetSearcher__set_state(DatasetSearcher __pyx_result, tuple __pyx_state):
+ *     __pyx_result.current_i = __pyx_state[0]; __pyx_result.current_index = __pyx_state[1]; __pyx_result.current_offset = __pyx_state[2]; __pyx_result.sizes = __pyx_state[3]
+ *     if len(__pyx_state) > 4 and hasattr(__pyx_result, '__dict__'):             # <<<<<<<<<<<<<<
+ *         __pyx_result.__dict__.update(__pyx_state[4])
+ */
+  }
+
+  /* "(tree fragment)":11
+ *         __pyx_unpickle_DatasetSearcher__set_state(<DatasetSearcher> __pyx_result, __pyx_state)
+ *     return __pyx_result
+ * cdef __pyx_unpickle_DatasetSearcher__set_state(DatasetSearcher __pyx_result, tuple __pyx_state):             # <<<<<<<<<<<<<<
+ *     __pyx_result.current_i = __pyx_state[0]; __pyx_result.current_index = __pyx_state[1]; __pyx_result.current_offset = __pyx_state[2]; __pyx_result.sizes = __pyx_state[3]
+ *     if len(__pyx_state) > 4 and hasattr(__pyx_result, '__dict__'):
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __PYX_XCLEAR_MEMVIEW(&__pyx_t_3, 1);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_XDECREF(__pyx_t_9);
+  __Pyx_AddTraceback("fairseq.data.token_block_utils_fast.__pyx_unpickle_DatasetSearcher__set_state", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+static struct __pyx_vtabstruct_7fairseq_4data_22token_block_utils_fast_DatasetSearcher __pyx_vtable_7fairseq_4data_22token_block_utils_fast_DatasetSearcher;
+
+static PyObject *__pyx_tp_new_7fairseq_4data_22token_block_utils_fast_DatasetSearcher(PyTypeObject *t, CYTHON_UNUSED PyObject *a, CYTHON_UNUSED PyObject *k) {
+  struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *p;
+  PyObject *o;
+  #if CYTHON_COMPILING_IN_LIMITED_API
+  allocfunc alloc_func = (allocfunc)PyType_GetSlot(t, Py_tp_alloc);
+  o = alloc_func(t, 0);
+  #else
+  if (likely(!__Pyx_PyType_HasFeature(t, Py_TPFLAGS_IS_ABSTRACT))) {
+    o = (*t->tp_alloc)(t, 0);
+  } else {
+    o = (PyObject *) PyBaseObject_Type.tp_new(t, __pyx_empty_tuple, 0);
+  }
+  if (unlikely(!o)) return 0;
+  #endif
+  p = ((struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *)o);
+  p->__pyx_vtab = __pyx_vtabptr_7fairseq_4data_22token_block_utils_fast_DatasetSearcher;
+  p->sizes.data = NULL;
+  p->sizes.memview = NULL;
+  return o;
+}
+
+static void __pyx_tp_dealloc_7fairseq_4data_22token_block_utils_fast_DatasetSearcher(PyObject *o) {
+  struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *p = (struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *)o;
+  #if CYTHON_USE_TP_FINALIZE
+  if (unlikely((PY_VERSION_HEX >= 0x03080000 || __Pyx_PyType_HasFeature(Py_TYPE(o), Py_TPFLAGS_HAVE_FINALIZE)) && __Pyx_PyObject_GetSlot(o, tp_finalize, destructor)) && (!PyType_IS_GC(Py_TYPE(o)) || !__Pyx_PyObject_GC_IsFinalized(o))) {
+    if (__Pyx_PyObject_GetSlot(o, tp_dealloc, destructor) == __pyx_tp_dealloc_7fairseq_4data_22token_block_utils_fast_DatasetSearcher) {
+      if (PyObject_CallFinalizerFromDealloc(o)) return;
+    }
+  }
+  #endif
+  __PYX_XCLEAR_MEMVIEW(&p->sizes, 1);
+  p->sizes.memview = NULL; p->sizes.data = NULL;
+  #if CYTHON_USE_TYPE_SLOTS || CYTHON_COMPILING_IN_PYPY
+  (*Py_TYPE(o)->tp_free)(o);
+  #else
+  {
+    freefunc tp_free = (freefunc)PyType_GetSlot(Py_TYPE(o), Py_tp_free);
+    if (tp_free) tp_free(o);
+  }
+  #endif
+}
+
+static PyMethodDef __pyx_methods_7fairseq_4data_22token_block_utils_fast_DatasetSearcher[] = {
+  {"__reduce_cython__", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_3__reduce_cython__, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {"__setstate_cython__", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_5__setstate_cython__, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {0, 0, 0, 0}
+};
+#if CYTHON_USE_TYPE_SPECS
+static PyType_Slot __pyx_type_7fairseq_4data_22token_block_utils_fast_DatasetSearcher_slots[] = {
+  {Py_tp_dealloc, (void *)__pyx_tp_dealloc_7fairseq_4data_22token_block_utils_fast_DatasetSearcher},
+  {Py_tp_doc, (void *)PyDoc_STR("Helper for mapping \"flat\" indices to indices and offsets in an\n    underlying dataset.")},
+  {Py_tp_methods, (void *)__pyx_methods_7fairseq_4data_22token_block_utils_fast_DatasetSearcher},
+  {Py_tp_init, (void *)__pyx_pw_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_1__init__},
+  {Py_tp_new, (void *)__pyx_tp_new_7fairseq_4data_22token_block_utils_fast_DatasetSearcher},
+  {0, 0},
+};
+static PyType_Spec __pyx_type_7fairseq_4data_22token_block_utils_fast_DatasetSearcher_spec = {
+  "fairseq.data.token_block_utils_fast.DatasetSearcher",
+  sizeof(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher),
+  0,
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE,
+  __pyx_type_7fairseq_4data_22token_block_utils_fast_DatasetSearcher_slots,
+};
+#else
+
+static PyTypeObject __pyx_type_7fairseq_4data_22token_block_utils_fast_DatasetSearcher = {
+  PyVarObject_HEAD_INIT(0, 0)
+  "fairseq.data.token_block_utils_fast.""DatasetSearcher", /*tp_name*/
+  sizeof(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher), /*tp_basicsize*/
+  0, /*tp_itemsize*/
+  __pyx_tp_dealloc_7fairseq_4data_22token_block_utils_fast_DatasetSearcher, /*tp_dealloc*/
+  #if PY_VERSION_HEX < 0x030800b4
+  0, /*tp_print*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030800b4
+  0, /*tp_vectorcall_offset*/
+  #endif
+  0, /*tp_getattr*/
+  0, /*tp_setattr*/
+  #if PY_MAJOR_VERSION < 3
+  0, /*tp_compare*/
+  #endif
+  #if PY_MAJOR_VERSION >= 3
+  0, /*tp_as_async*/
+  #endif
+  0, /*tp_repr*/
+  0, /*tp_as_number*/
+  0, /*tp_as_sequence*/
+  0, /*tp_as_mapping*/
+  0, /*tp_hash*/
+  0, /*tp_call*/
+  0, /*tp_str*/
+  0, /*tp_getattro*/
+  0, /*tp_setattro*/
+  0, /*tp_as_buffer*/
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE, /*tp_flags*/
+  PyDoc_STR("Helper for mapping \"flat\" indices to indices and offsets in an\n    underlying dataset."), /*tp_doc*/
+  0, /*tp_traverse*/
+  0, /*tp_clear*/
+  0, /*tp_richcompare*/
+  0, /*tp_weaklistoffset*/
+  0, /*tp_iter*/
+  0, /*tp_iternext*/
+  __pyx_methods_7fairseq_4data_22token_block_utils_fast_DatasetSearcher, /*tp_methods*/
+  0, /*tp_members*/
+  0, /*tp_getset*/
+  0, /*tp_base*/
+  0, /*tp_dict*/
+  0, /*tp_descr_get*/
+  0, /*tp_descr_set*/
+  #if !CYTHON_USE_TYPE_SPECS
+  0, /*tp_dictoffset*/
+  #endif
+  __pyx_pw_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_1__init__, /*tp_init*/
+  0, /*tp_alloc*/
+  __pyx_tp_new_7fairseq_4data_22token_block_utils_fast_DatasetSearcher, /*tp_new*/
+  0, /*tp_free*/
+  0, /*tp_is_gc*/
+  0, /*tp_bases*/
+  0, /*tp_mro*/
+  0, /*tp_cache*/
+  0, /*tp_subclasses*/
+  0, /*tp_weaklist*/
+  0, /*tp_del*/
+  0, /*tp_version_tag*/
+  #if PY_VERSION_HEX >= 0x030400a1
+  #if CYTHON_USE_TP_FINALIZE
+  0, /*tp_finalize*/
+  #else
+  NULL, /*tp_finalize*/
+  #endif
+  #endif
+  #if PY_VERSION_HEX >= 0x030800b1 && (!CYTHON_COMPILING_IN_PYPY || PYPY_VERSION_NUM >= 0x07030800)
+  0, /*tp_vectorcall*/
+  #endif
+  #if __PYX_NEED_TP_PRINT_SLOT == 1
+  0, /*tp_print*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030C0000
+  0, /*tp_watched*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030d00A4
+  0, /*tp_versions_used*/
+  #endif
+  #if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX >= 0x03090000 && PY_VERSION_HEX < 0x030a0000
+  0, /*tp_pypy_flags*/
+  #endif
+};
+#endif
+static struct __pyx_vtabstruct_array __pyx_vtable_array;
+
+static PyObject *__pyx_tp_new_array(PyTypeObject *t, PyObject *a, PyObject *k) {
+  struct __pyx_array_obj *p;
+  PyObject *o;
+  #if CYTHON_COMPILING_IN_LIMITED_API
+  allocfunc alloc_func = (allocfunc)PyType_GetSlot(t, Py_tp_alloc);
+  o = alloc_func(t, 0);
+  #else
+  if (likely(!__Pyx_PyType_HasFeature(t, Py_TPFLAGS_IS_ABSTRACT))) {
+    o = (*t->tp_alloc)(t, 0);
+  } else {
+    o = (PyObject *) PyBaseObject_Type.tp_new(t, __pyx_empty_tuple, 0);
+  }
+  if (unlikely(!o)) return 0;
+  #endif
+  p = ((struct __pyx_array_obj *)o);
+  p->__pyx_vtab = __pyx_vtabptr_array;
+  p->mode = ((PyObject*)Py_None); Py_INCREF(Py_None);
+  p->_format = ((PyObject*)Py_None); Py_INCREF(Py_None);
+  if (unlikely(__pyx_array___cinit__(o, a, k) < 0)) goto bad;
+  return o;
+  bad:
+  Py_DECREF(o); o = 0;
+  return NULL;
+}
+
+static void __pyx_tp_dealloc_array(PyObject *o) {
+  struct __pyx_array_obj *p = (struct __pyx_array_obj *)o;
+  #if CYTHON_USE_TP_FINALIZE
+  if (unlikely((PY_VERSION_HEX >= 0x03080000 || __Pyx_PyType_HasFeature(Py_TYPE(o), Py_TPFLAGS_HAVE_FINALIZE)) && __Pyx_PyObject_GetSlot(o, tp_finalize, destructor)) && (!PyType_IS_GC(Py_TYPE(o)) || !__Pyx_PyObject_GC_IsFinalized(o))) {
+    if (__Pyx_PyObject_GetSlot(o, tp_dealloc, destructor) == __pyx_tp_dealloc_array) {
+      if (PyObject_CallFinalizerFromDealloc(o)) return;
+    }
+  }
+  #endif
+  {
+    PyObject *etype, *eval, *etb;
+    PyErr_Fetch(&etype, &eval, &etb);
+    __Pyx_SET_REFCNT(o, Py_REFCNT(o) + 1);
+    __pyx_array___dealloc__(o);
+    __Pyx_SET_REFCNT(o, Py_REFCNT(o) - 1);
+    PyErr_Restore(etype, eval, etb);
+  }
+  Py_CLEAR(p->mode);
+  Py_CLEAR(p->_format);
+  #if CYTHON_USE_TYPE_SLOTS || CYTHON_COMPILING_IN_PYPY
+  (*Py_TYPE(o)->tp_free)(o);
+  #else
+  {
+    freefunc tp_free = (freefunc)PyType_GetSlot(Py_TYPE(o), Py_tp_free);
+    if (tp_free) tp_free(o);
+  }
+  #endif
+}
+static PyObject *__pyx_sq_item_array(PyObject *o, Py_ssize_t i) {
+  PyObject *r;
+  PyObject *x = PyInt_FromSsize_t(i); if(!x) return 0;
+  r = Py_TYPE(o)->tp_as_mapping->mp_subscript(o, x);
+  Py_DECREF(x);
+  return r;
+}
+
+static int __pyx_mp_ass_subscript_array(PyObject *o, PyObject *i, PyObject *v) {
+  if (v) {
+    return __pyx_array___setitem__(o, i, v);
+  }
+  else {
+    __Pyx_TypeName o_type_name;
+    o_type_name = __Pyx_PyType_GetName(Py_TYPE(o));
+    PyErr_Format(PyExc_NotImplementedError,
+      "Subscript deletion not supported by " __Pyx_FMT_TYPENAME, o_type_name);
+    __Pyx_DECREF_TypeName(o_type_name);
+    return -1;
+  }
+}
+
+static PyObject *__pyx_tp_getattro_array(PyObject *o, PyObject *n) {
+  PyObject *v = __Pyx_PyObject_GenericGetAttr(o, n);
+  if (!v && PyErr_ExceptionMatches(PyExc_AttributeError)) {
+    PyErr_Clear();
+    v = __pyx_array___getattr__(o, n);
+  }
+  return v;
+}
+
+static PyObject *__pyx_getprop___pyx_array_memview(PyObject *o, CYTHON_UNUSED void *x) {
+  return __pyx_pw_15View_dot_MemoryView_5array_7memview_1__get__(o);
+}
+
+static PyMethodDef __pyx_methods_array[] = {
+  {"__getattr__", (PyCFunction)__pyx_array___getattr__, METH_O|METH_COEXIST, 0},
+  {"__reduce_cython__", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw___pyx_array_1__reduce_cython__, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {"__setstate_cython__", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw___pyx_array_3__setstate_cython__, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {0, 0, 0, 0}
+};
+
+static struct PyGetSetDef __pyx_getsets_array[] = {
+  {(char *)"memview", __pyx_getprop___pyx_array_memview, 0, (char *)0, 0},
+  {0, 0, 0, 0, 0}
+};
+#if CYTHON_USE_TYPE_SPECS
+#if !CYTHON_COMPILING_IN_LIMITED_API
+
+static PyBufferProcs __pyx_tp_as_buffer_array = {
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getreadbuffer*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getwritebuffer*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getsegcount*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getcharbuffer*/
+  #endif
+  __pyx_array_getbuffer, /*bf_getbuffer*/
+  0, /*bf_releasebuffer*/
+};
+#endif
+static PyType_Slot __pyx_type___pyx_array_slots[] = {
+  {Py_tp_dealloc, (void *)__pyx_tp_dealloc_array},
+  {Py_sq_length, (void *)__pyx_array___len__},
+  {Py_sq_item, (void *)__pyx_sq_item_array},
+  {Py_mp_length, (void *)__pyx_array___len__},
+  {Py_mp_subscript, (void *)__pyx_array___getitem__},
+  {Py_mp_ass_subscript, (void *)__pyx_mp_ass_subscript_array},
+  {Py_tp_getattro, (void *)__pyx_tp_getattro_array},
+  #if defined(Py_bf_getbuffer)
+  {Py_bf_getbuffer, (void *)__pyx_array_getbuffer},
+  #endif
+  {Py_tp_methods, (void *)__pyx_methods_array},
+  {Py_tp_getset, (void *)__pyx_getsets_array},
+  {Py_tp_new, (void *)__pyx_tp_new_array},
+  {0, 0},
+};
+static PyType_Spec __pyx_type___pyx_array_spec = {
+  "fairseq.data.token_block_utils_fast.array",
+  sizeof(struct __pyx_array_obj),
+  0,
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_SEQUENCE,
+  __pyx_type___pyx_array_slots,
+};
+#else
+
+static PySequenceMethods __pyx_tp_as_sequence_array = {
+  __pyx_array___len__, /*sq_length*/
+  0, /*sq_concat*/
+  0, /*sq_repeat*/
+  __pyx_sq_item_array, /*sq_item*/
+  0, /*sq_slice*/
+  0, /*sq_ass_item*/
+  0, /*sq_ass_slice*/
+  0, /*sq_contains*/
+  0, /*sq_inplace_concat*/
+  0, /*sq_inplace_repeat*/
+};
+
+static PyMappingMethods __pyx_tp_as_mapping_array = {
+  __pyx_array___len__, /*mp_length*/
+  __pyx_array___getitem__, /*mp_subscript*/
+  __pyx_mp_ass_subscript_array, /*mp_ass_subscript*/
+};
+
+static PyBufferProcs __pyx_tp_as_buffer_array = {
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getreadbuffer*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getwritebuffer*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getsegcount*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getcharbuffer*/
+  #endif
+  __pyx_array_getbuffer, /*bf_getbuffer*/
+  0, /*bf_releasebuffer*/
+};
+
+static PyTypeObject __pyx_type___pyx_array = {
+  PyVarObject_HEAD_INIT(0, 0)
+  "fairseq.data.token_block_utils_fast.""array", /*tp_name*/
+  sizeof(struct __pyx_array_obj), /*tp_basicsize*/
+  0, /*tp_itemsize*/
+  __pyx_tp_dealloc_array, /*tp_dealloc*/
+  #if PY_VERSION_HEX < 0x030800b4
+  0, /*tp_print*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030800b4
+  0, /*tp_vectorcall_offset*/
+  #endif
+  0, /*tp_getattr*/
+  0, /*tp_setattr*/
+  #if PY_MAJOR_VERSION < 3
+  0, /*tp_compare*/
+  #endif
+  #if PY_MAJOR_VERSION >= 3
+  0, /*tp_as_async*/
+  #endif
+  0, /*tp_repr*/
+  0, /*tp_as_number*/
+  &__pyx_tp_as_sequence_array, /*tp_as_sequence*/
+  &__pyx_tp_as_mapping_array, /*tp_as_mapping*/
+  0, /*tp_hash*/
+  0, /*tp_call*/
+  0, /*tp_str*/
+  __pyx_tp_getattro_array, /*tp_getattro*/
+  0, /*tp_setattro*/
+  &__pyx_tp_as_buffer_array, /*tp_as_buffer*/
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_SEQUENCE, /*tp_flags*/
+  0, /*tp_doc*/
+  0, /*tp_traverse*/
+  0, /*tp_clear*/
+  0, /*tp_richcompare*/
+  0, /*tp_weaklistoffset*/
+  0, /*tp_iter*/
+  0, /*tp_iternext*/
+  __pyx_methods_array, /*tp_methods*/
+  0, /*tp_members*/
+  __pyx_getsets_array, /*tp_getset*/
+  0, /*tp_base*/
+  0, /*tp_dict*/
+  0, /*tp_descr_get*/
+  0, /*tp_descr_set*/
+  #if !CYTHON_USE_TYPE_SPECS
+  0, /*tp_dictoffset*/
+  #endif
+  0, /*tp_init*/
+  0, /*tp_alloc*/
+  __pyx_tp_new_array, /*tp_new*/
+  0, /*tp_free*/
+  0, /*tp_is_gc*/
+  0, /*tp_bases*/
+  0, /*tp_mro*/
+  0, /*tp_cache*/
+  0, /*tp_subclasses*/
+  0, /*tp_weaklist*/
+  0, /*tp_del*/
+  0, /*tp_version_tag*/
+  #if PY_VERSION_HEX >= 0x030400a1
+  #if CYTHON_USE_TP_FINALIZE
+  0, /*tp_finalize*/
+  #else
+  NULL, /*tp_finalize*/
+  #endif
+  #endif
+  #if PY_VERSION_HEX >= 0x030800b1 && (!CYTHON_COMPILING_IN_PYPY || PYPY_VERSION_NUM >= 0x07030800)
+  0, /*tp_vectorcall*/
+  #endif
+  #if __PYX_NEED_TP_PRINT_SLOT == 1
+  0, /*tp_print*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030C0000
+  0, /*tp_watched*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030d00A4
+  0, /*tp_versions_used*/
+  #endif
+  #if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX >= 0x03090000 && PY_VERSION_HEX < 0x030a0000
+  0, /*tp_pypy_flags*/
+  #endif
+};
+#endif
+
+static PyObject *__pyx_tp_new_Enum(PyTypeObject *t, CYTHON_UNUSED PyObject *a, CYTHON_UNUSED PyObject *k) {
+  struct __pyx_MemviewEnum_obj *p;
+  PyObject *o;
+  #if CYTHON_COMPILING_IN_LIMITED_API
+  allocfunc alloc_func = (allocfunc)PyType_GetSlot(t, Py_tp_alloc);
+  o = alloc_func(t, 0);
+  #else
+  if (likely(!__Pyx_PyType_HasFeature(t, Py_TPFLAGS_IS_ABSTRACT))) {
+    o = (*t->tp_alloc)(t, 0);
+  } else {
+    o = (PyObject *) PyBaseObject_Type.tp_new(t, __pyx_empty_tuple, 0);
+  }
+  if (unlikely(!o)) return 0;
+  #endif
+  p = ((struct __pyx_MemviewEnum_obj *)o);
+  p->name = Py_None; Py_INCREF(Py_None);
+  return o;
+}
+
+static void __pyx_tp_dealloc_Enum(PyObject *o) {
+  struct __pyx_MemviewEnum_obj *p = (struct __pyx_MemviewEnum_obj *)o;
+  #if CYTHON_USE_TP_FINALIZE
+  if (unlikely((PY_VERSION_HEX >= 0x03080000 || __Pyx_PyType_HasFeature(Py_TYPE(o), Py_TPFLAGS_HAVE_FINALIZE)) && __Pyx_PyObject_GetSlot(o, tp_finalize, destructor)) && !__Pyx_PyObject_GC_IsFinalized(o)) {
+    if (__Pyx_PyObject_GetSlot(o, tp_dealloc, destructor) == __pyx_tp_dealloc_Enum) {
+      if (PyObject_CallFinalizerFromDealloc(o)) return;
+    }
+  }
+  #endif
+  PyObject_GC_UnTrack(o);
+  Py_CLEAR(p->name);
+  #if CYTHON_USE_TYPE_SLOTS || CYTHON_COMPILING_IN_PYPY
+  (*Py_TYPE(o)->tp_free)(o);
+  #else
+  {
+    freefunc tp_free = (freefunc)PyType_GetSlot(Py_TYPE(o), Py_tp_free);
+    if (tp_free) tp_free(o);
+  }
+  #endif
+}
+
+static int __pyx_tp_traverse_Enum(PyObject *o, visitproc v, void *a) {
+  int e;
+  struct __pyx_MemviewEnum_obj *p = (struct __pyx_MemviewEnum_obj *)o;
+  if (p->name) {
+    e = (*v)(p->name, a); if (e) return e;
+  }
+  return 0;
+}
+
+static int __pyx_tp_clear_Enum(PyObject *o) {
+  PyObject* tmp;
+  struct __pyx_MemviewEnum_obj *p = (struct __pyx_MemviewEnum_obj *)o;
+  tmp = ((PyObject*)p->name);
+  p->name = Py_None; Py_INCREF(Py_None);
+  Py_XDECREF(tmp);
+  return 0;
+}
+
+static PyObject *__pyx_specialmethod___pyx_MemviewEnum___repr__(PyObject *self, CYTHON_UNUSED PyObject *arg) {
+  return __pyx_MemviewEnum___repr__(self);
+}
+
+static PyMethodDef __pyx_methods_Enum[] = {
+  {"__repr__", (PyCFunction)__pyx_specialmethod___pyx_MemviewEnum___repr__, METH_NOARGS|METH_COEXIST, 0},
+  {"__reduce_cython__", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw___pyx_MemviewEnum_1__reduce_cython__, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {"__setstate_cython__", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw___pyx_MemviewEnum_3__setstate_cython__, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {0, 0, 0, 0}
+};
+#if CYTHON_USE_TYPE_SPECS
+static PyType_Slot __pyx_type___pyx_MemviewEnum_slots[] = {
+  {Py_tp_dealloc, (void *)__pyx_tp_dealloc_Enum},
+  {Py_tp_repr, (void *)__pyx_MemviewEnum___repr__},
+  {Py_tp_traverse, (void *)__pyx_tp_traverse_Enum},
+  {Py_tp_clear, (void *)__pyx_tp_clear_Enum},
+  {Py_tp_methods, (void *)__pyx_methods_Enum},
+  {Py_tp_init, (void *)__pyx_MemviewEnum___init__},
+  {Py_tp_new, (void *)__pyx_tp_new_Enum},
+  {0, 0},
+};
+static PyType_Spec __pyx_type___pyx_MemviewEnum_spec = {
+  "fairseq.data.token_block_utils_fast.Enum",
+  sizeof(struct __pyx_MemviewEnum_obj),
+  0,
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_HAVE_GC,
+  __pyx_type___pyx_MemviewEnum_slots,
+};
+#else
+
+static PyTypeObject __pyx_type___pyx_MemviewEnum = {
+  PyVarObject_HEAD_INIT(0, 0)
+  "fairseq.data.token_block_utils_fast.""Enum", /*tp_name*/
+  sizeof(struct __pyx_MemviewEnum_obj), /*tp_basicsize*/
+  0, /*tp_itemsize*/
+  __pyx_tp_dealloc_Enum, /*tp_dealloc*/
+  #if PY_VERSION_HEX < 0x030800b4
+  0, /*tp_print*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030800b4
+  0, /*tp_vectorcall_offset*/
+  #endif
+  0, /*tp_getattr*/
+  0, /*tp_setattr*/
+  #if PY_MAJOR_VERSION < 3
+  0, /*tp_compare*/
+  #endif
+  #if PY_MAJOR_VERSION >= 3
+  0, /*tp_as_async*/
+  #endif
+  __pyx_MemviewEnum___repr__, /*tp_repr*/
+  0, /*tp_as_number*/
+  0, /*tp_as_sequence*/
+  0, /*tp_as_mapping*/
+  0, /*tp_hash*/
+  0, /*tp_call*/
+  0, /*tp_str*/
+  0, /*tp_getattro*/
+  0, /*tp_setattro*/
+  0, /*tp_as_buffer*/
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_HAVE_GC, /*tp_flags*/
+  0, /*tp_doc*/
+  __pyx_tp_traverse_Enum, /*tp_traverse*/
+  __pyx_tp_clear_Enum, /*tp_clear*/
+  0, /*tp_richcompare*/
+  0, /*tp_weaklistoffset*/
+  0, /*tp_iter*/
+  0, /*tp_iternext*/
+  __pyx_methods_Enum, /*tp_methods*/
+  0, /*tp_members*/
+  0, /*tp_getset*/
+  0, /*tp_base*/
+  0, /*tp_dict*/
+  0, /*tp_descr_get*/
+  0, /*tp_descr_set*/
+  #if !CYTHON_USE_TYPE_SPECS
+  0, /*tp_dictoffset*/
+  #endif
+  __pyx_MemviewEnum___init__, /*tp_init*/
+  0, /*tp_alloc*/
+  __pyx_tp_new_Enum, /*tp_new*/
+  0, /*tp_free*/
+  0, /*tp_is_gc*/
+  0, /*tp_bases*/
+  0, /*tp_mro*/
+  0, /*tp_cache*/
+  0, /*tp_subclasses*/
+  0, /*tp_weaklist*/
+  0, /*tp_del*/
+  0, /*tp_version_tag*/
+  #if PY_VERSION_HEX >= 0x030400a1
+  #if CYTHON_USE_TP_FINALIZE
+  0, /*tp_finalize*/
+  #else
+  NULL, /*tp_finalize*/
+  #endif
+  #endif
+  #if PY_VERSION_HEX >= 0x030800b1 && (!CYTHON_COMPILING_IN_PYPY || PYPY_VERSION_NUM >= 0x07030800)
+  0, /*tp_vectorcall*/
+  #endif
+  #if __PYX_NEED_TP_PRINT_SLOT == 1
+  0, /*tp_print*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030C0000
+  0, /*tp_watched*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030d00A4
+  0, /*tp_versions_used*/
+  #endif
+  #if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX >= 0x03090000 && PY_VERSION_HEX < 0x030a0000
+  0, /*tp_pypy_flags*/
+  #endif
+};
+#endif
+static struct __pyx_vtabstruct_memoryview __pyx_vtable_memoryview;
+
+static PyObject *__pyx_tp_new_memoryview(PyTypeObject *t, PyObject *a, PyObject *k) {
+  struct __pyx_memoryview_obj *p;
+  PyObject *o;
+  #if CYTHON_COMPILING_IN_LIMITED_API
+  allocfunc alloc_func = (allocfunc)PyType_GetSlot(t, Py_tp_alloc);
+  o = alloc_func(t, 0);
+  #else
+  if (likely(!__Pyx_PyType_HasFeature(t, Py_TPFLAGS_IS_ABSTRACT))) {
+    o = (*t->tp_alloc)(t, 0);
+  } else {
+    o = (PyObject *) PyBaseObject_Type.tp_new(t, __pyx_empty_tuple, 0);
+  }
+  if (unlikely(!o)) return 0;
+  #endif
+  p = ((struct __pyx_memoryview_obj *)o);
+  p->__pyx_vtab = __pyx_vtabptr_memoryview;
+  p->obj = Py_None; Py_INCREF(Py_None);
+  p->_size = Py_None; Py_INCREF(Py_None);
+  p->_array_interface = Py_None; Py_INCREF(Py_None);
+  p->view.obj = NULL;
+  if (unlikely(__pyx_memoryview___cinit__(o, a, k) < 0)) goto bad;
+  return o;
+  bad:
+  Py_DECREF(o); o = 0;
+  return NULL;
+}
+
+static void __pyx_tp_dealloc_memoryview(PyObject *o) {
+  struct __pyx_memoryview_obj *p = (struct __pyx_memoryview_obj *)o;
+  #if CYTHON_USE_TP_FINALIZE
+  if (unlikely((PY_VERSION_HEX >= 0x03080000 || __Pyx_PyType_HasFeature(Py_TYPE(o), Py_TPFLAGS_HAVE_FINALIZE)) && __Pyx_PyObject_GetSlot(o, tp_finalize, destructor)) && !__Pyx_PyObject_GC_IsFinalized(o)) {
+    if (__Pyx_PyObject_GetSlot(o, tp_dealloc, destructor) == __pyx_tp_dealloc_memoryview) {
+      if (PyObject_CallFinalizerFromDealloc(o)) return;
+    }
+  }
+  #endif
+  PyObject_GC_UnTrack(o);
+  {
+    PyObject *etype, *eval, *etb;
+    PyErr_Fetch(&etype, &eval, &etb);
+    __Pyx_SET_REFCNT(o, Py_REFCNT(o) + 1);
+    __pyx_memoryview___dealloc__(o);
+    __Pyx_SET_REFCNT(o, Py_REFCNT(o) - 1);
+    PyErr_Restore(etype, eval, etb);
+  }
+  Py_CLEAR(p->obj);
+  Py_CLEAR(p->_size);
+  Py_CLEAR(p->_array_interface);
+  #if CYTHON_USE_TYPE_SLOTS || CYTHON_COMPILING_IN_PYPY
+  (*Py_TYPE(o)->tp_free)(o);
+  #else
+  {
+    freefunc tp_free = (freefunc)PyType_GetSlot(Py_TYPE(o), Py_tp_free);
+    if (tp_free) tp_free(o);
+  }
+  #endif
+}
+
+static int __pyx_tp_traverse_memoryview(PyObject *o, visitproc v, void *a) {
+  int e;
+  struct __pyx_memoryview_obj *p = (struct __pyx_memoryview_obj *)o;
+  if (p->obj) {
+    e = (*v)(p->obj, a); if (e) return e;
+  }
+  if (p->_size) {
+    e = (*v)(p->_size, a); if (e) return e;
+  }
+  if (p->_array_interface) {
+    e = (*v)(p->_array_interface, a); if (e) return e;
+  }
+  if (p->view.obj) {
+    e = (*v)(p->view.obj, a); if (e) return e;
+  }
+  return 0;
+}
+
+static int __pyx_tp_clear_memoryview(PyObject *o) {
+  PyObject* tmp;
+  struct __pyx_memoryview_obj *p = (struct __pyx_memoryview_obj *)o;
+  tmp = ((PyObject*)p->obj);
+  p->obj = Py_None; Py_INCREF(Py_None);
+  Py_XDECREF(tmp);
+  tmp = ((PyObject*)p->_size);
+  p->_size = Py_None; Py_INCREF(Py_None);
+  Py_XDECREF(tmp);
+  tmp = ((PyObject*)p->_array_interface);
+  p->_array_interface = Py_None; Py_INCREF(Py_None);
+  Py_XDECREF(tmp);
+  Py_CLEAR(p->view.obj);
+  return 0;
+}
+static PyObject *__pyx_sq_item_memoryview(PyObject *o, Py_ssize_t i) {
+  PyObject *r;
+  PyObject *x = PyInt_FromSsize_t(i); if(!x) return 0;
+  r = Py_TYPE(o)->tp_as_mapping->mp_subscript(o, x);
+  Py_DECREF(x);
+  return r;
+}
+
+static int __pyx_mp_ass_subscript_memoryview(PyObject *o, PyObject *i, PyObject *v) {
+  if (v) {
+    return __pyx_memoryview___setitem__(o, i, v);
+  }
+  else {
+    __Pyx_TypeName o_type_name;
+    o_type_name = __Pyx_PyType_GetName(Py_TYPE(o));
+    PyErr_Format(PyExc_NotImplementedError,
+      "Subscript deletion not supported by " __Pyx_FMT_TYPENAME, o_type_name);
+    __Pyx_DECREF_TypeName(o_type_name);
+    return -1;
+  }
+}
+
+static PyObject *__pyx_getprop___pyx_memoryview_T(PyObject *o, CYTHON_UNUSED void *x) {
+  return __pyx_pw_15View_dot_MemoryView_10memoryview_1T_1__get__(o);
+}
+
+static PyObject *__pyx_getprop___pyx_memoryview_base(PyObject *o, CYTHON_UNUSED void *x) {
+  return __pyx_pw_15View_dot_MemoryView_10memoryview_4base_1__get__(o);
+}
+
+static PyObject *__pyx_getprop___pyx_memoryview_shape(PyObject *o, CYTHON_UNUSED void *x) {
+  return __pyx_pw_15View_dot_MemoryView_10memoryview_5shape_1__get__(o);
+}
+
+static PyObject *__pyx_getprop___pyx_memoryview_strides(PyObject *o, CYTHON_UNUSED void *x) {
+  return __pyx_pw_15View_dot_MemoryView_10memoryview_7strides_1__get__(o);
+}
+
+static PyObject *__pyx_getprop___pyx_memoryview_suboffsets(PyObject *o, CYTHON_UNUSED void *x) {
+  return __pyx_pw_15View_dot_MemoryView_10memoryview_10suboffsets_1__get__(o);
+}
+
+static PyObject *__pyx_getprop___pyx_memoryview_ndim(PyObject *o, CYTHON_UNUSED void *x) {
+  return __pyx_pw_15View_dot_MemoryView_10memoryview_4ndim_1__get__(o);
+}
+
+static PyObject *__pyx_getprop___pyx_memoryview_itemsize(PyObject *o, CYTHON_UNUSED void *x) {
+  return __pyx_pw_15View_dot_MemoryView_10memoryview_8itemsize_1__get__(o);
+}
+
+static PyObject *__pyx_getprop___pyx_memoryview_nbytes(PyObject *o, CYTHON_UNUSED void *x) {
+  return __pyx_pw_15View_dot_MemoryView_10memoryview_6nbytes_1__get__(o);
+}
+
+static PyObject *__pyx_getprop___pyx_memoryview_size(PyObject *o, CYTHON_UNUSED void *x) {
+  return __pyx_pw_15View_dot_MemoryView_10memoryview_4size_1__get__(o);
+}
+
+static PyObject *__pyx_specialmethod___pyx_memoryview___repr__(PyObject *self, CYTHON_UNUSED PyObject *arg) {
+  return __pyx_memoryview___repr__(self);
+}
+
+static PyMethodDef __pyx_methods_memoryview[] = {
+  {"__repr__", (PyCFunction)__pyx_specialmethod___pyx_memoryview___repr__, METH_NOARGS|METH_COEXIST, 0},
+  {"is_c_contig", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_memoryview_is_c_contig, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {"is_f_contig", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_memoryview_is_f_contig, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {"copy", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_memoryview_copy, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {"copy_fortran", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_memoryview_copy_fortran, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {"__reduce_cython__", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw___pyx_memoryview_1__reduce_cython__, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {"__setstate_cython__", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw___pyx_memoryview_3__setstate_cython__, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {0, 0, 0, 0}
+};
+
+static struct PyGetSetDef __pyx_getsets_memoryview[] = {
+  {(char *)"T", __pyx_getprop___pyx_memoryview_T, 0, (char *)0, 0},
+  {(char *)"base", __pyx_getprop___pyx_memoryview_base, 0, (char *)0, 0},
+  {(char *)"shape", __pyx_getprop___pyx_memoryview_shape, 0, (char *)0, 0},
+  {(char *)"strides", __pyx_getprop___pyx_memoryview_strides, 0, (char *)0, 0},
+  {(char *)"suboffsets", __pyx_getprop___pyx_memoryview_suboffsets, 0, (char *)0, 0},
+  {(char *)"ndim", __pyx_getprop___pyx_memoryview_ndim, 0, (char *)0, 0},
+  {(char *)"itemsize", __pyx_getprop___pyx_memoryview_itemsize, 0, (char *)0, 0},
+  {(char *)"nbytes", __pyx_getprop___pyx_memoryview_nbytes, 0, (char *)0, 0},
+  {(char *)"size", __pyx_getprop___pyx_memoryview_size, 0, (char *)0, 0},
+  {0, 0, 0, 0, 0}
+};
+#if CYTHON_USE_TYPE_SPECS
+#if !CYTHON_COMPILING_IN_LIMITED_API
+
+static PyBufferProcs __pyx_tp_as_buffer_memoryview = {
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getreadbuffer*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getwritebuffer*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getsegcount*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getcharbuffer*/
+  #endif
+  __pyx_memoryview_getbuffer, /*bf_getbuffer*/
+  0, /*bf_releasebuffer*/
+};
+#endif
+static PyType_Slot __pyx_type___pyx_memoryview_slots[] = {
+  {Py_tp_dealloc, (void *)__pyx_tp_dealloc_memoryview},
+  {Py_tp_repr, (void *)__pyx_memoryview___repr__},
+  {Py_sq_length, (void *)__pyx_memoryview___len__},
+  {Py_sq_item, (void *)__pyx_sq_item_memoryview},
+  {Py_mp_length, (void *)__pyx_memoryview___len__},
+  {Py_mp_subscript, (void *)__pyx_memoryview___getitem__},
+  {Py_mp_ass_subscript, (void *)__pyx_mp_ass_subscript_memoryview},
+  {Py_tp_str, (void *)__pyx_memoryview___str__},
+  #if defined(Py_bf_getbuffer)
+  {Py_bf_getbuffer, (void *)__pyx_memoryview_getbuffer},
+  #endif
+  {Py_tp_traverse, (void *)__pyx_tp_traverse_memoryview},
+  {Py_tp_clear, (void *)__pyx_tp_clear_memoryview},
+  {Py_tp_methods, (void *)__pyx_methods_memoryview},
+  {Py_tp_getset, (void *)__pyx_getsets_memoryview},
+  {Py_tp_new, (void *)__pyx_tp_new_memoryview},
+  {0, 0},
+};
+static PyType_Spec __pyx_type___pyx_memoryview_spec = {
+  "fairseq.data.token_block_utils_fast.memoryview",
+  sizeof(struct __pyx_memoryview_obj),
+  0,
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_HAVE_GC,
+  __pyx_type___pyx_memoryview_slots,
+};
+#else
+
+static PySequenceMethods __pyx_tp_as_sequence_memoryview = {
+  __pyx_memoryview___len__, /*sq_length*/
+  0, /*sq_concat*/
+  0, /*sq_repeat*/
+  __pyx_sq_item_memoryview, /*sq_item*/
+  0, /*sq_slice*/
+  0, /*sq_ass_item*/
+  0, /*sq_ass_slice*/
+  0, /*sq_contains*/
+  0, /*sq_inplace_concat*/
+  0, /*sq_inplace_repeat*/
+};
+
+static PyMappingMethods __pyx_tp_as_mapping_memoryview = {
+  __pyx_memoryview___len__, /*mp_length*/
+  __pyx_memoryview___getitem__, /*mp_subscript*/
+  __pyx_mp_ass_subscript_memoryview, /*mp_ass_subscript*/
+};
+
+static PyBufferProcs __pyx_tp_as_buffer_memoryview = {
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getreadbuffer*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getwritebuffer*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getsegcount*/
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  0, /*bf_getcharbuffer*/
+  #endif
+  __pyx_memoryview_getbuffer, /*bf_getbuffer*/
+  0, /*bf_releasebuffer*/
+};
+
+static PyTypeObject __pyx_type___pyx_memoryview = {
+  PyVarObject_HEAD_INIT(0, 0)
+  "fairseq.data.token_block_utils_fast.""memoryview", /*tp_name*/
+  sizeof(struct __pyx_memoryview_obj), /*tp_basicsize*/
+  0, /*tp_itemsize*/
+  __pyx_tp_dealloc_memoryview, /*tp_dealloc*/
+  #if PY_VERSION_HEX < 0x030800b4
+  0, /*tp_print*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030800b4
+  0, /*tp_vectorcall_offset*/
+  #endif
+  0, /*tp_getattr*/
+  0, /*tp_setattr*/
+  #if PY_MAJOR_VERSION < 3
+  0, /*tp_compare*/
+  #endif
+  #if PY_MAJOR_VERSION >= 3
+  0, /*tp_as_async*/
+  #endif
+  __pyx_memoryview___repr__, /*tp_repr*/
+  0, /*tp_as_number*/
+  &__pyx_tp_as_sequence_memoryview, /*tp_as_sequence*/
+  &__pyx_tp_as_mapping_memoryview, /*tp_as_mapping*/
+  0, /*tp_hash*/
+  0, /*tp_call*/
+  __pyx_memoryview___str__, /*tp_str*/
+  0, /*tp_getattro*/
+  0, /*tp_setattro*/
+  &__pyx_tp_as_buffer_memoryview, /*tp_as_buffer*/
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_HAVE_GC, /*tp_flags*/
+  0, /*tp_doc*/
+  __pyx_tp_traverse_memoryview, /*tp_traverse*/
+  __pyx_tp_clear_memoryview, /*tp_clear*/
+  0, /*tp_richcompare*/
+  0, /*tp_weaklistoffset*/
+  0, /*tp_iter*/
+  0, /*tp_iternext*/
+  __pyx_methods_memoryview, /*tp_methods*/
+  0, /*tp_members*/
+  __pyx_getsets_memoryview, /*tp_getset*/
+  0, /*tp_base*/
+  0, /*tp_dict*/
+  0, /*tp_descr_get*/
+  0, /*tp_descr_set*/
+  #if !CYTHON_USE_TYPE_SPECS
+  0, /*tp_dictoffset*/
+  #endif
+  0, /*tp_init*/
+  0, /*tp_alloc*/
+  __pyx_tp_new_memoryview, /*tp_new*/
+  0, /*tp_free*/
+  0, /*tp_is_gc*/
+  0, /*tp_bases*/
+  0, /*tp_mro*/
+  0, /*tp_cache*/
+  0, /*tp_subclasses*/
+  0, /*tp_weaklist*/
+  0, /*tp_del*/
+  0, /*tp_version_tag*/
+  #if PY_VERSION_HEX >= 0x030400a1
+  #if CYTHON_USE_TP_FINALIZE
+  0, /*tp_finalize*/
+  #else
+  NULL, /*tp_finalize*/
+  #endif
+  #endif
+  #if PY_VERSION_HEX >= 0x030800b1 && (!CYTHON_COMPILING_IN_PYPY || PYPY_VERSION_NUM >= 0x07030800)
+  0, /*tp_vectorcall*/
+  #endif
+  #if __PYX_NEED_TP_PRINT_SLOT == 1
+  0, /*tp_print*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030C0000
+  0, /*tp_watched*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030d00A4
+  0, /*tp_versions_used*/
+  #endif
+  #if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX >= 0x03090000 && PY_VERSION_HEX < 0x030a0000
+  0, /*tp_pypy_flags*/
+  #endif
+};
+#endif
+static struct __pyx_vtabstruct__memoryviewslice __pyx_vtable__memoryviewslice;
+
+static PyObject *__pyx_tp_new__memoryviewslice(PyTypeObject *t, PyObject *a, PyObject *k) {
+  struct __pyx_memoryviewslice_obj *p;
+  PyObject *o = __pyx_tp_new_memoryview(t, a, k);
+  if (unlikely(!o)) return 0;
+  p = ((struct __pyx_memoryviewslice_obj *)o);
+  p->__pyx_base.__pyx_vtab = (struct __pyx_vtabstruct_memoryview*)__pyx_vtabptr__memoryviewslice;
+  new((void*)&(p->from_slice)) __Pyx_memviewslice();
+  p->from_object = Py_None; Py_INCREF(Py_None);
+  p->from_slice.memview = NULL;
+  return o;
+}
+
+static void __pyx_tp_dealloc__memoryviewslice(PyObject *o) {
+  struct __pyx_memoryviewslice_obj *p = (struct __pyx_memoryviewslice_obj *)o;
+  #if CYTHON_USE_TP_FINALIZE
+  if (unlikely((PY_VERSION_HEX >= 0x03080000 || __Pyx_PyType_HasFeature(Py_TYPE(o), Py_TPFLAGS_HAVE_FINALIZE)) && __Pyx_PyObject_GetSlot(o, tp_finalize, destructor)) && !__Pyx_PyObject_GC_IsFinalized(o)) {
+    if (__Pyx_PyObject_GetSlot(o, tp_dealloc, destructor) == __pyx_tp_dealloc__memoryviewslice) {
+      if (PyObject_CallFinalizerFromDealloc(o)) return;
+    }
+  }
+  #endif
+  PyObject_GC_UnTrack(o);
+  {
+    PyObject *etype, *eval, *etb;
+    PyErr_Fetch(&etype, &eval, &etb);
+    __Pyx_SET_REFCNT(o, Py_REFCNT(o) + 1);
+    __pyx_memoryviewslice___dealloc__(o);
+    __Pyx_SET_REFCNT(o, Py_REFCNT(o) - 1);
+    PyErr_Restore(etype, eval, etb);
+  }
+  __Pyx_call_destructor(p->from_slice);
+  Py_CLEAR(p->from_object);
+  PyObject_GC_Track(o);
+  __pyx_tp_dealloc_memoryview(o);
+}
+
+static int __pyx_tp_traverse__memoryviewslice(PyObject *o, visitproc v, void *a) {
+  int e;
+  struct __pyx_memoryviewslice_obj *p = (struct __pyx_memoryviewslice_obj *)o;
+  e = __pyx_tp_traverse_memoryview(o, v, a); if (e) return e;
+  if (p->from_object) {
+    e = (*v)(p->from_object, a); if (e) return e;
+  }
+  return 0;
+}
+
+static int __pyx_tp_clear__memoryviewslice(PyObject *o) {
+  PyObject* tmp;
+  struct __pyx_memoryviewslice_obj *p = (struct __pyx_memoryviewslice_obj *)o;
+  __pyx_tp_clear_memoryview(o);
+  tmp = ((PyObject*)p->from_object);
+  p->from_object = Py_None; Py_INCREF(Py_None);
+  Py_XDECREF(tmp);
+  __PYX_XCLEAR_MEMVIEW(&p->from_slice, 1);
+  return 0;
+}
+
+static PyMethodDef __pyx_methods__memoryviewslice[] = {
+  {"__reduce_cython__", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw___pyx_memoryviewslice_1__reduce_cython__, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {"__setstate_cython__", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw___pyx_memoryviewslice_3__setstate_cython__, __Pyx_METH_FASTCALL|METH_KEYWORDS, 0},
+  {0, 0, 0, 0}
+};
+#if CYTHON_USE_TYPE_SPECS
+static PyType_Slot __pyx_type___pyx_memoryviewslice_slots[] = {
+  {Py_tp_dealloc, (void *)__pyx_tp_dealloc__memoryviewslice},
+  {Py_tp_doc, (void *)PyDoc_STR("Internal class for passing memoryview slices to Python")},
+  {Py_tp_traverse, (void *)__pyx_tp_traverse__memoryviewslice},
+  {Py_tp_clear, (void *)__pyx_tp_clear__memoryviewslice},
+  {Py_tp_methods, (void *)__pyx_methods__memoryviewslice},
+  {Py_tp_new, (void *)__pyx_tp_new__memoryviewslice},
+  {0, 0},
+};
+static PyType_Spec __pyx_type___pyx_memoryviewslice_spec = {
+  "fairseq.data.token_block_utils_fast._memoryviewslice",
+  sizeof(struct __pyx_memoryviewslice_obj),
+  0,
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_HAVE_GC|Py_TPFLAGS_SEQUENCE,
+  __pyx_type___pyx_memoryviewslice_slots,
+};
+#else
+
+static PyTypeObject __pyx_type___pyx_memoryviewslice = {
+  PyVarObject_HEAD_INIT(0, 0)
+  "fairseq.data.token_block_utils_fast.""_memoryviewslice", /*tp_name*/
+  sizeof(struct __pyx_memoryviewslice_obj), /*tp_basicsize*/
+  0, /*tp_itemsize*/
+  __pyx_tp_dealloc__memoryviewslice, /*tp_dealloc*/
+  #if PY_VERSION_HEX < 0x030800b4
+  0, /*tp_print*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030800b4
+  0, /*tp_vectorcall_offset*/
+  #endif
+  0, /*tp_getattr*/
+  0, /*tp_setattr*/
+  #if PY_MAJOR_VERSION < 3
+  0, /*tp_compare*/
+  #endif
+  #if PY_MAJOR_VERSION >= 3
+  0, /*tp_as_async*/
+  #endif
+  #if CYTHON_COMPILING_IN_PYPY || 0
+  __pyx_memoryview___repr__, /*tp_repr*/
+  #else
+  0, /*tp_repr*/
+  #endif
+  0, /*tp_as_number*/
+  0, /*tp_as_sequence*/
+  0, /*tp_as_mapping*/
+  0, /*tp_hash*/
+  0, /*tp_call*/
+  #if CYTHON_COMPILING_IN_PYPY || 0
+  __pyx_memoryview___str__, /*tp_str*/
+  #else
+  0, /*tp_str*/
+  #endif
+  0, /*tp_getattro*/
+  0, /*tp_setattro*/
+  0, /*tp_as_buffer*/
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_HAVE_GC|Py_TPFLAGS_SEQUENCE, /*tp_flags*/
+  PyDoc_STR("Internal class for passing memoryview slices to Python"), /*tp_doc*/
+  __pyx_tp_traverse__memoryviewslice, /*tp_traverse*/
+  __pyx_tp_clear__memoryviewslice, /*tp_clear*/
+  0, /*tp_richcompare*/
+  0, /*tp_weaklistoffset*/
+  0, /*tp_iter*/
+  0, /*tp_iternext*/
+  __pyx_methods__memoryviewslice, /*tp_methods*/
+  0, /*tp_members*/
+  0, /*tp_getset*/
+  0, /*tp_base*/
+  0, /*tp_dict*/
+  0, /*tp_descr_get*/
+  0, /*tp_descr_set*/
+  #if !CYTHON_USE_TYPE_SPECS
+  0, /*tp_dictoffset*/
+  #endif
+  0, /*tp_init*/
+  0, /*tp_alloc*/
+  __pyx_tp_new__memoryviewslice, /*tp_new*/
+  0, /*tp_free*/
+  0, /*tp_is_gc*/
+  0, /*tp_bases*/
+  0, /*tp_mro*/
+  0, /*tp_cache*/
+  0, /*tp_subclasses*/
+  0, /*tp_weaklist*/
+  0, /*tp_del*/
+  0, /*tp_version_tag*/
+  #if PY_VERSION_HEX >= 0x030400a1
+  #if CYTHON_USE_TP_FINALIZE
+  0, /*tp_finalize*/
+  #else
+  NULL, /*tp_finalize*/
+  #endif
+  #endif
+  #if PY_VERSION_HEX >= 0x030800b1 && (!CYTHON_COMPILING_IN_PYPY || PYPY_VERSION_NUM >= 0x07030800)
+  0, /*tp_vectorcall*/
+  #endif
+  #if __PYX_NEED_TP_PRINT_SLOT == 1
+  0, /*tp_print*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030C0000
+  0, /*tp_watched*/
+  #endif
+  #if PY_VERSION_HEX >= 0x030d00A4
+  0, /*tp_versions_used*/
+  #endif
+  #if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX >= 0x03090000 && PY_VERSION_HEX < 0x030a0000
+  0, /*tp_pypy_flags*/
+  #endif
+};
+#endif
+
+static PyMethodDef __pyx_methods[] = {
+  {0, 0, 0, 0}
+};
+#ifndef CYTHON_SMALL_CODE
+#if defined(__clang__)
+    #define CYTHON_SMALL_CODE
+#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+    #define CYTHON_SMALL_CODE __attribute__((cold))
+#else
+    #define CYTHON_SMALL_CODE
+#endif
+#endif
+/* #### Code section: pystring_table ### */
+
+static int __Pyx_CreateStringTabAndInitStrings(void) {
+  __Pyx_StringTabEntry __pyx_string_tab[] = {
+    {&__pyx_kp_u_, __pyx_k_, sizeof(__pyx_k_), 0, 1, 0, 0},
+    {&__pyx_n_s_ASCII, __pyx_k_ASCII, sizeof(__pyx_k_ASCII), 0, 0, 1, 1},
+    {&__pyx_kp_s_All_dimensions_preceding_dimensi, __pyx_k_All_dimensions_preceding_dimensi, sizeof(__pyx_k_All_dimensions_preceding_dimensi), 0, 0, 1, 0},
+    {&__pyx_n_s_AssertionError, __pyx_k_AssertionError, sizeof(__pyx_k_AssertionError), 0, 0, 1, 1},
+    {&__pyx_kp_s_Buffer_view_does_not_expose_stri, __pyx_k_Buffer_view_does_not_expose_stri, sizeof(__pyx_k_Buffer_view_does_not_expose_stri), 0, 0, 1, 0},
+    {&__pyx_kp_s_Can_only_create_a_buffer_that_is, __pyx_k_Can_only_create_a_buffer_that_is, sizeof(__pyx_k_Can_only_create_a_buffer_that_is), 0, 0, 1, 0},
+    {&__pyx_kp_s_Cannot_assign_to_read_only_memor, __pyx_k_Cannot_assign_to_read_only_memor, sizeof(__pyx_k_Cannot_assign_to_read_only_memor), 0, 0, 1, 0},
+    {&__pyx_kp_s_Cannot_create_writable_memory_vi, __pyx_k_Cannot_create_writable_memory_vi, sizeof(__pyx_k_Cannot_create_writable_memory_vi), 0, 0, 1, 0},
+    {&__pyx_kp_u_Cannot_index_with_type, __pyx_k_Cannot_index_with_type, sizeof(__pyx_k_Cannot_index_with_type), 0, 1, 0, 0},
+    {&__pyx_kp_s_Cannot_transpose_memoryview_with, __pyx_k_Cannot_transpose_memoryview_with, sizeof(__pyx_k_Cannot_transpose_memoryview_with), 0, 0, 1, 0},
+    {&__pyx_n_s_DTYPE, __pyx_k_DTYPE, sizeof(__pyx_k_DTYPE), 0, 0, 1, 1},
+    {&__pyx_n_s_DatasetSearcher, __pyx_k_DatasetSearcher, sizeof(__pyx_k_DatasetSearcher), 0, 0, 1, 1},
+    {&__pyx_n_s_DatasetSearcher___reduce_cython, __pyx_k_DatasetSearcher___reduce_cython, sizeof(__pyx_k_DatasetSearcher___reduce_cython), 0, 0, 1, 1},
+    {&__pyx_n_s_DatasetSearcher___setstate_cytho, __pyx_k_DatasetSearcher___setstate_cytho, sizeof(__pyx_k_DatasetSearcher___setstate_cytho), 0, 0, 1, 1},
+    {&__pyx_kp_s_Dimension_d_is_not_direct, __pyx_k_Dimension_d_is_not_direct, sizeof(__pyx_k_Dimension_d_is_not_direct), 0, 0, 1, 0},
+    {&__pyx_n_s_Ellipsis, __pyx_k_Ellipsis, sizeof(__pyx_k_Ellipsis), 0, 0, 1, 1},
+    {&__pyx_kp_s_Empty_shape_tuple_for_cython_arr, __pyx_k_Empty_shape_tuple_for_cython_arr, sizeof(__pyx_k_Empty_shape_tuple_for_cython_arr), 0, 0, 1, 0},
+    {&__pyx_n_s_ImportError, __pyx_k_ImportError, sizeof(__pyx_k_ImportError), 0, 0, 1, 1},
+    {&__pyx_kp_s_Incompatible_checksums_0x_x_vs_0, __pyx_k_Incompatible_checksums_0x_x_vs_0, sizeof(__pyx_k_Incompatible_checksums_0x_x_vs_0), 0, 0, 1, 0},
+    {&__pyx_kp_s_Incompatible_checksums_0x_x_vs_0_2, __pyx_k_Incompatible_checksums_0x_x_vs_0_2, sizeof(__pyx_k_Incompatible_checksums_0x_x_vs_0_2), 0, 0, 1, 0},
+    {&__pyx_n_s_IndexError, __pyx_k_IndexError, sizeof(__pyx_k_IndexError), 0, 0, 1, 1},
+    {&__pyx_kp_s_Index_out_of_bounds_axis_d, __pyx_k_Index_out_of_bounds_axis_d, sizeof(__pyx_k_Index_out_of_bounds_axis_d), 0, 0, 1, 0},
+    {&__pyx_kp_s_Indirect_dimensions_not_supporte, __pyx_k_Indirect_dimensions_not_supporte, sizeof(__pyx_k_Indirect_dimensions_not_supporte), 0, 0, 1, 0},
+    {&__pyx_kp_u_Invalid_break_mode, __pyx_k_Invalid_break_mode, sizeof(__pyx_k_Invalid_break_mode), 0, 1, 0, 0},
+    {&__pyx_kp_u_Invalid_mode_expected_c_or_fortr, __pyx_k_Invalid_mode_expected_c_or_fortr, sizeof(__pyx_k_Invalid_mode_expected_c_or_fortr), 0, 1, 0, 0},
+    {&__pyx_kp_u_Invalid_shape_in_axis, __pyx_k_Invalid_shape_in_axis, sizeof(__pyx_k_Invalid_shape_in_axis), 0, 1, 0, 0},
+    {&__pyx_n_s_MemoryError, __pyx_k_MemoryError, sizeof(__pyx_k_MemoryError), 0, 0, 1, 1},
+    {&__pyx_kp_s_MemoryView_of_r_at_0x_x, __pyx_k_MemoryView_of_r_at_0x_x, sizeof(__pyx_k_MemoryView_of_r_at_0x_x), 0, 0, 1, 0},
+    {&__pyx_kp_s_MemoryView_of_r_object, __pyx_k_MemoryView_of_r_object, sizeof(__pyx_k_MemoryView_of_r_object), 0, 0, 1, 0},
+    {&__pyx_n_b_O, __pyx_k_O, sizeof(__pyx_k_O), 0, 0, 0, 1},
+    {&__pyx_kp_u_Out_of_bounds_on_buffer_access_a, __pyx_k_Out_of_bounds_on_buffer_access_a, sizeof(__pyx_k_Out_of_bounds_on_buffer_access_a), 0, 1, 0, 0},
+    {&__pyx_n_s_PickleError, __pyx_k_PickleError, sizeof(__pyx_k_PickleError), 0, 0, 1, 1},
+    {&__pyx_n_s_Sequence, __pyx_k_Sequence, sizeof(__pyx_k_Sequence), 0, 0, 1, 1},
+    {&__pyx_kp_s_Step_may_not_be_zero_axis_d, __pyx_k_Step_may_not_be_zero_axis_d, sizeof(__pyx_k_Step_may_not_be_zero_axis_d), 0, 0, 1, 0},
+    {&__pyx_n_s_TypeError, __pyx_k_TypeError, sizeof(__pyx_k_TypeError), 0, 0, 1, 1},
+    {&__pyx_kp_s_Unable_to_convert_item_to_object, __pyx_k_Unable_to_convert_item_to_object, sizeof(__pyx_k_Unable_to_convert_item_to_object), 0, 0, 1, 0},
+    {&__pyx_n_s_ValueError, __pyx_k_ValueError, sizeof(__pyx_k_ValueError), 0, 0, 1, 1},
+    {&__pyx_n_s_View_MemoryView, __pyx_k_View_MemoryView, sizeof(__pyx_k_View_MemoryView), 0, 0, 1, 1},
+    {&__pyx_kp_u__2, __pyx_k__2, sizeof(__pyx_k__2), 0, 1, 0, 0},
+    {&__pyx_n_s__3, __pyx_k__3, sizeof(__pyx_k__3), 0, 0, 1, 1},
+    {&__pyx_n_s__35, __pyx_k__35, sizeof(__pyx_k__35), 0, 0, 1, 1},
+    {&__pyx_kp_u__6, __pyx_k__6, sizeof(__pyx_k__6), 0, 1, 0, 0},
+    {&__pyx_kp_u__7, __pyx_k__7, sizeof(__pyx_k__7), 0, 1, 0, 0},
+    {&__pyx_n_s_abc, __pyx_k_abc, sizeof(__pyx_k_abc), 0, 0, 1, 1},
+    {&__pyx_n_s_allocate_buffer, __pyx_k_allocate_buffer, sizeof(__pyx_k_allocate_buffer), 0, 0, 1, 1},
+    {&__pyx_kp_u_and, __pyx_k_and, sizeof(__pyx_k_and), 0, 1, 0, 0},
+    {&__pyx_n_s_asyncio_coroutines, __pyx_k_asyncio_coroutines, sizeof(__pyx_k_asyncio_coroutines), 0, 0, 1, 1},
+    {&__pyx_n_s_axis, __pyx_k_axis, sizeof(__pyx_k_axis), 0, 0, 1, 1},
+    {&__pyx_n_s_base, __pyx_k_base, sizeof(__pyx_k_base), 0, 0, 1, 1},
+    {&__pyx_n_s_block_size, __pyx_k_block_size, sizeof(__pyx_k_block_size), 0, 0, 1, 1},
+    {&__pyx_n_s_break_mode, __pyx_k_break_mode, sizeof(__pyx_k_break_mode), 0, 0, 1, 1},
+    {&__pyx_n_s_c, __pyx_k_c, sizeof(__pyx_k_c), 0, 0, 1, 1},
+    {&__pyx_n_u_c, __pyx_k_c, sizeof(__pyx_k_c), 0, 1, 0, 1},
+    {&__pyx_n_s_chain, __pyx_k_chain, sizeof(__pyx_k_chain), 0, 0, 1, 1},
+    {&__pyx_n_s_class, __pyx_k_class, sizeof(__pyx_k_class), 0, 0, 1, 1},
+    {&__pyx_n_s_class_getitem, __pyx_k_class_getitem, sizeof(__pyx_k_class_getitem), 0, 0, 1, 1},
+    {&__pyx_n_s_cline_in_traceback, __pyx_k_cline_in_traceback, sizeof(__pyx_k_cline_in_traceback), 0, 0, 1, 1},
+    {&__pyx_n_s_collections, __pyx_k_collections, sizeof(__pyx_k_collections), 0, 0, 1, 1},
+    {&__pyx_kp_s_collections_abc, __pyx_k_collections_abc, sizeof(__pyx_k_collections_abc), 0, 0, 1, 0},
+    {&__pyx_n_u_complete, __pyx_k_complete, sizeof(__pyx_k_complete), 0, 1, 0, 1},
+    {&__pyx_n_u_complete_doc, __pyx_k_complete_doc, sizeof(__pyx_k_complete_doc), 0, 1, 0, 1},
+    {&__pyx_kp_s_contiguous_and_direct, __pyx_k_contiguous_and_direct, sizeof(__pyx_k_contiguous_and_direct), 0, 0, 1, 0},
+    {&__pyx_kp_s_contiguous_and_indirect, __pyx_k_contiguous_and_indirect, sizeof(__pyx_k_contiguous_and_indirect), 0, 0, 1, 0},
+    {&__pyx_n_s_count, __pyx_k_count, sizeof(__pyx_k_count), 0, 0, 1, 1},
+    {&__pyx_n_s_cumsum, __pyx_k_cumsum, sizeof(__pyx_k_cumsum), 0, 0, 1, 1},
+    {&__pyx_n_s_dict, __pyx_k_dict, sizeof(__pyx_k_dict), 0, 0, 1, 1},
+    {&__pyx_n_s_dict_2, __pyx_k_dict_2, sizeof(__pyx_k_dict_2), 0, 0, 1, 1},
+    {&__pyx_kp_u_disable, __pyx_k_disable, sizeof(__pyx_k_disable), 0, 1, 0, 0},
+    {&__pyx_n_s_document_sep_len, __pyx_k_document_sep_len, sizeof(__pyx_k_document_sep_len), 0, 0, 1, 1},
+    {&__pyx_n_s_dtype, __pyx_k_dtype, sizeof(__pyx_k_dtype), 0, 0, 1, 1},
+    {&__pyx_n_s_dtype_is_object, __pyx_k_dtype_is_object, sizeof(__pyx_k_dtype_is_object), 0, 0, 1, 1},
+    {&__pyx_kp_u_enable, __pyx_k_enable, sizeof(__pyx_k_enable), 0, 1, 0, 0},
+    {&__pyx_n_s_encode, __pyx_k_encode, sizeof(__pyx_k_encode), 0, 0, 1, 1},
+    {&__pyx_n_s_enumerate, __pyx_k_enumerate, sizeof(__pyx_k_enumerate), 0, 0, 1, 1},
+    {&__pyx_n_u_eos, __pyx_k_eos, sizeof(__pyx_k_eos), 0, 1, 0, 1},
+    {&__pyx_n_s_error, __pyx_k_error, sizeof(__pyx_k_error), 0, 0, 1, 1},
+    {&__pyx_kp_s_fairseq_data_token_block_utils_f, __pyx_k_fairseq_data_token_block_utils_f, sizeof(__pyx_k_fairseq_data_token_block_utils_f), 0, 0, 1, 0},
+    {&__pyx_n_s_fairseq_data_token_block_utils_f_2, __pyx_k_fairseq_data_token_block_utils_f_2, sizeof(__pyx_k_fairseq_data_token_block_utils_f_2), 0, 0, 1, 1},
+    {&__pyx_n_s_flags, __pyx_k_flags, sizeof(__pyx_k_flags), 0, 0, 1, 1},
+    {&__pyx_n_s_format, __pyx_k_format, sizeof(__pyx_k_format), 0, 0, 1, 1},
+    {&__pyx_n_s_fortran, __pyx_k_fortran, sizeof(__pyx_k_fortran), 0, 0, 1, 1},
+    {&__pyx_n_u_fortran, __pyx_k_fortran, sizeof(__pyx_k_fortran), 0, 1, 0, 1},
+    {&__pyx_n_s_from_iterable, __pyx_k_from_iterable, sizeof(__pyx_k_from_iterable), 0, 0, 1, 1},
+    {&__pyx_n_s_fromiter, __pyx_k_fromiter, sizeof(__pyx_k_fromiter), 0, 0, 1, 1},
+    {&__pyx_kp_u_gc, __pyx_k_gc, sizeof(__pyx_k_gc), 0, 1, 0, 0},
+    {&__pyx_n_s_get_block_to_dataset_index_fast, __pyx_k_get_block_to_dataset_index_fast, sizeof(__pyx_k_get_block_to_dataset_index_fast), 0, 0, 1, 1},
+    {&__pyx_n_s_get_slice_indices_fast, __pyx_k_get_slice_indices_fast, sizeof(__pyx_k_get_slice_indices_fast), 0, 0, 1, 1},
+    {&__pyx_n_s_getstate, __pyx_k_getstate, sizeof(__pyx_k_getstate), 0, 0, 1, 1},
+    {&__pyx_kp_u_got, __pyx_k_got, sizeof(__pyx_k_got), 0, 1, 0, 0},
+    {&__pyx_kp_u_got_differing_extents_in_dimensi, __pyx_k_got_differing_extents_in_dimensi, sizeof(__pyx_k_got_differing_extents_in_dimensi), 0, 1, 0, 0},
+    {&__pyx_n_s_id, __pyx_k_id, sizeof(__pyx_k_id), 0, 0, 1, 1},
+    {&__pyx_n_s_import, __pyx_k_import, sizeof(__pyx_k_import), 0, 0, 1, 1},
+    {&__pyx_n_s_index, __pyx_k_index, sizeof(__pyx_k_index), 0, 0, 1, 1},
+    {&__pyx_n_s_initializing, __pyx_k_initializing, sizeof(__pyx_k_initializing), 0, 0, 1, 1},
+    {&__pyx_n_s_int64, __pyx_k_int64, sizeof(__pyx_k_int64), 0, 0, 1, 1},
+    {&__pyx_n_s_is_coroutine, __pyx_k_is_coroutine, sizeof(__pyx_k_is_coroutine), 0, 0, 1, 1},
+    {&__pyx_kp_u_isenabled, __pyx_k_isenabled, sizeof(__pyx_k_isenabled), 0, 1, 0, 0},
+    {&__pyx_n_s_itemsize, __pyx_k_itemsize, sizeof(__pyx_k_itemsize), 0, 0, 1, 1},
+    {&__pyx_kp_s_itemsize_0_for_cython_array, __pyx_k_itemsize_0_for_cython_array, sizeof(__pyx_k_itemsize_0_for_cython_array), 0, 0, 1, 0},
+    {&__pyx_n_s_itertools, __pyx_k_itertools, sizeof(__pyx_k_itertools), 0, 0, 1, 1},
+    {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1},
+    {&__pyx_n_s_memview, __pyx_k_memview, sizeof(__pyx_k_memview), 0, 0, 1, 1},
+    {&__pyx_n_s_mode, __pyx_k_mode, sizeof(__pyx_k_mode), 0, 0, 1, 1},
+    {&__pyx_n_s_name, __pyx_k_name, sizeof(__pyx_k_name), 0, 0, 1, 1},
+    {&__pyx_n_s_name_2, __pyx_k_name_2, sizeof(__pyx_k_name_2), 0, 0, 1, 1},
+    {&__pyx_n_s_ndim, __pyx_k_ndim, sizeof(__pyx_k_ndim), 0, 0, 1, 1},
+    {&__pyx_n_s_new, __pyx_k_new, sizeof(__pyx_k_new), 0, 0, 1, 1},
+    {&__pyx_kp_s_no_default___reduce___due_to_non, __pyx_k_no_default___reduce___due_to_non, sizeof(__pyx_k_no_default___reduce___due_to_non), 0, 0, 1, 0},
+    {&__pyx_n_u_none, __pyx_k_none, sizeof(__pyx_k_none), 0, 1, 0, 1},
+    {&__pyx_n_s_np, __pyx_k_np, sizeof(__pyx_k_np), 0, 0, 1, 1},
+    {&__pyx_n_s_numpy, __pyx_k_numpy, sizeof(__pyx_k_numpy), 0, 0, 1, 1},
+    {&__pyx_kp_u_numpy__core_multiarray_failed_to, __pyx_k_numpy__core_multiarray_failed_to, sizeof(__pyx_k_numpy__core_multiarray_failed_to), 0, 1, 0, 0},
+    {&__pyx_kp_u_numpy__core_umath_failed_to_impo, __pyx_k_numpy__core_umath_failed_to_impo, sizeof(__pyx_k_numpy__core_umath_failed_to_impo), 0, 1, 0, 0},
+    {&__pyx_n_s_obj, __pyx_k_obj, sizeof(__pyx_k_obj), 0, 0, 1, 1},
+    {&__pyx_n_s_pack, __pyx_k_pack, sizeof(__pyx_k_pack), 0, 0, 1, 1},
+    {&__pyx_n_s_pickle, __pyx_k_pickle, sizeof(__pyx_k_pickle), 0, 0, 1, 1},
+    {&__pyx_n_s_pyx_PickleError, __pyx_k_pyx_PickleError, sizeof(__pyx_k_pyx_PickleError), 0, 0, 1, 1},
+    {&__pyx_n_s_pyx_checksum, __pyx_k_pyx_checksum, sizeof(__pyx_k_pyx_checksum), 0, 0, 1, 1},
+    {&__pyx_n_s_pyx_result, __pyx_k_pyx_result, sizeof(__pyx_k_pyx_result), 0, 0, 1, 1},
+    {&__pyx_n_s_pyx_state, __pyx_k_pyx_state, sizeof(__pyx_k_pyx_state), 0, 0, 1, 1},
+    {&__pyx_n_s_pyx_type, __pyx_k_pyx_type, sizeof(__pyx_k_pyx_type), 0, 0, 1, 1},
+    {&__pyx_n_s_pyx_unpickle_DatasetSearcher, __pyx_k_pyx_unpickle_DatasetSearcher, sizeof(__pyx_k_pyx_unpickle_DatasetSearcher), 0, 0, 1, 1},
+    {&__pyx_n_s_pyx_unpickle_Enum, __pyx_k_pyx_unpickle_Enum, sizeof(__pyx_k_pyx_unpickle_Enum), 0, 0, 1, 1},
+    {&__pyx_n_s_pyx_vtable, __pyx_k_pyx_vtable, sizeof(__pyx_k_pyx_vtable), 0, 0, 1, 1},
+    {&__pyx_n_s_range, __pyx_k_range, sizeof(__pyx_k_range), 0, 0, 1, 1},
+    {&__pyx_n_s_reduce, __pyx_k_reduce, sizeof(__pyx_k_reduce), 0, 0, 1, 1},
+    {&__pyx_n_s_reduce_cython, __pyx_k_reduce_cython, sizeof(__pyx_k_reduce_cython), 0, 0, 1, 1},
+    {&__pyx_n_s_reduce_ex, __pyx_k_reduce_ex, sizeof(__pyx_k_reduce_ex), 0, 0, 1, 1},
+    {&__pyx_n_s_register, __pyx_k_register, sizeof(__pyx_k_register), 0, 0, 1, 1},
+    {&__pyx_n_s_reshape, __pyx_k_reshape, sizeof(__pyx_k_reshape), 0, 0, 1, 1},
+    {&__pyx_n_s_self, __pyx_k_self, sizeof(__pyx_k_self), 0, 0, 1, 1},
+    {&__pyx_n_s_setstate, __pyx_k_setstate, sizeof(__pyx_k_setstate), 0, 0, 1, 1},
+    {&__pyx_n_s_setstate_cython, __pyx_k_setstate_cython, sizeof(__pyx_k_setstate_cython), 0, 0, 1, 1},
+    {&__pyx_n_s_shape, __pyx_k_shape, sizeof(__pyx_k_shape), 0, 0, 1, 1},
+    {&__pyx_n_s_size, __pyx_k_size, sizeof(__pyx_k_size), 0, 0, 1, 1},
+    {&__pyx_n_s_sizes, __pyx_k_sizes, sizeof(__pyx_k_sizes), 0, 0, 1, 1},
+    {&__pyx_n_s_slice_indices, __pyx_k_slice_indices, sizeof(__pyx_k_slice_indices), 0, 0, 1, 1},
+    {&__pyx_n_s_spec, __pyx_k_spec, sizeof(__pyx_k_spec), 0, 0, 1, 1},
+    {&__pyx_n_s_start, __pyx_k_start, sizeof(__pyx_k_start), 0, 0, 1, 1},
+    {&__pyx_n_s_state, __pyx_k_state, sizeof(__pyx_k_state), 0, 0, 1, 1},
+    {&__pyx_n_s_step, __pyx_k_step, sizeof(__pyx_k_step), 0, 0, 1, 1},
+    {&__pyx_n_s_stop, __pyx_k_stop, sizeof(__pyx_k_stop), 0, 0, 1, 1},
+    {&__pyx_kp_s_strided_and_direct, __pyx_k_strided_and_direct, sizeof(__pyx_k_strided_and_direct), 0, 0, 1, 0},
+    {&__pyx_kp_s_strided_and_direct_or_indirect, __pyx_k_strided_and_direct_or_indirect, sizeof(__pyx_k_strided_and_direct_or_indirect), 0, 0, 1, 0},
+    {&__pyx_kp_s_strided_and_indirect, __pyx_k_strided_and_indirect, sizeof(__pyx_k_strided_and_indirect), 0, 0, 1, 0},
+    {&__pyx_kp_s_stringsource, __pyx_k_stringsource, sizeof(__pyx_k_stringsource), 0, 0, 1, 0},
+    {&__pyx_n_s_struct, __pyx_k_struct, sizeof(__pyx_k_struct), 0, 0, 1, 1},
+    {&__pyx_n_s_sum, __pyx_k_sum, sizeof(__pyx_k_sum), 0, 0, 1, 1},
+    {&__pyx_n_s_sys, __pyx_k_sys, sizeof(__pyx_k_sys), 0, 0, 1, 1},
+    {&__pyx_n_s_test, __pyx_k_test, sizeof(__pyx_k_test), 0, 0, 1, 1},
+    {&__pyx_n_s_torch, __pyx_k_torch, sizeof(__pyx_k_torch), 0, 0, 1, 1},
+    {&__pyx_kp_s_unable_to_allocate_array_data, __pyx_k_unable_to_allocate_array_data, sizeof(__pyx_k_unable_to_allocate_array_data), 0, 0, 1, 0},
+    {&__pyx_kp_s_unable_to_allocate_shape_and_str, __pyx_k_unable_to_allocate_shape_and_str, sizeof(__pyx_k_unable_to_allocate_shape_and_str), 0, 0, 1, 0},
+    {&__pyx_n_s_unpack, __pyx_k_unpack, sizeof(__pyx_k_unpack), 0, 0, 1, 1},
+    {&__pyx_n_s_update, __pyx_k_update, sizeof(__pyx_k_update), 0, 0, 1, 1},
+    {&__pyx_n_s_use_setstate, __pyx_k_use_setstate, sizeof(__pyx_k_use_setstate), 0, 0, 1, 1},
+    {&__pyx_n_s_version_info, __pyx_k_version_info, sizeof(__pyx_k_version_info), 0, 0, 1, 1},
+    {&__pyx_n_s_zeros, __pyx_k_zeros, sizeof(__pyx_k_zeros), 0, 0, 1, 1},
+    {0, 0, 0, 0, 0, 0, 0}
+  };
+  return __Pyx_InitStrings(__pyx_string_tab);
+}
+/* #### Code section: cached_builtins ### */
+static CYTHON_SMALL_CODE int __Pyx_InitCachedBuiltins(void) {
+  __pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) __PYX_ERR(0, 30, __pyx_L1_error)
+  __pyx_builtin_ValueError = __Pyx_GetBuiltinName(__pyx_n_s_ValueError); if (!__pyx_builtin_ValueError) __PYX_ERR(0, 102, __pyx_L1_error)
+  __pyx_builtin_AssertionError = __Pyx_GetBuiltinName(__pyx_n_s_AssertionError); if (!__pyx_builtin_AssertionError) __PYX_ERR(0, 171, __pyx_L1_error)
+  __pyx_builtin___import__ = __Pyx_GetBuiltinName(__pyx_n_s_import); if (!__pyx_builtin___import__) __PYX_ERR(1, 100, __pyx_L1_error)
+  __pyx_builtin_MemoryError = __Pyx_GetBuiltinName(__pyx_n_s_MemoryError); if (!__pyx_builtin_MemoryError) __PYX_ERR(1, 156, __pyx_L1_error)
+  __pyx_builtin_enumerate = __Pyx_GetBuiltinName(__pyx_n_s_enumerate); if (!__pyx_builtin_enumerate) __PYX_ERR(1, 159, __pyx_L1_error)
+  __pyx_builtin_TypeError = __Pyx_GetBuiltinName(__pyx_n_s_TypeError); if (!__pyx_builtin_TypeError) __PYX_ERR(1, 2, __pyx_L1_error)
+  __pyx_builtin_Ellipsis = __Pyx_GetBuiltinName(__pyx_n_s_Ellipsis); if (!__pyx_builtin_Ellipsis) __PYX_ERR(1, 408, __pyx_L1_error)
+  __pyx_builtin_id = __Pyx_GetBuiltinName(__pyx_n_s_id); if (!__pyx_builtin_id) __PYX_ERR(1, 618, __pyx_L1_error)
+  __pyx_builtin_IndexError = __Pyx_GetBuiltinName(__pyx_n_s_IndexError); if (!__pyx_builtin_IndexError) __PYX_ERR(1, 914, __pyx_L1_error)
+  __pyx_builtin_ImportError = __Pyx_GetBuiltinName(__pyx_n_s_ImportError); if (!__pyx_builtin_ImportError) __PYX_ERR(2, 1043, __pyx_L1_error)
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+/* #### Code section: cached_constants ### */
+
+static CYTHON_SMALL_CODE int __Pyx_InitCachedConstants(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0);
+
+  /* "View.MemoryView":582
+ *     def suboffsets(self):
+ *         if self.view.suboffsets == NULL:
+ *             return (-1,) * self.view.ndim             # <<<<<<<<<<<<<<
+ * 
+ *         return tuple([suboffset for suboffset in self.view.suboffsets[:self.view.ndim]])
+ */
+  __pyx_tuple__4 = PyTuple_New(1); if (unlikely(!__pyx_tuple__4)) __PYX_ERR(1, 582, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__4);
+  __Pyx_INCREF(__pyx_int_neg_1);
+  __Pyx_GIVEREF(__pyx_int_neg_1);
+  if (__Pyx_PyTuple_SET_ITEM(__pyx_tuple__4, 0, __pyx_int_neg_1)) __PYX_ERR(1, 582, __pyx_L1_error);
+  __Pyx_GIVEREF(__pyx_tuple__4);
+
+  /* "View.MemoryView":679
+ *     tup = <tuple>index if isinstance(index, tuple) else (index,)
+ * 
+ *     result = [slice(None)] * ndim             # <<<<<<<<<<<<<<
+ *     have_slices = False
+ *     seen_ellipsis = False
+ */
+  __pyx_slice__5 = PySlice_New(Py_None, Py_None, Py_None); if (unlikely(!__pyx_slice__5)) __PYX_ERR(1, 679, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_slice__5);
+  __Pyx_GIVEREF(__pyx_slice__5);
+
+  /* "(tree fragment)":4
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ *     if __pyx_checksum not in (0x82a3537, 0x6ae9995, 0xb068931):             # <<<<<<<<<<<<<<
+ *         from pickle import PickleError as __pyx_PickleError
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x82a3537, 0x6ae9995, 0xb068931) = (name))" % __pyx_checksum
+ */
+  __pyx_tuple__8 = PyTuple_Pack(3, __pyx_int_136983863, __pyx_int_112105877, __pyx_int_184977713); if (unlikely(!__pyx_tuple__8)) __PYX_ERR(1, 4, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__8);
+  __Pyx_GIVEREF(__pyx_tuple__8);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1043
+ *         __pyx_import_array()
+ *     except Exception:
+ *         raise ImportError("numpy._core.multiarray failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_umath() except -1:
+ */
+  __pyx_tuple__9 = PyTuple_Pack(1, __pyx_kp_u_numpy__core_multiarray_failed_to); if (unlikely(!__pyx_tuple__9)) __PYX_ERR(2, 1043, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__9);
+  __Pyx_GIVEREF(__pyx_tuple__9);
+
+  /* "../../../../tmp/pip-build-env-7z019gw3/normal/lib/python3.10/site-packages/numpy/__init__.cython-30.pxd":1049
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy._core.umath failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_ufunc() except -1:
+ */
+  __pyx_tuple__10 = PyTuple_Pack(1, __pyx_kp_u_numpy__core_umath_failed_to_impo); if (unlikely(!__pyx_tuple__10)) __PYX_ERR(2, 1049, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__10);
+  __Pyx_GIVEREF(__pyx_tuple__10);
+
+  /* "fairseq/data/token_block_utils_fast.pyx":99
+ *         slice_indices = np.zeros((len(sizes), 2), dtype=DTYPE)
+ *         cumsum = sizes.cumsum(axis=0)
+ *         slice_indices[1:, 0] = cumsum[:cumsum.shape[0] - 1]             # <<<<<<<<<<<<<<
+ *         slice_indices[:, 1] = cumsum
+ *     else:
+ */
+  __pyx_slice__11 = PySlice_New(__pyx_int_1, Py_None, Py_None); if (unlikely(!__pyx_slice__11)) __PYX_ERR(0, 99, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_slice__11);
+  __Pyx_GIVEREF(__pyx_slice__11);
+  __pyx_tuple__12 = PyTuple_Pack(2, __pyx_slice__11, __pyx_int_0); if (unlikely(!__pyx_tuple__12)) __PYX_ERR(0, 99, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__12);
+  __Pyx_GIVEREF(__pyx_tuple__12);
+
+  /* "fairseq/data/token_block_utils_fast.pyx":100
+ *         cumsum = sizes.cumsum(axis=0)
+ *         slice_indices[1:, 0] = cumsum[:cumsum.shape[0] - 1]
+ *         slice_indices[:, 1] = cumsum             # <<<<<<<<<<<<<<
+ *     else:
+ *         raise ValueError('Invalid break_mode: ' + break_mode)
+ */
+  __pyx_tuple__13 = PyTuple_Pack(2, __pyx_slice__5, __pyx_int_1); if (unlikely(!__pyx_tuple__13)) __PYX_ERR(0, 100, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__13);
+  __Pyx_GIVEREF(__pyx_tuple__13);
+
+  /* "(tree fragment)":4
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ *     if __pyx_checksum not in (0x8c67b45, 0x2e2dd22, 0x6632805):             # <<<<<<<<<<<<<<
+ *         from pickle import PickleError as __pyx_PickleError
+ *         raise __pyx_PickleError, "Incompatible checksums (0x%x vs (0x8c67b45, 0x2e2dd22, 0x6632805) = (current_i, current_index, current_offset, sizes))" % __pyx_checksum
+ */
+  __pyx_tuple__14 = PyTuple_Pack(3, __pyx_int_147225413, __pyx_int_48422178, __pyx_int_107161605); if (unlikely(!__pyx_tuple__14)) __PYX_ERR(1, 4, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__14);
+  __Pyx_GIVEREF(__pyx_tuple__14);
+
+  /* "View.MemoryView":100
+ * cdef object __pyx_collections_abc_Sequence "__pyx_collections_abc_Sequence"
+ * try:
+ *     if __import__("sys").version_info >= (3, 3):             # <<<<<<<<<<<<<<
+ *         __pyx_collections_abc_Sequence = __import__("collections.abc").abc.Sequence
+ *     else:
+ */
+  __pyx_tuple__15 = PyTuple_Pack(1, __pyx_n_s_sys); if (unlikely(!__pyx_tuple__15)) __PYX_ERR(1, 100, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__15);
+  __Pyx_GIVEREF(__pyx_tuple__15);
+  __pyx_tuple__16 = PyTuple_Pack(2, __pyx_int_3, __pyx_int_3); if (unlikely(!__pyx_tuple__16)) __PYX_ERR(1, 100, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__16);
+  __Pyx_GIVEREF(__pyx_tuple__16);
+
+  /* "View.MemoryView":101
+ * try:
+ *     if __import__("sys").version_info >= (3, 3):
+ *         __pyx_collections_abc_Sequence = __import__("collections.abc").abc.Sequence             # <<<<<<<<<<<<<<
+ *     else:
+ *         __pyx_collections_abc_Sequence = __import__("collections").Sequence
+ */
+  __pyx_tuple__17 = PyTuple_Pack(1, __pyx_kp_s_collections_abc); if (unlikely(!__pyx_tuple__17)) __PYX_ERR(1, 101, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__17);
+  __Pyx_GIVEREF(__pyx_tuple__17);
+
+  /* "View.MemoryView":103
+ *         __pyx_collections_abc_Sequence = __import__("collections.abc").abc.Sequence
+ *     else:
+ *         __pyx_collections_abc_Sequence = __import__("collections").Sequence             # <<<<<<<<<<<<<<
+ * except:
+ * 
+ */
+  __pyx_tuple__18 = PyTuple_Pack(1, __pyx_n_s_collections); if (unlikely(!__pyx_tuple__18)) __PYX_ERR(1, 103, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__18);
+  __Pyx_GIVEREF(__pyx_tuple__18);
+
+  /* "View.MemoryView":309
+ *         return self.name
+ * 
+ * cdef generic = Enum("<strided and direct or indirect>")             # <<<<<<<<<<<<<<
+ * cdef strided = Enum("<strided and direct>") # default
+ * cdef indirect = Enum("<strided and indirect>")
+ */
+  __pyx_tuple__19 = PyTuple_Pack(1, __pyx_kp_s_strided_and_direct_or_indirect); if (unlikely(!__pyx_tuple__19)) __PYX_ERR(1, 309, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__19);
+  __Pyx_GIVEREF(__pyx_tuple__19);
+
+  /* "View.MemoryView":310
+ * 
+ * cdef generic = Enum("<strided and direct or indirect>")
+ * cdef strided = Enum("<strided and direct>") # default             # <<<<<<<<<<<<<<
+ * cdef indirect = Enum("<strided and indirect>")
+ * 
+ */
+  __pyx_tuple__20 = PyTuple_Pack(1, __pyx_kp_s_strided_and_direct); if (unlikely(!__pyx_tuple__20)) __PYX_ERR(1, 310, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__20);
+  __Pyx_GIVEREF(__pyx_tuple__20);
+
+  /* "View.MemoryView":311
+ * cdef generic = Enum("<strided and direct or indirect>")
+ * cdef strided = Enum("<strided and direct>") # default
+ * cdef indirect = Enum("<strided and indirect>")             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_tuple__21 = PyTuple_Pack(1, __pyx_kp_s_strided_and_indirect); if (unlikely(!__pyx_tuple__21)) __PYX_ERR(1, 311, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__21);
+  __Pyx_GIVEREF(__pyx_tuple__21);
+
+  /* "View.MemoryView":314
+ * 
+ * 
+ * cdef contiguous = Enum("<contiguous and direct>")             # <<<<<<<<<<<<<<
+ * cdef indirect_contiguous = Enum("<contiguous and indirect>")
+ * 
+ */
+  __pyx_tuple__22 = PyTuple_Pack(1, __pyx_kp_s_contiguous_and_direct); if (unlikely(!__pyx_tuple__22)) __PYX_ERR(1, 314, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__22);
+  __Pyx_GIVEREF(__pyx_tuple__22);
+
+  /* "View.MemoryView":315
+ * 
+ * cdef contiguous = Enum("<contiguous and direct>")
+ * cdef indirect_contiguous = Enum("<contiguous and indirect>")             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_tuple__23 = PyTuple_Pack(1, __pyx_kp_s_contiguous_and_indirect); if (unlikely(!__pyx_tuple__23)) __PYX_ERR(1, 315, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__23);
+  __Pyx_GIVEREF(__pyx_tuple__23);
+
+  /* "(tree fragment)":1
+ * def __pyx_unpickle_Enum(__pyx_type, long __pyx_checksum, __pyx_state):             # <<<<<<<<<<<<<<
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ */
+  __pyx_tuple__24 = PyTuple_Pack(5, __pyx_n_s_pyx_type, __pyx_n_s_pyx_checksum, __pyx_n_s_pyx_state, __pyx_n_s_pyx_PickleError, __pyx_n_s_pyx_result); if (unlikely(!__pyx_tuple__24)) __PYX_ERR(1, 1, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__24);
+  __Pyx_GIVEREF(__pyx_tuple__24);
+  __pyx_codeobj__25 = (PyObject*)__Pyx_PyCode_New(3, 0, 0, 5, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__24, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_stringsource, __pyx_n_s_pyx_unpickle_Enum, 1, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__25)) __PYX_ERR(1, 1, __pyx_L1_error)
+
+  /* "fairseq/data/token_block_utils_fast.pyx":50
+ * @cython.wraparound(False)
+ * @cython.nonecheck(False)
+ * cpdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_fast(np.ndarray[DTYPE_t, ndim=1] sizes, str break_mode, int block_size, int document_sep_len):             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t tok_idx = 0
+ *     cdef DTYPE_t sz_idx = 0
+ */
+  __pyx_tuple__26 = PyTuple_Pack(4, __pyx_n_s_sizes, __pyx_n_s_break_mode, __pyx_n_s_block_size, __pyx_n_s_document_sep_len); if (unlikely(!__pyx_tuple__26)) __PYX_ERR(0, 50, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__26);
+  __Pyx_GIVEREF(__pyx_tuple__26);
+  __pyx_codeobj__27 = (PyObject*)__Pyx_PyCode_New(4, 0, 0, 4, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__26, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_fairseq_data_token_block_utils_f, __pyx_n_s_get_slice_indices_fast, 50, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__27)) __PYX_ERR(0, 50, __pyx_L1_error)
+
+  /* "fairseq/data/token_block_utils_fast.pyx":109
+ * @cython.wraparound(False)
+ * @cython.nonecheck(False)
+ * cpdef np.ndarray[DTYPE_t, ndim=2] _get_block_to_dataset_index_fast(np.ndarray[DTYPE_t, ndim=1] sizes, np.ndarray[DTYPE_t, ndim=2] slice_indices):             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t start_ds_idx
+ *     cdef DTYPE_t start_offset
+ */
+  __pyx_tuple__28 = PyTuple_Pack(2, __pyx_n_s_sizes, __pyx_n_s_slice_indices); if (unlikely(!__pyx_tuple__28)) __PYX_ERR(0, 109, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__28);
+  __Pyx_GIVEREF(__pyx_tuple__28);
+  __pyx_codeobj__29 = (PyObject*)__Pyx_PyCode_New(2, 0, 0, 2, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__28, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_fairseq_data_token_block_utils_f, __pyx_n_s_get_block_to_dataset_index_fast, 109, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__29)) __PYX_ERR(0, 109, __pyx_L1_error)
+
+  /* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     cdef tuple state
+ *     cdef object _dict
+ */
+  __pyx_tuple__30 = PyTuple_Pack(4, __pyx_n_s_self, __pyx_n_s_state, __pyx_n_s_dict_2, __pyx_n_s_use_setstate); if (unlikely(!__pyx_tuple__30)) __PYX_ERR(1, 1, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__30);
+  __Pyx_GIVEREF(__pyx_tuple__30);
+  __pyx_codeobj__31 = (PyObject*)__Pyx_PyCode_New(1, 0, 0, 4, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__30, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_stringsource, __pyx_n_s_reduce_cython, 1, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__31)) __PYX_ERR(1, 1, __pyx_L1_error)
+
+  /* "(tree fragment)":16
+ *     else:
+ *         return __pyx_unpickle_DatasetSearcher, (type(self), 0x8c67b45, state)
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     __pyx_unpickle_DatasetSearcher__set_state(self, __pyx_state)
+ */
+  __pyx_tuple__32 = PyTuple_Pack(2, __pyx_n_s_self, __pyx_n_s_pyx_state); if (unlikely(!__pyx_tuple__32)) __PYX_ERR(1, 16, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__32);
+  __Pyx_GIVEREF(__pyx_tuple__32);
+  __pyx_codeobj__33 = (PyObject*)__Pyx_PyCode_New(2, 0, 0, 2, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__32, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_stringsource, __pyx_n_s_setstate_cython, 16, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__33)) __PYX_ERR(1, 16, __pyx_L1_error)
+
+  /* "(tree fragment)":1
+ * def __pyx_unpickle_DatasetSearcher(__pyx_type, long __pyx_checksum, __pyx_state):             # <<<<<<<<<<<<<<
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ */
+  __pyx_codeobj__34 = (PyObject*)__Pyx_PyCode_New(3, 0, 0, 5, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__24, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_stringsource, __pyx_n_s_pyx_unpickle_DatasetSearcher, 1, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__34)) __PYX_ERR(1, 1, __pyx_L1_error)
+  __Pyx_RefNannyFinishContext();
+  return 0;
+  __pyx_L1_error:;
+  __Pyx_RefNannyFinishContext();
+  return -1;
+}
+/* #### Code section: init_constants ### */
+
+static CYTHON_SMALL_CODE int __Pyx_InitConstants(void) {
+  if (__Pyx_CreateStringTabAndInitStrings() < 0) __PYX_ERR(0, 1, __pyx_L1_error);
+  __pyx_int_0 = PyInt_FromLong(0); if (unlikely(!__pyx_int_0)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_1 = PyInt_FromLong(1); if (unlikely(!__pyx_int_1)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_2 = PyInt_FromLong(2); if (unlikely(!__pyx_int_2)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_3 = PyInt_FromLong(3); if (unlikely(!__pyx_int_3)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_48422178 = PyInt_FromLong(48422178L); if (unlikely(!__pyx_int_48422178)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_107161605 = PyInt_FromLong(107161605L); if (unlikely(!__pyx_int_107161605)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_112105877 = PyInt_FromLong(112105877L); if (unlikely(!__pyx_int_112105877)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_136983863 = PyInt_FromLong(136983863L); if (unlikely(!__pyx_int_136983863)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_147225413 = PyInt_FromLong(147225413L); if (unlikely(!__pyx_int_147225413)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_184977713 = PyInt_FromLong(184977713L); if (unlikely(!__pyx_int_184977713)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_neg_1 = PyInt_FromLong(-1); if (unlikely(!__pyx_int_neg_1)) __PYX_ERR(0, 1, __pyx_L1_error)
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+/* #### Code section: init_globals ### */
+
+static CYTHON_SMALL_CODE int __Pyx_InitGlobals(void) {
+  /* AssertionsEnabled.init */
+  if (likely(__Pyx_init_assertions_enabled() == 0)); else
+
+if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 1, __pyx_L1_error)
+
+  /* NumpyImportArray.init */
+  /*
+ * Cython has automatically inserted a call to _import_array since
+ * you didn't include one when you cimported numpy. To disable this
+ * add the line
+ *   <void>numpy._import_array
+ */
+#ifdef NPY_FEATURE_VERSION
+#ifndef NO_IMPORT_ARRAY
+if (unlikely(_import_array() == -1)) {
+    PyErr_SetString(PyExc_ImportError, "numpy.core.multiarray failed to import "
+    "(auto-generated because you didn't call 'numpy.import_array()' after cimporting numpy; "
+    "use '<void>numpy._import_array' to disable if you are certain you don't need it).");
+}
+#endif
+#endif
+
+if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 1, __pyx_L1_error)
+
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+/* #### Code section: init_module ### */
+
+static CYTHON_SMALL_CODE int __Pyx_modinit_global_init_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_variable_export_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_function_export_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_type_init_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_type_import_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_variable_import_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_function_import_code(void); /*proto*/
+
+static int __Pyx_modinit_global_init_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_global_init_code", 0);
+  /*--- Global init code ---*/
+  __pyx_collections_abc_Sequence = Py_None; Py_INCREF(Py_None);
+  generic = Py_None; Py_INCREF(Py_None);
+  strided = Py_None; Py_INCREF(Py_None);
+  indirect = Py_None; Py_INCREF(Py_None);
+  contiguous = Py_None; Py_INCREF(Py_None);
+  indirect_contiguous = Py_None; Py_INCREF(Py_None);
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_variable_export_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_variable_export_code", 0);
+  /*--- Variable export code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_function_export_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_function_export_code", 0);
+  /*--- Function export code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_type_init_code(void) {
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__Pyx_modinit_type_init_code", 0);
+  /*--- Type init code ---*/
+  __pyx_vtabptr_7fairseq_4data_22token_block_utils_fast_DatasetSearcher = &__pyx_vtable_7fairseq_4data_22token_block_utils_fast_DatasetSearcher;
+  __pyx_vtable_7fairseq_4data_22token_block_utils_fast_DatasetSearcher.reset = (PyObject *(*)(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *))__pyx_f_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_reset;
+  __pyx_vtable_7fairseq_4data_22token_block_utils_fast_DatasetSearcher.step = (int (*)(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *, __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t))__pyx_f_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_step;
+  __pyx_vtable_7fairseq_4data_22token_block_utils_fast_DatasetSearcher.seek = (PyObject *(*)(struct __pyx_obj_7fairseq_4data_22token_block_utils_fast_DatasetSearcher *, __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t))__pyx_f_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_seek;
+  #if CYTHON_USE_TYPE_SPECS
+  __pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher = (PyTypeObject *) __Pyx_PyType_FromModuleAndSpec(__pyx_m, &__pyx_type_7fairseq_4data_22token_block_utils_fast_DatasetSearcher_spec, NULL); if (unlikely(!__pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher)) __PYX_ERR(0, 139, __pyx_L1_error)
+  if (__Pyx_fix_up_extension_type_from_spec(&__pyx_type_7fairseq_4data_22token_block_utils_fast_DatasetSearcher_spec, __pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher) < 0) __PYX_ERR(0, 139, __pyx_L1_error)
+  #else
+  __pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher = &__pyx_type_7fairseq_4data_22token_block_utils_fast_DatasetSearcher;
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  #endif
+  #if !CYTHON_USE_TYPE_SPECS
+  if (__Pyx_PyType_Ready(__pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher) < 0) __PYX_ERR(0, 139, __pyx_L1_error)
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  __pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher->tp_print = 0;
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if ((CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP) && likely(!__pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher->tp_dictoffset && __pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher->tp_getattro == PyObject_GenericGetAttr)) {
+    __pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher->tp_getattro = __Pyx_PyObject_GenericGetAttr;
+  }
+  #endif
+  if (__Pyx_SetVtable(__pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher, __pyx_vtabptr_7fairseq_4data_22token_block_utils_fast_DatasetSearcher) < 0) __PYX_ERR(0, 139, __pyx_L1_error)
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if (__Pyx_MergeVtables(__pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher) < 0) __PYX_ERR(0, 139, __pyx_L1_error)
+  #endif
+  if (PyObject_SetAttr(__pyx_m, __pyx_n_s_DatasetSearcher, (PyObject *) __pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher) < 0) __PYX_ERR(0, 139, __pyx_L1_error)
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if (__Pyx_setup_reduce((PyObject *) __pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher) < 0) __PYX_ERR(0, 139, __pyx_L1_error)
+  #endif
+  __pyx_vtabptr_array = &__pyx_vtable_array;
+  __pyx_vtable_array.get_memview = (PyObject *(*)(struct __pyx_array_obj *))__pyx_array_get_memview;
+  #if CYTHON_USE_TYPE_SPECS
+  __pyx_array_type = (PyTypeObject *) __Pyx_PyType_FromModuleAndSpec(__pyx_m, &__pyx_type___pyx_array_spec, NULL); if (unlikely(!__pyx_array_type)) __PYX_ERR(1, 114, __pyx_L1_error)
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  __pyx_array_type->tp_as_buffer = &__pyx_tp_as_buffer_array;
+  if (!__pyx_array_type->tp_as_buffer->bf_releasebuffer && __pyx_array_type->tp_base->tp_as_buffer && __pyx_array_type->tp_base->tp_as_buffer->bf_releasebuffer) {
+    __pyx_array_type->tp_as_buffer->bf_releasebuffer = __pyx_array_type->tp_base->tp_as_buffer->bf_releasebuffer;
+  }
+  #elif defined(Py_bf_getbuffer) && defined(Py_bf_releasebuffer)
+  /* PY_VERSION_HEX >= 0x03090000 || Py_LIMITED_API >= 0x030B0000 */
+  #elif defined(_MSC_VER)
+  #pragma message ("The buffer protocol is not supported in the Limited C-API < 3.11.")
+  #else
+  #warning "The buffer protocol is not supported in the Limited C-API < 3.11."
+  #endif
+  if (__Pyx_fix_up_extension_type_from_spec(&__pyx_type___pyx_array_spec, __pyx_array_type) < 0) __PYX_ERR(1, 114, __pyx_L1_error)
+  #else
+  __pyx_array_type = &__pyx_type___pyx_array;
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  #endif
+  #if !CYTHON_USE_TYPE_SPECS
+  if (__Pyx_PyType_Ready(__pyx_array_type) < 0) __PYX_ERR(1, 114, __pyx_L1_error)
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  __pyx_array_type->tp_print = 0;
+  #endif
+  if (__Pyx_SetVtable(__pyx_array_type, __pyx_vtabptr_array) < 0) __PYX_ERR(1, 114, __pyx_L1_error)
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if (__Pyx_MergeVtables(__pyx_array_type) < 0) __PYX_ERR(1, 114, __pyx_L1_error)
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if (__Pyx_setup_reduce((PyObject *) __pyx_array_type) < 0) __PYX_ERR(1, 114, __pyx_L1_error)
+  #endif
+  #if CYTHON_USE_TYPE_SPECS
+  __pyx_MemviewEnum_type = (PyTypeObject *) __Pyx_PyType_FromModuleAndSpec(__pyx_m, &__pyx_type___pyx_MemviewEnum_spec, NULL); if (unlikely(!__pyx_MemviewEnum_type)) __PYX_ERR(1, 302, __pyx_L1_error)
+  if (__Pyx_fix_up_extension_type_from_spec(&__pyx_type___pyx_MemviewEnum_spec, __pyx_MemviewEnum_type) < 0) __PYX_ERR(1, 302, __pyx_L1_error)
+  #else
+  __pyx_MemviewEnum_type = &__pyx_type___pyx_MemviewEnum;
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  #endif
+  #if !CYTHON_USE_TYPE_SPECS
+  if (__Pyx_PyType_Ready(__pyx_MemviewEnum_type) < 0) __PYX_ERR(1, 302, __pyx_L1_error)
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  __pyx_MemviewEnum_type->tp_print = 0;
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if ((CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP) && likely(!__pyx_MemviewEnum_type->tp_dictoffset && __pyx_MemviewEnum_type->tp_getattro == PyObject_GenericGetAttr)) {
+    __pyx_MemviewEnum_type->tp_getattro = __Pyx_PyObject_GenericGetAttr;
+  }
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if (__Pyx_setup_reduce((PyObject *) __pyx_MemviewEnum_type) < 0) __PYX_ERR(1, 302, __pyx_L1_error)
+  #endif
+  __pyx_vtabptr_memoryview = &__pyx_vtable_memoryview;
+  __pyx_vtable_memoryview.get_item_pointer = (char *(*)(struct __pyx_memoryview_obj *, PyObject *))__pyx_memoryview_get_item_pointer;
+  __pyx_vtable_memoryview.is_slice = (PyObject *(*)(struct __pyx_memoryview_obj *, PyObject *))__pyx_memoryview_is_slice;
+  __pyx_vtable_memoryview.setitem_slice_assignment = (PyObject *(*)(struct __pyx_memoryview_obj *, PyObject *, PyObject *))__pyx_memoryview_setitem_slice_assignment;
+  __pyx_vtable_memoryview.setitem_slice_assign_scalar = (PyObject *(*)(struct __pyx_memoryview_obj *, struct __pyx_memoryview_obj *, PyObject *))__pyx_memoryview_setitem_slice_assign_scalar;
+  __pyx_vtable_memoryview.setitem_indexed = (PyObject *(*)(struct __pyx_memoryview_obj *, PyObject *, PyObject *))__pyx_memoryview_setitem_indexed;
+  __pyx_vtable_memoryview.convert_item_to_object = (PyObject *(*)(struct __pyx_memoryview_obj *, char *))__pyx_memoryview_convert_item_to_object;
+  __pyx_vtable_memoryview.assign_item_from_object = (PyObject *(*)(struct __pyx_memoryview_obj *, char *, PyObject *))__pyx_memoryview_assign_item_from_object;
+  __pyx_vtable_memoryview._get_base = (PyObject *(*)(struct __pyx_memoryview_obj *))__pyx_memoryview__get_base;
+  #if CYTHON_USE_TYPE_SPECS
+  __pyx_memoryview_type = (PyTypeObject *) __Pyx_PyType_FromModuleAndSpec(__pyx_m, &__pyx_type___pyx_memoryview_spec, NULL); if (unlikely(!__pyx_memoryview_type)) __PYX_ERR(1, 337, __pyx_L1_error)
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  __pyx_memoryview_type->tp_as_buffer = &__pyx_tp_as_buffer_memoryview;
+  if (!__pyx_memoryview_type->tp_as_buffer->bf_releasebuffer && __pyx_memoryview_type->tp_base->tp_as_buffer && __pyx_memoryview_type->tp_base->tp_as_buffer->bf_releasebuffer) {
+    __pyx_memoryview_type->tp_as_buffer->bf_releasebuffer = __pyx_memoryview_type->tp_base->tp_as_buffer->bf_releasebuffer;
+  }
+  #elif defined(Py_bf_getbuffer) && defined(Py_bf_releasebuffer)
+  /* PY_VERSION_HEX >= 0x03090000 || Py_LIMITED_API >= 0x030B0000 */
+  #elif defined(_MSC_VER)
+  #pragma message ("The buffer protocol is not supported in the Limited C-API < 3.11.")
+  #else
+  #warning "The buffer protocol is not supported in the Limited C-API < 3.11."
+  #endif
+  if (__Pyx_fix_up_extension_type_from_spec(&__pyx_type___pyx_memoryview_spec, __pyx_memoryview_type) < 0) __PYX_ERR(1, 337, __pyx_L1_error)
+  #else
+  __pyx_memoryview_type = &__pyx_type___pyx_memoryview;
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  #endif
+  #if !CYTHON_USE_TYPE_SPECS
+  if (__Pyx_PyType_Ready(__pyx_memoryview_type) < 0) __PYX_ERR(1, 337, __pyx_L1_error)
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  __pyx_memoryview_type->tp_print = 0;
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if ((CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP) && likely(!__pyx_memoryview_type->tp_dictoffset && __pyx_memoryview_type->tp_getattro == PyObject_GenericGetAttr)) {
+    __pyx_memoryview_type->tp_getattro = __Pyx_PyObject_GenericGetAttr;
+  }
+  #endif
+  if (__Pyx_SetVtable(__pyx_memoryview_type, __pyx_vtabptr_memoryview) < 0) __PYX_ERR(1, 337, __pyx_L1_error)
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if (__Pyx_MergeVtables(__pyx_memoryview_type) < 0) __PYX_ERR(1, 337, __pyx_L1_error)
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if (__Pyx_setup_reduce((PyObject *) __pyx_memoryview_type) < 0) __PYX_ERR(1, 337, __pyx_L1_error)
+  #endif
+  __pyx_vtabptr__memoryviewslice = &__pyx_vtable__memoryviewslice;
+  __pyx_vtable__memoryviewslice.__pyx_base = *__pyx_vtabptr_memoryview;
+  __pyx_vtable__memoryviewslice.__pyx_base.convert_item_to_object = (PyObject *(*)(struct __pyx_memoryview_obj *, char *))__pyx_memoryviewslice_convert_item_to_object;
+  __pyx_vtable__memoryviewslice.__pyx_base.assign_item_from_object = (PyObject *(*)(struct __pyx_memoryview_obj *, char *, PyObject *))__pyx_memoryviewslice_assign_item_from_object;
+  __pyx_vtable__memoryviewslice.__pyx_base._get_base = (PyObject *(*)(struct __pyx_memoryview_obj *))__pyx_memoryviewslice__get_base;
+  #if CYTHON_USE_TYPE_SPECS
+  __pyx_t_1 = PyTuple_Pack(1, (PyObject *)__pyx_memoryview_type); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 952, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_memoryviewslice_type = (PyTypeObject *) __Pyx_PyType_FromModuleAndSpec(__pyx_m, &__pyx_type___pyx_memoryviewslice_spec, __pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (unlikely(!__pyx_memoryviewslice_type)) __PYX_ERR(1, 952, __pyx_L1_error)
+  if (__Pyx_fix_up_extension_type_from_spec(&__pyx_type___pyx_memoryviewslice_spec, __pyx_memoryviewslice_type) < 0) __PYX_ERR(1, 952, __pyx_L1_error)
+  #else
+  __pyx_memoryviewslice_type = &__pyx_type___pyx_memoryviewslice;
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  __pyx_memoryviewslice_type->tp_base = __pyx_memoryview_type;
+  #endif
+  #if !CYTHON_USE_TYPE_SPECS
+  if (__Pyx_PyType_Ready(__pyx_memoryviewslice_type) < 0) __PYX_ERR(1, 952, __pyx_L1_error)
+  #endif
+  #if PY_MAJOR_VERSION < 3
+  __pyx_memoryviewslice_type->tp_print = 0;
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if ((CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP) && likely(!__pyx_memoryviewslice_type->tp_dictoffset && __pyx_memoryviewslice_type->tp_getattro == PyObject_GenericGetAttr)) {
+    __pyx_memoryviewslice_type->tp_getattro = __Pyx_PyObject_GenericGetAttr;
+  }
+  #endif
+  if (__Pyx_SetVtable(__pyx_memoryviewslice_type, __pyx_vtabptr__memoryviewslice) < 0) __PYX_ERR(1, 952, __pyx_L1_error)
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if (__Pyx_MergeVtables(__pyx_memoryviewslice_type) < 0) __PYX_ERR(1, 952, __pyx_L1_error)
+  #endif
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+  if (__Pyx_setup_reduce((PyObject *) __pyx_memoryviewslice_type) < 0) __PYX_ERR(1, 952, __pyx_L1_error)
+  #endif
+  __Pyx_RefNannyFinishContext();
+  return 0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_RefNannyFinishContext();
+  return -1;
+}
+
+static int __Pyx_modinit_type_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__Pyx_modinit_type_import_code", 0);
+  /*--- Type import code ---*/
+  __pyx_t_1 = PyImport_ImportModule(__Pyx_BUILTIN_MODULE_NAME); if (unlikely(!__pyx_t_1)) __PYX_ERR(3, 9, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_ptype_7cpython_4type_type = __Pyx_ImportType_3_0_12(__pyx_t_1, __Pyx_BUILTIN_MODULE_NAME, "type", 
+  #if defined(PYPY_VERSION_NUM) && PYPY_VERSION_NUM < 0x050B0000
+  sizeof(PyTypeObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyTypeObject),
+  #elif CYTHON_COMPILING_IN_LIMITED_API
+  sizeof(PyTypeObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyTypeObject),
+  #else
+  sizeof(PyHeapTypeObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyHeapTypeObject),
+  #endif
+  __Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_7cpython_4type_type) __PYX_ERR(3, 9, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = PyImport_ImportModule("numpy"); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 272, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_ptype_5numpy_dtype = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "dtype", sizeof(PyArray_Descr), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyArray_Descr),__Pyx_ImportType_CheckSize_Ignore_3_0_12); if (!__pyx_ptype_5numpy_dtype) __PYX_ERR(2, 272, __pyx_L1_error)
+  __pyx_ptype_5numpy_flatiter = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "flatiter", sizeof(PyArrayIterObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyArrayIterObject),__Pyx_ImportType_CheckSize_Ignore_3_0_12); if (!__pyx_ptype_5numpy_flatiter) __PYX_ERR(2, 317, __pyx_L1_error)
+  __pyx_ptype_5numpy_broadcast = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "broadcast", sizeof(PyArrayMultiIterObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyArrayMultiIterObject),__Pyx_ImportType_CheckSize_Ignore_3_0_12); if (!__pyx_ptype_5numpy_broadcast) __PYX_ERR(2, 321, __pyx_L1_error)
+  __pyx_ptype_5numpy_ndarray = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "ndarray", sizeof(PyArrayObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyArrayObject),__Pyx_ImportType_CheckSize_Ignore_3_0_12); if (!__pyx_ptype_5numpy_ndarray) __PYX_ERR(2, 360, __pyx_L1_error)
+  __pyx_ptype_5numpy_generic = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "generic", sizeof(PyObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyObject),__Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_5numpy_generic) __PYX_ERR(2, 865, __pyx_L1_error)
+  __pyx_ptype_5numpy_number = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "number", sizeof(PyObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyObject),__Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_5numpy_number) __PYX_ERR(2, 867, __pyx_L1_error)
+  __pyx_ptype_5numpy_integer = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "integer", sizeof(PyObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyObject),__Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_5numpy_integer) __PYX_ERR(2, 869, __pyx_L1_error)
+  __pyx_ptype_5numpy_signedinteger = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "signedinteger", sizeof(PyObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyObject),__Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_5numpy_signedinteger) __PYX_ERR(2, 871, __pyx_L1_error)
+  __pyx_ptype_5numpy_unsignedinteger = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "unsignedinteger", sizeof(PyObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyObject),__Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_5numpy_unsignedinteger) __PYX_ERR(2, 873, __pyx_L1_error)
+  __pyx_ptype_5numpy_inexact = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "inexact", sizeof(PyObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyObject),__Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_5numpy_inexact) __PYX_ERR(2, 875, __pyx_L1_error)
+  __pyx_ptype_5numpy_floating = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "floating", sizeof(PyObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyObject),__Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_5numpy_floating) __PYX_ERR(2, 877, __pyx_L1_error)
+  __pyx_ptype_5numpy_complexfloating = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "complexfloating", sizeof(PyObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyObject),__Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_5numpy_complexfloating) __PYX_ERR(2, 879, __pyx_L1_error)
+  __pyx_ptype_5numpy_flexible = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "flexible", sizeof(PyObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyObject),__Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_5numpy_flexible) __PYX_ERR(2, 881, __pyx_L1_error)
+  __pyx_ptype_5numpy_character = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "character", sizeof(PyObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyObject),__Pyx_ImportType_CheckSize_Warn_3_0_12); if (!__pyx_ptype_5numpy_character) __PYX_ERR(2, 883, __pyx_L1_error)
+  __pyx_ptype_5numpy_ufunc = __Pyx_ImportType_3_0_12(__pyx_t_1, "numpy", "ufunc", sizeof(PyUFuncObject), __PYX_GET_STRUCT_ALIGNMENT_3_0_12(PyUFuncObject),__Pyx_ImportType_CheckSize_Ignore_3_0_12); if (!__pyx_ptype_5numpy_ufunc) __PYX_ERR(2, 947, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_RefNannyFinishContext();
+  return 0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_RefNannyFinishContext();
+  return -1;
+}
+
+static int __Pyx_modinit_variable_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_variable_import_code", 0);
+  /*--- Variable import code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_function_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_function_import_code", 0);
+  /*--- Function import code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+
+#if PY_MAJOR_VERSION >= 3
+#if CYTHON_PEP489_MULTI_PHASE_INIT
+static PyObject* __pyx_pymod_create(PyObject *spec, PyModuleDef *def); /*proto*/
+static int __pyx_pymod_exec_token_block_utils_fast(PyObject* module); /*proto*/
+static PyModuleDef_Slot __pyx_moduledef_slots[] = {
+  {Py_mod_create, (void*)__pyx_pymod_create},
+  {Py_mod_exec, (void*)__pyx_pymod_exec_token_block_utils_fast},
+  {0, NULL}
+};
+#endif
+
+#ifdef __cplusplus
+namespace {
+  struct PyModuleDef __pyx_moduledef =
+  #else
+  static struct PyModuleDef __pyx_moduledef =
+  #endif
+  {
+      PyModuleDef_HEAD_INIT,
+      "token_block_utils_fast",
+      0, /* m_doc */
+    #if CYTHON_PEP489_MULTI_PHASE_INIT
+      0, /* m_size */
+    #elif CYTHON_USE_MODULE_STATE
+      sizeof(__pyx_mstate), /* m_size */
+    #else
+      -1, /* m_size */
+    #endif
+      __pyx_methods /* m_methods */,
+    #if CYTHON_PEP489_MULTI_PHASE_INIT
+      __pyx_moduledef_slots, /* m_slots */
+    #else
+      NULL, /* m_reload */
+    #endif
+    #if CYTHON_USE_MODULE_STATE
+      __pyx_m_traverse, /* m_traverse */
+      __pyx_m_clear, /* m_clear */
+      NULL /* m_free */
+    #else
+      NULL, /* m_traverse */
+      NULL, /* m_clear */
+      NULL /* m_free */
+    #endif
+  };
+  #ifdef __cplusplus
+} /* anonymous namespace */
+#endif
+#endif
+
+#ifndef CYTHON_NO_PYINIT_EXPORT
+#define __Pyx_PyMODINIT_FUNC PyMODINIT_FUNC
+#elif PY_MAJOR_VERSION < 3
+#ifdef __cplusplus
+#define __Pyx_PyMODINIT_FUNC extern "C" void
+#else
+#define __Pyx_PyMODINIT_FUNC void
+#endif
+#else
+#ifdef __cplusplus
+#define __Pyx_PyMODINIT_FUNC extern "C" PyObject *
+#else
+#define __Pyx_PyMODINIT_FUNC PyObject *
+#endif
+#endif
+
+
+#if PY_MAJOR_VERSION < 3
+__Pyx_PyMODINIT_FUNC inittoken_block_utils_fast(void) CYTHON_SMALL_CODE; /*proto*/
+__Pyx_PyMODINIT_FUNC inittoken_block_utils_fast(void)
+#else
+__Pyx_PyMODINIT_FUNC PyInit_token_block_utils_fast(void) CYTHON_SMALL_CODE; /*proto*/
+__Pyx_PyMODINIT_FUNC PyInit_token_block_utils_fast(void)
+#if CYTHON_PEP489_MULTI_PHASE_INIT
+{
+  return PyModuleDef_Init(&__pyx_moduledef);
+}
+static CYTHON_SMALL_CODE int __Pyx_check_single_interpreter(void) {
+    #if PY_VERSION_HEX >= 0x030700A1
+    static PY_INT64_T main_interpreter_id = -1;
+    PY_INT64_T current_id = PyInterpreterState_GetID(PyThreadState_Get()->interp);
+    if (main_interpreter_id == -1) {
+        main_interpreter_id = current_id;
+        return (unlikely(current_id == -1)) ? -1 : 0;
+    } else if (unlikely(main_interpreter_id != current_id))
+    #else
+    static PyInterpreterState *main_interpreter = NULL;
+    PyInterpreterState *current_interpreter = PyThreadState_Get()->interp;
+    if (!main_interpreter) {
+        main_interpreter = current_interpreter;
+    } else if (unlikely(main_interpreter != current_interpreter))
+    #endif
+    {
+        PyErr_SetString(
+            PyExc_ImportError,
+            "Interpreter change detected - this module can only be loaded into one interpreter per process.");
+        return -1;
+    }
+    return 0;
+}
+#if CYTHON_COMPILING_IN_LIMITED_API
+static CYTHON_SMALL_CODE int __Pyx_copy_spec_to_module(PyObject *spec, PyObject *module, const char* from_name, const char* to_name, int allow_none)
+#else
+static CYTHON_SMALL_CODE int __Pyx_copy_spec_to_module(PyObject *spec, PyObject *moddict, const char* from_name, const char* to_name, int allow_none)
+#endif
+{
+    PyObject *value = PyObject_GetAttrString(spec, from_name);
+    int result = 0;
+    if (likely(value)) {
+        if (allow_none || value != Py_None) {
+#if CYTHON_COMPILING_IN_LIMITED_API
+            result = PyModule_AddObject(module, to_name, value);
+#else
+            result = PyDict_SetItemString(moddict, to_name, value);
+#endif
+        }
+        Py_DECREF(value);
+    } else if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
+        PyErr_Clear();
+    } else {
+        result = -1;
+    }
+    return result;
+}
+static CYTHON_SMALL_CODE PyObject* __pyx_pymod_create(PyObject *spec, PyModuleDef *def) {
+    PyObject *module = NULL, *moddict, *modname;
+    CYTHON_UNUSED_VAR(def);
+    if (__Pyx_check_single_interpreter())
+        return NULL;
+    if (__pyx_m)
+        return __Pyx_NewRef(__pyx_m);
+    modname = PyObject_GetAttrString(spec, "name");
+    if (unlikely(!modname)) goto bad;
+    module = PyModule_NewObject(modname);
+    Py_DECREF(modname);
+    if (unlikely(!module)) goto bad;
+#if CYTHON_COMPILING_IN_LIMITED_API
+    moddict = module;
+#else
+    moddict = PyModule_GetDict(module);
+    if (unlikely(!moddict)) goto bad;
+#endif
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "loader", "__loader__", 1) < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "origin", "__file__", 1) < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "parent", "__package__", 1) < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "submodule_search_locations", "__path__", 0) < 0)) goto bad;
+    return module;
+bad:
+    Py_XDECREF(module);
+    return NULL;
+}
+
+
+static CYTHON_SMALL_CODE int __pyx_pymod_exec_token_block_utils_fast(PyObject *__pyx_pyinit_module)
+#endif
+#endif
+{
+  int stringtab_initialized = 0;
+  #if CYTHON_USE_MODULE_STATE
+  int pystate_addmodule_run = 0;
+  #endif
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  int __pyx_t_6;
+  PyObject *__pyx_t_7 = NULL;
+  static PyThread_type_lock __pyx_t_8[8];
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannyDeclarations
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  if (__pyx_m) {
+    if (__pyx_m == __pyx_pyinit_module) return 0;
+    PyErr_SetString(PyExc_RuntimeError, "Module 'token_block_utils_fast' has already been imported. Re-initialisation is not supported.");
+    return -1;
+  }
+  #elif PY_MAJOR_VERSION >= 3
+  if (__pyx_m) return __Pyx_NewRef(__pyx_m);
+  #endif
+  /*--- Module creation code ---*/
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  __pyx_m = __pyx_pyinit_module;
+  Py_INCREF(__pyx_m);
+  #else
+  #if PY_MAJOR_VERSION < 3
+  __pyx_m = Py_InitModule4("token_block_utils_fast", __pyx_methods, 0, 0, PYTHON_API_VERSION); Py_XINCREF(__pyx_m);
+  if (unlikely(!__pyx_m)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #elif CYTHON_USE_MODULE_STATE
+  __pyx_t_1 = PyModule_Create(&__pyx_moduledef); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 1, __pyx_L1_error)
+  {
+    int add_module_result = PyState_AddModule(__pyx_t_1, &__pyx_moduledef);
+    __pyx_t_1 = 0; /* transfer ownership from __pyx_t_1 to "token_block_utils_fast" pseudovariable */
+    if (unlikely((add_module_result < 0))) __PYX_ERR(0, 1, __pyx_L1_error)
+    pystate_addmodule_run = 1;
+  }
+  #else
+  __pyx_m = PyModule_Create(&__pyx_moduledef);
+  if (unlikely(!__pyx_m)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #endif
+  CYTHON_UNUSED_VAR(__pyx_t_1);
+  __pyx_d = PyModule_GetDict(__pyx_m); if (unlikely(!__pyx_d)) __PYX_ERR(0, 1, __pyx_L1_error)
+  Py_INCREF(__pyx_d);
+  __pyx_b = __Pyx_PyImport_AddModuleRef(__Pyx_BUILTIN_MODULE_NAME); if (unlikely(!__pyx_b)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_cython_runtime = __Pyx_PyImport_AddModuleRef((const char *) "cython_runtime"); if (unlikely(!__pyx_cython_runtime)) __PYX_ERR(0, 1, __pyx_L1_error)
+  if (PyObject_SetAttrString(__pyx_m, "__builtins__", __pyx_b) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #if CYTHON_REFNANNY
+__Pyx_RefNanny = __Pyx_RefNannyImportAPI("refnanny");
+if (!__Pyx_RefNanny) {
+  PyErr_Clear();
+  __Pyx_RefNanny = __Pyx_RefNannyImportAPI("Cython.Runtime.refnanny");
+  if (!__Pyx_RefNanny)
+      Py_FatalError("failed to import 'refnanny' module");
+}
+#endif
+  __Pyx_RefNannySetupContext("__Pyx_PyMODINIT_FUNC PyInit_token_block_utils_fast(void)", 0);
+  if (__Pyx_check_binary_version(__PYX_LIMITED_VERSION_HEX, __Pyx_get_runtime_version(), CYTHON_COMPILING_IN_LIMITED_API) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #ifdef __Pxy_PyFrame_Initialize_Offsets
+  __Pxy_PyFrame_Initialize_Offsets();
+  #endif
+  __pyx_empty_tuple = PyTuple_New(0); if (unlikely(!__pyx_empty_tuple)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_bytes = PyBytes_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_bytes)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_unicode = PyUnicode_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_unicode)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #ifdef __Pyx_CyFunction_USED
+  if (__pyx_CyFunction_init(__pyx_m) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_FusedFunction_USED
+  if (__pyx_FusedFunction_init(__pyx_m) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_Coroutine_USED
+  if (__pyx_Coroutine_init(__pyx_m) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_Generator_USED
+  if (__pyx_Generator_init(__pyx_m) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_AsyncGen_USED
+  if (__pyx_AsyncGen_init(__pyx_m) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_StopAsyncIteration_USED
+  if (__pyx_StopAsyncIteration_init(__pyx_m) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  /*--- Library function declarations ---*/
+  /*--- Threads initialization code ---*/
+  #if defined(WITH_THREAD) && PY_VERSION_HEX < 0x030700F0 && defined(__PYX_FORCE_INIT_THREADS) && __PYX_FORCE_INIT_THREADS
+  PyEval_InitThreads();
+  #endif
+  /*--- Initialize various global constants etc. ---*/
+  if (__Pyx_InitConstants() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  stringtab_initialized = 1;
+  if (__Pyx_InitGlobals() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #if PY_MAJOR_VERSION < 3 && (__PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT)
+  if (__Pyx_init_sys_getdefaultencoding_params() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  if (__pyx_module_is_main_fairseq__data__token_block_utils_fast) {
+    if (PyObject_SetAttr(__pyx_m, __pyx_n_s_name_2, __pyx_n_s_main) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  }
+  #if PY_MAJOR_VERSION >= 3
+  {
+    PyObject *modules = PyImport_GetModuleDict(); if (unlikely(!modules)) __PYX_ERR(0, 1, __pyx_L1_error)
+    if (!PyDict_GetItemString(modules, "fairseq.data.token_block_utils_fast")) {
+      if (unlikely((PyDict_SetItemString(modules, "fairseq.data.token_block_utils_fast", __pyx_m) < 0))) __PYX_ERR(0, 1, __pyx_L1_error)
+    }
+  }
+  #endif
+  /*--- Builtin init code ---*/
+  if (__Pyx_InitCachedBuiltins() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  /*--- Constants init code ---*/
+  if (__Pyx_InitCachedConstants() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  /*--- Global type/function init code ---*/
+  (void)__Pyx_modinit_global_init_code();
+  (void)__Pyx_modinit_variable_export_code();
+  (void)__Pyx_modinit_function_export_code();
+  if (unlikely((__Pyx_modinit_type_init_code() < 0))) __PYX_ERR(0, 1, __pyx_L1_error)
+  if (unlikely((__Pyx_modinit_type_import_code() < 0))) __PYX_ERR(0, 1, __pyx_L1_error)
+  (void)__Pyx_modinit_variable_import_code();
+  (void)__Pyx_modinit_function_import_code();
+  /*--- Execution code ---*/
+  #if defined(__Pyx_Generator_USED) || defined(__Pyx_Coroutine_USED)
+  if (__Pyx_patch_abc() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+
+  /* "View.MemoryView":99
+ * 
+ * cdef object __pyx_collections_abc_Sequence "__pyx_collections_abc_Sequence"
+ * try:             # <<<<<<<<<<<<<<
+ *     if __import__("sys").version_info >= (3, 3):
+ *         __pyx_collections_abc_Sequence = __import__("collections.abc").abc.Sequence
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "View.MemoryView":100
+ * cdef object __pyx_collections_abc_Sequence "__pyx_collections_abc_Sequence"
+ * try:
+ *     if __import__("sys").version_info >= (3, 3):             # <<<<<<<<<<<<<<
+ *         __pyx_collections_abc_Sequence = __import__("collections.abc").abc.Sequence
+ *     else:
+ */
+      __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin___import__, __pyx_tuple__15, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 100, __pyx_L2_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_version_info); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 100, __pyx_L2_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_4 = PyObject_RichCompare(__pyx_t_5, __pyx_tuple__16, Py_GE); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 100, __pyx_L2_error)
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely((__pyx_t_6 < 0))) __PYX_ERR(1, 100, __pyx_L2_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+
+        /* "View.MemoryView":101
+ * try:
+ *     if __import__("sys").version_info >= (3, 3):
+ *         __pyx_collections_abc_Sequence = __import__("collections.abc").abc.Sequence             # <<<<<<<<<<<<<<
+ *     else:
+ *         __pyx_collections_abc_Sequence = __import__("collections").Sequence
+ */
+        __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin___import__, __pyx_tuple__17, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 101, __pyx_L2_error)
+        __Pyx_GOTREF(__pyx_t_4);
+        __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_abc); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 101, __pyx_L2_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+        __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_Sequence); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 101, __pyx_L2_error)
+        __Pyx_GOTREF(__pyx_t_4);
+        __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+        __Pyx_XGOTREF(__pyx_collections_abc_Sequence);
+        __Pyx_DECREF_SET(__pyx_collections_abc_Sequence, __pyx_t_4);
+        __Pyx_GIVEREF(__pyx_t_4);
+        __pyx_t_4 = 0;
+
+        /* "View.MemoryView":100
+ * cdef object __pyx_collections_abc_Sequence "__pyx_collections_abc_Sequence"
+ * try:
+ *     if __import__("sys").version_info >= (3, 3):             # <<<<<<<<<<<<<<
+ *         __pyx_collections_abc_Sequence = __import__("collections.abc").abc.Sequence
+ *     else:
+ */
+        goto __pyx_L8;
+      }
+
+      /* "View.MemoryView":103
+ *         __pyx_collections_abc_Sequence = __import__("collections.abc").abc.Sequence
+ *     else:
+ *         __pyx_collections_abc_Sequence = __import__("collections").Sequence             # <<<<<<<<<<<<<<
+ * except:
+ * 
+ */
+      /*else*/ {
+        __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin___import__, __pyx_tuple__18, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 103, __pyx_L2_error)
+        __Pyx_GOTREF(__pyx_t_4);
+        __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_Sequence); if (unlikely(!__pyx_t_5)) __PYX_ERR(1, 103, __pyx_L2_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+        __Pyx_XGOTREF(__pyx_collections_abc_Sequence);
+        __Pyx_DECREF_SET(__pyx_collections_abc_Sequence, __pyx_t_5);
+        __Pyx_GIVEREF(__pyx_t_5);
+        __pyx_t_5 = 0;
+      }
+      __pyx_L8:;
+
+      /* "View.MemoryView":99
+ * 
+ * cdef object __pyx_collections_abc_Sequence "__pyx_collections_abc_Sequence"
+ * try:             # <<<<<<<<<<<<<<
+ *     if __import__("sys").version_info >= (3, 3):
+ *         __pyx_collections_abc_Sequence = __import__("collections.abc").abc.Sequence
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L7_try_end;
+    __pyx_L2_error:;
+    __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+
+    /* "View.MemoryView":104
+ *     else:
+ *         __pyx_collections_abc_Sequence = __import__("collections").Sequence
+ * except:             # <<<<<<<<<<<<<<
+ * 
+ *     __pyx_collections_abc_Sequence = None
+ */
+    /*except:*/ {
+      __Pyx_AddTraceback("View.MemoryView", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_4, &__pyx_t_7) < 0) __PYX_ERR(1, 104, __pyx_L4_except_error)
+      __Pyx_XGOTREF(__pyx_t_5);
+      __Pyx_XGOTREF(__pyx_t_4);
+      __Pyx_XGOTREF(__pyx_t_7);
+
+      /* "View.MemoryView":106
+ * except:
+ * 
+ *     __pyx_collections_abc_Sequence = None             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+      __Pyx_INCREF(Py_None);
+      __Pyx_XGOTREF(__pyx_collections_abc_Sequence);
+      __Pyx_DECREF_SET(__pyx_collections_abc_Sequence, Py_None);
+      __Pyx_GIVEREF(Py_None);
+      __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+      goto __pyx_L3_exception_handled;
+    }
+
+    /* "View.MemoryView":99
+ * 
+ * cdef object __pyx_collections_abc_Sequence "__pyx_collections_abc_Sequence"
+ * try:             # <<<<<<<<<<<<<<
+ *     if __import__("sys").version_info >= (3, 3):
+ *         __pyx_collections_abc_Sequence = __import__("collections.abc").abc.Sequence
+ */
+    __pyx_L4_except_error:;
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L3_exception_handled:;
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    __pyx_L7_try_end:;
+  }
+
+  /* "View.MemoryView":241
+ * 
+ * 
+ *     try:             # <<<<<<<<<<<<<<
+ *         count = __pyx_collections_abc_Sequence.count
+ *         index = __pyx_collections_abc_Sequence.index
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_3, &__pyx_t_2, &__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_1);
+    /*try:*/ {
+
+      /* "View.MemoryView":242
+ * 
+ *     try:
+ *         count = __pyx_collections_abc_Sequence.count             # <<<<<<<<<<<<<<
+ *         index = __pyx_collections_abc_Sequence.index
+ *     except:
+ */
+      __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_collections_abc_Sequence, __pyx_n_s_count); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 242, __pyx_L11_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      if (__Pyx_SetItemOnTypeDict(__pyx_array_type, __pyx_n_s_count, __pyx_t_7) < 0) __PYX_ERR(1, 242, __pyx_L11_error)
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      PyType_Modified(__pyx_array_type);
+
+      /* "View.MemoryView":243
+ *     try:
+ *         count = __pyx_collections_abc_Sequence.count
+ *         index = __pyx_collections_abc_Sequence.index             # <<<<<<<<<<<<<<
+ *     except:
+ *         pass
+ */
+      __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_collections_abc_Sequence, __pyx_n_s_index); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 243, __pyx_L11_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      if (__Pyx_SetItemOnTypeDict(__pyx_array_type, __pyx_n_s_index, __pyx_t_7) < 0) __PYX_ERR(1, 243, __pyx_L11_error)
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      PyType_Modified(__pyx_array_type);
+
+      /* "View.MemoryView":241
+ * 
+ * 
+ *     try:             # <<<<<<<<<<<<<<
+ *         count = __pyx_collections_abc_Sequence.count
+ *         index = __pyx_collections_abc_Sequence.index
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    goto __pyx_L16_try_end;
+    __pyx_L11_error:;
+    __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+    /* "View.MemoryView":244
+ *         count = __pyx_collections_abc_Sequence.count
+ *         index = __pyx_collections_abc_Sequence.index
+ *     except:             # <<<<<<<<<<<<<<
+ *         pass
+ * 
+ */
+    /*except:*/ {
+      __Pyx_ErrRestore(0,0,0);
+      goto __pyx_L12_exception_handled;
+    }
+    __pyx_L12_exception_handled:;
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_ExceptionReset(__pyx_t_3, __pyx_t_2, __pyx_t_1);
+    __pyx_L16_try_end:;
+  }
+
+  /* "View.MemoryView":309
+ *         return self.name
+ * 
+ * cdef generic = Enum("<strided and direct or indirect>")             # <<<<<<<<<<<<<<
+ * cdef strided = Enum("<strided and direct>") # default
+ * cdef indirect = Enum("<strided and indirect>")
+ */
+  __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)__pyx_MemviewEnum_type), __pyx_tuple__19, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 309, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  __Pyx_XGOTREF(generic);
+  __Pyx_DECREF_SET(generic, __pyx_t_7);
+  __Pyx_GIVEREF(__pyx_t_7);
+  __pyx_t_7 = 0;
+
+  /* "View.MemoryView":310
+ * 
+ * cdef generic = Enum("<strided and direct or indirect>")
+ * cdef strided = Enum("<strided and direct>") # default             # <<<<<<<<<<<<<<
+ * cdef indirect = Enum("<strided and indirect>")
+ * 
+ */
+  __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)__pyx_MemviewEnum_type), __pyx_tuple__20, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 310, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  __Pyx_XGOTREF(strided);
+  __Pyx_DECREF_SET(strided, __pyx_t_7);
+  __Pyx_GIVEREF(__pyx_t_7);
+  __pyx_t_7 = 0;
+
+  /* "View.MemoryView":311
+ * cdef generic = Enum("<strided and direct or indirect>")
+ * cdef strided = Enum("<strided and direct>") # default
+ * cdef indirect = Enum("<strided and indirect>")             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)__pyx_MemviewEnum_type), __pyx_tuple__21, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 311, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  __Pyx_XGOTREF(indirect);
+  __Pyx_DECREF_SET(indirect, __pyx_t_7);
+  __Pyx_GIVEREF(__pyx_t_7);
+  __pyx_t_7 = 0;
+
+  /* "View.MemoryView":314
+ * 
+ * 
+ * cdef contiguous = Enum("<contiguous and direct>")             # <<<<<<<<<<<<<<
+ * cdef indirect_contiguous = Enum("<contiguous and indirect>")
+ * 
+ */
+  __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)__pyx_MemviewEnum_type), __pyx_tuple__22, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 314, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  __Pyx_XGOTREF(contiguous);
+  __Pyx_DECREF_SET(contiguous, __pyx_t_7);
+  __Pyx_GIVEREF(__pyx_t_7);
+  __pyx_t_7 = 0;
+
+  /* "View.MemoryView":315
+ * 
+ * cdef contiguous = Enum("<contiguous and direct>")
+ * cdef indirect_contiguous = Enum("<contiguous and indirect>")             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)__pyx_MemviewEnum_type), __pyx_tuple__23, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 315, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  __Pyx_XGOTREF(indirect_contiguous);
+  __Pyx_DECREF_SET(indirect_contiguous, __pyx_t_7);
+  __Pyx_GIVEREF(__pyx_t_7);
+  __pyx_t_7 = 0;
+
+  /* "View.MemoryView":323
+ * 
+ * 
+ * cdef int __pyx_memoryview_thread_locks_used = 0             # <<<<<<<<<<<<<<
+ * cdef PyThread_type_lock[8] __pyx_memoryview_thread_locks = [
+ *     PyThread_allocate_lock(),
+ */
+  __pyx_memoryview_thread_locks_used = 0;
+
+  /* "View.MemoryView":324
+ * 
+ * cdef int __pyx_memoryview_thread_locks_used = 0
+ * cdef PyThread_type_lock[8] __pyx_memoryview_thread_locks = [             # <<<<<<<<<<<<<<
+ *     PyThread_allocate_lock(),
+ *     PyThread_allocate_lock(),
+ */
+  __pyx_t_8[0] = PyThread_allocate_lock();
+  __pyx_t_8[1] = PyThread_allocate_lock();
+  __pyx_t_8[2] = PyThread_allocate_lock();
+  __pyx_t_8[3] = PyThread_allocate_lock();
+  __pyx_t_8[4] = PyThread_allocate_lock();
+  __pyx_t_8[5] = PyThread_allocate_lock();
+  __pyx_t_8[6] = PyThread_allocate_lock();
+  __pyx_t_8[7] = PyThread_allocate_lock();
+  memcpy(&(__pyx_memoryview_thread_locks[0]), __pyx_t_8, sizeof(__pyx_memoryview_thread_locks[0]) * (8));
+
+  /* "View.MemoryView":982
+ * 
+ * 
+ *     try:             # <<<<<<<<<<<<<<
+ *         count = __pyx_collections_abc_Sequence.count
+ *         index = __pyx_collections_abc_Sequence.index
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "View.MemoryView":983
+ * 
+ *     try:
+ *         count = __pyx_collections_abc_Sequence.count             # <<<<<<<<<<<<<<
+ *         index = __pyx_collections_abc_Sequence.index
+ *     except:
+ */
+      __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_collections_abc_Sequence, __pyx_n_s_count); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 983, __pyx_L17_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      if (__Pyx_SetItemOnTypeDict(__pyx_memoryviewslice_type, __pyx_n_s_count, __pyx_t_7) < 0) __PYX_ERR(1, 983, __pyx_L17_error)
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      PyType_Modified(__pyx_memoryviewslice_type);
+
+      /* "View.MemoryView":984
+ *     try:
+ *         count = __pyx_collections_abc_Sequence.count
+ *         index = __pyx_collections_abc_Sequence.index             # <<<<<<<<<<<<<<
+ *     except:
+ *         pass
+ */
+      __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_collections_abc_Sequence, __pyx_n_s_index); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 984, __pyx_L17_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      if (__Pyx_SetItemOnTypeDict(__pyx_memoryviewslice_type, __pyx_n_s_index, __pyx_t_7) < 0) __PYX_ERR(1, 984, __pyx_L17_error)
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      PyType_Modified(__pyx_memoryviewslice_type);
+
+      /* "View.MemoryView":982
+ * 
+ * 
+ *     try:             # <<<<<<<<<<<<<<
+ *         count = __pyx_collections_abc_Sequence.count
+ *         index = __pyx_collections_abc_Sequence.index
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L22_try_end;
+    __pyx_L17_error:;
+    __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+    /* "View.MemoryView":985
+ *         count = __pyx_collections_abc_Sequence.count
+ *         index = __pyx_collections_abc_Sequence.index
+ *     except:             # <<<<<<<<<<<<<<
+ *         pass
+ * 
+ */
+    /*except:*/ {
+      __Pyx_ErrRestore(0,0,0);
+      goto __pyx_L18_exception_handled;
+    }
+    __pyx_L18_exception_handled:;
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    __pyx_L22_try_end:;
+  }
+
+  /* "View.MemoryView":988
+ *         pass
+ * 
+ * try:             # <<<<<<<<<<<<<<
+ *     if __pyx_collections_abc_Sequence:
+ * 
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_3, &__pyx_t_2, &__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_1);
+    /*try:*/ {
+
+      /* "View.MemoryView":989
+ * 
+ * try:
+ *     if __pyx_collections_abc_Sequence:             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_collections_abc_Sequence); if (unlikely((__pyx_t_6 < 0))) __PYX_ERR(1, 989, __pyx_L23_error)
+      if (__pyx_t_6) {
+
+        /* "View.MemoryView":993
+ * 
+ * 
+ *         __pyx_collections_abc_Sequence.register(_memoryviewslice)             # <<<<<<<<<<<<<<
+ *         __pyx_collections_abc_Sequence.register(array)
+ * except:
+ */
+        __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_collections_abc_Sequence, __pyx_n_s_register); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 993, __pyx_L23_error)
+        __Pyx_GOTREF(__pyx_t_7);
+        __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_t_7, ((PyObject *)__pyx_memoryviewslice_type)); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 993, __pyx_L23_error)
+        __Pyx_GOTREF(__pyx_t_4);
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+
+        /* "View.MemoryView":994
+ * 
+ *         __pyx_collections_abc_Sequence.register(_memoryviewslice)
+ *         __pyx_collections_abc_Sequence.register(array)             # <<<<<<<<<<<<<<
+ * except:
+ *     pass  # ignore failure, it's a minor issue
+ */
+        __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_collections_abc_Sequence, __pyx_n_s_register); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 994, __pyx_L23_error)
+        __Pyx_GOTREF(__pyx_t_4);
+        __pyx_t_7 = __Pyx_PyObject_CallOneArg(__pyx_t_4, ((PyObject *)__pyx_array_type)); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 994, __pyx_L23_error)
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+        __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+        /* "View.MemoryView":989
+ * 
+ * try:
+ *     if __pyx_collections_abc_Sequence:             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+      }
+
+      /* "View.MemoryView":988
+ *         pass
+ * 
+ * try:             # <<<<<<<<<<<<<<
+ *     if __pyx_collections_abc_Sequence:
+ * 
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    goto __pyx_L28_try_end;
+    __pyx_L23_error:;
+    __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+    /* "View.MemoryView":995
+ *         __pyx_collections_abc_Sequence.register(_memoryviewslice)
+ *         __pyx_collections_abc_Sequence.register(array)
+ * except:             # <<<<<<<<<<<<<<
+ *     pass  # ignore failure, it's a minor issue
+ * 
+ */
+    /*except:*/ {
+      __Pyx_ErrRestore(0,0,0);
+      goto __pyx_L24_exception_handled;
+    }
+    __pyx_L24_exception_handled:;
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_ExceptionReset(__pyx_t_3, __pyx_t_2, __pyx_t_1);
+    __pyx_L28_try_end:;
+  }
+
+  /* "(tree fragment)":1
+ * def __pyx_unpickle_Enum(__pyx_type, long __pyx_checksum, __pyx_state):             # <<<<<<<<<<<<<<
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ */
+  __pyx_t_7 = PyCFunction_NewEx(&__pyx_mdef_15View_dot_MemoryView_1__pyx_unpickle_Enum, NULL, __pyx_n_s_View_MemoryView); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 1, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_pyx_unpickle_Enum, __pyx_t_7) < 0) __PYX_ERR(1, 1, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":7
+ * # LICENSE file in the root directory of this source tree.
+ * 
+ * import numpy as np             # <<<<<<<<<<<<<<
+ * import torch
+ * from itertools import chain
+ */
+  __pyx_t_7 = __Pyx_ImportDottedModule(__pyx_n_s_numpy, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 7, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_7) < 0) __PYX_ERR(0, 7, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":8
+ * 
+ * import numpy as np
+ * import torch             # <<<<<<<<<<<<<<
+ * from itertools import chain
+ * from libc.math cimport ceil
+ */
+  __pyx_t_7 = __Pyx_ImportDottedModule(__pyx_n_s_torch, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 8, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_torch, __pyx_t_7) < 0) __PYX_ERR(0, 8, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":9
+ * import numpy as np
+ * import torch
+ * from itertools import chain             # <<<<<<<<<<<<<<
+ * from libc.math cimport ceil
+ * 
+ */
+  __pyx_t_7 = PyList_New(1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 9, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  __Pyx_INCREF(__pyx_n_s_chain);
+  __Pyx_GIVEREF(__pyx_n_s_chain);
+  if (__Pyx_PyList_SET_ITEM(__pyx_t_7, 0, __pyx_n_s_chain)) __PYX_ERR(0, 9, __pyx_L1_error);
+  __pyx_t_4 = __Pyx_Import(__pyx_n_s_itertools, __pyx_t_7, 0); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 9, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+  __pyx_t_7 = __Pyx_ImportFrom(__pyx_t_4, __pyx_n_s_chain); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 9, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_chain, __pyx_t_7) < 0) __PYX_ERR(0, 9, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":15
+ * cimport numpy as np
+ * 
+ * DTYPE = np.int64             # <<<<<<<<<<<<<<
+ * ctypedef np.int64_t DTYPE_t
+ * 
+ */
+  __Pyx_GetModuleGlobalName(__pyx_t_4, __pyx_n_s_np); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 15, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_int64); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 15, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_DTYPE, __pyx_t_7) < 0) __PYX_ERR(0, 15, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":50
+ * @cython.wraparound(False)
+ * @cython.nonecheck(False)
+ * cpdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_fast(np.ndarray[DTYPE_t, ndim=1] sizes, str break_mode, int block_size, int document_sep_len):             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t tok_idx = 0
+ *     cdef DTYPE_t sz_idx = 0
+ */
+  __pyx_t_7 = __Pyx_CyFunction_New(&__pyx_mdef_7fairseq_4data_22token_block_utils_fast_1_get_slice_indices_fast, 0, __pyx_n_s_get_slice_indices_fast, NULL, __pyx_n_s_fairseq_data_token_block_utils_f_2, __pyx_d, ((PyObject *)__pyx_codeobj__27)); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 50, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_get_slice_indices_fast, __pyx_t_7) < 0) __PYX_ERR(0, 50, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":109
+ * @cython.wraparound(False)
+ * @cython.nonecheck(False)
+ * cpdef np.ndarray[DTYPE_t, ndim=2] _get_block_to_dataset_index_fast(np.ndarray[DTYPE_t, ndim=1] sizes, np.ndarray[DTYPE_t, ndim=2] slice_indices):             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t start_ds_idx
+ *     cdef DTYPE_t start_offset
+ */
+  __pyx_t_7 = __Pyx_CyFunction_New(&__pyx_mdef_7fairseq_4data_22token_block_utils_fast_3_get_block_to_dataset_index_fast, 0, __pyx_n_s_get_block_to_dataset_index_fast, NULL, __pyx_n_s_fairseq_data_token_block_utils_f_2, __pyx_d, ((PyObject *)__pyx_codeobj__29)); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 109, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_get_block_to_dataset_index_fast, __pyx_t_7) < 0) __PYX_ERR(0, 109, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+  /* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     cdef tuple state
+ *     cdef object _dict
+ */
+  __pyx_t_7 = __Pyx_CyFunction_New(&__pyx_mdef_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_3__reduce_cython__, __Pyx_CYFUNCTION_CCLASS, __pyx_n_s_DatasetSearcher___reduce_cython, NULL, __pyx_n_s_fairseq_data_token_block_utils_f_2, __pyx_d, ((PyObject *)__pyx_codeobj__31)); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 1, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  if (__Pyx_SetItemOnTypeDict((PyObject *)__pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher, __pyx_n_s_reduce_cython, __pyx_t_7) < 0) __PYX_ERR(1, 1, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+  PyType_Modified(__pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher);
+
+  /* "(tree fragment)":16
+ *     else:
+ *         return __pyx_unpickle_DatasetSearcher, (type(self), 0x8c67b45, state)
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     __pyx_unpickle_DatasetSearcher__set_state(self, __pyx_state)
+ */
+  __pyx_t_7 = __Pyx_CyFunction_New(&__pyx_mdef_7fairseq_4data_22token_block_utils_fast_15DatasetSearcher_5__setstate_cython__, __Pyx_CYFUNCTION_CCLASS, __pyx_n_s_DatasetSearcher___setstate_cytho, NULL, __pyx_n_s_fairseq_data_token_block_utils_f_2, __pyx_d, ((PyObject *)__pyx_codeobj__33)); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 16, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  if (__Pyx_SetItemOnTypeDict((PyObject *)__pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher, __pyx_n_s_setstate_cython, __pyx_t_7) < 0) __PYX_ERR(1, 16, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+  PyType_Modified(__pyx_ptype_7fairseq_4data_22token_block_utils_fast_DatasetSearcher);
+
+  /* "(tree fragment)":1
+ * def __pyx_unpickle_DatasetSearcher(__pyx_type, long __pyx_checksum, __pyx_state):             # <<<<<<<<<<<<<<
+ *     cdef object __pyx_PickleError
+ *     cdef object __pyx_result
+ */
+  __pyx_t_7 = __Pyx_CyFunction_New(&__pyx_mdef_7fairseq_4data_22token_block_utils_fast_5__pyx_unpickle_DatasetSearcher, 0, __pyx_n_s_pyx_unpickle_DatasetSearcher, NULL, __pyx_n_s_fairseq_data_token_block_utils_f_2, __pyx_d, ((PyObject *)__pyx_codeobj__34)); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 1, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_pyx_unpickle_DatasetSearcher, __pyx_t_7) < 0) __PYX_ERR(1, 1, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+  /* "fairseq/data/token_block_utils_fast.pyx":1
+ * # cython: language_level=3             # <<<<<<<<<<<<<<
+ * # Copyright (c) Facebook, Inc. and its affiliates.
+ * #
+ */
+  __pyx_t_7 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_7) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+  /*--- Wrapped vars code ---*/
+
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_7);
+  if (__pyx_m) {
+    if (__pyx_d && stringtab_initialized) {
+      __Pyx_AddTraceback("init fairseq.data.token_block_utils_fast", __pyx_clineno, __pyx_lineno, __pyx_filename);
+    }
+    #if !CYTHON_USE_MODULE_STATE
+    Py_CLEAR(__pyx_m);
+    #else
+    Py_DECREF(__pyx_m);
+    if (pystate_addmodule_run) {
+      PyObject *tp, *value, *tb;
+      PyErr_Fetch(&tp, &value, &tb);
+      PyState_RemoveModule(&__pyx_moduledef);
+      PyErr_Restore(tp, value, tb);
+    }
+    #endif
+  } else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_ImportError, "init fairseq.data.token_block_utils_fast");
+  }
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  return (__pyx_m != NULL) ? 0 : -1;
+  #elif PY_MAJOR_VERSION >= 3
+  return __pyx_m;
+  #else
+  return;
+  #endif
+}
+/* #### Code section: cleanup_globals ### */
+/* #### Code section: cleanup_module ### */
+/* #### Code section: main_method ### */
+/* #### Code section: utility_code_pragmas ### */
+#ifdef _MSC_VER
+#pragma warning( push )
+/* Warning 4127: conditional expression is constant
+ * Cython uses constant conditional expressions to allow in inline functions to be optimized at
+ * compile-time, so this warning is not useful
+ */
+#pragma warning( disable : 4127 )
+#endif
+
+
+
+/* #### Code section: utility_code_def ### */
+
+/* --- Runtime support code --- */
+/* Refnanny */
+#if CYTHON_REFNANNY
+static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname) {
+    PyObject *m = NULL, *p = NULL;
+    void *r = NULL;
+    m = PyImport_ImportModule(modname);
+    if (!m) goto end;
+    p = PyObject_GetAttrString(m, "RefNannyAPI");
+    if (!p) goto end;
+    r = PyLong_AsVoidPtr(p);
+end:
+    Py_XDECREF(p);
+    Py_XDECREF(m);
+    return (__Pyx_RefNannyAPIStruct *)r;
+}
+#endif
+
+/* PyErrExceptionMatches */
+#if CYTHON_FAST_THREAD_STATE
+static int __Pyx_PyErr_ExceptionMatchesTuple(PyObject *exc_type, PyObject *tuple) {
+    Py_ssize_t i, n;
+    n = PyTuple_GET_SIZE(tuple);
+#if PY_MAJOR_VERSION >= 3
+    for (i=0; i<n; i++) {
+        if (exc_type == PyTuple_GET_ITEM(tuple, i)) return 1;
+    }
+#endif
+    for (i=0; i<n; i++) {
+        if (__Pyx_PyErr_GivenExceptionMatches(exc_type, PyTuple_GET_ITEM(tuple, i))) return 1;
+    }
+    return 0;
+}
+static CYTHON_INLINE int __Pyx_PyErr_ExceptionMatchesInState(PyThreadState* tstate, PyObject* err) {
+    int result;
+    PyObject *exc_type;
+#if PY_VERSION_HEX >= 0x030C00A6
+    PyObject *current_exception = tstate->current_exception;
+    if (unlikely(!current_exception)) return 0;
+    exc_type = (PyObject*) Py_TYPE(current_exception);
+    if (exc_type == err) return 1;
+#else
+    exc_type = tstate->curexc_type;
+    if (exc_type == err) return 1;
+    if (unlikely(!exc_type)) return 0;
+#endif
+    #if CYTHON_AVOID_BORROWED_REFS
+    Py_INCREF(exc_type);
+    #endif
+    if (unlikely(PyTuple_Check(err))) {
+        result = __Pyx_PyErr_ExceptionMatchesTuple(exc_type, err);
+    } else {
+        result = __Pyx_PyErr_GivenExceptionMatches(exc_type, err);
+    }
+    #if CYTHON_AVOID_BORROWED_REFS
+    Py_DECREF(exc_type);
+    #endif
+    return result;
+}
+#endif
+
+/* PyErrFetchRestore */
+#if CYTHON_FAST_THREAD_STATE
+static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) {
+#if PY_VERSION_HEX >= 0x030C00A6
+    PyObject *tmp_value;
+    assert(type == NULL || (value != NULL && type == (PyObject*) Py_TYPE(value)));
+    if (value) {
+        #if CYTHON_COMPILING_IN_CPYTHON
+        if (unlikely(((PyBaseExceptionObject*) value)->traceback != tb))
+        #endif
+            PyException_SetTraceback(value, tb);
+    }
+    tmp_value = tstate->current_exception;
+    tstate->current_exception = value;
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(type);
+    Py_XDECREF(tb);
+#else
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    tmp_type = tstate->curexc_type;
+    tmp_value = tstate->curexc_value;
+    tmp_tb = tstate->curexc_traceback;
+    tstate->curexc_type = type;
+    tstate->curexc_value = value;
+    tstate->curexc_traceback = tb;
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+#endif
+}
+static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+#if PY_VERSION_HEX >= 0x030C00A6
+    PyObject* exc_value;
+    exc_value = tstate->current_exception;
+    tstate->current_exception = 0;
+    *value = exc_value;
+    *type = NULL;
+    *tb = NULL;
+    if (exc_value) {
+        *type = (PyObject*) Py_TYPE(exc_value);
+        Py_INCREF(*type);
+        #if CYTHON_COMPILING_IN_CPYTHON
+        *tb = ((PyBaseExceptionObject*) exc_value)->traceback;
+        Py_XINCREF(*tb);
+        #else
+        *tb = PyException_GetTraceback(exc_value);
+        #endif
+    }
+#else
+    *type = tstate->curexc_type;
+    *value = tstate->curexc_value;
+    *tb = tstate->curexc_traceback;
+    tstate->curexc_type = 0;
+    tstate->curexc_value = 0;
+    tstate->curexc_traceback = 0;
+#endif
+}
+#endif
+
+/* PyObjectGetAttrStr */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name) {
+    PyTypeObject* tp = Py_TYPE(obj);
+    if (likely(tp->tp_getattro))
+        return tp->tp_getattro(obj, attr_name);
+#if PY_MAJOR_VERSION < 3
+    if (likely(tp->tp_getattr))
+        return tp->tp_getattr(obj, PyString_AS_STRING(attr_name));
+#endif
+    return PyObject_GetAttr(obj, attr_name);
+}
+#endif
+
+/* PyObjectGetAttrStrNoError */
+#if __PYX_LIMITED_VERSION_HEX < 0x030d00A1
+static void __Pyx_PyObject_GetAttrStr_ClearAttributeError(void) {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    if (likely(__Pyx_PyErr_ExceptionMatches(PyExc_AttributeError)))
+        __Pyx_PyErr_Clear();
+}
+#endif
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStrNoError(PyObject* obj, PyObject* attr_name) {
+    PyObject *result;
+#if __PYX_LIMITED_VERSION_HEX >= 0x030d00A1
+    (void) PyObject_GetOptionalAttr(obj, attr_name, &result);
+    return result;
+#else
+#if CYTHON_COMPILING_IN_CPYTHON && CYTHON_USE_TYPE_SLOTS && PY_VERSION_HEX >= 0x030700B1
+    PyTypeObject* tp = Py_TYPE(obj);
+    if (likely(tp->tp_getattro == PyObject_GenericGetAttr)) {
+        return _PyObject_GenericGetAttrWithDict(obj, attr_name, NULL, 1);
+    }
+#endif
+    result = __Pyx_PyObject_GetAttrStr(obj, attr_name);
+    if (unlikely(!result)) {
+        __Pyx_PyObject_GetAttrStr_ClearAttributeError();
+    }
+    return result;
+#endif
+}
+
+/* GetBuiltinName */
+static PyObject *__Pyx_GetBuiltinName(PyObject *name) {
+    PyObject* result = __Pyx_PyObject_GetAttrStrNoError(__pyx_b, name);
+    if (unlikely(!result) && !PyErr_Occurred()) {
+        PyErr_Format(PyExc_NameError,
+#if PY_MAJOR_VERSION >= 3
+            "name '%U' is not defined", name);
+#else
+            "name '%.200s' is not defined", PyString_AS_STRING(name));
+#endif
+    }
+    return result;
+}
+
+/* TupleAndListFromArray */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE void __Pyx_copy_object_array(PyObject *const *CYTHON_RESTRICT src, PyObject** CYTHON_RESTRICT dest, Py_ssize_t length) {
+    PyObject *v;
+    Py_ssize_t i;
+    for (i = 0; i < length; i++) {
+        v = dest[i] = src[i];
+        Py_INCREF(v);
+    }
+}
+static CYTHON_INLINE PyObject *
+__Pyx_PyTuple_FromArray(PyObject *const *src, Py_ssize_t n)
+{
+    PyObject *res;
+    if (n <= 0) {
+        Py_INCREF(__pyx_empty_tuple);
+        return __pyx_empty_tuple;
+    }
+    res = PyTuple_New(n);
+    if (unlikely(res == NULL)) return NULL;
+    __Pyx_copy_object_array(src, ((PyTupleObject*)res)->ob_item, n);
+    return res;
+}
+static CYTHON_INLINE PyObject *
+__Pyx_PyList_FromArray(PyObject *const *src, Py_ssize_t n)
+{
+    PyObject *res;
+    if (n <= 0) {
+        return PyList_New(0);
+    }
+    res = PyList_New(n);
+    if (unlikely(res == NULL)) return NULL;
+    __Pyx_copy_object_array(src, ((PyListObject*)res)->ob_item, n);
+    return res;
+}
+#endif
+
+/* BytesEquals */
+static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int equals) {
+#if CYTHON_COMPILING_IN_PYPY || CYTHON_COMPILING_IN_LIMITED_API
+    return PyObject_RichCompareBool(s1, s2, equals);
+#else
+    if (s1 == s2) {
+        return (equals == Py_EQ);
+    } else if (PyBytes_CheckExact(s1) & PyBytes_CheckExact(s2)) {
+        const char *ps1, *ps2;
+        Py_ssize_t length = PyBytes_GET_SIZE(s1);
+        if (length != PyBytes_GET_SIZE(s2))
+            return (equals == Py_NE);
+        ps1 = PyBytes_AS_STRING(s1);
+        ps2 = PyBytes_AS_STRING(s2);
+        if (ps1[0] != ps2[0]) {
+            return (equals == Py_NE);
+        } else if (length == 1) {
+            return (equals == Py_EQ);
+        } else {
+            int result;
+#if CYTHON_USE_UNICODE_INTERNALS && (PY_VERSION_HEX < 0x030B0000)
+            Py_hash_t hash1, hash2;
+            hash1 = ((PyBytesObject*)s1)->ob_shash;
+            hash2 = ((PyBytesObject*)s2)->ob_shash;
+            if (hash1 != hash2 && hash1 != -1 && hash2 != -1) {
+                return (equals == Py_NE);
+            }
+#endif
+            result = memcmp(ps1, ps2, (size_t)length);
+            return (equals == Py_EQ) ? (result == 0) : (result != 0);
+        }
+    } else if ((s1 == Py_None) & PyBytes_CheckExact(s2)) {
+        return (equals == Py_NE);
+    } else if ((s2 == Py_None) & PyBytes_CheckExact(s1)) {
+        return (equals == Py_NE);
+    } else {
+        int result;
+        PyObject* py_result = PyObject_RichCompare(s1, s2, equals);
+        if (!py_result)
+            return -1;
+        result = __Pyx_PyObject_IsTrue(py_result);
+        Py_DECREF(py_result);
+        return result;
+    }
+#endif
+}
+
+/* UnicodeEquals */
+static CYTHON_INLINE int __Pyx_PyUnicode_Equals(PyObject* s1, PyObject* s2, int equals) {
+#if CYTHON_COMPILING_IN_PYPY || CYTHON_COMPILING_IN_LIMITED_API
+    return PyObject_RichCompareBool(s1, s2, equals);
+#else
+#if PY_MAJOR_VERSION < 3
+    PyObject* owned_ref = NULL;
+#endif
+    int s1_is_unicode, s2_is_unicode;
+    if (s1 == s2) {
+        goto return_eq;
+    }
+    s1_is_unicode = PyUnicode_CheckExact(s1);
+    s2_is_unicode = PyUnicode_CheckExact(s2);
+#if PY_MAJOR_VERSION < 3
+    if ((s1_is_unicode & (!s2_is_unicode)) && PyString_CheckExact(s2)) {
+        owned_ref = PyUnicode_FromObject(s2);
+        if (unlikely(!owned_ref))
+            return -1;
+        s2 = owned_ref;
+        s2_is_unicode = 1;
+    } else if ((s2_is_unicode & (!s1_is_unicode)) && PyString_CheckExact(s1)) {
+        owned_ref = PyUnicode_FromObject(s1);
+        if (unlikely(!owned_ref))
+            return -1;
+        s1 = owned_ref;
+        s1_is_unicode = 1;
+    } else if (((!s2_is_unicode) & (!s1_is_unicode))) {
+        return __Pyx_PyBytes_Equals(s1, s2, equals);
+    }
+#endif
+    if (s1_is_unicode & s2_is_unicode) {
+        Py_ssize_t length;
+        int kind;
+        void *data1, *data2;
+        if (unlikely(__Pyx_PyUnicode_READY(s1) < 0) || unlikely(__Pyx_PyUnicode_READY(s2) < 0))
+            return -1;
+        length = __Pyx_PyUnicode_GET_LENGTH(s1);
+        if (length != __Pyx_PyUnicode_GET_LENGTH(s2)) {
+            goto return_ne;
+        }
+#if CYTHON_USE_UNICODE_INTERNALS
+        {
+            Py_hash_t hash1, hash2;
+        #if CYTHON_PEP393_ENABLED
+            hash1 = ((PyASCIIObject*)s1)->hash;
+            hash2 = ((PyASCIIObject*)s2)->hash;
+        #else
+            hash1 = ((PyUnicodeObject*)s1)->hash;
+            hash2 = ((PyUnicodeObject*)s2)->hash;
+        #endif
+            if (hash1 != hash2 && hash1 != -1 && hash2 != -1) {
+                goto return_ne;
+            }
+        }
+#endif
+        kind = __Pyx_PyUnicode_KIND(s1);
+        if (kind != __Pyx_PyUnicode_KIND(s2)) {
+            goto return_ne;
+        }
+        data1 = __Pyx_PyUnicode_DATA(s1);
+        data2 = __Pyx_PyUnicode_DATA(s2);
+        if (__Pyx_PyUnicode_READ(kind, data1, 0) != __Pyx_PyUnicode_READ(kind, data2, 0)) {
+            goto return_ne;
+        } else if (length == 1) {
+            goto return_eq;
+        } else {
+            int result = memcmp(data1, data2, (size_t)(length * kind));
+            #if PY_MAJOR_VERSION < 3
+            Py_XDECREF(owned_ref);
+            #endif
+            return (equals == Py_EQ) ? (result == 0) : (result != 0);
+        }
+    } else if ((s1 == Py_None) & s2_is_unicode) {
+        goto return_ne;
+    } else if ((s2 == Py_None) & s1_is_unicode) {
+        goto return_ne;
+    } else {
+        int result;
+        PyObject* py_result = PyObject_RichCompare(s1, s2, equals);
+        #if PY_MAJOR_VERSION < 3
+        Py_XDECREF(owned_ref);
+        #endif
+        if (!py_result)
+            return -1;
+        result = __Pyx_PyObject_IsTrue(py_result);
+        Py_DECREF(py_result);
+        return result;
+    }
+return_eq:
+    #if PY_MAJOR_VERSION < 3
+    Py_XDECREF(owned_ref);
+    #endif
+    return (equals == Py_EQ);
+return_ne:
+    #if PY_MAJOR_VERSION < 3
+    Py_XDECREF(owned_ref);
+    #endif
+    return (equals == Py_NE);
+#endif
+}
+
+/* fastcall */
+#if CYTHON_METH_FASTCALL
+static CYTHON_INLINE PyObject * __Pyx_GetKwValue_FASTCALL(PyObject *kwnames, PyObject *const *kwvalues, PyObject *s)
+{
+    Py_ssize_t i, n = PyTuple_GET_SIZE(kwnames);
+    for (i = 0; i < n; i++)
+    {
+        if (s == PyTuple_GET_ITEM(kwnames, i)) return kwvalues[i];
+    }
+    for (i = 0; i < n; i++)
+    {
+        int eq = __Pyx_PyUnicode_Equals(s, PyTuple_GET_ITEM(kwnames, i), Py_EQ);
+        if (unlikely(eq != 0)) {
+            if (unlikely(eq < 0)) return NULL;
+            return kwvalues[i];
+        }
+    }
+    return NULL;
+}
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030d0000
+CYTHON_UNUSED static PyObject *__Pyx_KwargsAsDict_FASTCALL(PyObject *kwnames, PyObject *const *kwvalues) {
+    Py_ssize_t i, nkwargs = PyTuple_GET_SIZE(kwnames);
+    PyObject *dict;
+    dict = PyDict_New();
+    if (unlikely(!dict))
+        return NULL;
+    for (i=0; i<nkwargs; i++) {
+        PyObject *key = PyTuple_GET_ITEM(kwnames, i);
+        if (unlikely(PyDict_SetItem(dict, key, kwvalues[i]) < 0))
+            goto bad;
+    }
+    return dict;
+bad:
+    Py_DECREF(dict);
+    return NULL;
+}
+#endif
+#endif
+
+/* RaiseArgTupleInvalid */
+static void __Pyx_RaiseArgtupleInvalid(
+    const char* func_name,
+    int exact,
+    Py_ssize_t num_min,
+    Py_ssize_t num_max,
+    Py_ssize_t num_found)
+{
+    Py_ssize_t num_expected;
+    const char *more_or_less;
+    if (num_found < num_min) {
+        num_expected = num_min;
+        more_or_less = "at least";
+    } else {
+        num_expected = num_max;
+        more_or_less = "at most";
+    }
+    if (exact) {
+        more_or_less = "exactly";
+    }
+    PyErr_Format(PyExc_TypeError,
+                 "%.200s() takes %.8s %" CYTHON_FORMAT_SSIZE_T "d positional argument%.1s (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                 func_name, more_or_less, num_expected,
+                 (num_expected == 1) ? "" : "s", num_found);
+}
+
+/* RaiseDoubleKeywords */
+static void __Pyx_RaiseDoubleKeywordsError(
+    const char* func_name,
+    PyObject* kw_name)
+{
+    PyErr_Format(PyExc_TypeError,
+        #if PY_MAJOR_VERSION >= 3
+        "%s() got multiple values for keyword argument '%U'", func_name, kw_name);
+        #else
+        "%s() got multiple values for keyword argument '%s'", func_name,
+        PyString_AsString(kw_name));
+        #endif
+}
+
+/* ParseKeywords */
+static int __Pyx_ParseOptionalKeywords(
+    PyObject *kwds,
+    PyObject *const *kwvalues,
+    PyObject **argnames[],
+    PyObject *kwds2,
+    PyObject *values[],
+    Py_ssize_t num_pos_args,
+    const char* function_name)
+{
+    PyObject *key = 0, *value = 0;
+    Py_ssize_t pos = 0;
+    PyObject*** name;
+    PyObject*** first_kw_arg = argnames + num_pos_args;
+    int kwds_is_tuple = CYTHON_METH_FASTCALL && likely(PyTuple_Check(kwds));
+    while (1) {
+        Py_XDECREF(key); key = NULL;
+        Py_XDECREF(value); value = NULL;
+        if (kwds_is_tuple) {
+            Py_ssize_t size;
+#if CYTHON_ASSUME_SAFE_MACROS
+            size = PyTuple_GET_SIZE(kwds);
+#else
+            size = PyTuple_Size(kwds);
+            if (size < 0) goto bad;
+#endif
+            if (pos >= size) break;
+#if CYTHON_AVOID_BORROWED_REFS
+            key = __Pyx_PySequence_ITEM(kwds, pos);
+            if (!key) goto bad;
+#elif CYTHON_ASSUME_SAFE_MACROS
+            key = PyTuple_GET_ITEM(kwds, pos);
+#else
+            key = PyTuple_GetItem(kwds, pos);
+            if (!key) goto bad;
+#endif
+            value = kwvalues[pos];
+            pos++;
+        }
+        else
+        {
+            if (!PyDict_Next(kwds, &pos, &key, &value)) break;
+#if CYTHON_AVOID_BORROWED_REFS
+            Py_INCREF(key);
+#endif
+        }
+        name = first_kw_arg;
+        while (*name && (**name != key)) name++;
+        if (*name) {
+            values[name-argnames] = value;
+#if CYTHON_AVOID_BORROWED_REFS
+            Py_INCREF(value);
+            Py_DECREF(key);
+#endif
+            key = NULL;
+            value = NULL;
+            continue;
+        }
+#if !CYTHON_AVOID_BORROWED_REFS
+        Py_INCREF(key);
+#endif
+        Py_INCREF(value);
+        name = first_kw_arg;
+        #if PY_MAJOR_VERSION < 3
+        if (likely(PyString_Check(key))) {
+            while (*name) {
+                if ((CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**name) == PyString_GET_SIZE(key))
+                        && _PyString_Eq(**name, key)) {
+                    values[name-argnames] = value;
+#if CYTHON_AVOID_BORROWED_REFS
+                    value = NULL;
+#endif
+                    break;
+                }
+                name++;
+            }
+            if (*name) continue;
+            else {
+                PyObject*** argname = argnames;
+                while (argname != first_kw_arg) {
+                    if ((**argname == key) || (
+                            (CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**argname) == PyString_GET_SIZE(key))
+                             && _PyString_Eq(**argname, key))) {
+                        goto arg_passed_twice;
+                    }
+                    argname++;
+                }
+            }
+        } else
+        #endif
+        if (likely(PyUnicode_Check(key))) {
+            while (*name) {
+                int cmp = (
+                #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
+                    (__Pyx_PyUnicode_GET_LENGTH(**name) != __Pyx_PyUnicode_GET_LENGTH(key)) ? 1 :
+                #endif
+                    PyUnicode_Compare(**name, key)
+                );
+                if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
+                if (cmp == 0) {
+                    values[name-argnames] = value;
+#if CYTHON_AVOID_BORROWED_REFS
+                    value = NULL;
+#endif
+                    break;
+                }
+                name++;
+            }
+            if (*name) continue;
+            else {
+                PyObject*** argname = argnames;
+                while (argname != first_kw_arg) {
+                    int cmp = (**argname == key) ? 0 :
+                    #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
+                        (__Pyx_PyUnicode_GET_LENGTH(**argname) != __Pyx_PyUnicode_GET_LENGTH(key)) ? 1 :
+                    #endif
+                        PyUnicode_Compare(**argname, key);
+                    if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
+                    if (cmp == 0) goto arg_passed_twice;
+                    argname++;
+                }
+            }
+        } else
+            goto invalid_keyword_type;
+        if (kwds2) {
+            if (unlikely(PyDict_SetItem(kwds2, key, value))) goto bad;
+        } else {
+            goto invalid_keyword;
+        }
+    }
+    Py_XDECREF(key);
+    Py_XDECREF(value);
+    return 0;
+arg_passed_twice:
+    __Pyx_RaiseDoubleKeywordsError(function_name, key);
+    goto bad;
+invalid_keyword_type:
+    PyErr_Format(PyExc_TypeError,
+        "%.200s() keywords must be strings", function_name);
+    goto bad;
+invalid_keyword:
+    #if PY_MAJOR_VERSION < 3
+    PyErr_Format(PyExc_TypeError,
+        "%.200s() got an unexpected keyword argument '%.200s'",
+        function_name, PyString_AsString(key));
+    #else
+    PyErr_Format(PyExc_TypeError,
+        "%s() got an unexpected keyword argument '%U'",
+        function_name, key);
+    #endif
+bad:
+    Py_XDECREF(key);
+    Py_XDECREF(value);
+    return -1;
+}
+
+/* ArgTypeTest */
+static int __Pyx__ArgTypeTest(PyObject *obj, PyTypeObject *type, const char *name, int exact)
+{
+    __Pyx_TypeName type_name;
+    __Pyx_TypeName obj_type_name;
+    if (unlikely(!type)) {
+        PyErr_SetString(PyExc_SystemError, "Missing type object");
+        return 0;
+    }
+    else if (exact) {
+        #if PY_MAJOR_VERSION == 2
+        if ((type == &PyBaseString_Type) && likely(__Pyx_PyBaseString_CheckExact(obj))) return 1;
+        #endif
+    }
+    else {
+        if (likely(__Pyx_TypeCheck(obj, type))) return 1;
+    }
+    type_name = __Pyx_PyType_GetName(type);
+    obj_type_name = __Pyx_PyType_GetName(Py_TYPE(obj));
+    PyErr_Format(PyExc_TypeError,
+        "Argument '%.200s' has incorrect type (expected " __Pyx_FMT_TYPENAME
+        ", got " __Pyx_FMT_TYPENAME ")", name, type_name, obj_type_name);
+    __Pyx_DECREF_TypeName(type_name);
+    __Pyx_DECREF_TypeName(obj_type_name);
+    return 0;
+}
+
+/* RaiseException */
+#if PY_MAJOR_VERSION < 3
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause) {
+    __Pyx_PyThreadState_declare
+    CYTHON_UNUSED_VAR(cause);
+    Py_XINCREF(type);
+    if (!value || value == Py_None)
+        value = NULL;
+    else
+        Py_INCREF(value);
+    if (!tb || tb == Py_None)
+        tb = NULL;
+    else {
+        Py_INCREF(tb);
+        if (!PyTraceBack_Check(tb)) {
+            PyErr_SetString(PyExc_TypeError,
+                "raise: arg 3 must be a traceback or None");
+            goto raise_error;
+        }
+    }
+    if (PyType_Check(type)) {
+#if CYTHON_COMPILING_IN_PYPY
+        if (!value) {
+            Py_INCREF(Py_None);
+            value = Py_None;
+        }
+#endif
+        PyErr_NormalizeException(&type, &value, &tb);
+    } else {
+        if (value) {
+            PyErr_SetString(PyExc_TypeError,
+                "instance exception may not have a separate value");
+            goto raise_error;
+        }
+        value = type;
+        type = (PyObject*) Py_TYPE(type);
+        Py_INCREF(type);
+        if (!PyType_IsSubtype((PyTypeObject *)type, (PyTypeObject *)PyExc_BaseException)) {
+            PyErr_SetString(PyExc_TypeError,
+                "raise: exception class must be a subclass of BaseException");
+            goto raise_error;
+        }
+    }
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrRestore(type, value, tb);
+    return;
+raise_error:
+    Py_XDECREF(value);
+    Py_XDECREF(type);
+    Py_XDECREF(tb);
+    return;
+}
+#else
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause) {
+    PyObject* owned_instance = NULL;
+    if (tb == Py_None) {
+        tb = 0;
+    } else if (tb && !PyTraceBack_Check(tb)) {
+        PyErr_SetString(PyExc_TypeError,
+            "raise: arg 3 must be a traceback or None");
+        goto bad;
+    }
+    if (value == Py_None)
+        value = 0;
+    if (PyExceptionInstance_Check(type)) {
+        if (value) {
+            PyErr_SetString(PyExc_TypeError,
+                "instance exception may not have a separate value");
+            goto bad;
+        }
+        value = type;
+        type = (PyObject*) Py_TYPE(value);
+    } else if (PyExceptionClass_Check(type)) {
+        PyObject *instance_class = NULL;
+        if (value && PyExceptionInstance_Check(value)) {
+            instance_class = (PyObject*) Py_TYPE(value);
+            if (instance_class != type) {
+                int is_subclass = PyObject_IsSubclass(instance_class, type);
+                if (!is_subclass) {
+                    instance_class = NULL;
+                } else if (unlikely(is_subclass == -1)) {
+                    goto bad;
+                } else {
+                    type = instance_class;
+                }
+            }
+        }
+        if (!instance_class) {
+            PyObject *args;
+            if (!value)
+                args = PyTuple_New(0);
+            else if (PyTuple_Check(value)) {
+                Py_INCREF(value);
+                args = value;
+            } else
+                args = PyTuple_Pack(1, value);
+            if (!args)
+                goto bad;
+            owned_instance = PyObject_Call(type, args, NULL);
+            Py_DECREF(args);
+            if (!owned_instance)
+                goto bad;
+            value = owned_instance;
+            if (!PyExceptionInstance_Check(value)) {
+                PyErr_Format(PyExc_TypeError,
+                             "calling %R should have returned an instance of "
+                             "BaseException, not %R",
+                             type, Py_TYPE(value));
+                goto bad;
+            }
+        }
+    } else {
+        PyErr_SetString(PyExc_TypeError,
+            "raise: exception class must be a subclass of BaseException");
+        goto bad;
+    }
+    if (cause) {
+        PyObject *fixed_cause;
+        if (cause == Py_None) {
+            fixed_cause = NULL;
+        } else if (PyExceptionClass_Check(cause)) {
+            fixed_cause = PyObject_CallObject(cause, NULL);
+            if (fixed_cause == NULL)
+                goto bad;
+        } else if (PyExceptionInstance_Check(cause)) {
+            fixed_cause = cause;
+            Py_INCREF(fixed_cause);
+        } else {
+            PyErr_SetString(PyExc_TypeError,
+                            "exception causes must derive from "
+                            "BaseException");
+            goto bad;
+        }
+        PyException_SetCause(value, fixed_cause);
+    }
+    PyErr_SetObject(type, value);
+    if (tb) {
+      #if PY_VERSION_HEX >= 0x030C00A6
+        PyException_SetTraceback(value, tb);
+      #elif CYTHON_FAST_THREAD_STATE
+        PyThreadState *tstate = __Pyx_PyThreadState_Current;
+        PyObject* tmp_tb = tstate->curexc_traceback;
+        if (tb != tmp_tb) {
+            Py_INCREF(tb);
+            tstate->curexc_traceback = tb;
+            Py_XDECREF(tmp_tb);
+        }
+#else
+        PyObject *tmp_type, *tmp_value, *tmp_tb;
+        PyErr_Fetch(&tmp_type, &tmp_value, &tmp_tb);
+        Py_INCREF(tb);
+        PyErr_Restore(tmp_type, tmp_value, tb);
+        Py_XDECREF(tmp_tb);
+#endif
+    }
+bad:
+    Py_XDECREF(owned_instance);
+    return;
+}
+#endif
+
+/* PyFunctionFastCall */
+#if CYTHON_FAST_PYCALL && !CYTHON_VECTORCALL
+static PyObject* __Pyx_PyFunction_FastCallNoKw(PyCodeObject *co, PyObject **args, Py_ssize_t na,
+                                               PyObject *globals) {
+    PyFrameObject *f;
+    PyThreadState *tstate = __Pyx_PyThreadState_Current;
+    PyObject **fastlocals;
+    Py_ssize_t i;
+    PyObject *result;
+    assert(globals != NULL);
+    /* XXX Perhaps we should create a specialized
+       PyFrame_New() that doesn't take locals, but does
+       take builtins without sanity checking them.
+       */
+    assert(tstate != NULL);
+    f = PyFrame_New(tstate, co, globals, NULL);
+    if (f == NULL) {
+        return NULL;
+    }
+    fastlocals = __Pyx_PyFrame_GetLocalsplus(f);
+    for (i = 0; i < na; i++) {
+        Py_INCREF(*args);
+        fastlocals[i] = *args++;
+    }
+    result = PyEval_EvalFrameEx(f,0);
+    ++tstate->recursion_depth;
+    Py_DECREF(f);
+    --tstate->recursion_depth;
+    return result;
+}
+static PyObject *__Pyx_PyFunction_FastCallDict(PyObject *func, PyObject **args, Py_ssize_t nargs, PyObject *kwargs) {
+    PyCodeObject *co = (PyCodeObject *)PyFunction_GET_CODE(func);
+    PyObject *globals = PyFunction_GET_GLOBALS(func);
+    PyObject *argdefs = PyFunction_GET_DEFAULTS(func);
+    PyObject *closure;
+#if PY_MAJOR_VERSION >= 3
+    PyObject *kwdefs;
+#endif
+    PyObject *kwtuple, **k;
+    PyObject **d;
+    Py_ssize_t nd;
+    Py_ssize_t nk;
+    PyObject *result;
+    assert(kwargs == NULL || PyDict_Check(kwargs));
+    nk = kwargs ? PyDict_Size(kwargs) : 0;
+    #if PY_MAJOR_VERSION < 3
+    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object"))) {
+        return NULL;
+    }
+    #else
+    if (unlikely(Py_EnterRecursiveCall(" while calling a Python object"))) {
+        return NULL;
+    }
+    #endif
+    if (
+#if PY_MAJOR_VERSION >= 3
+            co->co_kwonlyargcount == 0 &&
+#endif
+            likely(kwargs == NULL || nk == 0) &&
+            co->co_flags == (CO_OPTIMIZED | CO_NEWLOCALS | CO_NOFREE)) {
+        if (argdefs == NULL && co->co_argcount == nargs) {
+            result = __Pyx_PyFunction_FastCallNoKw(co, args, nargs, globals);
+            goto done;
+        }
+        else if (nargs == 0 && argdefs != NULL
+                 && co->co_argcount == Py_SIZE(argdefs)) {
+            /* function called with no arguments, but all parameters have
+               a default value: use default values as arguments .*/
+            args = &PyTuple_GET_ITEM(argdefs, 0);
+            result =__Pyx_PyFunction_FastCallNoKw(co, args, Py_SIZE(argdefs), globals);
+            goto done;
+        }
+    }
+    if (kwargs != NULL) {
+        Py_ssize_t pos, i;
+        kwtuple = PyTuple_New(2 * nk);
+        if (kwtuple == NULL) {
+            result = NULL;
+            goto done;
+        }
+        k = &PyTuple_GET_ITEM(kwtuple, 0);
+        pos = i = 0;
+        while (PyDict_Next(kwargs, &pos, &k[i], &k[i+1])) {
+            Py_INCREF(k[i]);
+            Py_INCREF(k[i+1]);
+            i += 2;
+        }
+        nk = i / 2;
+    }
+    else {
+        kwtuple = NULL;
+        k = NULL;
+    }
+    closure = PyFunction_GET_CLOSURE(func);
+#if PY_MAJOR_VERSION >= 3
+    kwdefs = PyFunction_GET_KW_DEFAULTS(func);
+#endif
+    if (argdefs != NULL) {
+        d = &PyTuple_GET_ITEM(argdefs, 0);
+        nd = Py_SIZE(argdefs);
+    }
+    else {
+        d = NULL;
+        nd = 0;
+    }
+#if PY_MAJOR_VERSION >= 3
+    result = PyEval_EvalCodeEx((PyObject*)co, globals, (PyObject *)NULL,
+                               args, (int)nargs,
+                               k, (int)nk,
+                               d, (int)nd, kwdefs, closure);
+#else
+    result = PyEval_EvalCodeEx(co, globals, (PyObject *)NULL,
+                               args, (int)nargs,
+                               k, (int)nk,
+                               d, (int)nd, closure);
+#endif
+    Py_XDECREF(kwtuple);
+done:
+    Py_LeaveRecursiveCall();
+    return result;
+}
+#endif
+
+/* PyObjectCall */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw) {
+    PyObject *result;
+    ternaryfunc call = Py_TYPE(func)->tp_call;
+    if (unlikely(!call))
+        return PyObject_Call(func, arg, kw);
+    #if PY_MAJOR_VERSION < 3
+    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object")))
+        return NULL;
+    #else
+    if (unlikely(Py_EnterRecursiveCall(" while calling a Python object")))
+        return NULL;
+    #endif
+    result = (*call)(func, arg, kw);
+    Py_LeaveRecursiveCall();
+    if (unlikely(!result) && unlikely(!PyErr_Occurred())) {
+        PyErr_SetString(
+            PyExc_SystemError,
+            "NULL result without error in PyObject_Call");
+    }
+    return result;
+}
+#endif
+
+/* PyObjectCallMethO */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg) {
+    PyObject *self, *result;
+    PyCFunction cfunc;
+    cfunc = __Pyx_CyOrPyCFunction_GET_FUNCTION(func);
+    self = __Pyx_CyOrPyCFunction_GET_SELF(func);
+    #if PY_MAJOR_VERSION < 3
+    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object")))
+        return NULL;
+    #else
+    if (unlikely(Py_EnterRecursiveCall(" while calling a Python object")))
+        return NULL;
+    #endif
+    result = cfunc(self, arg);
+    Py_LeaveRecursiveCall();
+    if (unlikely(!result) && unlikely(!PyErr_Occurred())) {
+        PyErr_SetString(
+            PyExc_SystemError,
+            "NULL result without error in PyObject_Call");
+    }
+    return result;
+}
+#endif
+
+/* PyObjectFastCall */
+#if PY_VERSION_HEX < 0x03090000 || CYTHON_COMPILING_IN_LIMITED_API
+static PyObject* __Pyx_PyObject_FastCall_fallback(PyObject *func, PyObject **args, size_t nargs, PyObject *kwargs) {
+    PyObject *argstuple;
+    PyObject *result = 0;
+    size_t i;
+    argstuple = PyTuple_New((Py_ssize_t)nargs);
+    if (unlikely(!argstuple)) return NULL;
+    for (i = 0; i < nargs; i++) {
+        Py_INCREF(args[i]);
+        if (__Pyx_PyTuple_SET_ITEM(argstuple, (Py_ssize_t)i, args[i]) < 0) goto bad;
+    }
+    result = __Pyx_PyObject_Call(func, argstuple, kwargs);
+  bad:
+    Py_DECREF(argstuple);
+    return result;
+}
+#endif
+static CYTHON_INLINE PyObject* __Pyx_PyObject_FastCallDict(PyObject *func, PyObject **args, size_t _nargs, PyObject *kwargs) {
+    Py_ssize_t nargs = __Pyx_PyVectorcall_NARGS(_nargs);
+#if CYTHON_COMPILING_IN_CPYTHON
+    if (nargs == 0 && kwargs == NULL) {
+        if (__Pyx_CyOrPyCFunction_Check(func) && likely( __Pyx_CyOrPyCFunction_GET_FLAGS(func) & METH_NOARGS))
+            return __Pyx_PyObject_CallMethO(func, NULL);
+    }
+    else if (nargs == 1 && kwargs == NULL) {
+        if (__Pyx_CyOrPyCFunction_Check(func) && likely( __Pyx_CyOrPyCFunction_GET_FLAGS(func) & METH_O))
+            return __Pyx_PyObject_CallMethO(func, args[0]);
+    }
+#endif
+    #if PY_VERSION_HEX < 0x030800B1
+    #if CYTHON_FAST_PYCCALL
+    if (PyCFunction_Check(func)) {
+        if (kwargs) {
+            return _PyCFunction_FastCallDict(func, args, nargs, kwargs);
+        } else {
+            return _PyCFunction_FastCallKeywords(func, args, nargs, NULL);
+        }
+    }
+    #if PY_VERSION_HEX >= 0x030700A1
+    if (!kwargs && __Pyx_IS_TYPE(func, &PyMethodDescr_Type)) {
+        return _PyMethodDescr_FastCallKeywords(func, args, nargs, NULL);
+    }
+    #endif
+    #endif
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(func)) {
+        return __Pyx_PyFunction_FastCallDict(func, args, nargs, kwargs);
+    }
+    #endif
+    #endif
+    if (kwargs == NULL) {
+        #if CYTHON_VECTORCALL
+        #if PY_VERSION_HEX < 0x03090000
+        vectorcallfunc f = _PyVectorcall_Function(func);
+        #else
+        vectorcallfunc f = PyVectorcall_Function(func);
+        #endif
+        if (f) {
+            return f(func, args, (size_t)nargs, NULL);
+        }
+        #elif defined(__Pyx_CyFunction_USED) && CYTHON_BACKPORT_VECTORCALL
+        if (__Pyx_CyFunction_CheckExact(func)) {
+            __pyx_vectorcallfunc f = __Pyx_CyFunction_func_vectorcall(func);
+            if (f) return f(func, args, (size_t)nargs, NULL);
+        }
+        #endif
+    }
+    if (nargs == 0) {
+        return __Pyx_PyObject_Call(func, __pyx_empty_tuple, kwargs);
+    }
+    #if PY_VERSION_HEX >= 0x03090000 && !CYTHON_COMPILING_IN_LIMITED_API
+    return PyObject_VectorcallDict(func, args, (size_t)nargs, kwargs);
+    #else
+    return __Pyx_PyObject_FastCall_fallback(func, args, (size_t)nargs, kwargs);
+    #endif
+}
+
+/* RaiseUnexpectedTypeError */
+static int
+__Pyx_RaiseUnexpectedTypeError(const char *expected, PyObject *obj)
+{
+    __Pyx_TypeName obj_type_name = __Pyx_PyType_GetName(Py_TYPE(obj));
+    PyErr_Format(PyExc_TypeError, "Expected %s, got " __Pyx_FMT_TYPENAME,
+                 expected, obj_type_name);
+    __Pyx_DECREF_TypeName(obj_type_name);
+    return 0;
+}
+
+/* CIntToDigits */
+static const char DIGIT_PAIRS_10[2*10*10+1] = {
+    "00010203040506070809"
+    "10111213141516171819"
+    "20212223242526272829"
+    "30313233343536373839"
+    "40414243444546474849"
+    "50515253545556575859"
+    "60616263646566676869"
+    "70717273747576777879"
+    "80818283848586878889"
+    "90919293949596979899"
+};
+static const char DIGIT_PAIRS_8[2*8*8+1] = {
+    "0001020304050607"
+    "1011121314151617"
+    "2021222324252627"
+    "3031323334353637"
+    "4041424344454647"
+    "5051525354555657"
+    "6061626364656667"
+    "7071727374757677"
+};
+static const char DIGITS_HEX[2*16+1] = {
+    "0123456789abcdef"
+    "0123456789ABCDEF"
+};
+
+/* BuildPyUnicode */
+static PyObject* __Pyx_PyUnicode_BuildFromAscii(Py_ssize_t ulength, char* chars, int clength,
+                                                int prepend_sign, char padding_char) {
+    PyObject *uval;
+    Py_ssize_t uoffset = ulength - clength;
+#if CYTHON_USE_UNICODE_INTERNALS
+    Py_ssize_t i;
+#if CYTHON_PEP393_ENABLED
+    void *udata;
+    uval = PyUnicode_New(ulength, 127);
+    if (unlikely(!uval)) return NULL;
+    udata = PyUnicode_DATA(uval);
+#else
+    Py_UNICODE *udata;
+    uval = PyUnicode_FromUnicode(NULL, ulength);
+    if (unlikely(!uval)) return NULL;
+    udata = PyUnicode_AS_UNICODE(uval);
+#endif
+    if (uoffset > 0) {
+        i = 0;
+        if (prepend_sign) {
+            __Pyx_PyUnicode_WRITE(PyUnicode_1BYTE_KIND, udata, 0, '-');
+            i++;
+        }
+        for (; i < uoffset; i++) {
+            __Pyx_PyUnicode_WRITE(PyUnicode_1BYTE_KIND, udata, i, padding_char);
+        }
+    }
+    for (i=0; i < clength; i++) {
+        __Pyx_PyUnicode_WRITE(PyUnicode_1BYTE_KIND, udata, uoffset+i, chars[i]);
+    }
+#else
+    {
+        PyObject *sign = NULL, *padding = NULL;
+        uval = NULL;
+        if (uoffset > 0) {
+            prepend_sign = !!prepend_sign;
+            if (uoffset > prepend_sign) {
+                padding = PyUnicode_FromOrdinal(padding_char);
+                if (likely(padding) && uoffset > prepend_sign + 1) {
+                    PyObject *tmp;
+                    PyObject *repeat = PyInt_FromSsize_t(uoffset - prepend_sign);
+                    if (unlikely(!repeat)) goto done_or_error;
+                    tmp = PyNumber_Multiply(padding, repeat);
+                    Py_DECREF(repeat);
+                    Py_DECREF(padding);
+                    padding = tmp;
+                }
+                if (unlikely(!padding)) goto done_or_error;
+            }
+            if (prepend_sign) {
+                sign = PyUnicode_FromOrdinal('-');
+                if (unlikely(!sign)) goto done_or_error;
+            }
+        }
+        uval = PyUnicode_DecodeASCII(chars, clength, NULL);
+        if (likely(uval) && padding) {
+            PyObject *tmp = PyNumber_Add(padding, uval);
+            Py_DECREF(uval);
+            uval = tmp;
+        }
+        if (likely(uval) && sign) {
+            PyObject *tmp = PyNumber_Add(sign, uval);
+            Py_DECREF(uval);
+            uval = tmp;
+        }
+done_or_error:
+        Py_XDECREF(padding);
+        Py_XDECREF(sign);
+    }
+#endif
+    return uval;
+}
+
+/* CIntToPyUnicode */
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_From_int(int value, Py_ssize_t width, char padding_char, char format_char) {
+    char digits[sizeof(int)*3+2];
+    char *dpos, *end = digits + sizeof(int)*3+2;
+    const char *hex_digits = DIGITS_HEX;
+    Py_ssize_t length, ulength;
+    int prepend_sign, last_one_off;
+    int remaining;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const int neg_one = (int) -1, const_zero = (int) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+    if (format_char == 'X') {
+        hex_digits += 16;
+        format_char = 'x';
+    }
+    remaining = value;
+    last_one_off = 0;
+    dpos = end;
+    do {
+        int digit_pos;
+        switch (format_char) {
+        case 'o':
+            digit_pos = abs((int)(remaining % (8*8)));
+            remaining = (int) (remaining / (8*8));
+            dpos -= 2;
+            memcpy(dpos, DIGIT_PAIRS_8 + digit_pos * 2, 2);
+            last_one_off = (digit_pos < 8);
+            break;
+        case 'd':
+            digit_pos = abs((int)(remaining % (10*10)));
+            remaining = (int) (remaining / (10*10));
+            dpos -= 2;
+            memcpy(dpos, DIGIT_PAIRS_10 + digit_pos * 2, 2);
+            last_one_off = (digit_pos < 10);
+            break;
+        case 'x':
+            *(--dpos) = hex_digits[abs((int)(remaining % 16))];
+            remaining = (int) (remaining / 16);
+            break;
+        default:
+            assert(0);
+            break;
+        }
+    } while (unlikely(remaining != 0));
+    assert(!last_one_off || *dpos == '0');
+    dpos += last_one_off;
+    length = end - dpos;
+    ulength = length;
+    prepend_sign = 0;
+    if (!is_unsigned && value <= neg_one) {
+        if (padding_char == ' ' || width <= length + 1) {
+            *(--dpos) = '-';
+            ++length;
+        } else {
+            prepend_sign = 1;
+        }
+        ++ulength;
+    }
+    if (width > ulength) {
+        ulength = width;
+    }
+    if (ulength == 1) {
+        return PyUnicode_FromOrdinal(*dpos);
+    }
+    return __Pyx_PyUnicode_BuildFromAscii(ulength, dpos, (int) length, prepend_sign, padding_char);
+}
+
+/* CIntToPyUnicode */
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_From_Py_ssize_t(Py_ssize_t value, Py_ssize_t width, char padding_char, char format_char) {
+    char digits[sizeof(Py_ssize_t)*3+2];
+    char *dpos, *end = digits + sizeof(Py_ssize_t)*3+2;
+    const char *hex_digits = DIGITS_HEX;
+    Py_ssize_t length, ulength;
+    int prepend_sign, last_one_off;
+    Py_ssize_t remaining;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const Py_ssize_t neg_one = (Py_ssize_t) -1, const_zero = (Py_ssize_t) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+    if (format_char == 'X') {
+        hex_digits += 16;
+        format_char = 'x';
+    }
+    remaining = value;
+    last_one_off = 0;
+    dpos = end;
+    do {
+        int digit_pos;
+        switch (format_char) {
+        case 'o':
+            digit_pos = abs((int)(remaining % (8*8)));
+            remaining = (Py_ssize_t) (remaining / (8*8));
+            dpos -= 2;
+            memcpy(dpos, DIGIT_PAIRS_8 + digit_pos * 2, 2);
+            last_one_off = (digit_pos < 8);
+            break;
+        case 'd':
+            digit_pos = abs((int)(remaining % (10*10)));
+            remaining = (Py_ssize_t) (remaining / (10*10));
+            dpos -= 2;
+            memcpy(dpos, DIGIT_PAIRS_10 + digit_pos * 2, 2);
+            last_one_off = (digit_pos < 10);
+            break;
+        case 'x':
+            *(--dpos) = hex_digits[abs((int)(remaining % 16))];
+            remaining = (Py_ssize_t) (remaining / 16);
+            break;
+        default:
+            assert(0);
+            break;
+        }
+    } while (unlikely(remaining != 0));
+    assert(!last_one_off || *dpos == '0');
+    dpos += last_one_off;
+    length = end - dpos;
+    ulength = length;
+    prepend_sign = 0;
+    if (!is_unsigned && value <= neg_one) {
+        if (padding_char == ' ' || width <= length + 1) {
+            *(--dpos) = '-';
+            ++length;
+        } else {
+            prepend_sign = 1;
+        }
+        ++ulength;
+    }
+    if (width > ulength) {
+        ulength = width;
+    }
+    if (ulength == 1) {
+        return PyUnicode_FromOrdinal(*dpos);
+    }
+    return __Pyx_PyUnicode_BuildFromAscii(ulength, dpos, (int) length, prepend_sign, padding_char);
+}
+
+/* JoinPyUnicode */
+static PyObject* __Pyx_PyUnicode_Join(PyObject* value_tuple, Py_ssize_t value_count, Py_ssize_t result_ulength,
+                                      Py_UCS4 max_char) {
+#if CYTHON_USE_UNICODE_INTERNALS && CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    PyObject *result_uval;
+    int result_ukind, kind_shift;
+    Py_ssize_t i, char_pos;
+    void *result_udata;
+    CYTHON_MAYBE_UNUSED_VAR(max_char);
+#if CYTHON_PEP393_ENABLED
+    result_uval = PyUnicode_New(result_ulength, max_char);
+    if (unlikely(!result_uval)) return NULL;
+    result_ukind = (max_char <= 255) ? PyUnicode_1BYTE_KIND : (max_char <= 65535) ? PyUnicode_2BYTE_KIND : PyUnicode_4BYTE_KIND;
+    kind_shift = (result_ukind == PyUnicode_4BYTE_KIND) ? 2 : result_ukind - 1;
+    result_udata = PyUnicode_DATA(result_uval);
+#else
+    result_uval = PyUnicode_FromUnicode(NULL, result_ulength);
+    if (unlikely(!result_uval)) return NULL;
+    result_ukind = sizeof(Py_UNICODE);
+    kind_shift = (result_ukind == 4) ? 2 : result_ukind - 1;
+    result_udata = PyUnicode_AS_UNICODE(result_uval);
+#endif
+    assert(kind_shift == 2 || kind_shift == 1 || kind_shift == 0);
+    char_pos = 0;
+    for (i=0; i < value_count; i++) {
+        int ukind;
+        Py_ssize_t ulength;
+        void *udata;
+        PyObject *uval = PyTuple_GET_ITEM(value_tuple, i);
+        if (unlikely(__Pyx_PyUnicode_READY(uval)))
+            goto bad;
+        ulength = __Pyx_PyUnicode_GET_LENGTH(uval);
+        if (unlikely(!ulength))
+            continue;
+        if (unlikely((PY_SSIZE_T_MAX >> kind_shift) - ulength < char_pos))
+            goto overflow;
+        ukind = __Pyx_PyUnicode_KIND(uval);
+        udata = __Pyx_PyUnicode_DATA(uval);
+        if (!CYTHON_PEP393_ENABLED || ukind == result_ukind) {
+            memcpy((char *)result_udata + (char_pos << kind_shift), udata, (size_t) (ulength << kind_shift));
+        } else {
+            #if PY_VERSION_HEX >= 0x030d0000
+            if (unlikely(PyUnicode_CopyCharacters(result_uval, char_pos, uval, 0, ulength) < 0)) goto bad;
+            #elif CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030300F0 || defined(_PyUnicode_FastCopyCharacters)
+            _PyUnicode_FastCopyCharacters(result_uval, char_pos, uval, 0, ulength);
+            #else
+            Py_ssize_t j;
+            for (j=0; j < ulength; j++) {
+                Py_UCS4 uchar = __Pyx_PyUnicode_READ(ukind, udata, j);
+                __Pyx_PyUnicode_WRITE(result_ukind, result_udata, char_pos+j, uchar);
+            }
+            #endif
+        }
+        char_pos += ulength;
+    }
+    return result_uval;
+overflow:
+    PyErr_SetString(PyExc_OverflowError, "join() result is too long for a Python string");
+bad:
+    Py_DECREF(result_uval);
+    return NULL;
+#else
+    CYTHON_UNUSED_VAR(max_char);
+    CYTHON_UNUSED_VAR(result_ulength);
+    CYTHON_UNUSED_VAR(value_count);
+    return PyUnicode_Join(__pyx_empty_unicode, value_tuple);
+#endif
+}
+
+/* GetAttr */
+static CYTHON_INLINE PyObject *__Pyx_GetAttr(PyObject *o, PyObject *n) {
+#if CYTHON_USE_TYPE_SLOTS
+#if PY_MAJOR_VERSION >= 3
+    if (likely(PyUnicode_Check(n)))
+#else
+    if (likely(PyString_Check(n)))
+#endif
+        return __Pyx_PyObject_GetAttrStr(o, n);
+#endif
+    return PyObject_GetAttr(o, n);
+}
+
+/* GetItemInt */
+static PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j) {
+    PyObject *r;
+    if (unlikely(!j)) return NULL;
+    r = PyObject_GetItem(o, j);
+    Py_DECREF(j);
+    return r;
+}
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, Py_ssize_t i,
+                                                              CYTHON_NCP_UNUSED int wraparound,
+                                                              CYTHON_NCP_UNUSED int boundscheck) {
+#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    Py_ssize_t wrapped_i = i;
+    if (wraparound & unlikely(i < 0)) {
+        wrapped_i += PyList_GET_SIZE(o);
+    }
+    if ((!boundscheck) || likely(__Pyx_is_valid_index(wrapped_i, PyList_GET_SIZE(o)))) {
+        PyObject *r = PyList_GET_ITEM(o, wrapped_i);
+        Py_INCREF(r);
+        return r;
+    }
+    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
+#else
+    return PySequence_GetItem(o, i);
+#endif
+}
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, Py_ssize_t i,
+                                                              CYTHON_NCP_UNUSED int wraparound,
+                                                              CYTHON_NCP_UNUSED int boundscheck) {
+#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    Py_ssize_t wrapped_i = i;
+    if (wraparound & unlikely(i < 0)) {
+        wrapped_i += PyTuple_GET_SIZE(o);
+    }
+    if ((!boundscheck) || likely(__Pyx_is_valid_index(wrapped_i, PyTuple_GET_SIZE(o)))) {
+        PyObject *r = PyTuple_GET_ITEM(o, wrapped_i);
+        Py_INCREF(r);
+        return r;
+    }
+    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
+#else
+    return PySequence_GetItem(o, i);
+#endif
+}
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i, int is_list,
+                                                     CYTHON_NCP_UNUSED int wraparound,
+                                                     CYTHON_NCP_UNUSED int boundscheck) {
+#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS && CYTHON_USE_TYPE_SLOTS
+    if (is_list || PyList_CheckExact(o)) {
+        Py_ssize_t n = ((!wraparound) | likely(i >= 0)) ? i : i + PyList_GET_SIZE(o);
+        if ((!boundscheck) || (likely(__Pyx_is_valid_index(n, PyList_GET_SIZE(o))))) {
+            PyObject *r = PyList_GET_ITEM(o, n);
+            Py_INCREF(r);
+            return r;
+        }
+    }
+    else if (PyTuple_CheckExact(o)) {
+        Py_ssize_t n = ((!wraparound) | likely(i >= 0)) ? i : i + PyTuple_GET_SIZE(o);
+        if ((!boundscheck) || likely(__Pyx_is_valid_index(n, PyTuple_GET_SIZE(o)))) {
+            PyObject *r = PyTuple_GET_ITEM(o, n);
+            Py_INCREF(r);
+            return r;
+        }
+    } else {
+        PyMappingMethods *mm = Py_TYPE(o)->tp_as_mapping;
+        PySequenceMethods *sm = Py_TYPE(o)->tp_as_sequence;
+        if (mm && mm->mp_subscript) {
+            PyObject *r, *key = PyInt_FromSsize_t(i);
+            if (unlikely(!key)) return NULL;
+            r = mm->mp_subscript(o, key);
+            Py_DECREF(key);
+            return r;
+        }
+        if (likely(sm && sm->sq_item)) {
+            if (wraparound && unlikely(i < 0) && likely(sm->sq_length)) {
+                Py_ssize_t l = sm->sq_length(o);
+                if (likely(l >= 0)) {
+                    i += l;
+                } else {
+                    if (!PyErr_ExceptionMatches(PyExc_OverflowError))
+                        return NULL;
+                    PyErr_Clear();
+                }
+            }
+            return sm->sq_item(o, i);
+        }
+    }
+#else
+    if (is_list || !PyMapping_Check(o)) {
+        return PySequence_GetItem(o, i);
+    }
+#endif
+    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
+}
+
+/* PyObjectCallOneArg */
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg) {
+    PyObject *args[2] = {NULL, arg};
+    return __Pyx_PyObject_FastCall(func, args+1, 1 | __Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET);
+}
+
+/* ObjectGetItem */
+#if CYTHON_USE_TYPE_SLOTS
+static PyObject *__Pyx_PyObject_GetIndex(PyObject *obj, PyObject *index) {
+    PyObject *runerr = NULL;
+    Py_ssize_t key_value;
+    key_value = __Pyx_PyIndex_AsSsize_t(index);
+    if (likely(key_value != -1 || !(runerr = PyErr_Occurred()))) {
+        return __Pyx_GetItemInt_Fast(obj, key_value, 0, 1, 1);
+    }
+    if (PyErr_GivenExceptionMatches(runerr, PyExc_OverflowError)) {
+        __Pyx_TypeName index_type_name = __Pyx_PyType_GetName(Py_TYPE(index));
+        PyErr_Clear();
+        PyErr_Format(PyExc_IndexError,
+            "cannot fit '" __Pyx_FMT_TYPENAME "' into an index-sized integer", index_type_name);
+        __Pyx_DECREF_TypeName(index_type_name);
+    }
+    return NULL;
+}
+static PyObject *__Pyx_PyObject_GetItem_Slow(PyObject *obj, PyObject *key) {
+    __Pyx_TypeName obj_type_name;
+    if (likely(PyType_Check(obj))) {
+        PyObject *meth = __Pyx_PyObject_GetAttrStrNoError(obj, __pyx_n_s_class_getitem);
+        if (!meth) {
+            PyErr_Clear();
+        } else {
+            PyObject *result = __Pyx_PyObject_CallOneArg(meth, key);
+            Py_DECREF(meth);
+            return result;
+        }
+    }
+    obj_type_name = __Pyx_PyType_GetName(Py_TYPE(obj));
+    PyErr_Format(PyExc_TypeError,
+        "'" __Pyx_FMT_TYPENAME "' object is not subscriptable", obj_type_name);
+    __Pyx_DECREF_TypeName(obj_type_name);
+    return NULL;
+}
+static PyObject *__Pyx_PyObject_GetItem(PyObject *obj, PyObject *key) {
+    PyTypeObject *tp = Py_TYPE(obj);
+    PyMappingMethods *mm = tp->tp_as_mapping;
+    PySequenceMethods *sm = tp->tp_as_sequence;
+    if (likely(mm && mm->mp_subscript)) {
+        return mm->mp_subscript(obj, key);
+    }
+    if (likely(sm && sm->sq_item)) {
+        return __Pyx_PyObject_GetIndex(obj, key);
+    }
+    return __Pyx_PyObject_GetItem_Slow(obj, key);
+}
+#endif
+
+/* KeywordStringCheck */
+static int __Pyx_CheckKeywordStrings(
+    PyObject *kw,
+    const char* function_name,
+    int kw_allowed)
+{
+    PyObject* key = 0;
+    Py_ssize_t pos = 0;
+#if CYTHON_COMPILING_IN_PYPY
+    if (!kw_allowed && PyDict_Next(kw, &pos, &key, 0))
+        goto invalid_keyword;
+    return 1;
+#else
+    if (CYTHON_METH_FASTCALL && likely(PyTuple_Check(kw))) {
+        Py_ssize_t kwsize;
+#if CYTHON_ASSUME_SAFE_MACROS
+        kwsize = PyTuple_GET_SIZE(kw);
+#else
+        kwsize = PyTuple_Size(kw);
+        if (kwsize < 0) return 0;
+#endif
+        if (unlikely(kwsize == 0))
+            return 1;
+        if (!kw_allowed) {
+#if CYTHON_ASSUME_SAFE_MACROS
+            key = PyTuple_GET_ITEM(kw, 0);
+#else
+            key = PyTuple_GetItem(kw, pos);
+            if (!key) return 0;
+#endif
+            goto invalid_keyword;
+        }
+#if PY_VERSION_HEX < 0x03090000
+        for (pos = 0; pos < kwsize; pos++) {
+#if CYTHON_ASSUME_SAFE_MACROS
+            key = PyTuple_GET_ITEM(kw, pos);
+#else
+            key = PyTuple_GetItem(kw, pos);
+            if (!key) return 0;
+#endif
+            if (unlikely(!PyUnicode_Check(key)))
+                goto invalid_keyword_type;
+        }
+#endif
+        return 1;
+    }
+    while (PyDict_Next(kw, &pos, &key, 0)) {
+        #if PY_MAJOR_VERSION < 3
+        if (unlikely(!PyString_Check(key)))
+        #endif
+            if (unlikely(!PyUnicode_Check(key)))
+                goto invalid_keyword_type;
+    }
+    if (!kw_allowed && unlikely(key))
+        goto invalid_keyword;
+    return 1;
+invalid_keyword_type:
+    PyErr_Format(PyExc_TypeError,
+        "%.200s() keywords must be strings", function_name);
+    return 0;
+#endif
+invalid_keyword:
+    #if PY_MAJOR_VERSION < 3
+    PyErr_Format(PyExc_TypeError,
+        "%.200s() got an unexpected keyword argument '%.200s'",
+        function_name, PyString_AsString(key));
+    #else
+    PyErr_Format(PyExc_TypeError,
+        "%s() got an unexpected keyword argument '%U'",
+        function_name, key);
+    #endif
+    return 0;
+}
+
+/* DivInt[Py_ssize_t] */
+static CYTHON_INLINE Py_ssize_t __Pyx_div_Py_ssize_t(Py_ssize_t a, Py_ssize_t b) {
+    Py_ssize_t q = a / b;
+    Py_ssize_t r = a - q*b;
+    q -= ((r != 0) & ((r ^ b) < 0));
+    return q;
+}
+
+/* GetAttr3 */
+#if __PYX_LIMITED_VERSION_HEX < 0x030d00A1
+static PyObject *__Pyx_GetAttr3Default(PyObject *d) {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    if (unlikely(!__Pyx_PyErr_ExceptionMatches(PyExc_AttributeError)))
+        return NULL;
+    __Pyx_PyErr_Clear();
+    Py_INCREF(d);
+    return d;
+}
+#endif
+static CYTHON_INLINE PyObject *__Pyx_GetAttr3(PyObject *o, PyObject *n, PyObject *d) {
+    PyObject *r;
+#if __PYX_LIMITED_VERSION_HEX >= 0x030d00A1
+    int res = PyObject_GetOptionalAttr(o, n, &r);
+    return (res != 0) ? r : __Pyx_NewRef(d);
+#else
+  #if CYTHON_USE_TYPE_SLOTS
+    if (likely(PyString_Check(n))) {
+        r = __Pyx_PyObject_GetAttrStrNoError(o, n);
+        if (unlikely(!r) && likely(!PyErr_Occurred())) {
+            r = __Pyx_NewRef(d);
+        }
+        return r;
+    }
+  #endif
+    r = PyObject_GetAttr(o, n);
+    return (likely(r)) ? r : __Pyx_GetAttr3Default(d);
+#endif
+}
+
+/* PyDictVersioning */
+#if CYTHON_USE_DICT_VERSIONS && CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_tp_dict_version(PyObject *obj) {
+    PyObject *dict = Py_TYPE(obj)->tp_dict;
+    return likely(dict) ? __PYX_GET_DICT_VERSION(dict) : 0;
+}
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_object_dict_version(PyObject *obj) {
+    PyObject **dictptr = NULL;
+    Py_ssize_t offset = Py_TYPE(obj)->tp_dictoffset;
+    if (offset) {
+#if CYTHON_COMPILING_IN_CPYTHON
+        dictptr = (likely(offset > 0)) ? (PyObject **) ((char *)obj + offset) : _PyObject_GetDictPtr(obj);
+#else
+        dictptr = _PyObject_GetDictPtr(obj);
+#endif
+    }
+    return (dictptr && *dictptr) ? __PYX_GET_DICT_VERSION(*dictptr) : 0;
+}
+static CYTHON_INLINE int __Pyx_object_dict_version_matches(PyObject* obj, PY_UINT64_T tp_dict_version, PY_UINT64_T obj_dict_version) {
+    PyObject *dict = Py_TYPE(obj)->tp_dict;
+    if (unlikely(!dict) || unlikely(tp_dict_version != __PYX_GET_DICT_VERSION(dict)))
+        return 0;
+    return obj_dict_version == __Pyx_get_object_dict_version(obj);
+}
+#endif
+
+/* GetModuleGlobalName */
+#if CYTHON_USE_DICT_VERSIONS
+static PyObject *__Pyx__GetModuleGlobalName(PyObject *name, PY_UINT64_T *dict_version, PyObject **dict_cached_value)
+#else
+static CYTHON_INLINE PyObject *__Pyx__GetModuleGlobalName(PyObject *name)
+#endif
+{
+    PyObject *result;
+#if !CYTHON_AVOID_BORROWED_REFS
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1 && PY_VERSION_HEX < 0x030d0000
+    result = _PyDict_GetItem_KnownHash(__pyx_d, name, ((PyASCIIObject *) name)->hash);
+    __PYX_UPDATE_DICT_CACHE(__pyx_d, result, *dict_cached_value, *dict_version)
+    if (likely(result)) {
+        return __Pyx_NewRef(result);
+    } else if (unlikely(PyErr_Occurred())) {
+        return NULL;
+    }
+#elif CYTHON_COMPILING_IN_LIMITED_API
+    if (unlikely(!__pyx_m)) {
+        return NULL;
+    }
+    result = PyObject_GetAttr(__pyx_m, name);
+    if (likely(result)) {
+        return result;
+    }
+#else
+    result = PyDict_GetItem(__pyx_d, name);
+    __PYX_UPDATE_DICT_CACHE(__pyx_d, result, *dict_cached_value, *dict_version)
+    if (likely(result)) {
+        return __Pyx_NewRef(result);
+    }
+#endif
+#else
+    result = PyObject_GetItem(__pyx_d, name);
+    __PYX_UPDATE_DICT_CACHE(__pyx_d, result, *dict_cached_value, *dict_version)
+    if (likely(result)) {
+        return __Pyx_NewRef(result);
+    }
+    PyErr_Clear();
+#endif
+    return __Pyx_GetBuiltinName(name);
+}
+
+/* RaiseTooManyValuesToUnpack */
+static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected) {
+    PyErr_Format(PyExc_ValueError,
+                 "too many values to unpack (expected %" CYTHON_FORMAT_SSIZE_T "d)", expected);
+}
+
+/* RaiseNeedMoreValuesToUnpack */
+static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index) {
+    PyErr_Format(PyExc_ValueError,
+                 "need more than %" CYTHON_FORMAT_SSIZE_T "d value%.1s to unpack",
+                 index, (index == 1) ? "" : "s");
+}
+
+/* RaiseNoneIterError */
+static CYTHON_INLINE void __Pyx_RaiseNoneNotIterableError(void) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
+}
+
+/* ExtTypeTest */
+static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type) {
+    __Pyx_TypeName obj_type_name;
+    __Pyx_TypeName type_name;
+    if (unlikely(!type)) {
+        PyErr_SetString(PyExc_SystemError, "Missing type object");
+        return 0;
+    }
+    if (likely(__Pyx_TypeCheck(obj, type)))
+        return 1;
+    obj_type_name = __Pyx_PyType_GetName(Py_TYPE(obj));
+    type_name = __Pyx_PyType_GetName(type);
+    PyErr_Format(PyExc_TypeError,
+                 "Cannot convert " __Pyx_FMT_TYPENAME " to " __Pyx_FMT_TYPENAME,
+                 obj_type_name, type_name);
+    __Pyx_DECREF_TypeName(obj_type_name);
+    __Pyx_DECREF_TypeName(type_name);
+    return 0;
+}
+
+/* GetTopmostException */
+#if CYTHON_USE_EXC_INFO_STACK && CYTHON_FAST_THREAD_STATE
+static _PyErr_StackItem *
+__Pyx_PyErr_GetTopmostException(PyThreadState *tstate)
+{
+    _PyErr_StackItem *exc_info = tstate->exc_info;
+    while ((exc_info->exc_value == NULL || exc_info->exc_value == Py_None) &&
+           exc_info->previous_item != NULL)
+    {
+        exc_info = exc_info->previous_item;
+    }
+    return exc_info;
+}
+#endif
+
+/* SaveResetException */
+#if CYTHON_FAST_THREAD_STATE
+static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+  #if CYTHON_USE_EXC_INFO_STACK && PY_VERSION_HEX >= 0x030B00a4
+    _PyErr_StackItem *exc_info = __Pyx_PyErr_GetTopmostException(tstate);
+    PyObject *exc_value = exc_info->exc_value;
+    if (exc_value == NULL || exc_value == Py_None) {
+        *value = NULL;
+        *type = NULL;
+        *tb = NULL;
+    } else {
+        *value = exc_value;
+        Py_INCREF(*value);
+        *type = (PyObject*) Py_TYPE(exc_value);
+        Py_INCREF(*type);
+        *tb = PyException_GetTraceback(exc_value);
+    }
+  #elif CYTHON_USE_EXC_INFO_STACK
+    _PyErr_StackItem *exc_info = __Pyx_PyErr_GetTopmostException(tstate);
+    *type = exc_info->exc_type;
+    *value = exc_info->exc_value;
+    *tb = exc_info->exc_traceback;
+    Py_XINCREF(*type);
+    Py_XINCREF(*value);
+    Py_XINCREF(*tb);
+  #else
+    *type = tstate->exc_type;
+    *value = tstate->exc_value;
+    *tb = tstate->exc_traceback;
+    Py_XINCREF(*type);
+    Py_XINCREF(*value);
+    Py_XINCREF(*tb);
+  #endif
+}
+static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) {
+  #if CYTHON_USE_EXC_INFO_STACK && PY_VERSION_HEX >= 0x030B00a4
+    _PyErr_StackItem *exc_info = tstate->exc_info;
+    PyObject *tmp_value = exc_info->exc_value;
+    exc_info->exc_value = value;
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(type);
+    Py_XDECREF(tb);
+  #else
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    #if CYTHON_USE_EXC_INFO_STACK
+    _PyErr_StackItem *exc_info = tstate->exc_info;
+    tmp_type = exc_info->exc_type;
+    tmp_value = exc_info->exc_value;
+    tmp_tb = exc_info->exc_traceback;
+    exc_info->exc_type = type;
+    exc_info->exc_value = value;
+    exc_info->exc_traceback = tb;
+    #else
+    tmp_type = tstate->exc_type;
+    tmp_value = tstate->exc_value;
+    tmp_tb = tstate->exc_traceback;
+    tstate->exc_type = type;
+    tstate->exc_value = value;
+    tstate->exc_traceback = tb;
+    #endif
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+  #endif
+}
+#endif
+
+/* GetException */
+#if CYTHON_FAST_THREAD_STATE
+static int __Pyx__GetException(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb)
+#else
+static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb)
+#endif
+{
+    PyObject *local_type = NULL, *local_value, *local_tb = NULL;
+#if CYTHON_FAST_THREAD_STATE
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+  #if PY_VERSION_HEX >= 0x030C00A6
+    local_value = tstate->current_exception;
+    tstate->current_exception = 0;
+    if (likely(local_value)) {
+        local_type = (PyObject*) Py_TYPE(local_value);
+        Py_INCREF(local_type);
+        local_tb = PyException_GetTraceback(local_value);
+    }
+  #else
+    local_type = tstate->curexc_type;
+    local_value = tstate->curexc_value;
+    local_tb = tstate->curexc_traceback;
+    tstate->curexc_type = 0;
+    tstate->curexc_value = 0;
+    tstate->curexc_traceback = 0;
+  #endif
+#else
+    PyErr_Fetch(&local_type, &local_value, &local_tb);
+#endif
+    PyErr_NormalizeException(&local_type, &local_value, &local_tb);
+#if CYTHON_FAST_THREAD_STATE && PY_VERSION_HEX >= 0x030C00A6
+    if (unlikely(tstate->current_exception))
+#elif CYTHON_FAST_THREAD_STATE
+    if (unlikely(tstate->curexc_type))
+#else
+    if (unlikely(PyErr_Occurred()))
+#endif
+        goto bad;
+    #if PY_MAJOR_VERSION >= 3
+    if (local_tb) {
+        if (unlikely(PyException_SetTraceback(local_value, local_tb) < 0))
+            goto bad;
+    }
+    #endif
+    Py_XINCREF(local_tb);
+    Py_XINCREF(local_type);
+    Py_XINCREF(local_value);
+    *type = local_type;
+    *value = local_value;
+    *tb = local_tb;
+#if CYTHON_FAST_THREAD_STATE
+    #if CYTHON_USE_EXC_INFO_STACK
+    {
+        _PyErr_StackItem *exc_info = tstate->exc_info;
+      #if PY_VERSION_HEX >= 0x030B00a4
+        tmp_value = exc_info->exc_value;
+        exc_info->exc_value = local_value;
+        tmp_type = NULL;
+        tmp_tb = NULL;
+        Py_XDECREF(local_type);
+        Py_XDECREF(local_tb);
+      #else
+        tmp_type = exc_info->exc_type;
+        tmp_value = exc_info->exc_value;
+        tmp_tb = exc_info->exc_traceback;
+        exc_info->exc_type = local_type;
+        exc_info->exc_value = local_value;
+        exc_info->exc_traceback = local_tb;
+      #endif
+    }
+    #else
+    tmp_type = tstate->exc_type;
+    tmp_value = tstate->exc_value;
+    tmp_tb = tstate->exc_traceback;
+    tstate->exc_type = local_type;
+    tstate->exc_value = local_value;
+    tstate->exc_traceback = local_tb;
+    #endif
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+#else
+    PyErr_SetExcInfo(local_type, local_value, local_tb);
+#endif
+    return 0;
+bad:
+    *type = 0;
+    *value = 0;
+    *tb = 0;
+    Py_XDECREF(local_type);
+    Py_XDECREF(local_value);
+    Py_XDECREF(local_tb);
+    return -1;
+}
+
+/* SwapException */
+#if CYTHON_FAST_THREAD_STATE
+static CYTHON_INLINE void __Pyx__ExceptionSwap(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+  #if CYTHON_USE_EXC_INFO_STACK && PY_VERSION_HEX >= 0x030B00a4
+    _PyErr_StackItem *exc_info = tstate->exc_info;
+    tmp_value = exc_info->exc_value;
+    exc_info->exc_value = *value;
+    if (tmp_value == NULL || tmp_value == Py_None) {
+        Py_XDECREF(tmp_value);
+        tmp_value = NULL;
+        tmp_type = NULL;
+        tmp_tb = NULL;
+    } else {
+        tmp_type = (PyObject*) Py_TYPE(tmp_value);
+        Py_INCREF(tmp_type);
+        #if CYTHON_COMPILING_IN_CPYTHON
+        tmp_tb = ((PyBaseExceptionObject*) tmp_value)->traceback;
+        Py_XINCREF(tmp_tb);
+        #else
+        tmp_tb = PyException_GetTraceback(tmp_value);
+        #endif
+    }
+  #elif CYTHON_USE_EXC_INFO_STACK
+    _PyErr_StackItem *exc_info = tstate->exc_info;
+    tmp_type = exc_info->exc_type;
+    tmp_value = exc_info->exc_value;
+    tmp_tb = exc_info->exc_traceback;
+    exc_info->exc_type = *type;
+    exc_info->exc_value = *value;
+    exc_info->exc_traceback = *tb;
+  #else
+    tmp_type = tstate->exc_type;
+    tmp_value = tstate->exc_value;
+    tmp_tb = tstate->exc_traceback;
+    tstate->exc_type = *type;
+    tstate->exc_value = *value;
+    tstate->exc_traceback = *tb;
+  #endif
+    *type = tmp_type;
+    *value = tmp_value;
+    *tb = tmp_tb;
+}
+#else
+static CYTHON_INLINE void __Pyx_ExceptionSwap(PyObject **type, PyObject **value, PyObject **tb) {
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    PyErr_GetExcInfo(&tmp_type, &tmp_value, &tmp_tb);
+    PyErr_SetExcInfo(*type, *value, *tb);
+    *type = tmp_type;
+    *value = tmp_value;
+    *tb = tmp_tb;
+}
+#endif
+
+/* Import */
+static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level) {
+    PyObject *module = 0;
+    PyObject *empty_dict = 0;
+    PyObject *empty_list = 0;
+    #if PY_MAJOR_VERSION < 3
+    PyObject *py_import;
+    py_import = __Pyx_PyObject_GetAttrStr(__pyx_b, __pyx_n_s_import);
+    if (unlikely(!py_import))
+        goto bad;
+    if (!from_list) {
+        empty_list = PyList_New(0);
+        if (unlikely(!empty_list))
+            goto bad;
+        from_list = empty_list;
+    }
+    #endif
+    empty_dict = PyDict_New();
+    if (unlikely(!empty_dict))
+        goto bad;
+    {
+        #if PY_MAJOR_VERSION >= 3
+        if (level == -1) {
+            if (strchr(__Pyx_MODULE_NAME, '.') != NULL) {
+                module = PyImport_ImportModuleLevelObject(
+                    name, __pyx_d, empty_dict, from_list, 1);
+                if (unlikely(!module)) {
+                    if (unlikely(!PyErr_ExceptionMatches(PyExc_ImportError)))
+                        goto bad;
+                    PyErr_Clear();
+                }
+            }
+            level = 0;
+        }
+        #endif
+        if (!module) {
+            #if PY_MAJOR_VERSION < 3
+            PyObject *py_level = PyInt_FromLong(level);
+            if (unlikely(!py_level))
+                goto bad;
+            module = PyObject_CallFunctionObjArgs(py_import,
+                name, __pyx_d, empty_dict, from_list, py_level, (PyObject *)NULL);
+            Py_DECREF(py_level);
+            #else
+            module = PyImport_ImportModuleLevelObject(
+                name, __pyx_d, empty_dict, from_list, level);
+            #endif
+        }
+    }
+bad:
+    Py_XDECREF(empty_dict);
+    Py_XDECREF(empty_list);
+    #if PY_MAJOR_VERSION < 3
+    Py_XDECREF(py_import);
+    #endif
+    return module;
+}
+
+/* ImportDottedModule */
+#if PY_MAJOR_VERSION >= 3
+static PyObject *__Pyx__ImportDottedModule_Error(PyObject *name, PyObject *parts_tuple, Py_ssize_t count) {
+    PyObject *partial_name = NULL, *slice = NULL, *sep = NULL;
+    if (unlikely(PyErr_Occurred())) {
+        PyErr_Clear();
+    }
+    if (likely(PyTuple_GET_SIZE(parts_tuple) == count)) {
+        partial_name = name;
+    } else {
+        slice = PySequence_GetSlice(parts_tuple, 0, count);
+        if (unlikely(!slice))
+            goto bad;
+        sep = PyUnicode_FromStringAndSize(".", 1);
+        if (unlikely(!sep))
+            goto bad;
+        partial_name = PyUnicode_Join(sep, slice);
+    }
+    PyErr_Format(
+#if PY_MAJOR_VERSION < 3
+        PyExc_ImportError,
+        "No module named '%s'", PyString_AS_STRING(partial_name));
+#else
+#if PY_VERSION_HEX >= 0x030600B1
+        PyExc_ModuleNotFoundError,
+#else
+        PyExc_ImportError,
+#endif
+        "No module named '%U'", partial_name);
+#endif
+bad:
+    Py_XDECREF(sep);
+    Py_XDECREF(slice);
+    Py_XDECREF(partial_name);
+    return NULL;
+}
+#endif
+#if PY_MAJOR_VERSION >= 3
+static PyObject *__Pyx__ImportDottedModule_Lookup(PyObject *name) {
+    PyObject *imported_module;
+#if PY_VERSION_HEX < 0x030700A1 || (CYTHON_COMPILING_IN_PYPY && PYPY_VERSION_NUM  < 0x07030400)
+    PyObject *modules = PyImport_GetModuleDict();
+    if (unlikely(!modules))
+        return NULL;
+    imported_module = __Pyx_PyDict_GetItemStr(modules, name);
+    Py_XINCREF(imported_module);
+#else
+    imported_module = PyImport_GetModule(name);
+#endif
+    return imported_module;
+}
+#endif
+#if PY_MAJOR_VERSION >= 3
+static PyObject *__Pyx_ImportDottedModule_WalkParts(PyObject *module, PyObject *name, PyObject *parts_tuple) {
+    Py_ssize_t i, nparts;
+    nparts = PyTuple_GET_SIZE(parts_tuple);
+    for (i=1; i < nparts && module; i++) {
+        PyObject *part, *submodule;
+#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+        part = PyTuple_GET_ITEM(parts_tuple, i);
+#else
+        part = PySequence_ITEM(parts_tuple, i);
+#endif
+        submodule = __Pyx_PyObject_GetAttrStrNoError(module, part);
+#if !(CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS)
+        Py_DECREF(part);
+#endif
+        Py_DECREF(module);
+        module = submodule;
+    }
+    if (unlikely(!module)) {
+        return __Pyx__ImportDottedModule_Error(name, parts_tuple, i);
+    }
+    return module;
+}
+#endif
+static PyObject *__Pyx__ImportDottedModule(PyObject *name, PyObject *parts_tuple) {
+#if PY_MAJOR_VERSION < 3
+    PyObject *module, *from_list, *star = __pyx_n_s__3;
+    CYTHON_UNUSED_VAR(parts_tuple);
+    from_list = PyList_New(1);
+    if (unlikely(!from_list))
+        return NULL;
+    Py_INCREF(star);
+    PyList_SET_ITEM(from_list, 0, star);
+    module = __Pyx_Import(name, from_list, 0);
+    Py_DECREF(from_list);
+    return module;
+#else
+    PyObject *imported_module;
+    PyObject *module = __Pyx_Import(name, NULL, 0);
+    if (!parts_tuple || unlikely(!module))
+        return module;
+    imported_module = __Pyx__ImportDottedModule_Lookup(name);
+    if (likely(imported_module)) {
+        Py_DECREF(module);
+        return imported_module;
+    }
+    PyErr_Clear();
+    return __Pyx_ImportDottedModule_WalkParts(module, name, parts_tuple);
+#endif
+}
+static PyObject *__Pyx_ImportDottedModule(PyObject *name, PyObject *parts_tuple) {
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030400B1
+    PyObject *module = __Pyx__ImportDottedModule_Lookup(name);
+    if (likely(module)) {
+        PyObject *spec = __Pyx_PyObject_GetAttrStrNoError(module, __pyx_n_s_spec);
+        if (likely(spec)) {
+            PyObject *unsafe = __Pyx_PyObject_GetAttrStrNoError(spec, __pyx_n_s_initializing);
+            if (likely(!unsafe || !__Pyx_PyObject_IsTrue(unsafe))) {
+                Py_DECREF(spec);
+                spec = NULL;
+            }
+            Py_XDECREF(unsafe);
+        }
+        if (likely(!spec)) {
+            PyErr_Clear();
+            return module;
+        }
+        Py_DECREF(spec);
+        Py_DECREF(module);
+    } else if (PyErr_Occurred()) {
+        PyErr_Clear();
+    }
+#endif
+    return __Pyx__ImportDottedModule(name, parts_tuple);
+}
+
+/* FastTypeChecks */
+#if CYTHON_COMPILING_IN_CPYTHON
+static int __Pyx_InBases(PyTypeObject *a, PyTypeObject *b) {
+    while (a) {
+        a = __Pyx_PyType_GetSlot(a, tp_base, PyTypeObject*);
+        if (a == b)
+            return 1;
+    }
+    return b == &PyBaseObject_Type;
+}
+static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b) {
+    PyObject *mro;
+    if (a == b) return 1;
+    mro = a->tp_mro;
+    if (likely(mro)) {
+        Py_ssize_t i, n;
+        n = PyTuple_GET_SIZE(mro);
+        for (i = 0; i < n; i++) {
+            if (PyTuple_GET_ITEM(mro, i) == (PyObject *)b)
+                return 1;
+        }
+        return 0;
+    }
+    return __Pyx_InBases(a, b);
+}
+static CYTHON_INLINE int __Pyx_IsAnySubtype2(PyTypeObject *cls, PyTypeObject *a, PyTypeObject *b) {
+    PyObject *mro;
+    if (cls == a || cls == b) return 1;
+    mro = cls->tp_mro;
+    if (likely(mro)) {
+        Py_ssize_t i, n;
+        n = PyTuple_GET_SIZE(mro);
+        for (i = 0; i < n; i++) {
+            PyObject *base = PyTuple_GET_ITEM(mro, i);
+            if (base == (PyObject *)a || base == (PyObject *)b)
+                return 1;
+        }
+        return 0;
+    }
+    return __Pyx_InBases(cls, a) || __Pyx_InBases(cls, b);
+}
+#if PY_MAJOR_VERSION == 2
+static int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObject* exc_type1, PyObject* exc_type2) {
+    PyObject *exception, *value, *tb;
+    int res;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&exception, &value, &tb);
+    res = exc_type1 ? PyObject_IsSubclass(err, exc_type1) : 0;
+    if (unlikely(res == -1)) {
+        PyErr_WriteUnraisable(err);
+        res = 0;
+    }
+    if (!res) {
+        res = PyObject_IsSubclass(err, exc_type2);
+        if (unlikely(res == -1)) {
+            PyErr_WriteUnraisable(err);
+            res = 0;
+        }
+    }
+    __Pyx_ErrRestore(exception, value, tb);
+    return res;
+}
+#else
+static CYTHON_INLINE int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObject* exc_type1, PyObject *exc_type2) {
+    if (exc_type1) {
+        return __Pyx_IsAnySubtype2((PyTypeObject*)err, (PyTypeObject*)exc_type1, (PyTypeObject*)exc_type2);
+    } else {
+        return __Pyx_IsSubtype((PyTypeObject*)err, (PyTypeObject*)exc_type2);
+    }
+}
+#endif
+static int __Pyx_PyErr_GivenExceptionMatchesTuple(PyObject *exc_type, PyObject *tuple) {
+    Py_ssize_t i, n;
+    assert(PyExceptionClass_Check(exc_type));
+    n = PyTuple_GET_SIZE(tuple);
+#if PY_MAJOR_VERSION >= 3
+    for (i=0; i<n; i++) {
+        if (exc_type == PyTuple_GET_ITEM(tuple, i)) return 1;
+    }
+#endif
+    for (i=0; i<n; i++) {
+        PyObject *t = PyTuple_GET_ITEM(tuple, i);
+        #if PY_MAJOR_VERSION < 3
+        if (likely(exc_type == t)) return 1;
+        #endif
+        if (likely(PyExceptionClass_Check(t))) {
+            if (__Pyx_inner_PyErr_GivenExceptionMatches2(exc_type, NULL, t)) return 1;
+        } else {
+        }
+    }
+    return 0;
+}
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject* exc_type) {
+    if (likely(err == exc_type)) return 1;
+    if (likely(PyExceptionClass_Check(err))) {
+        if (likely(PyExceptionClass_Check(exc_type))) {
+            return __Pyx_inner_PyErr_GivenExceptionMatches2(err, NULL, exc_type);
+        } else if (likely(PyTuple_Check(exc_type))) {
+            return __Pyx_PyErr_GivenExceptionMatchesTuple(err, exc_type);
+        } else {
+        }
+    }
+    return PyErr_GivenExceptionMatches(err, exc_type);
+}
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *exc_type1, PyObject *exc_type2) {
+    assert(PyExceptionClass_Check(exc_type1));
+    assert(PyExceptionClass_Check(exc_type2));
+    if (likely(err == exc_type1 || err == exc_type2)) return 1;
+    if (likely(PyExceptionClass_Check(err))) {
+        return __Pyx_inner_PyErr_GivenExceptionMatches2(err, exc_type1, exc_type2);
+    }
+    return (PyErr_GivenExceptionMatches(err, exc_type1) || PyErr_GivenExceptionMatches(err, exc_type2));
+}
+#endif
+
+/* PySequenceMultiply */
+static PyObject* __Pyx_PySequence_Multiply_Generic(PyObject *seq, Py_ssize_t mul) {
+    PyObject *result, *pymul = PyInt_FromSsize_t(mul);
+    if (unlikely(!pymul))
+        return NULL;
+    result = PyNumber_Multiply(seq, pymul);
+    Py_DECREF(pymul);
+    return result;
+}
+static CYTHON_INLINE PyObject* __Pyx_PySequence_Multiply(PyObject *seq, Py_ssize_t mul) {
+#if CYTHON_USE_TYPE_SLOTS
+    PyTypeObject *type = Py_TYPE(seq);
+    if (likely(type->tp_as_sequence && type->tp_as_sequence->sq_repeat)) {
+        return type->tp_as_sequence->sq_repeat(seq, mul);
+    } else
+#endif
+    {
+        return __Pyx_PySequence_Multiply_Generic(seq, mul);
+    }
+}
+
+/* SetItemInt */
+static int __Pyx_SetItemInt_Generic(PyObject *o, PyObject *j, PyObject *v) {
+    int r;
+    if (unlikely(!j)) return -1;
+    r = PyObject_SetItem(o, j, v);
+    Py_DECREF(j);
+    return r;
+}
+static CYTHON_INLINE int __Pyx_SetItemInt_Fast(PyObject *o, Py_ssize_t i, PyObject *v, int is_list,
+                                               CYTHON_NCP_UNUSED int wraparound, CYTHON_NCP_UNUSED int boundscheck) {
+#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS && CYTHON_USE_TYPE_SLOTS
+    if (is_list || PyList_CheckExact(o)) {
+        Py_ssize_t n = (!wraparound) ? i : ((likely(i >= 0)) ? i : i + PyList_GET_SIZE(o));
+        if ((!boundscheck) || likely(__Pyx_is_valid_index(n, PyList_GET_SIZE(o)))) {
+            PyObject* old = PyList_GET_ITEM(o, n);
+            Py_INCREF(v);
+            PyList_SET_ITEM(o, n, v);
+            Py_DECREF(old);
+            return 1;
+        }
+    } else {
+        PyMappingMethods *mm = Py_TYPE(o)->tp_as_mapping;
+        PySequenceMethods *sm = Py_TYPE(o)->tp_as_sequence;
+        if (mm && mm->mp_ass_subscript) {
+            int r;
+            PyObject *key = PyInt_FromSsize_t(i);
+            if (unlikely(!key)) return -1;
+            r = mm->mp_ass_subscript(o, key, v);
+            Py_DECREF(key);
+            return r;
+        }
+        if (likely(sm && sm->sq_ass_item)) {
+            if (wraparound && unlikely(i < 0) && likely(sm->sq_length)) {
+                Py_ssize_t l = sm->sq_length(o);
+                if (likely(l >= 0)) {
+                    i += l;
+                } else {
+                    if (!PyErr_ExceptionMatches(PyExc_OverflowError))
+                        return -1;
+                    PyErr_Clear();
+                }
+            }
+            return sm->sq_ass_item(o, i, v);
+        }
+    }
+#else
+    if (is_list || !PyMapping_Check(o))
+    {
+        return PySequence_SetItem(o, i, v);
+    }
+#endif
+    return __Pyx_SetItemInt_Generic(o, PyInt_FromSsize_t(i), v);
+}
+
+/* RaiseUnboundLocalError */
+static CYTHON_INLINE void __Pyx_RaiseUnboundLocalError(const char *varname) {
+    PyErr_Format(PyExc_UnboundLocalError, "local variable '%s' referenced before assignment", varname);
+}
+
+/* DivInt[long] */
+static CYTHON_INLINE long __Pyx_div_long(long a, long b) {
+    long q = a / b;
+    long r = a - q*b;
+    q -= ((r != 0) & ((r ^ b) < 0));
+    return q;
+}
+
+/* ImportFrom */
+static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name) {
+    PyObject* value = __Pyx_PyObject_GetAttrStr(module, name);
+    if (unlikely(!value) && PyErr_ExceptionMatches(PyExc_AttributeError)) {
+        const char* module_name_str = 0;
+        PyObject* module_name = 0;
+        PyObject* module_dot = 0;
+        PyObject* full_name = 0;
+        PyErr_Clear();
+        module_name_str = PyModule_GetName(module);
+        if (unlikely(!module_name_str)) { goto modbad; }
+        module_name = PyUnicode_FromString(module_name_str);
+        if (unlikely(!module_name)) { goto modbad; }
+        module_dot = PyUnicode_Concat(module_name, __pyx_kp_u__2);
+        if (unlikely(!module_dot)) { goto modbad; }
+        full_name = PyUnicode_Concat(module_dot, name);
+        if (unlikely(!full_name)) { goto modbad; }
+        #if PY_VERSION_HEX < 0x030700A1 || (CYTHON_COMPILING_IN_PYPY && PYPY_VERSION_NUM  < 0x07030400)
+        {
+            PyObject *modules = PyImport_GetModuleDict();
+            if (unlikely(!modules))
+                goto modbad;
+            value = PyObject_GetItem(modules, full_name);
+        }
+        #else
+        value = PyImport_GetModule(full_name);
+        #endif
+      modbad:
+        Py_XDECREF(full_name);
+        Py_XDECREF(module_dot);
+        Py_XDECREF(module_name);
+    }
+    if (unlikely(!value)) {
+        PyErr_Format(PyExc_ImportError,
+        #if PY_MAJOR_VERSION < 3
+            "cannot import name %.230s", PyString_AS_STRING(name));
+        #else
+            "cannot import name %S", name);
+        #endif
+    }
+    return value;
+}
+
+/* HasAttr */
+static CYTHON_INLINE int __Pyx_HasAttr(PyObject *o, PyObject *n) {
+    PyObject *r;
+    if (unlikely(!__Pyx_PyBaseString_Check(n))) {
+        PyErr_SetString(PyExc_TypeError,
+                        "hasattr(): attribute name must be string");
+        return -1;
+    }
+    r = __Pyx_GetAttr(o, n);
+    if (!r) {
+        PyErr_Clear();
+        return 0;
+    } else {
+        Py_DECREF(r);
+        return 1;
+    }
+}
+
+/* IsLittleEndian */
+static CYTHON_INLINE int __Pyx_Is_Little_Endian(void)
+{
+  union {
+    uint32_t u32;
+    uint8_t u8[4];
+  } S;
+  S.u32 = 0x01020304;
+  return S.u8[0] == 4;
+}
+
+/* BufferFormatCheck */
+static void __Pyx_BufFmt_Init(__Pyx_BufFmt_Context* ctx,
+                              __Pyx_BufFmt_StackElem* stack,
+                              __Pyx_TypeInfo* type) {
+  stack[0].field = &ctx->root;
+  stack[0].parent_offset = 0;
+  ctx->root.type = type;
+  ctx->root.name = "buffer dtype";
+  ctx->root.offset = 0;
+  ctx->head = stack;
+  ctx->head->field = &ctx->root;
+  ctx->fmt_offset = 0;
+  ctx->head->parent_offset = 0;
+  ctx->new_packmode = '@';
+  ctx->enc_packmode = '@';
+  ctx->new_count = 1;
+  ctx->enc_count = 0;
+  ctx->enc_type = 0;
+  ctx->is_complex = 0;
+  ctx->is_valid_array = 0;
+  ctx->struct_alignment = 0;
+  while (type->typegroup == 'S') {
+    ++ctx->head;
+    ctx->head->field = type->fields;
+    ctx->head->parent_offset = 0;
+    type = type->fields->type;
+  }
+}
+static int __Pyx_BufFmt_ParseNumber(const char** ts) {
+    int count;
+    const char* t = *ts;
+    if (*t < '0' || *t > '9') {
+      return -1;
+    } else {
+        count = *t++ - '0';
+        while (*t >= '0' && *t <= '9') {
+            count *= 10;
+            count += *t++ - '0';
+        }
+    }
+    *ts = t;
+    return count;
+}
+static int __Pyx_BufFmt_ExpectNumber(const char **ts) {
+    int number = __Pyx_BufFmt_ParseNumber(ts);
+    if (number == -1)
+        PyErr_Format(PyExc_ValueError,\
+                     "Does not understand character buffer dtype format string ('%c')", **ts);
+    return number;
+}
+static void __Pyx_BufFmt_RaiseUnexpectedChar(char ch) {
+  PyErr_Format(PyExc_ValueError,
+               "Unexpected format string character: '%c'", ch);
+}
+static const char* __Pyx_BufFmt_DescribeTypeChar(char ch, int is_complex) {
+  switch (ch) {
+    case '?': return "'bool'";
+    case 'c': return "'char'";
+    case 'b': return "'signed char'";
+    case 'B': return "'unsigned char'";
+    case 'h': return "'short'";
+    case 'H': return "'unsigned short'";
+    case 'i': return "'int'";
+    case 'I': return "'unsigned int'";
+    case 'l': return "'long'";
+    case 'L': return "'unsigned long'";
+    case 'q': return "'long long'";
+    case 'Q': return "'unsigned long long'";
+    case 'f': return (is_complex ? "'complex float'" : "'float'");
+    case 'd': return (is_complex ? "'complex double'" : "'double'");
+    case 'g': return (is_complex ? "'complex long double'" : "'long double'");
+    case 'T': return "a struct";
+    case 'O': return "Python object";
+    case 'P': return "a pointer";
+    case 's': case 'p': return "a string";
+    case 0: return "end";
+    default: return "unparsable format string";
+  }
+}
+static size_t __Pyx_BufFmt_TypeCharToStandardSize(char ch, int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return 2;
+    case 'i': case 'I': case 'l': case 'L': return 4;
+    case 'q': case 'Q': return 8;
+    case 'f': return (is_complex ? 8 : 4);
+    case 'd': return (is_complex ? 16 : 8);
+    case 'g': {
+      PyErr_SetString(PyExc_ValueError, "Python does not define a standard format string size for long double ('g')..");
+      return 0;
+    }
+    case 'O': case 'P': return sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+static size_t __Pyx_BufFmt_TypeCharToNativeSize(char ch, int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(short);
+    case 'i': case 'I': return sizeof(int);
+    case 'l': case 'L': return sizeof(long);
+    #ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(PY_LONG_LONG);
+    #endif
+    case 'f': return sizeof(float) * (is_complex ? 2 : 1);
+    case 'd': return sizeof(double) * (is_complex ? 2 : 1);
+    case 'g': return sizeof(long double) * (is_complex ? 2 : 1);
+    case 'O': case 'P': return sizeof(void*);
+    default: {
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+  }
+}
+typedef struct { char c; short x; } __Pyx_st_short;
+typedef struct { char c; int x; } __Pyx_st_int;
+typedef struct { char c; long x; } __Pyx_st_long;
+typedef struct { char c; float x; } __Pyx_st_float;
+typedef struct { char c; double x; } __Pyx_st_double;
+typedef struct { char c; long double x; } __Pyx_st_longdouble;
+typedef struct { char c; void *x; } __Pyx_st_void_p;
+#ifdef HAVE_LONG_LONG
+typedef struct { char c; PY_LONG_LONG x; } __Pyx_st_longlong;
+#endif
+static size_t __Pyx_BufFmt_TypeCharToAlignment(char ch, int is_complex) {
+  CYTHON_UNUSED_VAR(is_complex);
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(__Pyx_st_short) - sizeof(short);
+    case 'i': case 'I': return sizeof(__Pyx_st_int) - sizeof(int);
+    case 'l': case 'L': return sizeof(__Pyx_st_long) - sizeof(long);
+#ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(__Pyx_st_longlong) - sizeof(PY_LONG_LONG);
+#endif
+    case 'f': return sizeof(__Pyx_st_float) - sizeof(float);
+    case 'd': return sizeof(__Pyx_st_double) - sizeof(double);
+    case 'g': return sizeof(__Pyx_st_longdouble) - sizeof(long double);
+    case 'P': case 'O': return sizeof(__Pyx_st_void_p) - sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+/* These are for computing the padding at the end of the struct to align
+   on the first member of the struct. This will probably the same as above,
+   but we don't have any guarantees.
+ */
+typedef struct { short x; char c; } __Pyx_pad_short;
+typedef struct { int x; char c; } __Pyx_pad_int;
+typedef struct { long x; char c; } __Pyx_pad_long;
+typedef struct { float x; char c; } __Pyx_pad_float;
+typedef struct { double x; char c; } __Pyx_pad_double;
+typedef struct { long double x; char c; } __Pyx_pad_longdouble;
+typedef struct { void *x; char c; } __Pyx_pad_void_p;
+#ifdef HAVE_LONG_LONG
+typedef struct { PY_LONG_LONG x; char c; } __Pyx_pad_longlong;
+#endif
+static size_t __Pyx_BufFmt_TypeCharToPadding(char ch, int is_complex) {
+  CYTHON_UNUSED_VAR(is_complex);
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(__Pyx_pad_short) - sizeof(short);
+    case 'i': case 'I': return sizeof(__Pyx_pad_int) - sizeof(int);
+    case 'l': case 'L': return sizeof(__Pyx_pad_long) - sizeof(long);
+#ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(__Pyx_pad_longlong) - sizeof(PY_LONG_LONG);
+#endif
+    case 'f': return sizeof(__Pyx_pad_float) - sizeof(float);
+    case 'd': return sizeof(__Pyx_pad_double) - sizeof(double);
+    case 'g': return sizeof(__Pyx_pad_longdouble) - sizeof(long double);
+    case 'P': case 'O': return sizeof(__Pyx_pad_void_p) - sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+static char __Pyx_BufFmt_TypeCharToGroup(char ch, int is_complex) {
+  switch (ch) {
+    case 'c':
+        return 'H';
+    case 'b': case 'h': case 'i':
+    case 'l': case 'q': case 's': case 'p':
+        return 'I';
+    case '?': case 'B': case 'H': case 'I': case 'L': case 'Q':
+        return 'U';
+    case 'f': case 'd': case 'g':
+        return (is_complex ? 'C' : 'R');
+    case 'O':
+        return 'O';
+    case 'P':
+        return 'P';
+    default: {
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+  }
+}
+static void __Pyx_BufFmt_RaiseExpected(__Pyx_BufFmt_Context* ctx) {
+  if (ctx->head == NULL || ctx->head->field == &ctx->root) {
+    const char* expected;
+    const char* quote;
+    if (ctx->head == NULL) {
+      expected = "end";
+      quote = "";
+    } else {
+      expected = ctx->head->field->type->name;
+      quote = "'";
+    }
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer dtype mismatch, expected %s%s%s but got %s",
+                 quote, expected, quote,
+                 __Pyx_BufFmt_DescribeTypeChar(ctx->enc_type, ctx->is_complex));
+  } else {
+    __Pyx_StructField* field = ctx->head->field;
+    __Pyx_StructField* parent = (ctx->head - 1)->field;
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer dtype mismatch, expected '%s' but got %s in '%s.%s'",
+                 field->type->name, __Pyx_BufFmt_DescribeTypeChar(ctx->enc_type, ctx->is_complex),
+                 parent->type->name, field->name);
+  }
+}
+static int __Pyx_BufFmt_ProcessTypeChunk(__Pyx_BufFmt_Context* ctx) {
+  char group;
+  size_t size, offset, arraysize = 1;
+  if (ctx->enc_type == 0) return 0;
+  if (ctx->head->field->type->arraysize[0]) {
+    int i, ndim = 0;
+    if (ctx->enc_type == 's' || ctx->enc_type == 'p') {
+        ctx->is_valid_array = ctx->head->field->type->ndim == 1;
+        ndim = 1;
+        if (ctx->enc_count != ctx->head->field->type->arraysize[0]) {
+            PyErr_Format(PyExc_ValueError,
+                         "Expected a dimension of size %zu, got %zu",
+                         ctx->head->field->type->arraysize[0], ctx->enc_count);
+            return -1;
+        }
+    }
+    if (!ctx->is_valid_array) {
+      PyErr_Format(PyExc_ValueError, "Expected %d dimensions, got %d",
+                   ctx->head->field->type->ndim, ndim);
+      return -1;
+    }
+    for (i = 0; i < ctx->head->field->type->ndim; i++) {
+      arraysize *= ctx->head->field->type->arraysize[i];
+    }
+    ctx->is_valid_array = 0;
+    ctx->enc_count = 1;
+  }
+  group = __Pyx_BufFmt_TypeCharToGroup(ctx->enc_type, ctx->is_complex);
+  do {
+    __Pyx_StructField* field = ctx->head->field;
+    __Pyx_TypeInfo* type = field->type;
+    if (ctx->enc_packmode == '@' || ctx->enc_packmode == '^') {
+      size = __Pyx_BufFmt_TypeCharToNativeSize(ctx->enc_type, ctx->is_complex);
+    } else {
+      size = __Pyx_BufFmt_TypeCharToStandardSize(ctx->enc_type, ctx->is_complex);
+    }
+    if (ctx->enc_packmode == '@') {
+      size_t align_at = __Pyx_BufFmt_TypeCharToAlignment(ctx->enc_type, ctx->is_complex);
+      size_t align_mod_offset;
+      if (align_at == 0) return -1;
+      align_mod_offset = ctx->fmt_offset % align_at;
+      if (align_mod_offset > 0) ctx->fmt_offset += align_at - align_mod_offset;
+      if (ctx->struct_alignment == 0)
+          ctx->struct_alignment = __Pyx_BufFmt_TypeCharToPadding(ctx->enc_type,
+                                                                 ctx->is_complex);
+    }
+    if (type->size != size || type->typegroup != group) {
+      if (type->typegroup == 'C' && type->fields != NULL) {
+        size_t parent_offset = ctx->head->parent_offset + field->offset;
+        ++ctx->head;
+        ctx->head->field = type->fields;
+        ctx->head->parent_offset = parent_offset;
+        continue;
+      }
+      if ((type->typegroup == 'H' || group == 'H') && type->size == size) {
+      } else {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return -1;
+      }
+    }
+    offset = ctx->head->parent_offset + field->offset;
+    if (ctx->fmt_offset != offset) {
+      PyErr_Format(PyExc_ValueError,
+                   "Buffer dtype mismatch; next field is at offset %" CYTHON_FORMAT_SSIZE_T "d but %" CYTHON_FORMAT_SSIZE_T "d expected",
+                   (Py_ssize_t)ctx->fmt_offset, (Py_ssize_t)offset);
+      return -1;
+    }
+    ctx->fmt_offset += size;
+    if (arraysize)
+      ctx->fmt_offset += (arraysize - 1) * size;
+    --ctx->enc_count;
+    while (1) {
+      if (field == &ctx->root) {
+        ctx->head = NULL;
+        if (ctx->enc_count != 0) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return -1;
+        }
+        break;
+      }
+      ctx->head->field = ++field;
+      if (field->type == NULL) {
+        --ctx->head;
+        field = ctx->head->field;
+        continue;
+      } else if (field->type->typegroup == 'S') {
+        size_t parent_offset = ctx->head->parent_offset + field->offset;
+        if (field->type->fields->type == NULL) continue;
+        field = field->type->fields;
+        ++ctx->head;
+        ctx->head->field = field;
+        ctx->head->parent_offset = parent_offset;
+        break;
+      } else {
+        break;
+      }
+    }
+  } while (ctx->enc_count);
+  ctx->enc_type = 0;
+  ctx->is_complex = 0;
+  return 0;
+}
+static int
+__pyx_buffmt_parse_array(__Pyx_BufFmt_Context* ctx, const char** tsp)
+{
+    const char *ts = *tsp;
+    int i = 0, number, ndim;
+    ++ts;
+    if (ctx->new_count != 1) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Cannot handle repeated arrays in format string");
+        return -1;
+    }
+    if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return -1;
+    ndim = ctx->head->field->type->ndim;
+    while (*ts && *ts != ')') {
+        switch (*ts) {
+            case ' ': case '\f': case '\r': case '\n': case '\t': case '\v':  continue;
+            default:  break;
+        }
+        number = __Pyx_BufFmt_ExpectNumber(&ts);
+        if (number == -1) return -1;
+        if (i < ndim && (size_t) number != ctx->head->field->type->arraysize[i]) {
+            PyErr_Format(PyExc_ValueError,
+                        "Expected a dimension of size %zu, got %d",
+                        ctx->head->field->type->arraysize[i], number);
+            return -1;
+        }
+        if (*ts != ',' && *ts != ')') {
+            PyErr_Format(PyExc_ValueError,
+                                "Expected a comma in format string, got '%c'", *ts);
+            return -1;
+        }
+        if (*ts == ',') ts++;
+        i++;
+    }
+    if (i != ndim) {
+        PyErr_Format(PyExc_ValueError, "Expected %d dimension(s), got %d",
+                            ctx->head->field->type->ndim, i);
+        return -1;
+    }
+    if (!*ts) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Unexpected end of format string, expected ')'");
+        return -1;
+    }
+    ctx->is_valid_array = 1;
+    ctx->new_count = 1;
+    *tsp = ++ts;
+    return 0;
+}
+static const char* __Pyx_BufFmt_CheckString(__Pyx_BufFmt_Context* ctx, const char* ts) {
+  int got_Z = 0;
+  while (1) {
+    switch(*ts) {
+      case 0:
+        if (ctx->enc_type != 0 && ctx->head == NULL) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return NULL;
+        }
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        if (ctx->head != NULL) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return NULL;
+        }
+        return ts;
+      case ' ':
+      case '\r':
+      case '\n':
+        ++ts;
+        break;
+      case '<':
+        if (!__Pyx_Is_Little_Endian()) {
+          PyErr_SetString(PyExc_ValueError, "Little-endian buffer not supported on big-endian compiler");
+          return NULL;
+        }
+        ctx->new_packmode = '=';
+        ++ts;
+        break;
+      case '>':
+      case '!':
+        if (__Pyx_Is_Little_Endian()) {
+          PyErr_SetString(PyExc_ValueError, "Big-endian buffer not supported on little-endian compiler");
+          return NULL;
+        }
+        ctx->new_packmode = '=';
+        ++ts;
+        break;
+      case '=':
+      case '@':
+      case '^':
+        ctx->new_packmode = *ts++;
+        break;
+      case 'T':
+        {
+          const char* ts_after_sub;
+          size_t i, struct_count = ctx->new_count;
+          size_t struct_alignment = ctx->struct_alignment;
+          ctx->new_count = 1;
+          ++ts;
+          if (*ts != '{') {
+            PyErr_SetString(PyExc_ValueError, "Buffer acquisition: Expected '{' after 'T'");
+            return NULL;
+          }
+          if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+          ctx->enc_type = 0;
+          ctx->enc_count = 0;
+          ctx->struct_alignment = 0;
+          ++ts;
+          ts_after_sub = ts;
+          for (i = 0; i != struct_count; ++i) {
+            ts_after_sub = __Pyx_BufFmt_CheckString(ctx, ts);
+            if (!ts_after_sub) return NULL;
+          }
+          ts = ts_after_sub;
+          if (struct_alignment) ctx->struct_alignment = struct_alignment;
+        }
+        break;
+      case '}':
+        {
+          size_t alignment = ctx->struct_alignment;
+          ++ts;
+          if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+          ctx->enc_type = 0;
+          if (alignment && ctx->fmt_offset % alignment) {
+            ctx->fmt_offset += alignment - (ctx->fmt_offset % alignment);
+          }
+        }
+        return ts;
+      case 'x':
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        ctx->fmt_offset += ctx->new_count;
+        ctx->new_count = 1;
+        ctx->enc_count = 0;
+        ctx->enc_type = 0;
+        ctx->enc_packmode = ctx->new_packmode;
+        ++ts;
+        break;
+      case 'Z':
+        got_Z = 1;
+        ++ts;
+        if (*ts != 'f' && *ts != 'd' && *ts != 'g') {
+          __Pyx_BufFmt_RaiseUnexpectedChar('Z');
+          return NULL;
+        }
+        CYTHON_FALLTHROUGH;
+      case '?': case 'c': case 'b': case 'B': case 'h': case 'H': case 'i': case 'I':
+      case 'l': case 'L': case 'q': case 'Q':
+      case 'f': case 'd': case 'g':
+      case 'O': case 'p':
+        if ((ctx->enc_type == *ts) && (got_Z == ctx->is_complex) &&
+            (ctx->enc_packmode == ctx->new_packmode) && (!ctx->is_valid_array)) {
+          ctx->enc_count += ctx->new_count;
+          ctx->new_count = 1;
+          got_Z = 0;
+          ++ts;
+          break;
+        }
+        CYTHON_FALLTHROUGH;
+      case 's':
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        ctx->enc_count = ctx->new_count;
+        ctx->enc_packmode = ctx->new_packmode;
+        ctx->enc_type = *ts;
+        ctx->is_complex = got_Z;
+        ++ts;
+        ctx->new_count = 1;
+        got_Z = 0;
+        break;
+      case ':':
+        ++ts;
+        while(*ts != ':') ++ts;
+        ++ts;
+        break;
+      case '(':
+        if (__pyx_buffmt_parse_array(ctx, &ts) < 0) return NULL;
+        break;
+      default:
+        {
+          int number = __Pyx_BufFmt_ExpectNumber(&ts);
+          if (number == -1) return NULL;
+          ctx->new_count = (size_t)number;
+        }
+    }
+  }
+}
+
+/* BufferGetAndValidate */
+  static CYTHON_INLINE void __Pyx_SafeReleaseBuffer(Py_buffer* info) {
+  if (unlikely(info->buf == NULL)) return;
+  if (info->suboffsets == __Pyx_minusones) info->suboffsets = NULL;
+  __Pyx_ReleaseBuffer(info);
+}
+static void __Pyx_ZeroBuffer(Py_buffer* buf) {
+  buf->buf = NULL;
+  buf->obj = NULL;
+  buf->strides = __Pyx_zeros;
+  buf->shape = __Pyx_zeros;
+  buf->suboffsets = __Pyx_minusones;
+}
+static int __Pyx__GetBufferAndValidate(
+        Py_buffer* buf, PyObject* obj,  __Pyx_TypeInfo* dtype, int flags,
+        int nd, int cast, __Pyx_BufFmt_StackElem* stack)
+{
+  buf->buf = NULL;
+  if (unlikely(__Pyx_GetBuffer(obj, buf, flags) == -1)) {
+    __Pyx_ZeroBuffer(buf);
+    return -1;
+  }
+  if (unlikely(buf->ndim != nd)) {
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer has wrong number of dimensions (expected %d, got %d)",
+                 nd, buf->ndim);
+    goto fail;
+  }
+  if (!cast) {
+    __Pyx_BufFmt_Context ctx;
+    __Pyx_BufFmt_Init(&ctx, stack, dtype);
+    if (!__Pyx_BufFmt_CheckString(&ctx, buf->format)) goto fail;
+  }
+  if (unlikely((size_t)buf->itemsize != dtype->size)) {
+    PyErr_Format(PyExc_ValueError,
+      "Item size of buffer (%" CYTHON_FORMAT_SSIZE_T "d byte%s) does not match size of '%s' (%" CYTHON_FORMAT_SSIZE_T "d byte%s)",
+      buf->itemsize, (buf->itemsize > 1) ? "s" : "",
+      dtype->name, (Py_ssize_t)dtype->size, (dtype->size > 1) ? "s" : "");
+    goto fail;
+  }
+  if (buf->suboffsets == NULL) buf->suboffsets = __Pyx_minusones;
+  return 0;
+fail:;
+  __Pyx_SafeReleaseBuffer(buf);
+  return -1;
+}
+
+/* BufferFallbackError */
+  static void __Pyx_RaiseBufferFallbackError(void) {
+  PyErr_SetString(PyExc_ValueError,
+     "Buffer acquisition failed on assignment; and then reacquiring the old buffer failed too!");
+}
+
+/* PyIntBinop */
+  #if !CYTHON_COMPILING_IN_PYPY
+static PyObject* __Pyx_PyInt_SubtractObjC(PyObject *op1, PyObject *op2, long intval, int inplace, int zerodivision_check) {
+    CYTHON_MAYBE_UNUSED_VAR(intval);
+    CYTHON_MAYBE_UNUSED_VAR(inplace);
+    CYTHON_UNUSED_VAR(zerodivision_check);
+    #if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_CheckExact(op1))) {
+        const long b = intval;
+        long x;
+        long a = PyInt_AS_LONG(op1);
+        
+            x = (long)((unsigned long)a - (unsigned long)b);
+            if (likely((x^a) >= 0 || (x^~b) >= 0))
+                return PyInt_FromLong(x);
+            return PyLong_Type.tp_as_number->nb_subtract(op1, op2);
+    }
+    #endif
+    #if CYTHON_USE_PYLONG_INTERNALS
+    if (likely(PyLong_CheckExact(op1))) {
+        const long b = intval;
+        long a, x;
+#ifdef HAVE_LONG_LONG
+        const PY_LONG_LONG llb = intval;
+        PY_LONG_LONG lla, llx;
+#endif
+        if (unlikely(__Pyx_PyLong_IsZero(op1))) {
+            return PyLong_FromLong(-intval);
+        }
+        if (likely(__Pyx_PyLong_IsCompact(op1))) {
+            a = __Pyx_PyLong_CompactValue(op1);
+        } else {
+            const digit* digits = __Pyx_PyLong_Digits(op1);
+            const Py_ssize_t size = __Pyx_PyLong_SignedDigitCount(op1);
+            switch (size) {
+                case -2:
+                    if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                        a = -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+                    #ifdef HAVE_LONG_LONG
+                    } else if (8 * sizeof(PY_LONG_LONG) - 1 > 2 * PyLong_SHIFT) {
+                        lla = -(PY_LONG_LONG) (((((unsigned PY_LONG_LONG)digits[1]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[0]));
+                        goto long_long;
+                    #endif
+                    }
+                    CYTHON_FALLTHROUGH;
+                case 2:
+                    if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                        a = (long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+                    #ifdef HAVE_LONG_LONG
+                    } else if (8 * sizeof(PY_LONG_LONG) - 1 > 2 * PyLong_SHIFT) {
+                        lla = (PY_LONG_LONG) (((((unsigned PY_LONG_LONG)digits[1]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[0]));
+                        goto long_long;
+                    #endif
+                    }
+                    CYTHON_FALLTHROUGH;
+                case -3:
+                    if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                        a = -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+                    #ifdef HAVE_LONG_LONG
+                    } else if (8 * sizeof(PY_LONG_LONG) - 1 > 3 * PyLong_SHIFT) {
+                        lla = -(PY_LONG_LONG) (((((((unsigned PY_LONG_LONG)digits[2]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[1]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[0]));
+                        goto long_long;
+                    #endif
+                    }
+                    CYTHON_FALLTHROUGH;
+                case 3:
+                    if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                        a = (long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+                    #ifdef HAVE_LONG_LONG
+                    } else if (8 * sizeof(PY_LONG_LONG) - 1 > 3 * PyLong_SHIFT) {
+                        lla = (PY_LONG_LONG) (((((((unsigned PY_LONG_LONG)digits[2]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[1]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[0]));
+                        goto long_long;
+                    #endif
+                    }
+                    CYTHON_FALLTHROUGH;
+                case -4:
+                    if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                        a = -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+                    #ifdef HAVE_LONG_LONG
+                    } else if (8 * sizeof(PY_LONG_LONG) - 1 > 4 * PyLong_SHIFT) {
+                        lla = -(PY_LONG_LONG) (((((((((unsigned PY_LONG_LONG)digits[3]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[2]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[1]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[0]));
+                        goto long_long;
+                    #endif
+                    }
+                    CYTHON_FALLTHROUGH;
+                case 4:
+                    if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                        a = (long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+                    #ifdef HAVE_LONG_LONG
+                    } else if (8 * sizeof(PY_LONG_LONG) - 1 > 4 * PyLong_SHIFT) {
+                        lla = (PY_LONG_LONG) (((((((((unsigned PY_LONG_LONG)digits[3]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[2]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[1]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[0]));
+                        goto long_long;
+                    #endif
+                    }
+                    CYTHON_FALLTHROUGH;
+                default: return PyLong_Type.tp_as_number->nb_subtract(op1, op2);
+            }
+        }
+                x = a - b;
+            return PyLong_FromLong(x);
+#ifdef HAVE_LONG_LONG
+        long_long:
+                llx = lla - llb;
+            return PyLong_FromLongLong(llx);
+#endif
+        
+        
+    }
+    #endif
+    if (PyFloat_CheckExact(op1)) {
+        const long b = intval;
+#if CYTHON_COMPILING_IN_LIMITED_API
+        double a = __pyx_PyFloat_AsDouble(op1);
+#else
+        double a = PyFloat_AS_DOUBLE(op1);
+#endif
+            double result;
+            
+            PyFPE_START_PROTECT("subtract", return NULL)
+            result = ((double)a) - (double)b;
+            PyFPE_END_PROTECT(result)
+            return PyFloat_FromDouble(result);
+    }
+    return (inplace ? PyNumber_InPlaceSubtract : PyNumber_Subtract)(op1, op2);
+}
+#endif
+
+/* SliceObject */
+  static CYTHON_INLINE PyObject* __Pyx_PyObject_GetSlice(PyObject* obj,
+        Py_ssize_t cstart, Py_ssize_t cstop,
+        PyObject** _py_start, PyObject** _py_stop, PyObject** _py_slice,
+        int has_cstart, int has_cstop, int wraparound) {
+    __Pyx_TypeName obj_type_name;
+#if CYTHON_USE_TYPE_SLOTS
+    PyMappingMethods* mp;
+#if PY_MAJOR_VERSION < 3
+    PySequenceMethods* ms = Py_TYPE(obj)->tp_as_sequence;
+    if (likely(ms && ms->sq_slice)) {
+        if (!has_cstart) {
+            if (_py_start && (*_py_start != Py_None)) {
+                cstart = __Pyx_PyIndex_AsSsize_t(*_py_start);
+                if ((cstart == (Py_ssize_t)-1) && PyErr_Occurred()) goto bad;
+            } else
+                cstart = 0;
+        }
+        if (!has_cstop) {
+            if (_py_stop && (*_py_stop != Py_None)) {
+                cstop = __Pyx_PyIndex_AsSsize_t(*_py_stop);
+                if ((cstop == (Py_ssize_t)-1) && PyErr_Occurred()) goto bad;
+            } else
+                cstop = PY_SSIZE_T_MAX;
+        }
+        if (wraparound && unlikely((cstart < 0) | (cstop < 0)) && likely(ms->sq_length)) {
+            Py_ssize_t l = ms->sq_length(obj);
+            if (likely(l >= 0)) {
+                if (cstop < 0) {
+                    cstop += l;
+                    if (cstop < 0) cstop = 0;
+                }
+                if (cstart < 0) {
+                    cstart += l;
+                    if (cstart < 0) cstart = 0;
+                }
+            } else {
+                if (!PyErr_ExceptionMatches(PyExc_OverflowError))
+                    goto bad;
+                PyErr_Clear();
+            }
+        }
+        return ms->sq_slice(obj, cstart, cstop);
+    }
+#else
+    CYTHON_UNUSED_VAR(wraparound);
+#endif
+    mp = Py_TYPE(obj)->tp_as_mapping;
+    if (likely(mp && mp->mp_subscript))
+#else
+    CYTHON_UNUSED_VAR(wraparound);
+#endif
+    {
+        PyObject* result;
+        PyObject *py_slice, *py_start, *py_stop;
+        if (_py_slice) {
+            py_slice = *_py_slice;
+        } else {
+            PyObject* owned_start = NULL;
+            PyObject* owned_stop = NULL;
+            if (_py_start) {
+                py_start = *_py_start;
+            } else {
+                if (has_cstart) {
+                    owned_start = py_start = PyInt_FromSsize_t(cstart);
+                    if (unlikely(!py_start)) goto bad;
+                } else
+                    py_start = Py_None;
+            }
+            if (_py_stop) {
+                py_stop = *_py_stop;
+            } else {
+                if (has_cstop) {
+                    owned_stop = py_stop = PyInt_FromSsize_t(cstop);
+                    if (unlikely(!py_stop)) {
+                        Py_XDECREF(owned_start);
+                        goto bad;
+                    }
+                } else
+                    py_stop = Py_None;
+            }
+            py_slice = PySlice_New(py_start, py_stop, Py_None);
+            Py_XDECREF(owned_start);
+            Py_XDECREF(owned_stop);
+            if (unlikely(!py_slice)) goto bad;
+        }
+#if CYTHON_USE_TYPE_SLOTS
+        result = mp->mp_subscript(obj, py_slice);
+#else
+        result = PyObject_GetItem(obj, py_slice);
+#endif
+        if (!_py_slice) {
+            Py_DECREF(py_slice);
+        }
+        return result;
+    }
+    obj_type_name = __Pyx_PyType_GetName(Py_TYPE(obj));
+    PyErr_Format(PyExc_TypeError,
+        "'" __Pyx_FMT_TYPENAME "' object is unsliceable", obj_type_name);
+    __Pyx_DECREF_TypeName(obj_type_name);
+bad:
+    return NULL;
+}
+
+/* PyObject_GenericGetAttrNoDict */
+  #if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000
+static PyObject *__Pyx_RaiseGenericGetAttributeError(PyTypeObject *tp, PyObject *attr_name) {
+    __Pyx_TypeName type_name = __Pyx_PyType_GetName(tp);
+    PyErr_Format(PyExc_AttributeError,
+#if PY_MAJOR_VERSION >= 3
+                 "'" __Pyx_FMT_TYPENAME "' object has no attribute '%U'",
+                 type_name, attr_name);
+#else
+                 "'" __Pyx_FMT_TYPENAME "' object has no attribute '%.400s'",
+                 type_name, PyString_AS_STRING(attr_name));
+#endif
+    __Pyx_DECREF_TypeName(type_name);
+    return NULL;
+}
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GenericGetAttrNoDict(PyObject* obj, PyObject* attr_name) {
+    PyObject *descr;
+    PyTypeObject *tp = Py_TYPE(obj);
+    if (unlikely(!PyString_Check(attr_name))) {
+        return PyObject_GenericGetAttr(obj, attr_name);
+    }
+    assert(!tp->tp_dictoffset);
+    descr = _PyType_Lookup(tp, attr_name);
+    if (unlikely(!descr)) {
+        return __Pyx_RaiseGenericGetAttributeError(tp, attr_name);
+    }
+    Py_INCREF(descr);
+    #if PY_MAJOR_VERSION < 3
+    if (likely(PyType_HasFeature(Py_TYPE(descr), Py_TPFLAGS_HAVE_CLASS)))
+    #endif
+    {
+        descrgetfunc f = Py_TYPE(descr)->tp_descr_get;
+        if (unlikely(f)) {
+            PyObject *res = f(descr, obj, (PyObject *)tp);
+            Py_DECREF(descr);
+            return res;
+        }
+    }
+    return descr;
+}
+#endif
+
+/* PyObject_GenericGetAttr */
+  #if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000
+static PyObject* __Pyx_PyObject_GenericGetAttr(PyObject* obj, PyObject* attr_name) {
+    if (unlikely(Py_TYPE(obj)->tp_dictoffset)) {
+        return PyObject_GenericGetAttr(obj, attr_name);
+    }
+    return __Pyx_PyObject_GenericGetAttrNoDict(obj, attr_name);
+}
+#endif
+
+/* FixUpExtensionType */
+  #if CYTHON_USE_TYPE_SPECS
+static int __Pyx_fix_up_extension_type_from_spec(PyType_Spec *spec, PyTypeObject *type) {
+#if PY_VERSION_HEX > 0x030900B1 || CYTHON_COMPILING_IN_LIMITED_API
+    CYTHON_UNUSED_VAR(spec);
+    CYTHON_UNUSED_VAR(type);
+#else
+    const PyType_Slot *slot = spec->slots;
+    while (slot && slot->slot && slot->slot != Py_tp_members)
+        slot++;
+    if (slot && slot->slot == Py_tp_members) {
+        int changed = 0;
+#if !(PY_VERSION_HEX <= 0x030900b1 && CYTHON_COMPILING_IN_CPYTHON)
+        const
+#endif
+            PyMemberDef *memb = (PyMemberDef*) slot->pfunc;
+        while (memb && memb->name) {
+            if (memb->name[0] == '_' && memb->name[1] == '_') {
+#if PY_VERSION_HEX < 0x030900b1
+                if (strcmp(memb->name, "__weaklistoffset__") == 0) {
+                    assert(memb->type == T_PYSSIZET);
+                    assert(memb->flags == READONLY);
+                    type->tp_weaklistoffset = memb->offset;
+                    changed = 1;
+                }
+                else if (strcmp(memb->name, "__dictoffset__") == 0) {
+                    assert(memb->type == T_PYSSIZET);
+                    assert(memb->flags == READONLY);
+                    type->tp_dictoffset = memb->offset;
+                    changed = 1;
+                }
+#if CYTHON_METH_FASTCALL
+                else if (strcmp(memb->name, "__vectorcalloffset__") == 0) {
+                    assert(memb->type == T_PYSSIZET);
+                    assert(memb->flags == READONLY);
+#if PY_VERSION_HEX >= 0x030800b4
+                    type->tp_vectorcall_offset = memb->offset;
+#else
+                    type->tp_print = (printfunc) memb->offset;
+#endif
+                    changed = 1;
+                }
+#endif
+#else
+                if ((0));
+#endif
+#if PY_VERSION_HEX <= 0x030900b1 && CYTHON_COMPILING_IN_CPYTHON
+                else if (strcmp(memb->name, "__module__") == 0) {
+                    PyObject *descr;
+                    assert(memb->type == T_OBJECT);
+                    assert(memb->flags == 0 || memb->flags == READONLY);
+                    descr = PyDescr_NewMember(type, memb);
+                    if (unlikely(!descr))
+                        return -1;
+                    if (unlikely(PyDict_SetItem(type->tp_dict, PyDescr_NAME(descr), descr) < 0)) {
+                        Py_DECREF(descr);
+                        return -1;
+                    }
+                    Py_DECREF(descr);
+                    changed = 1;
+                }
+#endif
+            }
+            memb++;
+        }
+        if (changed)
+            PyType_Modified(type);
+    }
+#endif
+    return 0;
+}
+#endif
+
+/* PyObjectCallNoArg */
+  static CYTHON_INLINE PyObject* __Pyx_PyObject_CallNoArg(PyObject *func) {
+    PyObject *arg[2] = {NULL, NULL};
+    return __Pyx_PyObject_FastCall(func, arg + 1, 0 | __Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET);
+}
+
+/* PyObjectGetMethod */
+  static int __Pyx_PyObject_GetMethod(PyObject *obj, PyObject *name, PyObject **method) {
+    PyObject *attr;
+#if CYTHON_UNPACK_METHODS && CYTHON_COMPILING_IN_CPYTHON && CYTHON_USE_PYTYPE_LOOKUP
+    __Pyx_TypeName type_name;
+    PyTypeObject *tp = Py_TYPE(obj);
+    PyObject *descr;
+    descrgetfunc f = NULL;
+    PyObject **dictptr, *dict;
+    int meth_found = 0;
+    assert (*method == NULL);
+    if (unlikely(tp->tp_getattro != PyObject_GenericGetAttr)) {
+        attr = __Pyx_PyObject_GetAttrStr(obj, name);
+        goto try_unpack;
+    }
+    if (unlikely(tp->tp_dict == NULL) && unlikely(PyType_Ready(tp) < 0)) {
+        return 0;
+    }
+    descr = _PyType_Lookup(tp, name);
+    if (likely(descr != NULL)) {
+        Py_INCREF(descr);
+#if defined(Py_TPFLAGS_METHOD_DESCRIPTOR) && Py_TPFLAGS_METHOD_DESCRIPTOR
+        if (__Pyx_PyType_HasFeature(Py_TYPE(descr), Py_TPFLAGS_METHOD_DESCRIPTOR))
+#elif PY_MAJOR_VERSION >= 3
+        #ifdef __Pyx_CyFunction_USED
+        if (likely(PyFunction_Check(descr) || __Pyx_IS_TYPE(descr, &PyMethodDescr_Type) || __Pyx_CyFunction_Check(descr)))
+        #else
+        if (likely(PyFunction_Check(descr) || __Pyx_IS_TYPE(descr, &PyMethodDescr_Type)))
+        #endif
+#else
+        #ifdef __Pyx_CyFunction_USED
+        if (likely(PyFunction_Check(descr) || __Pyx_CyFunction_Check(descr)))
+        #else
+        if (likely(PyFunction_Check(descr)))
+        #endif
+#endif
+        {
+            meth_found = 1;
+        } else {
+            f = Py_TYPE(descr)->tp_descr_get;
+            if (f != NULL && PyDescr_IsData(descr)) {
+                attr = f(descr, obj, (PyObject *)Py_TYPE(obj));
+                Py_DECREF(descr);
+                goto try_unpack;
+            }
+        }
+    }
+    dictptr = _PyObject_GetDictPtr(obj);
+    if (dictptr != NULL && (dict = *dictptr) != NULL) {
+        Py_INCREF(dict);
+        attr = __Pyx_PyDict_GetItemStr(dict, name);
+        if (attr != NULL) {
+            Py_INCREF(attr);
+            Py_DECREF(dict);
+            Py_XDECREF(descr);
+            goto try_unpack;
+        }
+        Py_DECREF(dict);
+    }
+    if (meth_found) {
+        *method = descr;
+        return 1;
+    }
+    if (f != NULL) {
+        attr = f(descr, obj, (PyObject *)Py_TYPE(obj));
+        Py_DECREF(descr);
+        goto try_unpack;
+    }
+    if (likely(descr != NULL)) {
+        *method = descr;
+        return 0;
+    }
+    type_name = __Pyx_PyType_GetName(tp);
+    PyErr_Format(PyExc_AttributeError,
+#if PY_MAJOR_VERSION >= 3
+                 "'" __Pyx_FMT_TYPENAME "' object has no attribute '%U'",
+                 type_name, name);
+#else
+                 "'" __Pyx_FMT_TYPENAME "' object has no attribute '%.400s'",
+                 type_name, PyString_AS_STRING(name));
+#endif
+    __Pyx_DECREF_TypeName(type_name);
+    return 0;
+#else
+    attr = __Pyx_PyObject_GetAttrStr(obj, name);
+    goto try_unpack;
+#endif
+try_unpack:
+#if CYTHON_UNPACK_METHODS
+    if (likely(attr) && PyMethod_Check(attr) && likely(PyMethod_GET_SELF(attr) == obj)) {
+        PyObject *function = PyMethod_GET_FUNCTION(attr);
+        Py_INCREF(function);
+        Py_DECREF(attr);
+        *method = function;
+        return 1;
+    }
+#endif
+    *method = attr;
+    return 0;
+}
+
+/* PyObjectCallMethod0 */
+  static PyObject* __Pyx_PyObject_CallMethod0(PyObject* obj, PyObject* method_name) {
+    PyObject *method = NULL, *result = NULL;
+    int is_method = __Pyx_PyObject_GetMethod(obj, method_name, &method);
+    if (likely(is_method)) {
+        result = __Pyx_PyObject_CallOneArg(method, obj);
+        Py_DECREF(method);
+        return result;
+    }
+    if (unlikely(!method)) goto bad;
+    result = __Pyx_PyObject_CallNoArg(method);
+    Py_DECREF(method);
+bad:
+    return result;
+}
+
+/* ValidateBasesTuple */
+  #if CYTHON_COMPILING_IN_CPYTHON || CYTHON_COMPILING_IN_LIMITED_API || CYTHON_USE_TYPE_SPECS
+static int __Pyx_validate_bases_tuple(const char *type_name, Py_ssize_t dictoffset, PyObject *bases) {
+    Py_ssize_t i, n;
+#if CYTHON_ASSUME_SAFE_MACROS
+    n = PyTuple_GET_SIZE(bases);
+#else
+    n = PyTuple_Size(bases);
+    if (n < 0) return -1;
+#endif
+    for (i = 1; i < n; i++)
+    {
+#if CYTHON_AVOID_BORROWED_REFS
+        PyObject *b0 = PySequence_GetItem(bases, i);
+        if (!b0) return -1;
+#elif CYTHON_ASSUME_SAFE_MACROS
+        PyObject *b0 = PyTuple_GET_ITEM(bases, i);
+#else
+        PyObject *b0 = PyTuple_GetItem(bases, i);
+        if (!b0) return -1;
+#endif
+        PyTypeObject *b;
+#if PY_MAJOR_VERSION < 3
+        if (PyClass_Check(b0))
+        {
+            PyErr_Format(PyExc_TypeError, "base class '%.200s' is an old-style class",
+                         PyString_AS_STRING(((PyClassObject*)b0)->cl_name));
+#if CYTHON_AVOID_BORROWED_REFS
+            Py_DECREF(b0);
+#endif
+            return -1;
+        }
+#endif
+        b = (PyTypeObject*) b0;
+        if (!__Pyx_PyType_HasFeature(b, Py_TPFLAGS_HEAPTYPE))
+        {
+            __Pyx_TypeName b_name = __Pyx_PyType_GetName(b);
+            PyErr_Format(PyExc_TypeError,
+                "base class '" __Pyx_FMT_TYPENAME "' is not a heap type", b_name);
+            __Pyx_DECREF_TypeName(b_name);
+#if CYTHON_AVOID_BORROWED_REFS
+            Py_DECREF(b0);
+#endif
+            return -1;
+        }
+        if (dictoffset == 0)
+        {
+            Py_ssize_t b_dictoffset = 0;
+#if CYTHON_USE_TYPE_SLOTS || CYTHON_COMPILING_IN_PYPY
+            b_dictoffset = b->tp_dictoffset;
+#else
+            PyObject *py_b_dictoffset = PyObject_GetAttrString((PyObject*)b, "__dictoffset__");
+            if (!py_b_dictoffset) goto dictoffset_return;
+            b_dictoffset = PyLong_AsSsize_t(py_b_dictoffset);
+            Py_DECREF(py_b_dictoffset);
+            if (b_dictoffset == -1 && PyErr_Occurred()) goto dictoffset_return;
+#endif
+            if (b_dictoffset) {
+                {
+                    __Pyx_TypeName b_name = __Pyx_PyType_GetName(b);
+                    PyErr_Format(PyExc_TypeError,
+                        "extension type '%.200s' has no __dict__ slot, "
+                        "but base type '" __Pyx_FMT_TYPENAME "' has: "
+                        "either add 'cdef dict __dict__' to the extension type "
+                        "or add '__slots__ = [...]' to the base type",
+                        type_name, b_name);
+                    __Pyx_DECREF_TypeName(b_name);
+                }
+#if !(CYTHON_USE_TYPE_SLOTS || CYTHON_COMPILING_IN_PYPY)
+              dictoffset_return:
+#endif
+#if CYTHON_AVOID_BORROWED_REFS
+                Py_DECREF(b0);
+#endif
+                return -1;
+            }
+        }
+#if CYTHON_AVOID_BORROWED_REFS
+        Py_DECREF(b0);
+#endif
+    }
+    return 0;
+}
+#endif
+
+/* PyType_Ready */
+  static int __Pyx_PyType_Ready(PyTypeObject *t) {
+#if CYTHON_USE_TYPE_SPECS || !(CYTHON_COMPILING_IN_CPYTHON || CYTHON_COMPILING_IN_LIMITED_API) || defined(PYSTON_MAJOR_VERSION)
+    (void)__Pyx_PyObject_CallMethod0;
+#if CYTHON_USE_TYPE_SPECS
+    (void)__Pyx_validate_bases_tuple;
+#endif
+    return PyType_Ready(t);
+#else
+    int r;
+    PyObject *bases = __Pyx_PyType_GetSlot(t, tp_bases, PyObject*);
+    if (bases && unlikely(__Pyx_validate_bases_tuple(t->tp_name, t->tp_dictoffset, bases) == -1))
+        return -1;
+#if PY_VERSION_HEX >= 0x03050000 && !defined(PYSTON_MAJOR_VERSION)
+    {
+        int gc_was_enabled;
+    #if PY_VERSION_HEX >= 0x030A00b1
+        gc_was_enabled = PyGC_Disable();
+        (void)__Pyx_PyObject_CallMethod0;
+    #else
+        PyObject *ret, *py_status;
+        PyObject *gc = NULL;
+        #if PY_VERSION_HEX >= 0x030700a1 && (!CYTHON_COMPILING_IN_PYPY || PYPY_VERSION_NUM+0 >= 0x07030400)
+        gc = PyImport_GetModule(__pyx_kp_u_gc);
+        #endif
+        if (unlikely(!gc)) gc = PyImport_Import(__pyx_kp_u_gc);
+        if (unlikely(!gc)) return -1;
+        py_status = __Pyx_PyObject_CallMethod0(gc, __pyx_kp_u_isenabled);
+        if (unlikely(!py_status)) {
+            Py_DECREF(gc);
+            return -1;
+        }
+        gc_was_enabled = __Pyx_PyObject_IsTrue(py_status);
+        Py_DECREF(py_status);
+        if (gc_was_enabled > 0) {
+            ret = __Pyx_PyObject_CallMethod0(gc, __pyx_kp_u_disable);
+            if (unlikely(!ret)) {
+                Py_DECREF(gc);
+                return -1;
+            }
+            Py_DECREF(ret);
+        } else if (unlikely(gc_was_enabled == -1)) {
+            Py_DECREF(gc);
+            return -1;
+        }
+    #endif
+        t->tp_flags |= Py_TPFLAGS_HEAPTYPE;
+#if PY_VERSION_HEX >= 0x030A0000
+        t->tp_flags |= Py_TPFLAGS_IMMUTABLETYPE;
+#endif
+#else
+        (void)__Pyx_PyObject_CallMethod0;
+#endif
+    r = PyType_Ready(t);
+#if PY_VERSION_HEX >= 0x03050000 && !defined(PYSTON_MAJOR_VERSION)
+        t->tp_flags &= ~Py_TPFLAGS_HEAPTYPE;
+    #if PY_VERSION_HEX >= 0x030A00b1
+        if (gc_was_enabled)
+            PyGC_Enable();
+    #else
+        if (gc_was_enabled) {
+            PyObject *tp, *v, *tb;
+            PyErr_Fetch(&tp, &v, &tb);
+            ret = __Pyx_PyObject_CallMethod0(gc, __pyx_kp_u_enable);
+            if (likely(ret || r == -1)) {
+                Py_XDECREF(ret);
+                PyErr_Restore(tp, v, tb);
+            } else {
+                Py_XDECREF(tp);
+                Py_XDECREF(v);
+                Py_XDECREF(tb);
+                r = -1;
+            }
+        }
+        Py_DECREF(gc);
+    #endif
+    }
+#endif
+    return r;
+#endif
+}
+
+/* SetVTable */
+  static int __Pyx_SetVtable(PyTypeObject *type, void *vtable) {
+    PyObject *ob = PyCapsule_New(vtable, 0, 0);
+    if (unlikely(!ob))
+        goto bad;
+#if CYTHON_COMPILING_IN_LIMITED_API
+    if (unlikely(PyObject_SetAttr((PyObject *) type, __pyx_n_s_pyx_vtable, ob) < 0))
+#else
+    if (unlikely(PyDict_SetItem(type->tp_dict, __pyx_n_s_pyx_vtable, ob) < 0))
+#endif
+        goto bad;
+    Py_DECREF(ob);
+    return 0;
+bad:
+    Py_XDECREF(ob);
+    return -1;
+}
+
+/* GetVTable */
+  static void* __Pyx_GetVtable(PyTypeObject *type) {
+    void* ptr;
+#if CYTHON_COMPILING_IN_LIMITED_API
+    PyObject *ob = PyObject_GetAttr((PyObject *)type, __pyx_n_s_pyx_vtable);
+#else
+    PyObject *ob = PyObject_GetItem(type->tp_dict, __pyx_n_s_pyx_vtable);
+#endif
+    if (!ob)
+        goto bad;
+    ptr = PyCapsule_GetPointer(ob, 0);
+    if (!ptr && !PyErr_Occurred())
+        PyErr_SetString(PyExc_RuntimeError, "invalid vtable found for imported type");
+    Py_DECREF(ob);
+    return ptr;
+bad:
+    Py_XDECREF(ob);
+    return NULL;
+}
+
+/* MergeVTables */
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+static int __Pyx_MergeVtables(PyTypeObject *type) {
+    int i;
+    void** base_vtables;
+    __Pyx_TypeName tp_base_name;
+    __Pyx_TypeName base_name;
+    void* unknown = (void*)-1;
+    PyObject* bases = type->tp_bases;
+    int base_depth = 0;
+    {
+        PyTypeObject* base = type->tp_base;
+        while (base) {
+            base_depth += 1;
+            base = base->tp_base;
+        }
+    }
+    base_vtables = (void**) malloc(sizeof(void*) * (size_t)(base_depth + 1));
+    base_vtables[0] = unknown;
+    for (i = 1; i < PyTuple_GET_SIZE(bases); i++) {
+        void* base_vtable = __Pyx_GetVtable(((PyTypeObject*)PyTuple_GET_ITEM(bases, i)));
+        if (base_vtable != NULL) {
+            int j;
+            PyTypeObject* base = type->tp_base;
+            for (j = 0; j < base_depth; j++) {
+                if (base_vtables[j] == unknown) {
+                    base_vtables[j] = __Pyx_GetVtable(base);
+                    base_vtables[j + 1] = unknown;
+                }
+                if (base_vtables[j] == base_vtable) {
+                    break;
+                } else if (base_vtables[j] == NULL) {
+                    goto bad;
+                }
+                base = base->tp_base;
+            }
+        }
+    }
+    PyErr_Clear();
+    free(base_vtables);
+    return 0;
+bad:
+    tp_base_name = __Pyx_PyType_GetName(type->tp_base);
+    base_name = __Pyx_PyType_GetName((PyTypeObject*)PyTuple_GET_ITEM(bases, i));
+    PyErr_Format(PyExc_TypeError,
+        "multiple bases have vtable conflict: '" __Pyx_FMT_TYPENAME "' and '" __Pyx_FMT_TYPENAME "'", tp_base_name, base_name);
+    __Pyx_DECREF_TypeName(tp_base_name);
+    __Pyx_DECREF_TypeName(base_name);
+    free(base_vtables);
+    return -1;
+}
+#endif
+
+/* SetupReduce */
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+static int __Pyx_setup_reduce_is_named(PyObject* meth, PyObject* name) {
+  int ret;
+  PyObject *name_attr;
+  name_attr = __Pyx_PyObject_GetAttrStrNoError(meth, __pyx_n_s_name_2);
+  if (likely(name_attr)) {
+      ret = PyObject_RichCompareBool(name_attr, name, Py_EQ);
+  } else {
+      ret = -1;
+  }
+  if (unlikely(ret < 0)) {
+      PyErr_Clear();
+      ret = 0;
+  }
+  Py_XDECREF(name_attr);
+  return ret;
+}
+static int __Pyx_setup_reduce(PyObject* type_obj) {
+    int ret = 0;
+    PyObject *object_reduce = NULL;
+    PyObject *object_getstate = NULL;
+    PyObject *object_reduce_ex = NULL;
+    PyObject *reduce = NULL;
+    PyObject *reduce_ex = NULL;
+    PyObject *reduce_cython = NULL;
+    PyObject *setstate = NULL;
+    PyObject *setstate_cython = NULL;
+    PyObject *getstate = NULL;
+#if CYTHON_USE_PYTYPE_LOOKUP
+    getstate = _PyType_Lookup((PyTypeObject*)type_obj, __pyx_n_s_getstate);
+#else
+    getstate = __Pyx_PyObject_GetAttrStrNoError(type_obj, __pyx_n_s_getstate);
+    if (!getstate && PyErr_Occurred()) {
+        goto __PYX_BAD;
+    }
+#endif
+    if (getstate) {
+#if CYTHON_USE_PYTYPE_LOOKUP
+        object_getstate = _PyType_Lookup(&PyBaseObject_Type, __pyx_n_s_getstate);
+#else
+        object_getstate = __Pyx_PyObject_GetAttrStrNoError((PyObject*)&PyBaseObject_Type, __pyx_n_s_getstate);
+        if (!object_getstate && PyErr_Occurred()) {
+            goto __PYX_BAD;
+        }
+#endif
+        if (object_getstate != getstate) {
+            goto __PYX_GOOD;
+        }
+    }
+#if CYTHON_USE_PYTYPE_LOOKUP
+    object_reduce_ex = _PyType_Lookup(&PyBaseObject_Type, __pyx_n_s_reduce_ex); if (!object_reduce_ex) goto __PYX_BAD;
+#else
+    object_reduce_ex = __Pyx_PyObject_GetAttrStr((PyObject*)&PyBaseObject_Type, __pyx_n_s_reduce_ex); if (!object_reduce_ex) goto __PYX_BAD;
+#endif
+    reduce_ex = __Pyx_PyObject_GetAttrStr(type_obj, __pyx_n_s_reduce_ex); if (unlikely(!reduce_ex)) goto __PYX_BAD;
+    if (reduce_ex == object_reduce_ex) {
+#if CYTHON_USE_PYTYPE_LOOKUP
+        object_reduce = _PyType_Lookup(&PyBaseObject_Type, __pyx_n_s_reduce); if (!object_reduce) goto __PYX_BAD;
+#else
+        object_reduce = __Pyx_PyObject_GetAttrStr((PyObject*)&PyBaseObject_Type, __pyx_n_s_reduce); if (!object_reduce) goto __PYX_BAD;
+#endif
+        reduce = __Pyx_PyObject_GetAttrStr(type_obj, __pyx_n_s_reduce); if (unlikely(!reduce)) goto __PYX_BAD;
+        if (reduce == object_reduce || __Pyx_setup_reduce_is_named(reduce, __pyx_n_s_reduce_cython)) {
+            reduce_cython = __Pyx_PyObject_GetAttrStrNoError(type_obj, __pyx_n_s_reduce_cython);
+            if (likely(reduce_cython)) {
+                ret = PyDict_SetItem(((PyTypeObject*)type_obj)->tp_dict, __pyx_n_s_reduce, reduce_cython); if (unlikely(ret < 0)) goto __PYX_BAD;
+                ret = PyDict_DelItem(((PyTypeObject*)type_obj)->tp_dict, __pyx_n_s_reduce_cython); if (unlikely(ret < 0)) goto __PYX_BAD;
+            } else if (reduce == object_reduce || PyErr_Occurred()) {
+                goto __PYX_BAD;
+            }
+            setstate = __Pyx_PyObject_GetAttrStrNoError(type_obj, __pyx_n_s_setstate);
+            if (!setstate) PyErr_Clear();
+            if (!setstate || __Pyx_setup_reduce_is_named(setstate, __pyx_n_s_setstate_cython)) {
+                setstate_cython = __Pyx_PyObject_GetAttrStrNoError(type_obj, __pyx_n_s_setstate_cython);
+                if (likely(setstate_cython)) {
+                    ret = PyDict_SetItem(((PyTypeObject*)type_obj)->tp_dict, __pyx_n_s_setstate, setstate_cython); if (unlikely(ret < 0)) goto __PYX_BAD;
+                    ret = PyDict_DelItem(((PyTypeObject*)type_obj)->tp_dict, __pyx_n_s_setstate_cython); if (unlikely(ret < 0)) goto __PYX_BAD;
+                } else if (!setstate || PyErr_Occurred()) {
+                    goto __PYX_BAD;
+                }
+            }
+            PyType_Modified((PyTypeObject*)type_obj);
+        }
+    }
+    goto __PYX_GOOD;
+__PYX_BAD:
+    if (!PyErr_Occurred()) {
+        __Pyx_TypeName type_obj_name =
+            __Pyx_PyType_GetName((PyTypeObject*)type_obj);
+        PyErr_Format(PyExc_RuntimeError,
+            "Unable to initialize pickling for " __Pyx_FMT_TYPENAME, type_obj_name);
+        __Pyx_DECREF_TypeName(type_obj_name);
+    }
+    ret = -1;
+__PYX_GOOD:
+#if !CYTHON_USE_PYTYPE_LOOKUP
+    Py_XDECREF(object_reduce);
+    Py_XDECREF(object_reduce_ex);
+    Py_XDECREF(object_getstate);
+    Py_XDECREF(getstate);
+#endif
+    Py_XDECREF(reduce);
+    Py_XDECREF(reduce_ex);
+    Py_XDECREF(reduce_cython);
+    Py_XDECREF(setstate);
+    Py_XDECREF(setstate_cython);
+    return ret;
+}
+#endif
+
+/* TypeImport */
+  #ifndef __PYX_HAVE_RT_ImportType_3_0_12
+#define __PYX_HAVE_RT_ImportType_3_0_12
+static PyTypeObject *__Pyx_ImportType_3_0_12(PyObject *module, const char *module_name, const char *class_name,
+    size_t size, size_t alignment, enum __Pyx_ImportType_CheckSize_3_0_12 check_size)
+{
+    PyObject *result = 0;
+    char warning[200];
+    Py_ssize_t basicsize;
+    Py_ssize_t itemsize;
+#if CYTHON_COMPILING_IN_LIMITED_API
+    PyObject *py_basicsize;
+    PyObject *py_itemsize;
+#endif
+    result = PyObject_GetAttrString(module, class_name);
+    if (!result)
+        goto bad;
+    if (!PyType_Check(result)) {
+        PyErr_Format(PyExc_TypeError,
+            "%.200s.%.200s is not a type object",
+            module_name, class_name);
+        goto bad;
+    }
+#if !CYTHON_COMPILING_IN_LIMITED_API
+    basicsize = ((PyTypeObject *)result)->tp_basicsize;
+    itemsize = ((PyTypeObject *)result)->tp_itemsize;
+#else
+    py_basicsize = PyObject_GetAttrString(result, "__basicsize__");
+    if (!py_basicsize)
+        goto bad;
+    basicsize = PyLong_AsSsize_t(py_basicsize);
+    Py_DECREF(py_basicsize);
+    py_basicsize = 0;
+    if (basicsize == (Py_ssize_t)-1 && PyErr_Occurred())
+        goto bad;
+    py_itemsize = PyObject_GetAttrString(result, "__itemsize__");
+    if (!py_itemsize)
+        goto bad;
+    itemsize = PyLong_AsSsize_t(py_itemsize);
+    Py_DECREF(py_itemsize);
+    py_itemsize = 0;
+    if (itemsize == (Py_ssize_t)-1 && PyErr_Occurred())
+        goto bad;
+#endif
+    if (itemsize) {
+        if (size % alignment) {
+            alignment = size % alignment;
+        }
+        if (itemsize < (Py_ssize_t)alignment)
+            itemsize = (Py_ssize_t)alignment;
+    }
+    if ((size_t)(basicsize + itemsize) < size) {
+        PyErr_Format(PyExc_ValueError,
+            "%.200s.%.200s size changed, may indicate binary incompatibility. "
+            "Expected %zd from C header, got %zd from PyObject",
+            module_name, class_name, size, basicsize+itemsize);
+        goto bad;
+    }
+    if (check_size == __Pyx_ImportType_CheckSize_Error_3_0_12 &&
+            ((size_t)basicsize > size || (size_t)(basicsize + itemsize) < size)) {
+        PyErr_Format(PyExc_ValueError,
+            "%.200s.%.200s size changed, may indicate binary incompatibility. "
+            "Expected %zd from C header, got %zd-%zd from PyObject",
+            module_name, class_name, size, basicsize, basicsize+itemsize);
+        goto bad;
+    }
+    else if (check_size == __Pyx_ImportType_CheckSize_Warn_3_0_12 && (size_t)basicsize > size) {
+        PyOS_snprintf(warning, sizeof(warning),
+            "%s.%s size changed, may indicate binary incompatibility. "
+            "Expected %zd from C header, got %zd from PyObject",
+            module_name, class_name, size, basicsize);
+        if (PyErr_WarnEx(NULL, warning, 0) < 0) goto bad;
+    }
+    return (PyTypeObject *)result;
+bad:
+    Py_XDECREF(result);
+    return NULL;
+}
+#endif
+
+/* FetchSharedCythonModule */
+  static PyObject *__Pyx_FetchSharedCythonABIModule(void) {
+    return __Pyx_PyImport_AddModuleRef((char*) __PYX_ABI_MODULE_NAME);
+}
+
+/* FetchCommonType */
+  static int __Pyx_VerifyCachedType(PyObject *cached_type,
+                               const char *name,
+                               Py_ssize_t basicsize,
+                               Py_ssize_t expected_basicsize) {
+    if (!PyType_Check(cached_type)) {
+        PyErr_Format(PyExc_TypeError,
+            "Shared Cython type %.200s is not a type object", name);
+        return -1;
+    }
+    if (basicsize != expected_basicsize) {
+        PyErr_Format(PyExc_TypeError,
+            "Shared Cython type %.200s has the wrong size, try recompiling",
+            name);
+        return -1;
+    }
+    return 0;
+}
+#if !CYTHON_USE_TYPE_SPECS
+static PyTypeObject* __Pyx_FetchCommonType(PyTypeObject* type) {
+    PyObject* abi_module;
+    const char* object_name;
+    PyTypeObject *cached_type = NULL;
+    abi_module = __Pyx_FetchSharedCythonABIModule();
+    if (!abi_module) return NULL;
+    object_name = strrchr(type->tp_name, '.');
+    object_name = object_name ? object_name+1 : type->tp_name;
+    cached_type = (PyTypeObject*) PyObject_GetAttrString(abi_module, object_name);
+    if (cached_type) {
+        if (__Pyx_VerifyCachedType(
+              (PyObject *)cached_type,
+              object_name,
+              cached_type->tp_basicsize,
+              type->tp_basicsize) < 0) {
+            goto bad;
+        }
+        goto done;
+    }
+    if (!PyErr_ExceptionMatches(PyExc_AttributeError)) goto bad;
+    PyErr_Clear();
+    if (PyType_Ready(type) < 0) goto bad;
+    if (PyObject_SetAttrString(abi_module, object_name, (PyObject *)type) < 0)
+        goto bad;
+    Py_INCREF(type);
+    cached_type = type;
+done:
+    Py_DECREF(abi_module);
+    return cached_type;
+bad:
+    Py_XDECREF(cached_type);
+    cached_type = NULL;
+    goto done;
+}
+#else
+static PyTypeObject *__Pyx_FetchCommonTypeFromSpec(PyObject *module, PyType_Spec *spec, PyObject *bases) {
+    PyObject *abi_module, *cached_type = NULL;
+    const char* object_name = strrchr(spec->name, '.');
+    object_name = object_name ? object_name+1 : spec->name;
+    abi_module = __Pyx_FetchSharedCythonABIModule();
+    if (!abi_module) return NULL;
+    cached_type = PyObject_GetAttrString(abi_module, object_name);
+    if (cached_type) {
+        Py_ssize_t basicsize;
+#if CYTHON_COMPILING_IN_LIMITED_API
+        PyObject *py_basicsize;
+        py_basicsize = PyObject_GetAttrString(cached_type, "__basicsize__");
+        if (unlikely(!py_basicsize)) goto bad;
+        basicsize = PyLong_AsSsize_t(py_basicsize);
+        Py_DECREF(py_basicsize);
+        py_basicsize = 0;
+        if (unlikely(basicsize == (Py_ssize_t)-1) && PyErr_Occurred()) goto bad;
+#else
+        basicsize = likely(PyType_Check(cached_type)) ? ((PyTypeObject*) cached_type)->tp_basicsize : -1;
+#endif
+        if (__Pyx_VerifyCachedType(
+              cached_type,
+              object_name,
+              basicsize,
+              spec->basicsize) < 0) {
+            goto bad;
+        }
+        goto done;
+    }
+    if (!PyErr_ExceptionMatches(PyExc_AttributeError)) goto bad;
+    PyErr_Clear();
+    CYTHON_UNUSED_VAR(module);
+    cached_type = __Pyx_PyType_FromModuleAndSpec(abi_module, spec, bases);
+    if (unlikely(!cached_type)) goto bad;
+    if (unlikely(__Pyx_fix_up_extension_type_from_spec(spec, (PyTypeObject *) cached_type) < 0)) goto bad;
+    if (PyObject_SetAttrString(abi_module, object_name, cached_type) < 0) goto bad;
+done:
+    Py_DECREF(abi_module);
+    assert(cached_type == NULL || PyType_Check(cached_type));
+    return (PyTypeObject *) cached_type;
+bad:
+    Py_XDECREF(cached_type);
+    cached_type = NULL;
+    goto done;
+}
+#endif
+
+/* PyVectorcallFastCallDict */
+  #if CYTHON_METH_FASTCALL
+static PyObject *__Pyx_PyVectorcall_FastCallDict_kw(PyObject *func, __pyx_vectorcallfunc vc, PyObject *const *args, size_t nargs, PyObject *kw)
+{
+    PyObject *res = NULL;
+    PyObject *kwnames;
+    PyObject **newargs;
+    PyObject **kwvalues;
+    Py_ssize_t i, pos;
+    size_t j;
+    PyObject *key, *value;
+    unsigned long keys_are_strings;
+    Py_ssize_t nkw = PyDict_GET_SIZE(kw);
+    newargs = (PyObject **)PyMem_Malloc((nargs + (size_t)nkw) * sizeof(args[0]));
+    if (unlikely(newargs == NULL)) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    for (j = 0; j < nargs; j++) newargs[j] = args[j];
+    kwnames = PyTuple_New(nkw);
+    if (unlikely(kwnames == NULL)) {
+        PyMem_Free(newargs);
+        return NULL;
+    }
+    kwvalues = newargs + nargs;
+    pos = i = 0;
+    keys_are_strings = Py_TPFLAGS_UNICODE_SUBCLASS;
+    while (PyDict_Next(kw, &pos, &key, &value)) {
+        keys_are_strings &= Py_TYPE(key)->tp_flags;
+        Py_INCREF(key);
+        Py_INCREF(value);
+        PyTuple_SET_ITEM(kwnames, i, key);
+        kwvalues[i] = value;
+        i++;
+    }
+    if (unlikely(!keys_are_strings)) {
+        PyErr_SetString(PyExc_TypeError, "keywords must be strings");
+        goto cleanup;
+    }
+    res = vc(func, newargs, nargs, kwnames);
+cleanup:
+    Py_DECREF(kwnames);
+    for (i = 0; i < nkw; i++)
+        Py_DECREF(kwvalues[i]);
+    PyMem_Free(newargs);
+    return res;
+}
+static CYTHON_INLINE PyObject *__Pyx_PyVectorcall_FastCallDict(PyObject *func, __pyx_vectorcallfunc vc, PyObject *const *args, size_t nargs, PyObject *kw)
+{
+    if (likely(kw == NULL) || PyDict_GET_SIZE(kw) == 0) {
+        return vc(func, args, nargs, NULL);
+    }
+    return __Pyx_PyVectorcall_FastCallDict_kw(func, vc, args, nargs, kw);
+}
+#endif
+
+/* CythonFunctionShared */
+  #if CYTHON_COMPILING_IN_LIMITED_API
+static CYTHON_INLINE int __Pyx__IsSameCyOrCFunction(PyObject *func, void *cfunc) {
+    if (__Pyx_CyFunction_Check(func)) {
+        return PyCFunction_GetFunction(((__pyx_CyFunctionObject*)func)->func) == (PyCFunction) cfunc;
+    } else if (PyCFunction_Check(func)) {
+        return PyCFunction_GetFunction(func) == (PyCFunction) cfunc;
+    }
+    return 0;
+}
+#else
+static CYTHON_INLINE int __Pyx__IsSameCyOrCFunction(PyObject *func, void *cfunc) {
+    return __Pyx_CyOrPyCFunction_Check(func) && __Pyx_CyOrPyCFunction_GET_FUNCTION(func) == (PyCFunction) cfunc;
+}
+#endif
+static CYTHON_INLINE void __Pyx__CyFunction_SetClassObj(__pyx_CyFunctionObject* f, PyObject* classobj) {
+#if PY_VERSION_HEX < 0x030900B1 || CYTHON_COMPILING_IN_LIMITED_API
+    __Pyx_Py_XDECREF_SET(
+        __Pyx_CyFunction_GetClassObj(f),
+            ((classobj) ? __Pyx_NewRef(classobj) : NULL));
+#else
+    __Pyx_Py_XDECREF_SET(
+        ((PyCMethodObject *) (f))->mm_class,
+        (PyTypeObject*)((classobj) ? __Pyx_NewRef(classobj) : NULL));
+#endif
+}
+static PyObject *
+__Pyx_CyFunction_get_doc(__pyx_CyFunctionObject *op, void *closure)
+{
+    CYTHON_UNUSED_VAR(closure);
+    if (unlikely(op->func_doc == NULL)) {
+#if CYTHON_COMPILING_IN_LIMITED_API
+        op->func_doc = PyObject_GetAttrString(op->func, "__doc__");
+        if (unlikely(!op->func_doc)) return NULL;
+#else
+        if (((PyCFunctionObject*)op)->m_ml->ml_doc) {
+#if PY_MAJOR_VERSION >= 3
+            op->func_doc = PyUnicode_FromString(((PyCFunctionObject*)op)->m_ml->ml_doc);
+#else
+            op->func_doc = PyString_FromString(((PyCFunctionObject*)op)->m_ml->ml_doc);
+#endif
+            if (unlikely(op->func_doc == NULL))
+                return NULL;
+        } else {
+            Py_INCREF(Py_None);
+            return Py_None;
+        }
+#endif
+    }
+    Py_INCREF(op->func_doc);
+    return op->func_doc;
+}
+static int
+__Pyx_CyFunction_set_doc(__pyx_CyFunctionObject *op, PyObject *value, void *context)
+{
+    CYTHON_UNUSED_VAR(context);
+    if (value == NULL) {
+        value = Py_None;
+    }
+    Py_INCREF(value);
+    __Pyx_Py_XDECREF_SET(op->func_doc, value);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_name(__pyx_CyFunctionObject *op, void *context)
+{
+    CYTHON_UNUSED_VAR(context);
+    if (unlikely(op->func_name == NULL)) {
+#if CYTHON_COMPILING_IN_LIMITED_API
+        op->func_name = PyObject_GetAttrString(op->func, "__name__");
+#elif PY_MAJOR_VERSION >= 3
+        op->func_name = PyUnicode_InternFromString(((PyCFunctionObject*)op)->m_ml->ml_name);
+#else
+        op->func_name = PyString_InternFromString(((PyCFunctionObject*)op)->m_ml->ml_name);
+#endif
+        if (unlikely(op->func_name == NULL))
+            return NULL;
+    }
+    Py_INCREF(op->func_name);
+    return op->func_name;
+}
+static int
+__Pyx_CyFunction_set_name(__pyx_CyFunctionObject *op, PyObject *value, void *context)
+{
+    CYTHON_UNUSED_VAR(context);
+#if PY_MAJOR_VERSION >= 3
+    if (unlikely(value == NULL || !PyUnicode_Check(value)))
+#else
+    if (unlikely(value == NULL || !PyString_Check(value)))
+#endif
+    {
+        PyErr_SetString(PyExc_TypeError,
+                        "__name__ must be set to a string object");
+        return -1;
+    }
+    Py_INCREF(value);
+    __Pyx_Py_XDECREF_SET(op->func_name, value);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_qualname(__pyx_CyFunctionObject *op, void *context)
+{
+    CYTHON_UNUSED_VAR(context);
+    Py_INCREF(op->func_qualname);
+    return op->func_qualname;
+}
+static int
+__Pyx_CyFunction_set_qualname(__pyx_CyFunctionObject *op, PyObject *value, void *context)
+{
+    CYTHON_UNUSED_VAR(context);
+#if PY_MAJOR_VERSION >= 3
+    if (unlikely(value == NULL || !PyUnicode_Check(value)))
+#else
+    if (unlikely(value == NULL || !PyString_Check(value)))
+#endif
+    {
+        PyErr_SetString(PyExc_TypeError,
+                        "__qualname__ must be set to a string object");
+        return -1;
+    }
+    Py_INCREF(value);
+    __Pyx_Py_XDECREF_SET(op->func_qualname, value);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_dict(__pyx_CyFunctionObject *op, void *context)
+{
+    CYTHON_UNUSED_VAR(context);
+    if (unlikely(op->func_dict == NULL)) {
+        op->func_dict = PyDict_New();
+        if (unlikely(op->func_dict == NULL))
+            return NULL;
+    }
+    Py_INCREF(op->func_dict);
+    return op->func_dict;
+}
+static int
+__Pyx_CyFunction_set_dict(__pyx_CyFunctionObject *op, PyObject *value, void *context)
+{
+    CYTHON_UNUSED_VAR(context);
+    if (unlikely(value == NULL)) {
+        PyErr_SetString(PyExc_TypeError,
+               "function's dictionary may not be deleted");
+        return -1;
+    }
+    if (unlikely(!PyDict_Check(value))) {
+        PyErr_SetString(PyExc_TypeError,
+               "setting function's dictionary to a non-dict");
+        return -1;
+    }
+    Py_INCREF(value);
+    __Pyx_Py_XDECREF_SET(op->func_dict, value);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_globals(__pyx_CyFunctionObject *op, void *context)
+{
+    CYTHON_UNUSED_VAR(context);
+    Py_INCREF(op->func_globals);
+    return op->func_globals;
+}
+static PyObject *
+__Pyx_CyFunction_get_closure(__pyx_CyFunctionObject *op, void *context)
+{
+    CYTHON_UNUSED_VAR(op);
+    CYTHON_UNUSED_VAR(context);
+    Py_INCREF(Py_None);
+    return Py_None;
+}
+static PyObject *
+__Pyx_CyFunction_get_code(__pyx_CyFunctionObject *op, void *context)
+{
+    PyObject* result = (op->func_code) ? op->func_code : Py_None;
+    CYTHON_UNUSED_VAR(context);
+    Py_INCREF(result);
+    return result;
+}
+static int
+__Pyx_CyFunction_init_defaults(__pyx_CyFunctionObject *op) {
+    int result = 0;
+    PyObject *res = op->defaults_getter((PyObject *) op);
+    if (unlikely(!res))
+        return -1;
+    #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    op->defaults_tuple = PyTuple_GET_ITEM(res, 0);
+    Py_INCREF(op->defaults_tuple);
+    op->defaults_kwdict = PyTuple_GET_ITEM(res, 1);
+    Py_INCREF(op->defaults_kwdict);
+    #else
+    op->defaults_tuple = __Pyx_PySequence_ITEM(res, 0);
+    if (unlikely(!op->defaults_tuple)) result = -1;
+    else {
+        op->defaults_kwdict = __Pyx_PySequence_ITEM(res, 1);
+        if (unlikely(!op->defaults_kwdict)) result = -1;
+    }
+    #endif
+    Py_DECREF(res);
+    return result;
+}
+static int
+__Pyx_CyFunction_set_defaults(__pyx_CyFunctionObject *op, PyObject* value, void *context) {
+    CYTHON_UNUSED_VAR(context);
+    if (!value) {
+        value = Py_None;
+    } else if (unlikely(value != Py_None && !PyTuple_Check(value))) {
+        PyErr_SetString(PyExc_TypeError,
+                        "__defaults__ must be set to a tuple object");
+        return -1;
+    }
+    PyErr_WarnEx(PyExc_RuntimeWarning, "changes to cyfunction.__defaults__ will not "
+                 "currently affect the values used in function calls", 1);
+    Py_INCREF(value);
+    __Pyx_Py_XDECREF_SET(op->defaults_tuple, value);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_defaults(__pyx_CyFunctionObject *op, void *context) {
+    PyObject* result = op->defaults_tuple;
+    CYTHON_UNUSED_VAR(context);
+    if (unlikely(!result)) {
+        if (op->defaults_getter) {
+            if (unlikely(__Pyx_CyFunction_init_defaults(op) < 0)) return NULL;
+            result = op->defaults_tuple;
+        } else {
+            result = Py_None;
+        }
+    }
+    Py_INCREF(result);
+    return result;
+}
+static int
+__Pyx_CyFunction_set_kwdefaults(__pyx_CyFunctionObject *op, PyObject* value, void *context) {
+    CYTHON_UNUSED_VAR(context);
+    if (!value) {
+        value = Py_None;
+    } else if (unlikely(value != Py_None && !PyDict_Check(value))) {
+        PyErr_SetString(PyExc_TypeError,
+                        "__kwdefaults__ must be set to a dict object");
+        return -1;
+    }
+    PyErr_WarnEx(PyExc_RuntimeWarning, "changes to cyfunction.__kwdefaults__ will not "
+                 "currently affect the values used in function calls", 1);
+    Py_INCREF(value);
+    __Pyx_Py_XDECREF_SET(op->defaults_kwdict, value);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_kwdefaults(__pyx_CyFunctionObject *op, void *context) {
+    PyObject* result = op->defaults_kwdict;
+    CYTHON_UNUSED_VAR(context);
+    if (unlikely(!result)) {
+        if (op->defaults_getter) {
+            if (unlikely(__Pyx_CyFunction_init_defaults(op) < 0)) return NULL;
+            result = op->defaults_kwdict;
+        } else {
+            result = Py_None;
+        }
+    }
+    Py_INCREF(result);
+    return result;
+}
+static int
+__Pyx_CyFunction_set_annotations(__pyx_CyFunctionObject *op, PyObject* value, void *context) {
+    CYTHON_UNUSED_VAR(context);
+    if (!value || value == Py_None) {
+        value = NULL;
+    } else if (unlikely(!PyDict_Check(value))) {
+        PyErr_SetString(PyExc_TypeError,
+                        "__annotations__ must be set to a dict object");
+        return -1;
+    }
+    Py_XINCREF(value);
+    __Pyx_Py_XDECREF_SET(op->func_annotations, value);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_annotations(__pyx_CyFunctionObject *op, void *context) {
+    PyObject* result = op->func_annotations;
+    CYTHON_UNUSED_VAR(context);
+    if (unlikely(!result)) {
+        result = PyDict_New();
+        if (unlikely(!result)) return NULL;
+        op->func_annotations = result;
+    }
+    Py_INCREF(result);
+    return result;
+}
+static PyObject *
+__Pyx_CyFunction_get_is_coroutine(__pyx_CyFunctionObject *op, void *context) {
+    int is_coroutine;
+    CYTHON_UNUSED_VAR(context);
+    if (op->func_is_coroutine) {
+        return __Pyx_NewRef(op->func_is_coroutine);
+    }
+    is_coroutine = op->flags & __Pyx_CYFUNCTION_COROUTINE;
+#if PY_VERSION_HEX >= 0x03050000
+    if (is_coroutine) {
+        PyObject *module, *fromlist, *marker = __pyx_n_s_is_coroutine;
+        fromlist = PyList_New(1);
+        if (unlikely(!fromlist)) return NULL;
+        Py_INCREF(marker);
+#if CYTHON_ASSUME_SAFE_MACROS
+        PyList_SET_ITEM(fromlist, 0, marker);
+#else
+        if (unlikely(PyList_SetItem(fromlist, 0, marker) < 0)) {
+            Py_DECREF(marker);
+            Py_DECREF(fromlist);
+            return NULL;
+        }
+#endif
+        module = PyImport_ImportModuleLevelObject(__pyx_n_s_asyncio_coroutines, NULL, NULL, fromlist, 0);
+        Py_DECREF(fromlist);
+        if (unlikely(!module)) goto ignore;
+        op->func_is_coroutine = __Pyx_PyObject_GetAttrStr(module, marker);
+        Py_DECREF(module);
+        if (likely(op->func_is_coroutine)) {
+            return __Pyx_NewRef(op->func_is_coroutine);
+        }
+ignore:
+        PyErr_Clear();
+    }
+#endif
+    op->func_is_coroutine = __Pyx_PyBool_FromLong(is_coroutine);
+    return __Pyx_NewRef(op->func_is_coroutine);
+}
+#if CYTHON_COMPILING_IN_LIMITED_API
+static PyObject *
+__Pyx_CyFunction_get_module(__pyx_CyFunctionObject *op, void *context) {
+    CYTHON_UNUSED_VAR(context);
+    return PyObject_GetAttrString(op->func, "__module__");
+}
+static int
+__Pyx_CyFunction_set_module(__pyx_CyFunctionObject *op, PyObject* value, void *context) {
+    CYTHON_UNUSED_VAR(context);
+    return PyObject_SetAttrString(op->func, "__module__", value);
+}
+#endif
+static PyGetSetDef __pyx_CyFunction_getsets[] = {
+    {(char *) "func_doc", (getter)__Pyx_CyFunction_get_doc, (setter)__Pyx_CyFunction_set_doc, 0, 0},
+    {(char *) "__doc__",  (getter)__Pyx_CyFunction_get_doc, (setter)__Pyx_CyFunction_set_doc, 0, 0},
+    {(char *) "func_name", (getter)__Pyx_CyFunction_get_name, (setter)__Pyx_CyFunction_set_name, 0, 0},
+    {(char *) "__name__", (getter)__Pyx_CyFunction_get_name, (setter)__Pyx_CyFunction_set_name, 0, 0},
+    {(char *) "__qualname__", (getter)__Pyx_CyFunction_get_qualname, (setter)__Pyx_CyFunction_set_qualname, 0, 0},
+    {(char *) "func_dict", (getter)__Pyx_CyFunction_get_dict, (setter)__Pyx_CyFunction_set_dict, 0, 0},
+    {(char *) "__dict__", (getter)__Pyx_CyFunction_get_dict, (setter)__Pyx_CyFunction_set_dict, 0, 0},
+    {(char *) "func_globals", (getter)__Pyx_CyFunction_get_globals, 0, 0, 0},
+    {(char *) "__globals__", (getter)__Pyx_CyFunction_get_globals, 0, 0, 0},
+    {(char *) "func_closure", (getter)__Pyx_CyFunction_get_closure, 0, 0, 0},
+    {(char *) "__closure__", (getter)__Pyx_CyFunction_get_closure, 0, 0, 0},
+    {(char *) "func_code", (getter)__Pyx_CyFunction_get_code, 0, 0, 0},
+    {(char *) "__code__", (getter)__Pyx_CyFunction_get_code, 0, 0, 0},
+    {(char *) "func_defaults", (getter)__Pyx_CyFunction_get_defaults, (setter)__Pyx_CyFunction_set_defaults, 0, 0},
+    {(char *) "__defaults__", (getter)__Pyx_CyFunction_get_defaults, (setter)__Pyx_CyFunction_set_defaults, 0, 0},
+    {(char *) "__kwdefaults__", (getter)__Pyx_CyFunction_get_kwdefaults, (setter)__Pyx_CyFunction_set_kwdefaults, 0, 0},
+    {(char *) "__annotations__", (getter)__Pyx_CyFunction_get_annotations, (setter)__Pyx_CyFunction_set_annotations, 0, 0},
+    {(char *) "_is_coroutine", (getter)__Pyx_CyFunction_get_is_coroutine, 0, 0, 0},
+#if CYTHON_COMPILING_IN_LIMITED_API
+    {"__module__", (getter)__Pyx_CyFunction_get_module, (setter)__Pyx_CyFunction_set_module, 0, 0},
+#endif
+    {0, 0, 0, 0, 0}
+};
+static PyMemberDef __pyx_CyFunction_members[] = {
+#if !CYTHON_COMPILING_IN_LIMITED_API
+    {(char *) "__module__", T_OBJECT, offsetof(PyCFunctionObject, m_module), 0, 0},
+#endif
+#if CYTHON_USE_TYPE_SPECS
+    {(char *) "__dictoffset__", T_PYSSIZET, offsetof(__pyx_CyFunctionObject, func_dict), READONLY, 0},
+#if CYTHON_METH_FASTCALL
+#if CYTHON_BACKPORT_VECTORCALL
+    {(char *) "__vectorcalloffset__", T_PYSSIZET, offsetof(__pyx_CyFunctionObject, func_vectorcall), READONLY, 0},
+#else
+#if !CYTHON_COMPILING_IN_LIMITED_API
+    {(char *) "__vectorcalloffset__", T_PYSSIZET, offsetof(PyCFunctionObject, vectorcall), READONLY, 0},
+#endif
+#endif
+#endif
+#if PY_VERSION_HEX < 0x030500A0 || CYTHON_COMPILING_IN_LIMITED_API
+    {(char *) "__weaklistoffset__", T_PYSSIZET, offsetof(__pyx_CyFunctionObject, func_weakreflist), READONLY, 0},
+#else
+    {(char *) "__weaklistoffset__", T_PYSSIZET, offsetof(PyCFunctionObject, m_weakreflist), READONLY, 0},
+#endif
+#endif
+    {0, 0, 0,  0, 0}
+};
+static PyObject *
+__Pyx_CyFunction_reduce(__pyx_CyFunctionObject *m, PyObject *args)
+{
+    CYTHON_UNUSED_VAR(args);
+#if PY_MAJOR_VERSION >= 3
+    Py_INCREF(m->func_qualname);
+    return m->func_qualname;
+#else
+    return PyString_FromString(((PyCFunctionObject*)m)->m_ml->ml_name);
+#endif
+}
+static PyMethodDef __pyx_CyFunction_methods[] = {
+    {"__reduce__", (PyCFunction)__Pyx_CyFunction_reduce, METH_VARARGS, 0},
+    {0, 0, 0, 0}
+};
+#if PY_VERSION_HEX < 0x030500A0 || CYTHON_COMPILING_IN_LIMITED_API
+#define __Pyx_CyFunction_weakreflist(cyfunc) ((cyfunc)->func_weakreflist)
+#else
+#define __Pyx_CyFunction_weakreflist(cyfunc) (((PyCFunctionObject*)cyfunc)->m_weakreflist)
+#endif
+static PyObject *__Pyx_CyFunction_Init(__pyx_CyFunctionObject *op, PyMethodDef *ml, int flags, PyObject* qualname,
+                                       PyObject *closure, PyObject *module, PyObject* globals, PyObject* code) {
+#if !CYTHON_COMPILING_IN_LIMITED_API
+    PyCFunctionObject *cf = (PyCFunctionObject*) op;
+#endif
+    if (unlikely(op == NULL))
+        return NULL;
+#if CYTHON_COMPILING_IN_LIMITED_API
+    op->func = PyCFunction_NewEx(ml, (PyObject*)op, module);
+    if (unlikely(!op->func)) return NULL;
+#endif
+    op->flags = flags;
+    __Pyx_CyFunction_weakreflist(op) = NULL;
+#if !CYTHON_COMPILING_IN_LIMITED_API
+    cf->m_ml = ml;
+    cf->m_self = (PyObject *) op;
+#endif
+    Py_XINCREF(closure);
+    op->func_closure = closure;
+#if !CYTHON_COMPILING_IN_LIMITED_API
+    Py_XINCREF(module);
+    cf->m_module = module;
+#endif
+    op->func_dict = NULL;
+    op->func_name = NULL;
+    Py_INCREF(qualname);
+    op->func_qualname = qualname;
+    op->func_doc = NULL;
+#if PY_VERSION_HEX < 0x030900B1 || CYTHON_COMPILING_IN_LIMITED_API
+    op->func_classobj = NULL;
+#else
+    ((PyCMethodObject*)op)->mm_class = NULL;
+#endif
+    op->func_globals = globals;
+    Py_INCREF(op->func_globals);
+    Py_XINCREF(code);
+    op->func_code = code;
+    op->defaults_pyobjects = 0;
+    op->defaults_size = 0;
+    op->defaults = NULL;
+    op->defaults_tuple = NULL;
+    op->defaults_kwdict = NULL;
+    op->defaults_getter = NULL;
+    op->func_annotations = NULL;
+    op->func_is_coroutine = NULL;
+#if CYTHON_METH_FASTCALL
+    switch (ml->ml_flags & (METH_VARARGS | METH_FASTCALL | METH_NOARGS | METH_O | METH_KEYWORDS | METH_METHOD)) {
+    case METH_NOARGS:
+        __Pyx_CyFunction_func_vectorcall(op) = __Pyx_CyFunction_Vectorcall_NOARGS;
+        break;
+    case METH_O:
+        __Pyx_CyFunction_func_vectorcall(op) = __Pyx_CyFunction_Vectorcall_O;
+        break;
+    case METH_METHOD | METH_FASTCALL | METH_KEYWORDS:
+        __Pyx_CyFunction_func_vectorcall(op) = __Pyx_CyFunction_Vectorcall_FASTCALL_KEYWORDS_METHOD;
+        break;
+    case METH_FASTCALL | METH_KEYWORDS:
+        __Pyx_CyFunction_func_vectorcall(op) = __Pyx_CyFunction_Vectorcall_FASTCALL_KEYWORDS;
+        break;
+    case METH_VARARGS | METH_KEYWORDS:
+        __Pyx_CyFunction_func_vectorcall(op) = NULL;
+        break;
+    default:
+        PyErr_SetString(PyExc_SystemError, "Bad call flags for CyFunction");
+        Py_DECREF(op);
+        return NULL;
+    }
+#endif
+    return (PyObject *) op;
+}
+static int
+__Pyx_CyFunction_clear(__pyx_CyFunctionObject *m)
+{
+    Py_CLEAR(m->func_closure);
+#if CYTHON_COMPILING_IN_LIMITED_API
+    Py_CLEAR(m->func);
+#else
+    Py_CLEAR(((PyCFunctionObject*)m)->m_module);
+#endif
+    Py_CLEAR(m->func_dict);
+    Py_CLEAR(m->func_name);
+    Py_CLEAR(m->func_qualname);
+    Py_CLEAR(m->func_doc);
+    Py_CLEAR(m->func_globals);
+    Py_CLEAR(m->func_code);
+#if !CYTHON_COMPILING_IN_LIMITED_API
+#if PY_VERSION_HEX < 0x030900B1
+    Py_CLEAR(__Pyx_CyFunction_GetClassObj(m));
+#else
+    {
+        PyObject *cls = (PyObject*) ((PyCMethodObject *) (m))->mm_class;
+        ((PyCMethodObject *) (m))->mm_class = NULL;
+        Py_XDECREF(cls);
+    }
+#endif
+#endif
+    Py_CLEAR(m->defaults_tuple);
+    Py_CLEAR(m->defaults_kwdict);
+    Py_CLEAR(m->func_annotations);
+    Py_CLEAR(m->func_is_coroutine);
+    if (m->defaults) {
+        PyObject **pydefaults = __Pyx_CyFunction_Defaults(PyObject *, m);
+        int i;
+        for (i = 0; i < m->defaults_pyobjects; i++)
+            Py_XDECREF(pydefaults[i]);
+        PyObject_Free(m->defaults);
+        m->defaults = NULL;
+    }
+    return 0;
+}
+static void __Pyx__CyFunction_dealloc(__pyx_CyFunctionObject *m)
+{
+    if (__Pyx_CyFunction_weakreflist(m) != NULL)
+        PyObject_ClearWeakRefs((PyObject *) m);
+    __Pyx_CyFunction_clear(m);
+    __Pyx_PyHeapTypeObject_GC_Del(m);
+}
+static void __Pyx_CyFunction_dealloc(__pyx_CyFunctionObject *m)
+{
+    PyObject_GC_UnTrack(m);
+    __Pyx__CyFunction_dealloc(m);
+}
+static int __Pyx_CyFunction_traverse(__pyx_CyFunctionObject *m, visitproc visit, void *arg)
+{
+    Py_VISIT(m->func_closure);
+#if CYTHON_COMPILING_IN_LIMITED_API
+    Py_VISIT(m->func);
+#else
+    Py_VISIT(((PyCFunctionObject*)m)->m_module);
+#endif
+    Py_VISIT(m->func_dict);
+    Py_VISIT(m->func_name);
+    Py_VISIT(m->func_qualname);
+    Py_VISIT(m->func_doc);
+    Py_VISIT(m->func_globals);
+    Py_VISIT(m->func_code);
+#if !CYTHON_COMPILING_IN_LIMITED_API
+    Py_VISIT(__Pyx_CyFunction_GetClassObj(m));
+#endif
+    Py_VISIT(m->defaults_tuple);
+    Py_VISIT(m->defaults_kwdict);
+    Py_VISIT(m->func_is_coroutine);
+    if (m->defaults) {
+        PyObject **pydefaults = __Pyx_CyFunction_Defaults(PyObject *, m);
+        int i;
+        for (i = 0; i < m->defaults_pyobjects; i++)
+            Py_VISIT(pydefaults[i]);
+    }
+    return 0;
+}
+static PyObject*
+__Pyx_CyFunction_repr(__pyx_CyFunctionObject *op)
+{
+#if PY_MAJOR_VERSION >= 3
+    return PyUnicode_FromFormat("<cyfunction %U at %p>",
+                                op->func_qualname, (void *)op);
+#else
+    return PyString_FromFormat("<cyfunction %s at %p>",
+                               PyString_AsString(op->func_qualname), (void *)op);
+#endif
+}
+static PyObject * __Pyx_CyFunction_CallMethod(PyObject *func, PyObject *self, PyObject *arg, PyObject *kw) {
+#if CYTHON_COMPILING_IN_LIMITED_API
+    PyObject *f = ((__pyx_CyFunctionObject*)func)->func;
+    PyObject *py_name = NULL;
+    PyCFunction meth;
+    int flags;
+    meth = PyCFunction_GetFunction(f);
+    if (unlikely(!meth)) return NULL;
+    flags = PyCFunction_GetFlags(f);
+    if (unlikely(flags < 0)) return NULL;
+#else
+    PyCFunctionObject* f = (PyCFunctionObject*)func;
+    PyCFunction meth = f->m_ml->ml_meth;
+    int flags = f->m_ml->ml_flags;
+#endif
+    Py_ssize_t size;
+    switch (flags & (METH_VARARGS | METH_KEYWORDS | METH_NOARGS | METH_O)) {
+    case METH_VARARGS:
+        if (likely(kw == NULL || PyDict_Size(kw) == 0))
+            return (*meth)(self, arg);
+        break;
+    case METH_VARARGS | METH_KEYWORDS:
+        return (*(PyCFunctionWithKeywords)(void*)meth)(self, arg, kw);
+    case METH_NOARGS:
+        if (likely(kw == NULL || PyDict_Size(kw) == 0)) {
+#if CYTHON_ASSUME_SAFE_MACROS
+            size = PyTuple_GET_SIZE(arg);
+#else
+            size = PyTuple_Size(arg);
+            if (unlikely(size < 0)) return NULL;
+#endif
+            if (likely(size == 0))
+                return (*meth)(self, NULL);
+#if CYTHON_COMPILING_IN_LIMITED_API
+            py_name = __Pyx_CyFunction_get_name((__pyx_CyFunctionObject*)func, NULL);
+            if (!py_name) return NULL;
+            PyErr_Format(PyExc_TypeError,
+                "%.200S() takes no arguments (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                py_name, size);
+            Py_DECREF(py_name);
+#else
+            PyErr_Format(PyExc_TypeError,
+                "%.200s() takes no arguments (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                f->m_ml->ml_name, size);
+#endif
+            return NULL;
+        }
+        break;
+    case METH_O:
+        if (likely(kw == NULL || PyDict_Size(kw) == 0)) {
+#if CYTHON_ASSUME_SAFE_MACROS
+            size = PyTuple_GET_SIZE(arg);
+#else
+            size = PyTuple_Size(arg);
+            if (unlikely(size < 0)) return NULL;
+#endif
+            if (likely(size == 1)) {
+                PyObject *result, *arg0;
+                #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+                arg0 = PyTuple_GET_ITEM(arg, 0);
+                #else
+                arg0 = __Pyx_PySequence_ITEM(arg, 0); if (unlikely(!arg0)) return NULL;
+                #endif
+                result = (*meth)(self, arg0);
+                #if !(CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS)
+                Py_DECREF(arg0);
+                #endif
+                return result;
+            }
+#if CYTHON_COMPILING_IN_LIMITED_API
+            py_name = __Pyx_CyFunction_get_name((__pyx_CyFunctionObject*)func, NULL);
+            if (!py_name) return NULL;
+            PyErr_Format(PyExc_TypeError,
+                "%.200S() takes exactly one argument (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                py_name, size);
+            Py_DECREF(py_name);
+#else
+            PyErr_Format(PyExc_TypeError,
+                "%.200s() takes exactly one argument (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                f->m_ml->ml_name, size);
+#endif
+            return NULL;
+        }
+        break;
+    default:
+        PyErr_SetString(PyExc_SystemError, "Bad call flags for CyFunction");
+        return NULL;
+    }
+#if CYTHON_COMPILING_IN_LIMITED_API
+    py_name = __Pyx_CyFunction_get_name((__pyx_CyFunctionObject*)func, NULL);
+    if (!py_name) return NULL;
+    PyErr_Format(PyExc_TypeError, "%.200S() takes no keyword arguments",
+                 py_name);
+    Py_DECREF(py_name);
+#else
+    PyErr_Format(PyExc_TypeError, "%.200s() takes no keyword arguments",
+                 f->m_ml->ml_name);
+#endif
+    return NULL;
+}
+static CYTHON_INLINE PyObject *__Pyx_CyFunction_Call(PyObject *func, PyObject *arg, PyObject *kw) {
+    PyObject *self, *result;
+#if CYTHON_COMPILING_IN_LIMITED_API
+    self = PyCFunction_GetSelf(((__pyx_CyFunctionObject*)func)->func);
+    if (unlikely(!self) && PyErr_Occurred()) return NULL;
+#else
+    self = ((PyCFunctionObject*)func)->m_self;
+#endif
+    result = __Pyx_CyFunction_CallMethod(func, self, arg, kw);
+    return result;
+}
+static PyObject *__Pyx_CyFunction_CallAsMethod(PyObject *func, PyObject *args, PyObject *kw) {
+    PyObject *result;
+    __pyx_CyFunctionObject *cyfunc = (__pyx_CyFunctionObject *) func;
+#if CYTHON_METH_FASTCALL
+     __pyx_vectorcallfunc vc = __Pyx_CyFunction_func_vectorcall(cyfunc);
+    if (vc) {
+#if CYTHON_ASSUME_SAFE_MACROS
+        return __Pyx_PyVectorcall_FastCallDict(func, vc, &PyTuple_GET_ITEM(args, 0), (size_t)PyTuple_GET_SIZE(args), kw);
+#else
+        (void) &__Pyx_PyVectorcall_FastCallDict;
+        return PyVectorcall_Call(func, args, kw);
+#endif
+    }
+#endif
+    if ((cyfunc->flags & __Pyx_CYFUNCTION_CCLASS) && !(cyfunc->flags & __Pyx_CYFUNCTION_STATICMETHOD)) {
+        Py_ssize_t argc;
+        PyObject *new_args;
+        PyObject *self;
+#if CYTHON_ASSUME_SAFE_MACROS
+        argc = PyTuple_GET_SIZE(args);
+#else
+        argc = PyTuple_Size(args);
+        if (unlikely(!argc) < 0) return NULL;
+#endif
+        new_args = PyTuple_GetSlice(args, 1, argc);
+        if (unlikely(!new_args))
+            return NULL;
+        self = PyTuple_GetItem(args, 0);
+        if (unlikely(!self)) {
+            Py_DECREF(new_args);
+#if PY_MAJOR_VERSION > 2
+            PyErr_Format(PyExc_TypeError,
+                         "unbound method %.200S() needs an argument",
+                         cyfunc->func_qualname);
+#else
+            PyErr_SetString(PyExc_TypeError,
+                            "unbound method needs an argument");
+#endif
+            return NULL;
+        }
+        result = __Pyx_CyFunction_CallMethod(func, self, new_args, kw);
+        Py_DECREF(new_args);
+    } else {
+        result = __Pyx_CyFunction_Call(func, args, kw);
+    }
+    return result;
+}
+#if CYTHON_METH_FASTCALL
+static CYTHON_INLINE int __Pyx_CyFunction_Vectorcall_CheckArgs(__pyx_CyFunctionObject *cyfunc, Py_ssize_t nargs, PyObject *kwnames)
+{
+    int ret = 0;
+    if ((cyfunc->flags & __Pyx_CYFUNCTION_CCLASS) && !(cyfunc->flags & __Pyx_CYFUNCTION_STATICMETHOD)) {
+        if (unlikely(nargs < 1)) {
+            PyErr_Format(PyExc_TypeError, "%.200s() needs an argument",
+                         ((PyCFunctionObject*)cyfunc)->m_ml->ml_name);
+            return -1;
+        }
+        ret = 1;
+    }
+    if (unlikely(kwnames) && unlikely(PyTuple_GET_SIZE(kwnames))) {
+        PyErr_Format(PyExc_TypeError,
+                     "%.200s() takes no keyword arguments", ((PyCFunctionObject*)cyfunc)->m_ml->ml_name);
+        return -1;
+    }
+    return ret;
+}
+static PyObject * __Pyx_CyFunction_Vectorcall_NOARGS(PyObject *func, PyObject *const *args, size_t nargsf, PyObject *kwnames)
+{
+    __pyx_CyFunctionObject *cyfunc = (__pyx_CyFunctionObject *)func;
+    PyMethodDef* def = ((PyCFunctionObject*)cyfunc)->m_ml;
+#if CYTHON_BACKPORT_VECTORCALL
+    Py_ssize_t nargs = (Py_ssize_t)nargsf;
+#else
+    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
+#endif
+    PyObject *self;
+    switch (__Pyx_CyFunction_Vectorcall_CheckArgs(cyfunc, nargs, kwnames)) {
+    case 1:
+        self = args[0];
+        args += 1;
+        nargs -= 1;
+        break;
+    case 0:
+        self = ((PyCFunctionObject*)cyfunc)->m_self;
+        break;
+    default:
+        return NULL;
+    }
+    if (unlikely(nargs != 0)) {
+        PyErr_Format(PyExc_TypeError,
+            "%.200s() takes no arguments (%" CYTHON_FORMAT_SSIZE_T "d given)",
+            def->ml_name, nargs);
+        return NULL;
+    }
+    return def->ml_meth(self, NULL);
+}
+static PyObject * __Pyx_CyFunction_Vectorcall_O(PyObject *func, PyObject *const *args, size_t nargsf, PyObject *kwnames)
+{
+    __pyx_CyFunctionObject *cyfunc = (__pyx_CyFunctionObject *)func;
+    PyMethodDef* def = ((PyCFunctionObject*)cyfunc)->m_ml;
+#if CYTHON_BACKPORT_VECTORCALL
+    Py_ssize_t nargs = (Py_ssize_t)nargsf;
+#else
+    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
+#endif
+    PyObject *self;
+    switch (__Pyx_CyFunction_Vectorcall_CheckArgs(cyfunc, nargs, kwnames)) {
+    case 1:
+        self = args[0];
+        args += 1;
+        nargs -= 1;
+        break;
+    case 0:
+        self = ((PyCFunctionObject*)cyfunc)->m_self;
+        break;
+    default:
+        return NULL;
+    }
+    if (unlikely(nargs != 1)) {
+        PyErr_Format(PyExc_TypeError,
+            "%.200s() takes exactly one argument (%" CYTHON_FORMAT_SSIZE_T "d given)",
+            def->ml_name, nargs);
+        return NULL;
+    }
+    return def->ml_meth(self, args[0]);
+}
+static PyObject * __Pyx_CyFunction_Vectorcall_FASTCALL_KEYWORDS(PyObject *func, PyObject *const *args, size_t nargsf, PyObject *kwnames)
+{
+    __pyx_CyFunctionObject *cyfunc = (__pyx_CyFunctionObject *)func;
+    PyMethodDef* def = ((PyCFunctionObject*)cyfunc)->m_ml;
+#if CYTHON_BACKPORT_VECTORCALL
+    Py_ssize_t nargs = (Py_ssize_t)nargsf;
+#else
+    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
+#endif
+    PyObject *self;
+    switch (__Pyx_CyFunction_Vectorcall_CheckArgs(cyfunc, nargs, NULL)) {
+    case 1:
+        self = args[0];
+        args += 1;
+        nargs -= 1;
+        break;
+    case 0:
+        self = ((PyCFunctionObject*)cyfunc)->m_self;
+        break;
+    default:
+        return NULL;
+    }
+    return ((__Pyx_PyCFunctionFastWithKeywords)(void(*)(void))def->ml_meth)(self, args, nargs, kwnames);
+}
+static PyObject * __Pyx_CyFunction_Vectorcall_FASTCALL_KEYWORDS_METHOD(PyObject *func, PyObject *const *args, size_t nargsf, PyObject *kwnames)
+{
+    __pyx_CyFunctionObject *cyfunc = (__pyx_CyFunctionObject *)func;
+    PyMethodDef* def = ((PyCFunctionObject*)cyfunc)->m_ml;
+    PyTypeObject *cls = (PyTypeObject *) __Pyx_CyFunction_GetClassObj(cyfunc);
+#if CYTHON_BACKPORT_VECTORCALL
+    Py_ssize_t nargs = (Py_ssize_t)nargsf;
+#else
+    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
+#endif
+    PyObject *self;
+    switch (__Pyx_CyFunction_Vectorcall_CheckArgs(cyfunc, nargs, NULL)) {
+    case 1:
+        self = args[0];
+        args += 1;
+        nargs -= 1;
+        break;
+    case 0:
+        self = ((PyCFunctionObject*)cyfunc)->m_self;
+        break;
+    default:
+        return NULL;
+    }
+    return ((__Pyx_PyCMethod)(void(*)(void))def->ml_meth)(self, cls, args, (size_t)nargs, kwnames);
+}
+#endif
+#if CYTHON_USE_TYPE_SPECS
+static PyType_Slot __pyx_CyFunctionType_slots[] = {
+    {Py_tp_dealloc, (void *)__Pyx_CyFunction_dealloc},
+    {Py_tp_repr, (void *)__Pyx_CyFunction_repr},
+    {Py_tp_call, (void *)__Pyx_CyFunction_CallAsMethod},
+    {Py_tp_traverse, (void *)__Pyx_CyFunction_traverse},
+    {Py_tp_clear, (void *)__Pyx_CyFunction_clear},
+    {Py_tp_methods, (void *)__pyx_CyFunction_methods},
+    {Py_tp_members, (void *)__pyx_CyFunction_members},
+    {Py_tp_getset, (void *)__pyx_CyFunction_getsets},
+    {Py_tp_descr_get, (void *)__Pyx_PyMethod_New},
+    {0, 0},
+};
+static PyType_Spec __pyx_CyFunctionType_spec = {
+    __PYX_TYPE_MODULE_PREFIX "cython_function_or_method",
+    sizeof(__pyx_CyFunctionObject),
+    0,
+#ifdef Py_TPFLAGS_METHOD_DESCRIPTOR
+    Py_TPFLAGS_METHOD_DESCRIPTOR |
+#endif
+#if (defined(_Py_TPFLAGS_HAVE_VECTORCALL) && CYTHON_METH_FASTCALL)
+    _Py_TPFLAGS_HAVE_VECTORCALL |
+#endif
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_BASETYPE,
+    __pyx_CyFunctionType_slots
+};
+#else
+static PyTypeObject __pyx_CyFunctionType_type = {
+    PyVarObject_HEAD_INIT(0, 0)
+    __PYX_TYPE_MODULE_PREFIX "cython_function_or_method",
+    sizeof(__pyx_CyFunctionObject),
+    0,
+    (destructor) __Pyx_CyFunction_dealloc,
+#if !CYTHON_METH_FASTCALL
+    0,
+#elif CYTHON_BACKPORT_VECTORCALL
+    (printfunc)offsetof(__pyx_CyFunctionObject, func_vectorcall),
+#else
+    offsetof(PyCFunctionObject, vectorcall),
+#endif
+    0,
+    0,
+#if PY_MAJOR_VERSION < 3
+    0,
+#else
+    0,
+#endif
+    (reprfunc) __Pyx_CyFunction_repr,
+    0,
+    0,
+    0,
+    0,
+    __Pyx_CyFunction_CallAsMethod,
+    0,
+    0,
+    0,
+    0,
+#ifdef Py_TPFLAGS_METHOD_DESCRIPTOR
+    Py_TPFLAGS_METHOD_DESCRIPTOR |
+#endif
+#if defined(_Py_TPFLAGS_HAVE_VECTORCALL) && CYTHON_METH_FASTCALL
+    _Py_TPFLAGS_HAVE_VECTORCALL |
+#endif
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_BASETYPE,
+    0,
+    (traverseproc) __Pyx_CyFunction_traverse,
+    (inquiry) __Pyx_CyFunction_clear,
+    0,
+#if PY_VERSION_HEX < 0x030500A0
+    offsetof(__pyx_CyFunctionObject, func_weakreflist),
+#else
+    offsetof(PyCFunctionObject, m_weakreflist),
+#endif
+    0,
+    0,
+    __pyx_CyFunction_methods,
+    __pyx_CyFunction_members,
+    __pyx_CyFunction_getsets,
+    0,
+    0,
+    __Pyx_PyMethod_New,
+    0,
+    offsetof(__pyx_CyFunctionObject, func_dict),
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+#if PY_VERSION_HEX >= 0x030400a1
+    0,
+#endif
+#if PY_VERSION_HEX >= 0x030800b1 && (!CYTHON_COMPILING_IN_PYPY || PYPY_VERSION_NUM >= 0x07030800)
+    0,
+#endif
+#if __PYX_NEED_TP_PRINT_SLOT
+    0,
+#endif
+#if PY_VERSION_HEX >= 0x030C0000
+    0,
+#endif
+#if PY_VERSION_HEX >= 0x030d00A4
+    0,
+#endif
+#if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX >= 0x03090000 && PY_VERSION_HEX < 0x030a0000
+    0,
+#endif
+};
+#endif
+static int __pyx_CyFunction_init(PyObject *module) {
+#if CYTHON_USE_TYPE_SPECS
+    __pyx_CyFunctionType = __Pyx_FetchCommonTypeFromSpec(module, &__pyx_CyFunctionType_spec, NULL);
+#else
+    CYTHON_UNUSED_VAR(module);
+    __pyx_CyFunctionType = __Pyx_FetchCommonType(&__pyx_CyFunctionType_type);
+#endif
+    if (unlikely(__pyx_CyFunctionType == NULL)) {
+        return -1;
+    }
+    return 0;
+}
+static CYTHON_INLINE void *__Pyx_CyFunction_InitDefaults(PyObject *func, size_t size, int pyobjects) {
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    m->defaults = PyObject_Malloc(size);
+    if (unlikely(!m->defaults))
+        return PyErr_NoMemory();
+    memset(m->defaults, 0, size);
+    m->defaults_pyobjects = pyobjects;
+    m->defaults_size = size;
+    return m->defaults;
+}
+static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsTuple(PyObject *func, PyObject *tuple) {
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    m->defaults_tuple = tuple;
+    Py_INCREF(tuple);
+}
+static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsKwDict(PyObject *func, PyObject *dict) {
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    m->defaults_kwdict = dict;
+    Py_INCREF(dict);
+}
+static CYTHON_INLINE void __Pyx_CyFunction_SetAnnotationsDict(PyObject *func, PyObject *dict) {
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    m->func_annotations = dict;
+    Py_INCREF(dict);
+}
+
+/* CythonFunction */
+  static PyObject *__Pyx_CyFunction_New(PyMethodDef *ml, int flags, PyObject* qualname,
+                                      PyObject *closure, PyObject *module, PyObject* globals, PyObject* code) {
+    PyObject *op = __Pyx_CyFunction_Init(
+        PyObject_GC_New(__pyx_CyFunctionObject, __pyx_CyFunctionType),
+        ml, flags, qualname, closure, module, globals, code
+    );
+    if (likely(op)) {
+        PyObject_GC_Track(op);
+    }
+    return op;
+}
+
+/* CLineInTraceback */
+  #ifndef CYTHON_CLINE_IN_TRACEBACK
+static int __Pyx_CLineForTraceback(PyThreadState *tstate, int c_line) {
+    PyObject *use_cline;
+    PyObject *ptype, *pvalue, *ptraceback;
+#if CYTHON_COMPILING_IN_CPYTHON
+    PyObject **cython_runtime_dict;
+#endif
+    CYTHON_MAYBE_UNUSED_VAR(tstate);
+    if (unlikely(!__pyx_cython_runtime)) {
+        return c_line;
+    }
+    __Pyx_ErrFetchInState(tstate, &ptype, &pvalue, &ptraceback);
+#if CYTHON_COMPILING_IN_CPYTHON
+    cython_runtime_dict = _PyObject_GetDictPtr(__pyx_cython_runtime);
+    if (likely(cython_runtime_dict)) {
+        __PYX_PY_DICT_LOOKUP_IF_MODIFIED(
+            use_cline, *cython_runtime_dict,
+            __Pyx_PyDict_GetItemStr(*cython_runtime_dict, __pyx_n_s_cline_in_traceback))
+    } else
+#endif
+    {
+      PyObject *use_cline_obj = __Pyx_PyObject_GetAttrStrNoError(__pyx_cython_runtime, __pyx_n_s_cline_in_traceback);
+      if (use_cline_obj) {
+        use_cline = PyObject_Not(use_cline_obj) ? Py_False : Py_True;
+        Py_DECREF(use_cline_obj);
+      } else {
+        PyErr_Clear();
+        use_cline = NULL;
+      }
+    }
+    if (!use_cline) {
+        c_line = 0;
+        (void) PyObject_SetAttr(__pyx_cython_runtime, __pyx_n_s_cline_in_traceback, Py_False);
+    }
+    else if (use_cline == Py_False || (use_cline != Py_True && PyObject_Not(use_cline) != 0)) {
+        c_line = 0;
+    }
+    __Pyx_ErrRestoreInState(tstate, ptype, pvalue, ptraceback);
+    return c_line;
+}
+#endif
+
+/* CodeObjectCache */
+  #if !CYTHON_COMPILING_IN_LIMITED_API
+static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) {
+    int start = 0, mid = 0, end = count - 1;
+    if (end >= 0 && code_line > entries[end].code_line) {
+        return count;
+    }
+    while (start < end) {
+        mid = start + (end - start) / 2;
+        if (code_line < entries[mid].code_line) {
+            end = mid;
+        } else if (code_line > entries[mid].code_line) {
+             start = mid + 1;
+        } else {
+            return mid;
+        }
+    }
+    if (code_line <= entries[mid].code_line) {
+        return mid;
+    } else {
+        return mid + 1;
+    }
+}
+static PyCodeObject *__pyx_find_code_object(int code_line) {
+    PyCodeObject* code_object;
+    int pos;
+    if (unlikely(!code_line) || unlikely(!__pyx_code_cache.entries)) {
+        return NULL;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if (unlikely(pos >= __pyx_code_cache.count) || unlikely(__pyx_code_cache.entries[pos].code_line != code_line)) {
+        return NULL;
+    }
+    code_object = __pyx_code_cache.entries[pos].code_object;
+    Py_INCREF(code_object);
+    return code_object;
+}
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object) {
+    int pos, i;
+    __Pyx_CodeObjectCacheEntry* entries = __pyx_code_cache.entries;
+    if (unlikely(!code_line)) {
+        return;
+    }
+    if (unlikely(!entries)) {
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Malloc(64*sizeof(__Pyx_CodeObjectCacheEntry));
+        if (likely(entries)) {
+            __pyx_code_cache.entries = entries;
+            __pyx_code_cache.max_count = 64;
+            __pyx_code_cache.count = 1;
+            entries[0].code_line = code_line;
+            entries[0].code_object = code_object;
+            Py_INCREF(code_object);
+        }
+        return;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if ((pos < __pyx_code_cache.count) && unlikely(__pyx_code_cache.entries[pos].code_line == code_line)) {
+        PyCodeObject* tmp = entries[pos].code_object;
+        entries[pos].code_object = code_object;
+        Py_DECREF(tmp);
+        return;
+    }
+    if (__pyx_code_cache.count == __pyx_code_cache.max_count) {
+        int new_max = __pyx_code_cache.max_count + 64;
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Realloc(
+            __pyx_code_cache.entries, ((size_t)new_max) * sizeof(__Pyx_CodeObjectCacheEntry));
+        if (unlikely(!entries)) {
+            return;
+        }
+        __pyx_code_cache.entries = entries;
+        __pyx_code_cache.max_count = new_max;
+    }
+    for (i=__pyx_code_cache.count; i>pos; i--) {
+        entries[i] = entries[i-1];
+    }
+    entries[pos].code_line = code_line;
+    entries[pos].code_object = code_object;
+    __pyx_code_cache.count++;
+    Py_INCREF(code_object);
+}
+#endif
+
+/* AddTraceback */
+  #include "compile.h"
+#include "frameobject.h"
+#include "traceback.h"
+#if PY_VERSION_HEX >= 0x030b00a6 && !CYTHON_COMPILING_IN_LIMITED_API && !defined(PYPY_VERSION)
+  #ifndef Py_BUILD_CORE
+    #define Py_BUILD_CORE 1
+  #endif
+  #include "internal/pycore_frame.h"
+#endif
+#if CYTHON_COMPILING_IN_LIMITED_API
+static PyObject *__Pyx_PyCode_Replace_For_AddTraceback(PyObject *code, PyObject *scratch_dict,
+                                                       PyObject *firstlineno, PyObject *name) {
+    PyObject *replace = NULL;
+    if (unlikely(PyDict_SetItemString(scratch_dict, "co_firstlineno", firstlineno))) return NULL;
+    if (unlikely(PyDict_SetItemString(scratch_dict, "co_name", name))) return NULL;
+    replace = PyObject_GetAttrString(code, "replace");
+    if (likely(replace)) {
+        PyObject *result;
+        result = PyObject_Call(replace, __pyx_empty_tuple, scratch_dict);
+        Py_DECREF(replace);
+        return result;
+    }
+    PyErr_Clear();
+    #if __PYX_LIMITED_VERSION_HEX < 0x030780000
+    {
+        PyObject *compiled = NULL, *result = NULL;
+        if (unlikely(PyDict_SetItemString(scratch_dict, "code", code))) return NULL;
+        if (unlikely(PyDict_SetItemString(scratch_dict, "type", (PyObject*)(&PyType_Type)))) return NULL;
+        compiled = Py_CompileString(
+            "out = type(code)(\n"
+            "  code.co_argcount, code.co_kwonlyargcount, code.co_nlocals, code.co_stacksize,\n"
+            "  code.co_flags, code.co_code, code.co_consts, code.co_names,\n"
+            "  code.co_varnames, code.co_filename, co_name, co_firstlineno,\n"
+            "  code.co_lnotab)\n", "<dummy>", Py_file_input);
+        if (!compiled) return NULL;
+        result = PyEval_EvalCode(compiled, scratch_dict, scratch_dict);
+        Py_DECREF(compiled);
+        if (!result) PyErr_Print();
+        Py_DECREF(result);
+        result = PyDict_GetItemString(scratch_dict, "out");
+        if (result) Py_INCREF(result);
+        return result;
+    }
+    #else
+    return NULL;
+    #endif
+}
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename) {
+    PyObject *code_object = NULL, *py_py_line = NULL, *py_funcname = NULL, *dict = NULL;
+    PyObject *replace = NULL, *getframe = NULL, *frame = NULL;
+    PyObject *exc_type, *exc_value, *exc_traceback;
+    int success = 0;
+    if (c_line) {
+        (void) __pyx_cfilenm;
+        (void) __Pyx_CLineForTraceback(__Pyx_PyThreadState_Current, c_line);
+    }
+    PyErr_Fetch(&exc_type, &exc_value, &exc_traceback);
+    code_object = Py_CompileString("_getframe()", filename, Py_eval_input);
+    if (unlikely(!code_object)) goto bad;
+    py_py_line = PyLong_FromLong(py_line);
+    if (unlikely(!py_py_line)) goto bad;
+    py_funcname = PyUnicode_FromString(funcname);
+    if (unlikely(!py_funcname)) goto bad;
+    dict = PyDict_New();
+    if (unlikely(!dict)) goto bad;
+    {
+        PyObject *old_code_object = code_object;
+        code_object = __Pyx_PyCode_Replace_For_AddTraceback(code_object, dict, py_py_line, py_funcname);
+        Py_DECREF(old_code_object);
+    }
+    if (unlikely(!code_object)) goto bad;
+    getframe = PySys_GetObject("_getframe");
+    if (unlikely(!getframe)) goto bad;
+    if (unlikely(PyDict_SetItemString(dict, "_getframe", getframe))) goto bad;
+    frame = PyEval_EvalCode(code_object, dict, dict);
+    if (unlikely(!frame) || frame == Py_None) goto bad;
+    success = 1;
+  bad:
+    PyErr_Restore(exc_type, exc_value, exc_traceback);
+    Py_XDECREF(code_object);
+    Py_XDECREF(py_py_line);
+    Py_XDECREF(py_funcname);
+    Py_XDECREF(dict);
+    Py_XDECREF(replace);
+    if (success) {
+        PyTraceBack_Here(
+            (struct _frame*)frame);
+    }
+    Py_XDECREF(frame);
+}
+#else
+static PyCodeObject* __Pyx_CreateCodeObjectForTraceback(
+            const char *funcname, int c_line,
+            int py_line, const char *filename) {
+    PyCodeObject *py_code = NULL;
+    PyObject *py_funcname = NULL;
+    #if PY_MAJOR_VERSION < 3
+    PyObject *py_srcfile = NULL;
+    py_srcfile = PyString_FromString(filename);
+    if (!py_srcfile) goto bad;
+    #endif
+    if (c_line) {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        if (!py_funcname) goto bad;
+        #else
+        py_funcname = PyUnicode_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        if (!py_funcname) goto bad;
+        funcname = PyUnicode_AsUTF8(py_funcname);
+        if (!funcname) goto bad;
+        #endif
+    }
+    else {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromString(funcname);
+        if (!py_funcname) goto bad;
+        #endif
+    }
+    #if PY_MAJOR_VERSION < 3
+    py_code = __Pyx_PyCode_New(
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        __pyx_empty_bytes, /*PyObject *code,*/
+        __pyx_empty_tuple, /*PyObject *consts,*/
+        __pyx_empty_tuple, /*PyObject *names,*/
+        __pyx_empty_tuple, /*PyObject *varnames,*/
+        __pyx_empty_tuple, /*PyObject *freevars,*/
+        __pyx_empty_tuple, /*PyObject *cellvars,*/
+        py_srcfile,   /*PyObject *filename,*/
+        py_funcname,  /*PyObject *name,*/
+        py_line,
+        __pyx_empty_bytes  /*PyObject *lnotab*/
+    );
+    Py_DECREF(py_srcfile);
+    #else
+    py_code = PyCode_NewEmpty(filename, funcname, py_line);
+    #endif
+    Py_XDECREF(py_funcname);
+    return py_code;
+bad:
+    Py_XDECREF(py_funcname);
+    #if PY_MAJOR_VERSION < 3
+    Py_XDECREF(py_srcfile);
+    #endif
+    return NULL;
+}
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename) {
+    PyCodeObject *py_code = 0;
+    PyFrameObject *py_frame = 0;
+    PyThreadState *tstate = __Pyx_PyThreadState_Current;
+    PyObject *ptype, *pvalue, *ptraceback;
+    if (c_line) {
+        c_line = __Pyx_CLineForTraceback(tstate, c_line);
+    }
+    py_code = __pyx_find_code_object(c_line ? -c_line : py_line);
+    if (!py_code) {
+        __Pyx_ErrFetchInState(tstate, &ptype, &pvalue, &ptraceback);
+        py_code = __Pyx_CreateCodeObjectForTraceback(
+            funcname, c_line, py_line, filename);
+        if (!py_code) {
+            /* If the code object creation fails, then we should clear the
+               fetched exception references and propagate the new exception */
+            Py_XDECREF(ptype);
+            Py_XDECREF(pvalue);
+            Py_XDECREF(ptraceback);
+            goto bad;
+        }
+        __Pyx_ErrRestoreInState(tstate, ptype, pvalue, ptraceback);
+        __pyx_insert_code_object(c_line ? -c_line : py_line, py_code);
+    }
+    py_frame = PyFrame_New(
+        tstate,            /*PyThreadState *tstate,*/
+        py_code,           /*PyCodeObject *code,*/
+        __pyx_d,    /*PyObject *globals,*/
+        0                  /*PyObject *locals*/
+    );
+    if (!py_frame) goto bad;
+    __Pyx_PyFrame_SetLineNumber(py_frame, py_line);
+    PyTraceBack_Here(py_frame);
+bad:
+    Py_XDECREF(py_code);
+    Py_XDECREF(py_frame);
+}
+#endif
+
+#if PY_MAJOR_VERSION < 3
+static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags) {
+    __Pyx_TypeName obj_type_name;
+    if (PyObject_CheckBuffer(obj)) return PyObject_GetBuffer(obj, view, flags);
+        if (__Pyx_TypeCheck(obj, __pyx_array_type)) return __pyx_array_getbuffer(obj, view, flags);
+        if (__Pyx_TypeCheck(obj, __pyx_memoryview_type)) return __pyx_memoryview_getbuffer(obj, view, flags);
+    obj_type_name = __Pyx_PyType_GetName(Py_TYPE(obj));
+    PyErr_Format(PyExc_TypeError,
+                 "'" __Pyx_FMT_TYPENAME "' does not have the buffer interface",
+                 obj_type_name);
+    __Pyx_DECREF_TypeName(obj_type_name);
+    return -1;
+}
+static void __Pyx_ReleaseBuffer(Py_buffer *view) {
+    PyObject *obj = view->obj;
+    if (!obj) return;
+    if (PyObject_CheckBuffer(obj)) {
+        PyBuffer_Release(view);
+        return;
+    }
+    if ((0)) {}
+    view->obj = NULL;
+    Py_DECREF(obj);
+}
+#endif
+
+
+  /* MemviewSliceIsContig */
+  static int
+__pyx_memviewslice_is_contig(const __Pyx_memviewslice mvs, char order, int ndim)
+{
+    int i, index, step, start;
+    Py_ssize_t itemsize = mvs.memview->view.itemsize;
+    if (order == 'F') {
+        step = 1;
+        start = 0;
+    } else {
+        step = -1;
+        start = ndim - 1;
+    }
+    for (i = 0; i < ndim; i++) {
+        index = start + step * i;
+        if (mvs.suboffsets[index] >= 0 || mvs.strides[index] != itemsize)
+            return 0;
+        itemsize *= mvs.shape[index];
+    }
+    return 1;
+}
+
+/* OverlappingSlices */
+  static void
+__pyx_get_array_memory_extents(__Pyx_memviewslice *slice,
+                               void **out_start, void **out_end,
+                               int ndim, size_t itemsize)
+{
+    char *start, *end;
+    int i;
+    start = end = slice->data;
+    for (i = 0; i < ndim; i++) {
+        Py_ssize_t stride = slice->strides[i];
+        Py_ssize_t extent = slice->shape[i];
+        if (extent == 0) {
+            *out_start = *out_end = start;
+            return;
+        } else {
+            if (stride > 0)
+                end += stride * (extent - 1);
+            else
+                start += stride * (extent - 1);
+        }
+    }
+    *out_start = start;
+    *out_end = end + itemsize;
+}
+static int
+__pyx_slices_overlap(__Pyx_memviewslice *slice1,
+                     __Pyx_memviewslice *slice2,
+                     int ndim, size_t itemsize)
+{
+    void *start1, *end1, *start2, *end2;
+    __pyx_get_array_memory_extents(slice1, &start1, &end1, ndim, itemsize);
+    __pyx_get_array_memory_extents(slice2, &start2, &end2, ndim, itemsize);
+    return (start1 < end2) && (start2 < end1);
+}
+
+/* CIntFromPyVerify */
+  #define __PYX_VERIFY_RETURN_INT(target_type, func_type, func_value)\
+    __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 0)
+#define __PYX_VERIFY_RETURN_INT_EXC(target_type, func_type, func_value)\
+    __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 1)
+#define __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, exc)\
+    {\
+        func_type value = func_value;\
+        if (sizeof(target_type) < sizeof(func_type)) {\
+            if (unlikely(value != (func_type) (target_type) value)) {\
+                func_type zero = 0;\
+                if (exc && unlikely(value == (func_type)-1 && PyErr_Occurred()))\
+                    return (target_type) -1;\
+                if (is_unsigned && unlikely(value < zero))\
+                    goto raise_neg_overflow;\
+                else\
+                    goto raise_overflow;\
+            }\
+        }\
+        return (target_type) value;\
+    }
+
+/* MemviewDtypeToObject */
+  static CYTHON_INLINE PyObject *__pyx_memview_get_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t(const char *itemp) {
+    return (PyObject *) __Pyx_PyInt_From_npy_int64(*(__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t *) itemp);
+}
+static CYTHON_INLINE int __pyx_memview_set_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t(const char *itemp, PyObject *obj) {
+    __pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t value = __Pyx_PyInt_As_npy_int64(obj);
+    if (unlikely((value == ((npy_int64)-1)) && PyErr_Occurred()))
+        return 0;
+    *(__pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t *) itemp = value;
+    return 1;
+}
+
+/* TypeInfoCompare */
+  static int
+__pyx_typeinfo_cmp(__Pyx_TypeInfo *a, __Pyx_TypeInfo *b)
+{
+    int i;
+    if (!a || !b)
+        return 0;
+    if (a == b)
+        return 1;
+    if (a->size != b->size || a->typegroup != b->typegroup ||
+            a->is_unsigned != b->is_unsigned || a->ndim != b->ndim) {
+        if (a->typegroup == 'H' || b->typegroup == 'H') {
+            return a->size == b->size;
+        } else {
+            return 0;
+        }
+    }
+    if (a->ndim) {
+        for (i = 0; i < a->ndim; i++)
+            if (a->arraysize[i] != b->arraysize[i])
+                return 0;
+    }
+    if (a->typegroup == 'S') {
+        if (a->flags != b->flags)
+            return 0;
+        if (a->fields || b->fields) {
+            if (!(a->fields && b->fields))
+                return 0;
+            for (i = 0; a->fields[i].type && b->fields[i].type; i++) {
+                __Pyx_StructField *field_a = a->fields + i;
+                __Pyx_StructField *field_b = b->fields + i;
+                if (field_a->offset != field_b->offset ||
+                    !__pyx_typeinfo_cmp(field_a->type, field_b->type))
+                    return 0;
+            }
+            return !a->fields[i].type && !b->fields[i].type;
+        }
+    }
+    return 1;
+}
+
+/* MemviewSliceValidateAndInit */
+  static int
+__pyx_check_strides(Py_buffer *buf, int dim, int ndim, int spec)
+{
+    if (buf->shape[dim] <= 1)
+        return 1;
+    if (buf->strides) {
+        if (spec & __Pyx_MEMVIEW_CONTIG) {
+            if (spec & (__Pyx_MEMVIEW_PTR|__Pyx_MEMVIEW_FULL)) {
+                if (unlikely(buf->strides[dim] != sizeof(void *))) {
+                    PyErr_Format(PyExc_ValueError,
+                                 "Buffer is not indirectly contiguous "
+                                 "in dimension %d.", dim);
+                    goto fail;
+                }
+            } else if (unlikely(buf->strides[dim] != buf->itemsize)) {
+                PyErr_SetString(PyExc_ValueError,
+                                "Buffer and memoryview are not contiguous "
+                                "in the same dimension.");
+                goto fail;
+            }
+        }
+        if (spec & __Pyx_MEMVIEW_FOLLOW) {
+            Py_ssize_t stride = buf->strides[dim];
+            if (stride < 0)
+                stride = -stride;
+            if (unlikely(stride < buf->itemsize)) {
+                PyErr_SetString(PyExc_ValueError,
+                                "Buffer and memoryview are not contiguous "
+                                "in the same dimension.");
+                goto fail;
+            }
+        }
+    } else {
+        if (unlikely(spec & __Pyx_MEMVIEW_CONTIG && dim != ndim - 1)) {
+            PyErr_Format(PyExc_ValueError,
+                         "C-contiguous buffer is not contiguous in "
+                         "dimension %d", dim);
+            goto fail;
+        } else if (unlikely(spec & (__Pyx_MEMVIEW_PTR))) {
+            PyErr_Format(PyExc_ValueError,
+                         "C-contiguous buffer is not indirect in "
+                         "dimension %d", dim);
+            goto fail;
+        } else if (unlikely(buf->suboffsets)) {
+            PyErr_SetString(PyExc_ValueError,
+                            "Buffer exposes suboffsets but no strides");
+            goto fail;
+        }
+    }
+    return 1;
+fail:
+    return 0;
+}
+static int
+__pyx_check_suboffsets(Py_buffer *buf, int dim, int ndim, int spec)
+{
+    CYTHON_UNUSED_VAR(ndim);
+    if (spec & __Pyx_MEMVIEW_DIRECT) {
+        if (unlikely(buf->suboffsets && buf->suboffsets[dim] >= 0)) {
+            PyErr_Format(PyExc_ValueError,
+                         "Buffer not compatible with direct access "
+                         "in dimension %d.", dim);
+            goto fail;
+        }
+    }
+    if (spec & __Pyx_MEMVIEW_PTR) {
+        if (unlikely(!buf->suboffsets || (buf->suboffsets[dim] < 0))) {
+            PyErr_Format(PyExc_ValueError,
+                         "Buffer is not indirectly accessible "
+                         "in dimension %d.", dim);
+            goto fail;
+        }
+    }
+    return 1;
+fail:
+    return 0;
+}
+static int
+__pyx_verify_contig(Py_buffer *buf, int ndim, int c_or_f_flag)
+{
+    int i;
+    if (c_or_f_flag & __Pyx_IS_F_CONTIG) {
+        Py_ssize_t stride = 1;
+        for (i = 0; i < ndim; i++) {
+            if (unlikely(stride * buf->itemsize != buf->strides[i]  &&  buf->shape[i] > 1)) {
+                PyErr_SetString(PyExc_ValueError,
+                    "Buffer not fortran contiguous.");
+                goto fail;
+            }
+            stride = stride * buf->shape[i];
+        }
+    } else if (c_or_f_flag & __Pyx_IS_C_CONTIG) {
+        Py_ssize_t stride = 1;
+        for (i = ndim - 1; i >- 1; i--) {
+            if (unlikely(stride * buf->itemsize != buf->strides[i]  &&  buf->shape[i] > 1)) {
+                PyErr_SetString(PyExc_ValueError,
+                    "Buffer not C contiguous.");
+                goto fail;
+            }
+            stride = stride * buf->shape[i];
+        }
+    }
+    return 1;
+fail:
+    return 0;
+}
+static int __Pyx_ValidateAndInit_memviewslice(
+                int *axes_specs,
+                int c_or_f_flag,
+                int buf_flags,
+                int ndim,
+                __Pyx_TypeInfo *dtype,
+                __Pyx_BufFmt_StackElem stack[],
+                __Pyx_memviewslice *memviewslice,
+                PyObject *original_obj)
+{
+    struct __pyx_memoryview_obj *memview, *new_memview;
+    __Pyx_RefNannyDeclarations
+    Py_buffer *buf;
+    int i, spec = 0, retval = -1;
+    __Pyx_BufFmt_Context ctx;
+    int from_memoryview = __pyx_memoryview_check(original_obj);
+    __Pyx_RefNannySetupContext("ValidateAndInit_memviewslice", 0);
+    if (from_memoryview && __pyx_typeinfo_cmp(dtype, ((struct __pyx_memoryview_obj *)
+                                                            original_obj)->typeinfo)) {
+        memview = (struct __pyx_memoryview_obj *) original_obj;
+        new_memview = NULL;
+    } else {
+        memview = (struct __pyx_memoryview_obj *) __pyx_memoryview_new(
+                                            original_obj, buf_flags, 0, dtype);
+        new_memview = memview;
+        if (unlikely(!memview))
+            goto fail;
+    }
+    buf = &memview->view;
+    if (unlikely(buf->ndim != ndim)) {
+        PyErr_Format(PyExc_ValueError,
+                "Buffer has wrong number of dimensions (expected %d, got %d)",
+                ndim, buf->ndim);
+        goto fail;
+    }
+    if (new_memview) {
+        __Pyx_BufFmt_Init(&ctx, stack, dtype);
+        if (unlikely(!__Pyx_BufFmt_CheckString(&ctx, buf->format))) goto fail;
+    }
+    if (unlikely((unsigned) buf->itemsize != dtype->size)) {
+        PyErr_Format(PyExc_ValueError,
+                     "Item size of buffer (%" CYTHON_FORMAT_SSIZE_T "u byte%s) "
+                     "does not match size of '%s' (%" CYTHON_FORMAT_SSIZE_T "u byte%s)",
+                     buf->itemsize,
+                     (buf->itemsize > 1) ? "s" : "",
+                     dtype->name,
+                     dtype->size,
+                     (dtype->size > 1) ? "s" : "");
+        goto fail;
+    }
+    if (buf->len > 0) {
+        for (i = 0; i < ndim; i++) {
+            spec = axes_specs[i];
+            if (unlikely(!__pyx_check_strides(buf, i, ndim, spec)))
+                goto fail;
+            if (unlikely(!__pyx_check_suboffsets(buf, i, ndim, spec)))
+                goto fail;
+        }
+        if (unlikely(buf->strides && !__pyx_verify_contig(buf, ndim, c_or_f_flag)))
+            goto fail;
+    }
+    if (unlikely(__Pyx_init_memviewslice(memview, ndim, memviewslice,
+                                         new_memview != NULL) == -1)) {
+        goto fail;
+    }
+    retval = 0;
+    goto no_fail;
+fail:
+    Py_XDECREF(new_memview);
+    retval = -1;
+no_fail:
+    __Pyx_RefNannyFinishContext();
+    return retval;
+}
+
+/* ObjectToMemviewSlice */
+  static CYTHON_INLINE __Pyx_memviewslice __Pyx_PyObject_to_MemoryviewSlice_ds_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t(PyObject *obj, int writable_flag) {
+    __Pyx_memviewslice result = { 0, 0, { 0 }, { 0 }, { 0 } };
+    __Pyx_BufFmt_StackElem stack[1];
+    int axes_specs[] = { (__Pyx_MEMVIEW_DIRECT | __Pyx_MEMVIEW_STRIDED) };
+    int retcode;
+    if (obj == Py_None) {
+        result.memview = (struct __pyx_memoryview_obj *) Py_None;
+        return result;
+    }
+    retcode = __Pyx_ValidateAndInit_memviewslice(axes_specs, 0,
+                                                 PyBUF_RECORDS_RO | writable_flag, 1,
+                                                 &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, stack,
+                                                 &result, obj);
+    if (unlikely(retcode == -1))
+        goto __pyx_fail;
+    return result;
+__pyx_fail:
+    result.memview = NULL;
+    result.data = NULL;
+    return result;
+}
+
+/* ObjectToMemviewSlice */
+  static CYTHON_INLINE __Pyx_memviewslice __Pyx_PyObject_to_MemoryviewSlice_dsds_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t(PyObject *obj, int writable_flag) {
+    __Pyx_memviewslice result = { 0, 0, { 0 }, { 0 }, { 0 } };
+    __Pyx_BufFmt_StackElem stack[1];
+    int axes_specs[] = { (__Pyx_MEMVIEW_DIRECT | __Pyx_MEMVIEW_STRIDED), (__Pyx_MEMVIEW_DIRECT | __Pyx_MEMVIEW_STRIDED) };
+    int retcode;
+    if (obj == Py_None) {
+        result.memview = (struct __pyx_memoryview_obj *) Py_None;
+        return result;
+    }
+    retcode = __Pyx_ValidateAndInit_memviewslice(axes_specs, 0,
+                                                 PyBUF_RECORDS_RO | writable_flag, 2,
+                                                 &__Pyx_TypeInfo_nn___pyx_t_7fairseq_4data_22token_block_utils_fast_DTYPE_t, stack,
+                                                 &result, obj);
+    if (unlikely(retcode == -1))
+        goto __pyx_fail;
+    return result;
+__pyx_fail:
+    result.memview = NULL;
+    result.data = NULL;
+    return result;
+}
+
+/* Declarations */
+  #if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+  #ifdef __cplusplus
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      return ::std::complex< float >(x, y);
+    }
+  #else
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      return x + y*(__pyx_t_float_complex)_Complex_I;
+    }
+  #endif
+#else
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      __pyx_t_float_complex z;
+      z.real = x;
+      z.imag = y;
+      return z;
+    }
+#endif
+
+/* Arithmetic */
+  #if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+       return (a.real == b.real) && (a.imag == b.imag);
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sum_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real + b.real;
+        z.imag = a.imag + b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_diff_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real - b.real;
+        z.imag = a.imag - b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prod_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real * b.real - a.imag * b.imag;
+        z.imag = a.real * b.imag + a.imag * b.real;
+        return z;
+    }
+    #if 1
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else if (fabsf(b.real) >= fabsf(b.imag)) {
+            if (b.real == 0 && b.imag == 0) {
+                return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.imag);
+            } else {
+                float r = b.imag / b.real;
+                float s = (float)(1.0) / (b.real + b.imag * r);
+                return __pyx_t_float_complex_from_parts(
+                    (a.real + a.imag * r) * s, (a.imag - a.real * r) * s);
+            }
+        } else {
+            float r = b.real / b.imag;
+            float s = (float)(1.0) / (b.imag + b.real * r);
+            return __pyx_t_float_complex_from_parts(
+                (a.real * r + a.imag) * s, (a.imag * r - a.real) * s);
+        }
+    }
+    #else
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else {
+            float denom = b.real * b.real + b.imag * b.imag;
+            return __pyx_t_float_complex_from_parts(
+                (a.real * b.real + a.imag * b.imag) / denom,
+                (a.imag * b.real - a.real * b.imag) / denom);
+        }
+    }
+    #endif
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_neg_float(__pyx_t_float_complex a) {
+        __pyx_t_float_complex z;
+        z.real = -a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    static CYTHON_INLINE int __Pyx_c_is_zero_float(__pyx_t_float_complex a) {
+       return (a.real == 0) && (a.imag == 0);
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conj_float(__pyx_t_float_complex a) {
+        __pyx_t_float_complex z;
+        z.real =  a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    #if 1
+        static CYTHON_INLINE float __Pyx_c_abs_float(__pyx_t_float_complex z) {
+          #if !defined(HAVE_HYPOT) || defined(_MSC_VER)
+            return sqrtf(z.real*z.real + z.imag*z.imag);
+          #else
+            return hypotf(z.real, z.imag);
+          #endif
+        }
+        static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_pow_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+            __pyx_t_float_complex z;
+            float r, lnr, theta, z_r, z_theta;
+            if (b.imag == 0 && b.real == (int)b.real) {
+                if (b.real < 0) {
+                    float denom = a.real * a.real + a.imag * a.imag;
+                    a.real = a.real / denom;
+                    a.imag = -a.imag / denom;
+                    b.real = -b.real;
+                }
+                switch ((int)b.real) {
+                    case 0:
+                        z.real = 1;
+                        z.imag = 0;
+                        return z;
+                    case 1:
+                        return a;
+                    case 2:
+                        return __Pyx_c_prod_float(a, a);
+                    case 3:
+                        z = __Pyx_c_prod_float(a, a);
+                        return __Pyx_c_prod_float(z, a);
+                    case 4:
+                        z = __Pyx_c_prod_float(a, a);
+                        return __Pyx_c_prod_float(z, z);
+                }
+            }
+            if (a.imag == 0) {
+                if (a.real == 0) {
+                    return a;
+                } else if ((b.imag == 0) && (a.real >= 0)) {
+                    z.real = powf(a.real, b.real);
+                    z.imag = 0;
+                    return z;
+                } else if (a.real > 0) {
+                    r = a.real;
+                    theta = 0;
+                } else {
+                    r = -a.real;
+                    theta = atan2f(0.0, -1.0);
+                }
+            } else {
+                r = __Pyx_c_abs_float(a);
+                theta = atan2f(a.imag, a.real);
+            }
+            lnr = logf(r);
+            z_r = expf(lnr * b.real - theta * b.imag);
+            z_theta = theta * b.real + lnr * b.imag;
+            z.real = z_r * cosf(z_theta);
+            z.imag = z_r * sinf(z_theta);
+            return z;
+        }
+    #endif
+#endif
+
+/* Declarations */
+  #if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+  #ifdef __cplusplus
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      return ::std::complex< double >(x, y);
+    }
+  #else
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      return x + y*(__pyx_t_double_complex)_Complex_I;
+    }
+  #endif
+#else
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      __pyx_t_double_complex z;
+      z.real = x;
+      z.imag = y;
+      return z;
+    }
+#endif
+
+/* Arithmetic */
+  #if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+       return (a.real == b.real) && (a.imag == b.imag);
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real + b.real;
+        z.imag = a.imag + b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real - b.real;
+        z.imag = a.imag - b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real * b.real - a.imag * b.imag;
+        z.imag = a.real * b.imag + a.imag * b.real;
+        return z;
+    }
+    #if 1
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else if (fabs(b.real) >= fabs(b.imag)) {
+            if (b.real == 0 && b.imag == 0) {
+                return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.imag);
+            } else {
+                double r = b.imag / b.real;
+                double s = (double)(1.0) / (b.real + b.imag * r);
+                return __pyx_t_double_complex_from_parts(
+                    (a.real + a.imag * r) * s, (a.imag - a.real * r) * s);
+            }
+        } else {
+            double r = b.real / b.imag;
+            double s = (double)(1.0) / (b.imag + b.real * r);
+            return __pyx_t_double_complex_from_parts(
+                (a.real * r + a.imag) * s, (a.imag * r - a.real) * s);
+        }
+    }
+    #else
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else {
+            double denom = b.real * b.real + b.imag * b.imag;
+            return __pyx_t_double_complex_from_parts(
+                (a.real * b.real + a.imag * b.imag) / denom,
+                (a.imag * b.real - a.real * b.imag) / denom);
+        }
+    }
+    #endif
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg_double(__pyx_t_double_complex a) {
+        __pyx_t_double_complex z;
+        z.real = -a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    static CYTHON_INLINE int __Pyx_c_is_zero_double(__pyx_t_double_complex a) {
+       return (a.real == 0) && (a.imag == 0);
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj_double(__pyx_t_double_complex a) {
+        __pyx_t_double_complex z;
+        z.real =  a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    #if 1
+        static CYTHON_INLINE double __Pyx_c_abs_double(__pyx_t_double_complex z) {
+          #if !defined(HAVE_HYPOT) || defined(_MSC_VER)
+            return sqrt(z.real*z.real + z.imag*z.imag);
+          #else
+            return hypot(z.real, z.imag);
+          #endif
+        }
+        static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+            __pyx_t_double_complex z;
+            double r, lnr, theta, z_r, z_theta;
+            if (b.imag == 0 && b.real == (int)b.real) {
+                if (b.real < 0) {
+                    double denom = a.real * a.real + a.imag * a.imag;
+                    a.real = a.real / denom;
+                    a.imag = -a.imag / denom;
+                    b.real = -b.real;
+                }
+                switch ((int)b.real) {
+                    case 0:
+                        z.real = 1;
+                        z.imag = 0;
+                        return z;
+                    case 1:
+                        return a;
+                    case 2:
+                        return __Pyx_c_prod_double(a, a);
+                    case 3:
+                        z = __Pyx_c_prod_double(a, a);
+                        return __Pyx_c_prod_double(z, a);
+                    case 4:
+                        z = __Pyx_c_prod_double(a, a);
+                        return __Pyx_c_prod_double(z, z);
+                }
+            }
+            if (a.imag == 0) {
+                if (a.real == 0) {
+                    return a;
+                } else if ((b.imag == 0) && (a.real >= 0)) {
+                    z.real = pow(a.real, b.real);
+                    z.imag = 0;
+                    return z;
+                } else if (a.real > 0) {
+                    r = a.real;
+                    theta = 0;
+                } else {
+                    r = -a.real;
+                    theta = atan2(0.0, -1.0);
+                }
+            } else {
+                r = __Pyx_c_abs_double(a);
+                theta = atan2(a.imag, a.real);
+            }
+            lnr = log(r);
+            z_r = exp(lnr * b.real - theta * b.imag);
+            z_theta = theta * b.real + lnr * b.imag;
+            z.real = z_r * cos(z_theta);
+            z.imag = z_r * sin(z_theta);
+            return z;
+        }
+    #endif
+#endif
+
+/* Declarations */
+  #if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+  #ifdef __cplusplus
+    static CYTHON_INLINE __pyx_t_long_double_complex __pyx_t_long_double_complex_from_parts(long double x, long double y) {
+      return ::std::complex< long double >(x, y);
+    }
+  #else
+    static CYTHON_INLINE __pyx_t_long_double_complex __pyx_t_long_double_complex_from_parts(long double x, long double y) {
+      return x + y*(__pyx_t_long_double_complex)_Complex_I;
+    }
+  #endif
+#else
+    static CYTHON_INLINE __pyx_t_long_double_complex __pyx_t_long_double_complex_from_parts(long double x, long double y) {
+      __pyx_t_long_double_complex z;
+      z.real = x;
+      z.imag = y;
+      return z;
+    }
+#endif
+
+/* Arithmetic */
+  #if CYTHON_CCOMPLEX && (1) && (!0 || __cplusplus)
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_long__double(__pyx_t_long_double_complex a, __pyx_t_long_double_complex b) {
+       return (a.real == b.real) && (a.imag == b.imag);
+    }
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_sum_long__double(__pyx_t_long_double_complex a, __pyx_t_long_double_complex b) {
+        __pyx_t_long_double_complex z;
+        z.real = a.real + b.real;
+        z.imag = a.imag + b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_diff_long__double(__pyx_t_long_double_complex a, __pyx_t_long_double_complex b) {
+        __pyx_t_long_double_complex z;
+        z.real = a.real - b.real;
+        z.imag = a.imag - b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_prod_long__double(__pyx_t_long_double_complex a, __pyx_t_long_double_complex b) {
+        __pyx_t_long_double_complex z;
+        z.real = a.real * b.real - a.imag * b.imag;
+        z.imag = a.real * b.imag + a.imag * b.real;
+        return z;
+    }
+    #if 1
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_quot_long__double(__pyx_t_long_double_complex a, __pyx_t_long_double_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_long_double_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else if (fabsl(b.real) >= fabsl(b.imag)) {
+            if (b.real == 0 && b.imag == 0) {
+                return __pyx_t_long_double_complex_from_parts(a.real / b.real, a.imag / b.imag);
+            } else {
+                long double r = b.imag / b.real;
+                long double s = (long double)(1.0) / (b.real + b.imag * r);
+                return __pyx_t_long_double_complex_from_parts(
+                    (a.real + a.imag * r) * s, (a.imag - a.real * r) * s);
+            }
+        } else {
+            long double r = b.real / b.imag;
+            long double s = (long double)(1.0) / (b.imag + b.real * r);
+            return __pyx_t_long_double_complex_from_parts(
+                (a.real * r + a.imag) * s, (a.imag * r - a.real) * s);
+        }
+    }
+    #else
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_quot_long__double(__pyx_t_long_double_complex a, __pyx_t_long_double_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_long_double_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else {
+            long double denom = b.real * b.real + b.imag * b.imag;
+            return __pyx_t_long_double_complex_from_parts(
+                (a.real * b.real + a.imag * b.imag) / denom,
+                (a.imag * b.real - a.real * b.imag) / denom);
+        }
+    }
+    #endif
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_neg_long__double(__pyx_t_long_double_complex a) {
+        __pyx_t_long_double_complex z;
+        z.real = -a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    static CYTHON_INLINE int __Pyx_c_is_zero_long__double(__pyx_t_long_double_complex a) {
+       return (a.real == 0) && (a.imag == 0);
+    }
+    static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_conj_long__double(__pyx_t_long_double_complex a) {
+        __pyx_t_long_double_complex z;
+        z.real =  a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    #if 1
+        static CYTHON_INLINE long double __Pyx_c_abs_long__double(__pyx_t_long_double_complex z) {
+          #if !defined(HAVE_HYPOT) || defined(_MSC_VER)
+            return sqrtl(z.real*z.real + z.imag*z.imag);
+          #else
+            return hypotl(z.real, z.imag);
+          #endif
+        }
+        static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_pow_long__double(__pyx_t_long_double_complex a, __pyx_t_long_double_complex b) {
+            __pyx_t_long_double_complex z;
+            long double r, lnr, theta, z_r, z_theta;
+            if (b.imag == 0 && b.real == (int)b.real) {
+                if (b.real < 0) {
+                    long double denom = a.real * a.real + a.imag * a.imag;
+                    a.real = a.real / denom;
+                    a.imag = -a.imag / denom;
+                    b.real = -b.real;
+                }
+                switch ((int)b.real) {
+                    case 0:
+                        z.real = 1;
+                        z.imag = 0;
+                        return z;
+                    case 1:
+                        return a;
+                    case 2:
+                        return __Pyx_c_prod_long__double(a, a);
+                    case 3:
+                        z = __Pyx_c_prod_long__double(a, a);
+                        return __Pyx_c_prod_long__double(z, a);
+                    case 4:
+                        z = __Pyx_c_prod_long__double(a, a);
+                        return __Pyx_c_prod_long__double(z, z);
+                }
+            }
+            if (a.imag == 0) {
+                if (a.real == 0) {
+                    return a;
+                } else if ((b.imag == 0) && (a.real >= 0)) {
+                    z.real = powl(a.real, b.real);
+                    z.imag = 0;
+                    return z;
+                } else if (a.real > 0) {
+                    r = a.real;
+                    theta = 0;
+                } else {
+                    r = -a.real;
+                    theta = atan2l(0.0, -1.0);
+                }
+            } else {
+                r = __Pyx_c_abs_long__double(a);
+                theta = atan2l(a.imag, a.real);
+            }
+            lnr = logl(r);
+            z_r = expl(lnr * b.real - theta * b.imag);
+            z_theta = theta * b.real + lnr * b.imag;
+            z.real = z_r * cosl(z_theta);
+            z.imag = z_r * sinl(z_theta);
+            return z;
+        }
+    #endif
+#endif
+
+/* MemviewSliceCopyTemplate */
+  static __Pyx_memviewslice
+__pyx_memoryview_copy_new_contig(const __Pyx_memviewslice *from_mvs,
+                                 const char *mode, int ndim,
+                                 size_t sizeof_dtype, int contig_flag,
+                                 int dtype_is_object)
+{
+    __Pyx_RefNannyDeclarations
+    int i;
+    __Pyx_memviewslice new_mvs = { 0, 0, { 0 }, { 0 }, { 0 } };
+    struct __pyx_memoryview_obj *from_memview = from_mvs->memview;
+    Py_buffer *buf = &from_memview->view;
+    PyObject *shape_tuple = NULL;
+    PyObject *temp_int = NULL;
+    struct __pyx_array_obj *array_obj = NULL;
+    struct __pyx_memoryview_obj *memview_obj = NULL;
+    __Pyx_RefNannySetupContext("__pyx_memoryview_copy_new_contig", 0);
+    for (i = 0; i < ndim; i++) {
+        if (unlikely(from_mvs->suboffsets[i] >= 0)) {
+            PyErr_Format(PyExc_ValueError, "Cannot copy memoryview slice with "
+                                           "indirect dimensions (axis %d)", i);
+            goto fail;
+        }
+    }
+    shape_tuple = PyTuple_New(ndim);
+    if (unlikely(!shape_tuple)) {
+        goto fail;
+    }
+    __Pyx_GOTREF(shape_tuple);
+    for(i = 0; i < ndim; i++) {
+        temp_int = PyInt_FromSsize_t(from_mvs->shape[i]);
+        if(unlikely(!temp_int)) {
+            goto fail;
+        } else {
+            PyTuple_SET_ITEM(shape_tuple, i, temp_int);
+            temp_int = NULL;
+        }
+    }
+    array_obj = __pyx_array_new(shape_tuple, sizeof_dtype, buf->format, (char *) mode, NULL);
+    if (unlikely(!array_obj)) {
+        goto fail;
+    }
+    __Pyx_GOTREF(array_obj);
+    memview_obj = (struct __pyx_memoryview_obj *) __pyx_memoryview_new(
+                                    (PyObject *) array_obj, contig_flag,
+                                    dtype_is_object,
+                                    from_mvs->memview->typeinfo);
+    if (unlikely(!memview_obj))
+        goto fail;
+    if (unlikely(__Pyx_init_memviewslice(memview_obj, ndim, &new_mvs, 1) < 0))
+        goto fail;
+    if (unlikely(__pyx_memoryview_copy_contents(*from_mvs, new_mvs, ndim, ndim,
+                                                dtype_is_object) < 0))
+        goto fail;
+    goto no_fail;
+fail:
+    __Pyx_XDECREF(new_mvs.memview);
+    new_mvs.memview = NULL;
+    new_mvs.data = NULL;
+no_fail:
+    __Pyx_XDECREF(shape_tuple);
+    __Pyx_XDECREF(temp_int);
+    __Pyx_XDECREF(array_obj);
+    __Pyx_RefNannyFinishContext();
+    return new_mvs;
+}
+
+/* MemviewSliceInit */
+  static int
+__Pyx_init_memviewslice(struct __pyx_memoryview_obj *memview,
+                        int ndim,
+                        __Pyx_memviewslice *memviewslice,
+                        int memview_is_new_reference)
+{
+    __Pyx_RefNannyDeclarations
+    int i, retval=-1;
+    Py_buffer *buf = &memview->view;
+    __Pyx_RefNannySetupContext("init_memviewslice", 0);
+    if (unlikely(memviewslice->memview || memviewslice->data)) {
+        PyErr_SetString(PyExc_ValueError,
+            "memviewslice is already initialized!");
+        goto fail;
+    }
+    if (buf->strides) {
+        for (i = 0; i < ndim; i++) {
+            memviewslice->strides[i] = buf->strides[i];
+        }
+    } else {
+        Py_ssize_t stride = buf->itemsize;
+        for (i = ndim - 1; i >= 0; i--) {
+            memviewslice->strides[i] = stride;
+            stride *= buf->shape[i];
+        }
+    }
+    for (i = 0; i < ndim; i++) {
+        memviewslice->shape[i]   = buf->shape[i];
+        if (buf->suboffsets) {
+            memviewslice->suboffsets[i] = buf->suboffsets[i];
+        } else {
+            memviewslice->suboffsets[i] = -1;
+        }
+    }
+    memviewslice->memview = memview;
+    memviewslice->data = (char *)buf->buf;
+    if (__pyx_add_acquisition_count(memview) == 0 && !memview_is_new_reference) {
+        Py_INCREF(memview);
+    }
+    retval = 0;
+    goto no_fail;
+fail:
+    memviewslice->memview = 0;
+    memviewslice->data = 0;
+    retval = -1;
+no_fail:
+    __Pyx_RefNannyFinishContext();
+    return retval;
+}
+#ifndef Py_NO_RETURN
+#define Py_NO_RETURN
+#endif
+static void __pyx_fatalerror(const char *fmt, ...) Py_NO_RETURN {
+    va_list vargs;
+    char msg[200];
+#if PY_VERSION_HEX >= 0x030A0000 || defined(HAVE_STDARG_PROTOTYPES)
+    va_start(vargs, fmt);
+#else
+    va_start(vargs);
+#endif
+    vsnprintf(msg, 200, fmt, vargs);
+    va_end(vargs);
+    Py_FatalError(msg);
+}
+static CYTHON_INLINE int
+__pyx_add_acquisition_count_locked(__pyx_atomic_int_type *acquisition_count,
+                                   PyThread_type_lock lock)
+{
+    int result;
+    PyThread_acquire_lock(lock, 1);
+    result = (*acquisition_count)++;
+    PyThread_release_lock(lock);
+    return result;
+}
+static CYTHON_INLINE int
+__pyx_sub_acquisition_count_locked(__pyx_atomic_int_type *acquisition_count,
+                                   PyThread_type_lock lock)
+{
+    int result;
+    PyThread_acquire_lock(lock, 1);
+    result = (*acquisition_count)--;
+    PyThread_release_lock(lock);
+    return result;
+}
+static CYTHON_INLINE void
+__Pyx_INC_MEMVIEW(__Pyx_memviewslice *memslice, int have_gil, int lineno)
+{
+    __pyx_nonatomic_int_type old_acquisition_count;
+    struct __pyx_memoryview_obj *memview = memslice->memview;
+    if (unlikely(!memview || (PyObject *) memview == Py_None)) {
+        return;
+    }
+    old_acquisition_count = __pyx_add_acquisition_count(memview);
+    if (unlikely(old_acquisition_count <= 0)) {
+        if (likely(old_acquisition_count == 0)) {
+            if (have_gil) {
+                Py_INCREF((PyObject *) memview);
+            } else {
+                PyGILState_STATE _gilstate = PyGILState_Ensure();
+                Py_INCREF((PyObject *) memview);
+                PyGILState_Release(_gilstate);
+            }
+        } else {
+            __pyx_fatalerror("Acquisition count is %d (line %d)",
+                             old_acquisition_count+1, lineno);
+        }
+    }
+}
+static CYTHON_INLINE void __Pyx_XCLEAR_MEMVIEW(__Pyx_memviewslice *memslice,
+                                             int have_gil, int lineno) {
+    __pyx_nonatomic_int_type old_acquisition_count;
+    struct __pyx_memoryview_obj *memview = memslice->memview;
+    if (unlikely(!memview || (PyObject *) memview == Py_None)) {
+        memslice->memview = NULL;
+        return;
+    }
+    old_acquisition_count = __pyx_sub_acquisition_count(memview);
+    memslice->data = NULL;
+    if (likely(old_acquisition_count > 1)) {
+        memslice->memview = NULL;
+    } else if (likely(old_acquisition_count == 1)) {
+        if (have_gil) {
+            Py_CLEAR(memslice->memview);
+        } else {
+            PyGILState_STATE _gilstate = PyGILState_Ensure();
+            Py_CLEAR(memslice->memview);
+            PyGILState_Release(_gilstate);
+        }
+    } else {
+        __pyx_fatalerror("Acquisition count is %d (line %d)",
+                         old_acquisition_count-1, lineno);
+    }
+}
+
+/* CIntToPy */
+  static CYTHON_INLINE PyObject* __Pyx_PyInt_From_npy_int64(npy_int64 value) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const npy_int64 neg_one = (npy_int64) -1, const_zero = (npy_int64) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(npy_int64) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(npy_int64) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(npy_int64) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(npy_int64) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(npy_int64) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        unsigned char *bytes = (unsigned char *)&value;
+#if !CYTHON_COMPILING_IN_LIMITED_API && PY_VERSION_HEX >= 0x030d00A4
+        if (is_unsigned) {
+            return PyLong_FromUnsignedNativeBytes(bytes, sizeof(value), -1);
+        } else {
+            return PyLong_FromNativeBytes(bytes, sizeof(value), -1);
+        }
+#elif !CYTHON_COMPILING_IN_LIMITED_API && PY_VERSION_HEX < 0x030d0000
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        return _PyLong_FromByteArray(bytes, sizeof(npy_int64),
+                                     little, !is_unsigned);
+#else
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        PyObject *from_bytes, *result = NULL;
+        PyObject *py_bytes = NULL, *arg_tuple = NULL, *kwds = NULL, *order_str = NULL;
+        from_bytes = PyObject_GetAttrString((PyObject*)&PyLong_Type, "from_bytes");
+        if (!from_bytes) return NULL;
+        py_bytes = PyBytes_FromStringAndSize((char*)bytes, sizeof(npy_int64));
+        if (!py_bytes) goto limited_bad;
+        order_str = PyUnicode_FromString(little ? "little" : "big");
+        if (!order_str) goto limited_bad;
+        arg_tuple = PyTuple_Pack(2, py_bytes, order_str);
+        if (!arg_tuple) goto limited_bad;
+        if (!is_unsigned) {
+            kwds = PyDict_New();
+            if (!kwds) goto limited_bad;
+            if (PyDict_SetItemString(kwds, "signed", __Pyx_NewRef(Py_True))) goto limited_bad;
+        }
+        result = PyObject_Call(from_bytes, arg_tuple, kwds);
+        limited_bad:
+        Py_XDECREF(kwds);
+        Py_XDECREF(arg_tuple);
+        Py_XDECREF(order_str);
+        Py_XDECREF(py_bytes);
+        Py_XDECREF(from_bytes);
+        return result;
+#endif
+    }
+}
+
+/* CIntFromPy */
+  static CYTHON_INLINE npy_int64 __Pyx_PyInt_As_npy_int64(PyObject *x) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const npy_int64 neg_one = (npy_int64) -1, const_zero = (npy_int64) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if ((sizeof(npy_int64) < sizeof(long))) {
+            __PYX_VERIFY_RETURN_INT(npy_int64, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (npy_int64) val;
+        }
+    }
+#endif
+    if (unlikely(!PyLong_Check(x))) {
+        npy_int64 val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (npy_int64) -1;
+        val = __Pyx_PyInt_As_npy_int64(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+    if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+        if (unlikely(__Pyx_PyLong_IsNeg(x))) {
+            goto raise_neg_overflow;
+        } else if (__Pyx_PyLong_IsCompact(x)) {
+            __PYX_VERIFY_RETURN_INT(npy_int64, __Pyx_compact_upylong, __Pyx_PyLong_CompactValueUnsigned(x))
+        } else {
+            const digit* digits = __Pyx_PyLong_Digits(x);
+            assert(__Pyx_PyLong_DigitCount(x) > 1);
+            switch (__Pyx_PyLong_DigitCount(x)) {
+                case 2:
+                    if ((8 * sizeof(npy_int64) > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(npy_int64, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(npy_int64) >= 2 * PyLong_SHIFT)) {
+                            return (npy_int64) (((((npy_int64)digits[1]) << PyLong_SHIFT) | (npy_int64)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if ((8 * sizeof(npy_int64) > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(npy_int64, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(npy_int64) >= 3 * PyLong_SHIFT)) {
+                            return (npy_int64) (((((((npy_int64)digits[2]) << PyLong_SHIFT) | (npy_int64)digits[1]) << PyLong_SHIFT) | (npy_int64)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if ((8 * sizeof(npy_int64) > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(npy_int64, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(npy_int64) >= 4 * PyLong_SHIFT)) {
+                            return (npy_int64) (((((((((npy_int64)digits[3]) << PyLong_SHIFT) | (npy_int64)digits[2]) << PyLong_SHIFT) | (npy_int64)digits[1]) << PyLong_SHIFT) | (npy_int64)digits[0]));
+                        }
+                    }
+                    break;
+            }
+        }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX < 0x030C00A7
+        if (unlikely(Py_SIZE(x) < 0)) {
+            goto raise_neg_overflow;
+        }
+#else
+        {
+            int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+            if (unlikely(result < 0))
+                return (npy_int64) -1;
+            if (unlikely(result == 1))
+                goto raise_neg_overflow;
+        }
+#endif
+        if ((sizeof(npy_int64) <= sizeof(unsigned long))) {
+            __PYX_VERIFY_RETURN_INT_EXC(npy_int64, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+        } else if ((sizeof(npy_int64) <= sizeof(unsigned PY_LONG_LONG))) {
+            __PYX_VERIFY_RETURN_INT_EXC(npy_int64, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+        }
+    } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+        if (__Pyx_PyLong_IsCompact(x)) {
+            __PYX_VERIFY_RETURN_INT(npy_int64, __Pyx_compact_pylong, __Pyx_PyLong_CompactValue(x))
+        } else {
+            const digit* digits = __Pyx_PyLong_Digits(x);
+            assert(__Pyx_PyLong_DigitCount(x) > 1);
+            switch (__Pyx_PyLong_SignedDigitCount(x)) {
+                case -2:
+                    if ((8 * sizeof(npy_int64) - 1 > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(npy_int64, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(npy_int64) - 1 > 2 * PyLong_SHIFT)) {
+                            return (npy_int64) (((npy_int64)-1)*(((((npy_int64)digits[1]) << PyLong_SHIFT) | (npy_int64)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if ((8 * sizeof(npy_int64) > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(npy_int64, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(npy_int64) - 1 > 2 * PyLong_SHIFT)) {
+                            return (npy_int64) ((((((npy_int64)digits[1]) << PyLong_SHIFT) | (npy_int64)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if ((8 * sizeof(npy_int64) - 1 > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(npy_int64, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(npy_int64) - 1 > 3 * PyLong_SHIFT)) {
+                            return (npy_int64) (((npy_int64)-1)*(((((((npy_int64)digits[2]) << PyLong_SHIFT) | (npy_int64)digits[1]) << PyLong_SHIFT) | (npy_int64)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if ((8 * sizeof(npy_int64) > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(npy_int64, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(npy_int64) - 1 > 3 * PyLong_SHIFT)) {
+                            return (npy_int64) ((((((((npy_int64)digits[2]) << PyLong_SHIFT) | (npy_int64)digits[1]) << PyLong_SHIFT) | (npy_int64)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if ((8 * sizeof(npy_int64) - 1 > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(npy_int64, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(npy_int64) - 1 > 4 * PyLong_SHIFT)) {
+                            return (npy_int64) (((npy_int64)-1)*(((((((((npy_int64)digits[3]) << PyLong_SHIFT) | (npy_int64)digits[2]) << PyLong_SHIFT) | (npy_int64)digits[1]) << PyLong_SHIFT) | (npy_int64)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if ((8 * sizeof(npy_int64) > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(npy_int64, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(npy_int64) - 1 > 4 * PyLong_SHIFT)) {
+                            return (npy_int64) ((((((((((npy_int64)digits[3]) << PyLong_SHIFT) | (npy_int64)digits[2]) << PyLong_SHIFT) | (npy_int64)digits[1]) << PyLong_SHIFT) | (npy_int64)digits[0])));
+                        }
+                    }
+                    break;
+            }
+        }
+#endif
+        if ((sizeof(npy_int64) <= sizeof(long))) {
+            __PYX_VERIFY_RETURN_INT_EXC(npy_int64, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+        } else if ((sizeof(npy_int64) <= sizeof(PY_LONG_LONG))) {
+            __PYX_VERIFY_RETURN_INT_EXC(npy_int64, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+        }
+    }
+    {
+        npy_int64 val;
+        int ret = -1;
+#if PY_VERSION_HEX >= 0x030d00A6 && !CYTHON_COMPILING_IN_LIMITED_API
+        Py_ssize_t bytes_copied = PyLong_AsNativeBytes(
+            x, &val, sizeof(val), Py_ASNATIVEBYTES_NATIVE_ENDIAN | (is_unsigned ? Py_ASNATIVEBYTES_UNSIGNED_BUFFER | Py_ASNATIVEBYTES_REJECT_NEGATIVE : 0));
+        if (unlikely(bytes_copied == -1)) {
+        } else if (unlikely(bytes_copied > (Py_ssize_t) sizeof(val))) {
+            goto raise_overflow;
+        } else {
+            ret = 0;
+        }
+#elif PY_VERSION_HEX < 0x030d0000 && !(CYTHON_COMPILING_IN_PYPY || CYTHON_COMPILING_IN_LIMITED_API) || defined(_PyLong_AsByteArray)
+        int one = 1; int is_little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&val;
+        ret = _PyLong_AsByteArray((PyLongObject *)x,
+                                    bytes, sizeof(val),
+                                    is_little, !is_unsigned);
+#else
+        PyObject *v;
+        PyObject *stepval = NULL, *mask = NULL, *shift = NULL;
+        int bits, remaining_bits, is_negative = 0;
+        int chunk_size = (sizeof(long) < 8) ? 30 : 62;
+        if (likely(PyLong_CheckExact(x))) {
+            v = __Pyx_NewRef(x);
+        } else {
+            v = PyNumber_Long(x);
+            if (unlikely(!v)) return (npy_int64) -1;
+            assert(PyLong_CheckExact(v));
+        }
+        {
+            int result = PyObject_RichCompareBool(v, Py_False, Py_LT);
+            if (unlikely(result < 0)) {
+                Py_DECREF(v);
+                return (npy_int64) -1;
+            }
+            is_negative = result == 1;
+        }
+        if (is_unsigned && unlikely(is_negative)) {
+            Py_DECREF(v);
+            goto raise_neg_overflow;
+        } else if (is_negative) {
+            stepval = PyNumber_Invert(v);
+            Py_DECREF(v);
+            if (unlikely(!stepval))
+                return (npy_int64) -1;
+        } else {
+            stepval = v;
+        }
+        v = NULL;
+        val = (npy_int64) 0;
+        mask = PyLong_FromLong((1L << chunk_size) - 1); if (unlikely(!mask)) goto done;
+        shift = PyLong_FromLong(chunk_size); if (unlikely(!shift)) goto done;
+        for (bits = 0; bits < (int) sizeof(npy_int64) * 8 - chunk_size; bits += chunk_size) {
+            PyObject *tmp, *digit;
+            long idigit;
+            digit = PyNumber_And(stepval, mask);
+            if (unlikely(!digit)) goto done;
+            idigit = PyLong_AsLong(digit);
+            Py_DECREF(digit);
+            if (unlikely(idigit < 0)) goto done;
+            val |= ((npy_int64) idigit) << bits;
+            tmp = PyNumber_Rshift(stepval, shift);
+            if (unlikely(!tmp)) goto done;
+            Py_DECREF(stepval); stepval = tmp;
+        }
+        Py_DECREF(shift); shift = NULL;
+        Py_DECREF(mask); mask = NULL;
+        {
+            long idigit = PyLong_AsLong(stepval);
+            if (unlikely(idigit < 0)) goto done;
+            remaining_bits = ((int) sizeof(npy_int64) * 8) - bits - (is_unsigned ? 0 : 1);
+            if (unlikely(idigit >= (1L << remaining_bits)))
+                goto raise_overflow;
+            val |= ((npy_int64) idigit) << bits;
+        }
+        if (!is_unsigned) {
+            if (unlikely(val & (((npy_int64) 1) << (sizeof(npy_int64) * 8 - 1))))
+                goto raise_overflow;
+            if (is_negative)
+                val = ~val;
+        }
+        ret = 0;
+    done:
+        Py_XDECREF(shift);
+        Py_XDECREF(mask);
+        Py_XDECREF(stepval);
+#endif
+        if (unlikely(ret))
+            return (npy_int64) -1;
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to npy_int64");
+    return (npy_int64) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to npy_int64");
+    return (npy_int64) -1;
+}
+
+/* CIntFromPy */
+  static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const int neg_one = (int) -1, const_zero = (int) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if ((sizeof(int) < sizeof(long))) {
+            __PYX_VERIFY_RETURN_INT(int, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (int) val;
+        }
+    }
+#endif
+    if (unlikely(!PyLong_Check(x))) {
+        int val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (int) -1;
+        val = __Pyx_PyInt_As_int(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+    if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+        if (unlikely(__Pyx_PyLong_IsNeg(x))) {
+            goto raise_neg_overflow;
+        } else if (__Pyx_PyLong_IsCompact(x)) {
+            __PYX_VERIFY_RETURN_INT(int, __Pyx_compact_upylong, __Pyx_PyLong_CompactValueUnsigned(x))
+        } else {
+            const digit* digits = __Pyx_PyLong_Digits(x);
+            assert(__Pyx_PyLong_DigitCount(x) > 1);
+            switch (__Pyx_PyLong_DigitCount(x)) {
+                case 2:
+                    if ((8 * sizeof(int) > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(int) >= 2 * PyLong_SHIFT)) {
+                            return (int) (((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if ((8 * sizeof(int) > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(int) >= 3 * PyLong_SHIFT)) {
+                            return (int) (((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if ((8 * sizeof(int) > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(int) >= 4 * PyLong_SHIFT)) {
+                            return (int) (((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+            }
+        }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX < 0x030C00A7
+        if (unlikely(Py_SIZE(x) < 0)) {
+            goto raise_neg_overflow;
+        }
+#else
+        {
+            int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+            if (unlikely(result < 0))
+                return (int) -1;
+            if (unlikely(result == 1))
+                goto raise_neg_overflow;
+        }
+#endif
+        if ((sizeof(int) <= sizeof(unsigned long))) {
+            __PYX_VERIFY_RETURN_INT_EXC(int, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+        } else if ((sizeof(int) <= sizeof(unsigned PY_LONG_LONG))) {
+            __PYX_VERIFY_RETURN_INT_EXC(int, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+        }
+    } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+        if (__Pyx_PyLong_IsCompact(x)) {
+            __PYX_VERIFY_RETURN_INT(int, __Pyx_compact_pylong, __Pyx_PyLong_CompactValue(x))
+        } else {
+            const digit* digits = __Pyx_PyLong_Digits(x);
+            assert(__Pyx_PyLong_DigitCount(x) > 1);
+            switch (__Pyx_PyLong_SignedDigitCount(x)) {
+                case -2:
+                    if ((8 * sizeof(int) - 1 > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(int) - 1 > 2 * PyLong_SHIFT)) {
+                            return (int) (((int)-1)*(((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if ((8 * sizeof(int) > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(int) - 1 > 2 * PyLong_SHIFT)) {
+                            return (int) ((((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if ((8 * sizeof(int) - 1 > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(int) - 1 > 3 * PyLong_SHIFT)) {
+                            return (int) (((int)-1)*(((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if ((8 * sizeof(int) > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(int) - 1 > 3 * PyLong_SHIFT)) {
+                            return (int) ((((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if ((8 * sizeof(int) - 1 > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(int) - 1 > 4 * PyLong_SHIFT)) {
+                            return (int) (((int)-1)*(((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if ((8 * sizeof(int) > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(int) - 1 > 4 * PyLong_SHIFT)) {
+                            return (int) ((((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+            }
+        }
+#endif
+        if ((sizeof(int) <= sizeof(long))) {
+            __PYX_VERIFY_RETURN_INT_EXC(int, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+        } else if ((sizeof(int) <= sizeof(PY_LONG_LONG))) {
+            __PYX_VERIFY_RETURN_INT_EXC(int, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+        }
+    }
+    {
+        int val;
+        int ret = -1;
+#if PY_VERSION_HEX >= 0x030d00A6 && !CYTHON_COMPILING_IN_LIMITED_API
+        Py_ssize_t bytes_copied = PyLong_AsNativeBytes(
+            x, &val, sizeof(val), Py_ASNATIVEBYTES_NATIVE_ENDIAN | (is_unsigned ? Py_ASNATIVEBYTES_UNSIGNED_BUFFER | Py_ASNATIVEBYTES_REJECT_NEGATIVE : 0));
+        if (unlikely(bytes_copied == -1)) {
+        } else if (unlikely(bytes_copied > (Py_ssize_t) sizeof(val))) {
+            goto raise_overflow;
+        } else {
+            ret = 0;
+        }
+#elif PY_VERSION_HEX < 0x030d0000 && !(CYTHON_COMPILING_IN_PYPY || CYTHON_COMPILING_IN_LIMITED_API) || defined(_PyLong_AsByteArray)
+        int one = 1; int is_little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&val;
+        ret = _PyLong_AsByteArray((PyLongObject *)x,
+                                    bytes, sizeof(val),
+                                    is_little, !is_unsigned);
+#else
+        PyObject *v;
+        PyObject *stepval = NULL, *mask = NULL, *shift = NULL;
+        int bits, remaining_bits, is_negative = 0;
+        int chunk_size = (sizeof(long) < 8) ? 30 : 62;
+        if (likely(PyLong_CheckExact(x))) {
+            v = __Pyx_NewRef(x);
+        } else {
+            v = PyNumber_Long(x);
+            if (unlikely(!v)) return (int) -1;
+            assert(PyLong_CheckExact(v));
+        }
+        {
+            int result = PyObject_RichCompareBool(v, Py_False, Py_LT);
+            if (unlikely(result < 0)) {
+                Py_DECREF(v);
+                return (int) -1;
+            }
+            is_negative = result == 1;
+        }
+        if (is_unsigned && unlikely(is_negative)) {
+            Py_DECREF(v);
+            goto raise_neg_overflow;
+        } else if (is_negative) {
+            stepval = PyNumber_Invert(v);
+            Py_DECREF(v);
+            if (unlikely(!stepval))
+                return (int) -1;
+        } else {
+            stepval = v;
+        }
+        v = NULL;
+        val = (int) 0;
+        mask = PyLong_FromLong((1L << chunk_size) - 1); if (unlikely(!mask)) goto done;
+        shift = PyLong_FromLong(chunk_size); if (unlikely(!shift)) goto done;
+        for (bits = 0; bits < (int) sizeof(int) * 8 - chunk_size; bits += chunk_size) {
+            PyObject *tmp, *digit;
+            long idigit;
+            digit = PyNumber_And(stepval, mask);
+            if (unlikely(!digit)) goto done;
+            idigit = PyLong_AsLong(digit);
+            Py_DECREF(digit);
+            if (unlikely(idigit < 0)) goto done;
+            val |= ((int) idigit) << bits;
+            tmp = PyNumber_Rshift(stepval, shift);
+            if (unlikely(!tmp)) goto done;
+            Py_DECREF(stepval); stepval = tmp;
+        }
+        Py_DECREF(shift); shift = NULL;
+        Py_DECREF(mask); mask = NULL;
+        {
+            long idigit = PyLong_AsLong(stepval);
+            if (unlikely(idigit < 0)) goto done;
+            remaining_bits = ((int) sizeof(int) * 8) - bits - (is_unsigned ? 0 : 1);
+            if (unlikely(idigit >= (1L << remaining_bits)))
+                goto raise_overflow;
+            val |= ((int) idigit) << bits;
+        }
+        if (!is_unsigned) {
+            if (unlikely(val & (((int) 1) << (sizeof(int) * 8 - 1))))
+                goto raise_overflow;
+            if (is_negative)
+                val = ~val;
+        }
+        ret = 0;
+    done:
+        Py_XDECREF(shift);
+        Py_XDECREF(mask);
+        Py_XDECREF(stepval);
+#endif
+        if (unlikely(ret))
+            return (int) -1;
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to int");
+    return (int) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to int");
+    return (int) -1;
+}
+
+/* CIntFromPy */
+  static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const long neg_one = (long) -1, const_zero = (long) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if ((sizeof(long) < sizeof(long))) {
+            __PYX_VERIFY_RETURN_INT(long, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (long) val;
+        }
+    }
+#endif
+    if (unlikely(!PyLong_Check(x))) {
+        long val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (long) -1;
+        val = __Pyx_PyInt_As_long(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+    if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+        if (unlikely(__Pyx_PyLong_IsNeg(x))) {
+            goto raise_neg_overflow;
+        } else if (__Pyx_PyLong_IsCompact(x)) {
+            __PYX_VERIFY_RETURN_INT(long, __Pyx_compact_upylong, __Pyx_PyLong_CompactValueUnsigned(x))
+        } else {
+            const digit* digits = __Pyx_PyLong_Digits(x);
+            assert(__Pyx_PyLong_DigitCount(x) > 1);
+            switch (__Pyx_PyLong_DigitCount(x)) {
+                case 2:
+                    if ((8 * sizeof(long) > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(long) >= 2 * PyLong_SHIFT)) {
+                            return (long) (((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if ((8 * sizeof(long) > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(long) >= 3 * PyLong_SHIFT)) {
+                            return (long) (((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if ((8 * sizeof(long) > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(long) >= 4 * PyLong_SHIFT)) {
+                            return (long) (((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+            }
+        }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX < 0x030C00A7
+        if (unlikely(Py_SIZE(x) < 0)) {
+            goto raise_neg_overflow;
+        }
+#else
+        {
+            int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+            if (unlikely(result < 0))
+                return (long) -1;
+            if (unlikely(result == 1))
+                goto raise_neg_overflow;
+        }
+#endif
+        if ((sizeof(long) <= sizeof(unsigned long))) {
+            __PYX_VERIFY_RETURN_INT_EXC(long, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+        } else if ((sizeof(long) <= sizeof(unsigned PY_LONG_LONG))) {
+            __PYX_VERIFY_RETURN_INT_EXC(long, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+        }
+    } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+        if (__Pyx_PyLong_IsCompact(x)) {
+            __PYX_VERIFY_RETURN_INT(long, __Pyx_compact_pylong, __Pyx_PyLong_CompactValue(x))
+        } else {
+            const digit* digits = __Pyx_PyLong_Digits(x);
+            assert(__Pyx_PyLong_DigitCount(x) > 1);
+            switch (__Pyx_PyLong_SignedDigitCount(x)) {
+                case -2:
+                    if ((8 * sizeof(long) - 1 > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(long) - 1 > 2 * PyLong_SHIFT)) {
+                            return (long) (((long)-1)*(((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if ((8 * sizeof(long) > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(long) - 1 > 2 * PyLong_SHIFT)) {
+                            return (long) ((((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if ((8 * sizeof(long) - 1 > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(long) - 1 > 3 * PyLong_SHIFT)) {
+                            return (long) (((long)-1)*(((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if ((8 * sizeof(long) > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(long) - 1 > 3 * PyLong_SHIFT)) {
+                            return (long) ((((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if ((8 * sizeof(long) - 1 > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(long) - 1 > 4 * PyLong_SHIFT)) {
+                            return (long) (((long)-1)*(((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if ((8 * sizeof(long) > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(long) - 1 > 4 * PyLong_SHIFT)) {
+                            return (long) ((((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+            }
+        }
+#endif
+        if ((sizeof(long) <= sizeof(long))) {
+            __PYX_VERIFY_RETURN_INT_EXC(long, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+        } else if ((sizeof(long) <= sizeof(PY_LONG_LONG))) {
+            __PYX_VERIFY_RETURN_INT_EXC(long, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+        }
+    }
+    {
+        long val;
+        int ret = -1;
+#if PY_VERSION_HEX >= 0x030d00A6 && !CYTHON_COMPILING_IN_LIMITED_API
+        Py_ssize_t bytes_copied = PyLong_AsNativeBytes(
+            x, &val, sizeof(val), Py_ASNATIVEBYTES_NATIVE_ENDIAN | (is_unsigned ? Py_ASNATIVEBYTES_UNSIGNED_BUFFER | Py_ASNATIVEBYTES_REJECT_NEGATIVE : 0));
+        if (unlikely(bytes_copied == -1)) {
+        } else if (unlikely(bytes_copied > (Py_ssize_t) sizeof(val))) {
+            goto raise_overflow;
+        } else {
+            ret = 0;
+        }
+#elif PY_VERSION_HEX < 0x030d0000 && !(CYTHON_COMPILING_IN_PYPY || CYTHON_COMPILING_IN_LIMITED_API) || defined(_PyLong_AsByteArray)
+        int one = 1; int is_little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&val;
+        ret = _PyLong_AsByteArray((PyLongObject *)x,
+                                    bytes, sizeof(val),
+                                    is_little, !is_unsigned);
+#else
+        PyObject *v;
+        PyObject *stepval = NULL, *mask = NULL, *shift = NULL;
+        int bits, remaining_bits, is_negative = 0;
+        int chunk_size = (sizeof(long) < 8) ? 30 : 62;
+        if (likely(PyLong_CheckExact(x))) {
+            v = __Pyx_NewRef(x);
+        } else {
+            v = PyNumber_Long(x);
+            if (unlikely(!v)) return (long) -1;
+            assert(PyLong_CheckExact(v));
+        }
+        {
+            int result = PyObject_RichCompareBool(v, Py_False, Py_LT);
+            if (unlikely(result < 0)) {
+                Py_DECREF(v);
+                return (long) -1;
+            }
+            is_negative = result == 1;
+        }
+        if (is_unsigned && unlikely(is_negative)) {
+            Py_DECREF(v);
+            goto raise_neg_overflow;
+        } else if (is_negative) {
+            stepval = PyNumber_Invert(v);
+            Py_DECREF(v);
+            if (unlikely(!stepval))
+                return (long) -1;
+        } else {
+            stepval = v;
+        }
+        v = NULL;
+        val = (long) 0;
+        mask = PyLong_FromLong((1L << chunk_size) - 1); if (unlikely(!mask)) goto done;
+        shift = PyLong_FromLong(chunk_size); if (unlikely(!shift)) goto done;
+        for (bits = 0; bits < (int) sizeof(long) * 8 - chunk_size; bits += chunk_size) {
+            PyObject *tmp, *digit;
+            long idigit;
+            digit = PyNumber_And(stepval, mask);
+            if (unlikely(!digit)) goto done;
+            idigit = PyLong_AsLong(digit);
+            Py_DECREF(digit);
+            if (unlikely(idigit < 0)) goto done;
+            val |= ((long) idigit) << bits;
+            tmp = PyNumber_Rshift(stepval, shift);
+            if (unlikely(!tmp)) goto done;
+            Py_DECREF(stepval); stepval = tmp;
+        }
+        Py_DECREF(shift); shift = NULL;
+        Py_DECREF(mask); mask = NULL;
+        {
+            long idigit = PyLong_AsLong(stepval);
+            if (unlikely(idigit < 0)) goto done;
+            remaining_bits = ((int) sizeof(long) * 8) - bits - (is_unsigned ? 0 : 1);
+            if (unlikely(idigit >= (1L << remaining_bits)))
+                goto raise_overflow;
+            val |= ((long) idigit) << bits;
+        }
+        if (!is_unsigned) {
+            if (unlikely(val & (((long) 1) << (sizeof(long) * 8 - 1))))
+                goto raise_overflow;
+            if (is_negative)
+                val = ~val;
+        }
+        ret = 0;
+    done:
+        Py_XDECREF(shift);
+        Py_XDECREF(mask);
+        Py_XDECREF(stepval);
+#endif
+        if (unlikely(ret))
+            return (long) -1;
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to long");
+    return (long) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to long");
+    return (long) -1;
+}
+
+/* CIntToPy */
+  static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const long neg_one = (long) -1, const_zero = (long) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(long) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(long) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(long) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        unsigned char *bytes = (unsigned char *)&value;
+#if !CYTHON_COMPILING_IN_LIMITED_API && PY_VERSION_HEX >= 0x030d00A4
+        if (is_unsigned) {
+            return PyLong_FromUnsignedNativeBytes(bytes, sizeof(value), -1);
+        } else {
+            return PyLong_FromNativeBytes(bytes, sizeof(value), -1);
+        }
+#elif !CYTHON_COMPILING_IN_LIMITED_API && PY_VERSION_HEX < 0x030d0000
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        return _PyLong_FromByteArray(bytes, sizeof(long),
+                                     little, !is_unsigned);
+#else
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        PyObject *from_bytes, *result = NULL;
+        PyObject *py_bytes = NULL, *arg_tuple = NULL, *kwds = NULL, *order_str = NULL;
+        from_bytes = PyObject_GetAttrString((PyObject*)&PyLong_Type, "from_bytes");
+        if (!from_bytes) return NULL;
+        py_bytes = PyBytes_FromStringAndSize((char*)bytes, sizeof(long));
+        if (!py_bytes) goto limited_bad;
+        order_str = PyUnicode_FromString(little ? "little" : "big");
+        if (!order_str) goto limited_bad;
+        arg_tuple = PyTuple_Pack(2, py_bytes, order_str);
+        if (!arg_tuple) goto limited_bad;
+        if (!is_unsigned) {
+            kwds = PyDict_New();
+            if (!kwds) goto limited_bad;
+            if (PyDict_SetItemString(kwds, "signed", __Pyx_NewRef(Py_True))) goto limited_bad;
+        }
+        result = PyObject_Call(from_bytes, arg_tuple, kwds);
+        limited_bad:
+        Py_XDECREF(kwds);
+        Py_XDECREF(arg_tuple);
+        Py_XDECREF(order_str);
+        Py_XDECREF(py_bytes);
+        Py_XDECREF(from_bytes);
+        return result;
+#endif
+    }
+}
+
+/* CIntToPy */
+  static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const int neg_one = (int) -1, const_zero = (int) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(int) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(int) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(int) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(int) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(int) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        unsigned char *bytes = (unsigned char *)&value;
+#if !CYTHON_COMPILING_IN_LIMITED_API && PY_VERSION_HEX >= 0x030d00A4
+        if (is_unsigned) {
+            return PyLong_FromUnsignedNativeBytes(bytes, sizeof(value), -1);
+        } else {
+            return PyLong_FromNativeBytes(bytes, sizeof(value), -1);
+        }
+#elif !CYTHON_COMPILING_IN_LIMITED_API && PY_VERSION_HEX < 0x030d0000
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        return _PyLong_FromByteArray(bytes, sizeof(int),
+                                     little, !is_unsigned);
+#else
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        PyObject *from_bytes, *result = NULL;
+        PyObject *py_bytes = NULL, *arg_tuple = NULL, *kwds = NULL, *order_str = NULL;
+        from_bytes = PyObject_GetAttrString((PyObject*)&PyLong_Type, "from_bytes");
+        if (!from_bytes) return NULL;
+        py_bytes = PyBytes_FromStringAndSize((char*)bytes, sizeof(int));
+        if (!py_bytes) goto limited_bad;
+        order_str = PyUnicode_FromString(little ? "little" : "big");
+        if (!order_str) goto limited_bad;
+        arg_tuple = PyTuple_Pack(2, py_bytes, order_str);
+        if (!arg_tuple) goto limited_bad;
+        if (!is_unsigned) {
+            kwds = PyDict_New();
+            if (!kwds) goto limited_bad;
+            if (PyDict_SetItemString(kwds, "signed", __Pyx_NewRef(Py_True))) goto limited_bad;
+        }
+        result = PyObject_Call(from_bytes, arg_tuple, kwds);
+        limited_bad:
+        Py_XDECREF(kwds);
+        Py_XDECREF(arg_tuple);
+        Py_XDECREF(order_str);
+        Py_XDECREF(py_bytes);
+        Py_XDECREF(from_bytes);
+        return result;
+#endif
+    }
+}
+
+/* CIntFromPy */
+  static CYTHON_INLINE char __Pyx_PyInt_As_char(PyObject *x) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const char neg_one = (char) -1, const_zero = (char) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if ((sizeof(char) < sizeof(long))) {
+            __PYX_VERIFY_RETURN_INT(char, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (char) val;
+        }
+    }
+#endif
+    if (unlikely(!PyLong_Check(x))) {
+        char val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (char) -1;
+        val = __Pyx_PyInt_As_char(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+    if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+        if (unlikely(__Pyx_PyLong_IsNeg(x))) {
+            goto raise_neg_overflow;
+        } else if (__Pyx_PyLong_IsCompact(x)) {
+            __PYX_VERIFY_RETURN_INT(char, __Pyx_compact_upylong, __Pyx_PyLong_CompactValueUnsigned(x))
+        } else {
+            const digit* digits = __Pyx_PyLong_Digits(x);
+            assert(__Pyx_PyLong_DigitCount(x) > 1);
+            switch (__Pyx_PyLong_DigitCount(x)) {
+                case 2:
+                    if ((8 * sizeof(char) > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(char, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(char) >= 2 * PyLong_SHIFT)) {
+                            return (char) (((((char)digits[1]) << PyLong_SHIFT) | (char)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if ((8 * sizeof(char) > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(char, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(char) >= 3 * PyLong_SHIFT)) {
+                            return (char) (((((((char)digits[2]) << PyLong_SHIFT) | (char)digits[1]) << PyLong_SHIFT) | (char)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if ((8 * sizeof(char) > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(char, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(char) >= 4 * PyLong_SHIFT)) {
+                            return (char) (((((((((char)digits[3]) << PyLong_SHIFT) | (char)digits[2]) << PyLong_SHIFT) | (char)digits[1]) << PyLong_SHIFT) | (char)digits[0]));
+                        }
+                    }
+                    break;
+            }
+        }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX < 0x030C00A7
+        if (unlikely(Py_SIZE(x) < 0)) {
+            goto raise_neg_overflow;
+        }
+#else
+        {
+            int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+            if (unlikely(result < 0))
+                return (char) -1;
+            if (unlikely(result == 1))
+                goto raise_neg_overflow;
+        }
+#endif
+        if ((sizeof(char) <= sizeof(unsigned long))) {
+            __PYX_VERIFY_RETURN_INT_EXC(char, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+        } else if ((sizeof(char) <= sizeof(unsigned PY_LONG_LONG))) {
+            __PYX_VERIFY_RETURN_INT_EXC(char, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+        }
+    } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+        if (__Pyx_PyLong_IsCompact(x)) {
+            __PYX_VERIFY_RETURN_INT(char, __Pyx_compact_pylong, __Pyx_PyLong_CompactValue(x))
+        } else {
+            const digit* digits = __Pyx_PyLong_Digits(x);
+            assert(__Pyx_PyLong_DigitCount(x) > 1);
+            switch (__Pyx_PyLong_SignedDigitCount(x)) {
+                case -2:
+                    if ((8 * sizeof(char) - 1 > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(char, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(char) - 1 > 2 * PyLong_SHIFT)) {
+                            return (char) (((char)-1)*(((((char)digits[1]) << PyLong_SHIFT) | (char)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if ((8 * sizeof(char) > 1 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 2 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(char, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(char) - 1 > 2 * PyLong_SHIFT)) {
+                            return (char) ((((((char)digits[1]) << PyLong_SHIFT) | (char)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if ((8 * sizeof(char) - 1 > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(char, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(char) - 1 > 3 * PyLong_SHIFT)) {
+                            return (char) (((char)-1)*(((((((char)digits[2]) << PyLong_SHIFT) | (char)digits[1]) << PyLong_SHIFT) | (char)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if ((8 * sizeof(char) > 2 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 3 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(char, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(char) - 1 > 3 * PyLong_SHIFT)) {
+                            return (char) ((((((((char)digits[2]) << PyLong_SHIFT) | (char)digits[1]) << PyLong_SHIFT) | (char)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if ((8 * sizeof(char) - 1 > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(char, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(char) - 1 > 4 * PyLong_SHIFT)) {
+                            return (char) (((char)-1)*(((((((((char)digits[3]) << PyLong_SHIFT) | (char)digits[2]) << PyLong_SHIFT) | (char)digits[1]) << PyLong_SHIFT) | (char)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if ((8 * sizeof(char) > 3 * PyLong_SHIFT)) {
+                        if ((8 * sizeof(unsigned long) > 4 * PyLong_SHIFT)) {
+                            __PYX_VERIFY_RETURN_INT(char, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if ((8 * sizeof(char) - 1 > 4 * PyLong_SHIFT)) {
+                            return (char) ((((((((((char)digits[3]) << PyLong_SHIFT) | (char)digits[2]) << PyLong_SHIFT) | (char)digits[1]) << PyLong_SHIFT) | (char)digits[0])));
+                        }
+                    }
+                    break;
+            }
+        }
+#endif
+        if ((sizeof(char) <= sizeof(long))) {
+            __PYX_VERIFY_RETURN_INT_EXC(char, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+        } else if ((sizeof(char) <= sizeof(PY_LONG_LONG))) {
+            __PYX_VERIFY_RETURN_INT_EXC(char, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+        }
+    }
+    {
+        char val;
+        int ret = -1;
+#if PY_VERSION_HEX >= 0x030d00A6 && !CYTHON_COMPILING_IN_LIMITED_API
+        Py_ssize_t bytes_copied = PyLong_AsNativeBytes(
+            x, &val, sizeof(val), Py_ASNATIVEBYTES_NATIVE_ENDIAN | (is_unsigned ? Py_ASNATIVEBYTES_UNSIGNED_BUFFER | Py_ASNATIVEBYTES_REJECT_NEGATIVE : 0));
+        if (unlikely(bytes_copied == -1)) {
+        } else if (unlikely(bytes_copied > (Py_ssize_t) sizeof(val))) {
+            goto raise_overflow;
+        } else {
+            ret = 0;
+        }
+#elif PY_VERSION_HEX < 0x030d0000 && !(CYTHON_COMPILING_IN_PYPY || CYTHON_COMPILING_IN_LIMITED_API) || defined(_PyLong_AsByteArray)
+        int one = 1; int is_little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&val;
+        ret = _PyLong_AsByteArray((PyLongObject *)x,
+                                    bytes, sizeof(val),
+                                    is_little, !is_unsigned);
+#else
+        PyObject *v;
+        PyObject *stepval = NULL, *mask = NULL, *shift = NULL;
+        int bits, remaining_bits, is_negative = 0;
+        int chunk_size = (sizeof(long) < 8) ? 30 : 62;
+        if (likely(PyLong_CheckExact(x))) {
+            v = __Pyx_NewRef(x);
+        } else {
+            v = PyNumber_Long(x);
+            if (unlikely(!v)) return (char) -1;
+            assert(PyLong_CheckExact(v));
+        }
+        {
+            int result = PyObject_RichCompareBool(v, Py_False, Py_LT);
+            if (unlikely(result < 0)) {
+                Py_DECREF(v);
+                return (char) -1;
+            }
+            is_negative = result == 1;
+        }
+        if (is_unsigned && unlikely(is_negative)) {
+            Py_DECREF(v);
+            goto raise_neg_overflow;
+        } else if (is_negative) {
+            stepval = PyNumber_Invert(v);
+            Py_DECREF(v);
+            if (unlikely(!stepval))
+                return (char) -1;
+        } else {
+            stepval = v;
+        }
+        v = NULL;
+        val = (char) 0;
+        mask = PyLong_FromLong((1L << chunk_size) - 1); if (unlikely(!mask)) goto done;
+        shift = PyLong_FromLong(chunk_size); if (unlikely(!shift)) goto done;
+        for (bits = 0; bits < (int) sizeof(char) * 8 - chunk_size; bits += chunk_size) {
+            PyObject *tmp, *digit;
+            long idigit;
+            digit = PyNumber_And(stepval, mask);
+            if (unlikely(!digit)) goto done;
+            idigit = PyLong_AsLong(digit);
+            Py_DECREF(digit);
+            if (unlikely(idigit < 0)) goto done;
+            val |= ((char) idigit) << bits;
+            tmp = PyNumber_Rshift(stepval, shift);
+            if (unlikely(!tmp)) goto done;
+            Py_DECREF(stepval); stepval = tmp;
+        }
+        Py_DECREF(shift); shift = NULL;
+        Py_DECREF(mask); mask = NULL;
+        {
+            long idigit = PyLong_AsLong(stepval);
+            if (unlikely(idigit < 0)) goto done;
+            remaining_bits = ((int) sizeof(char) * 8) - bits - (is_unsigned ? 0 : 1);
+            if (unlikely(idigit >= (1L << remaining_bits)))
+                goto raise_overflow;
+            val |= ((char) idigit) << bits;
+        }
+        if (!is_unsigned) {
+            if (unlikely(val & (((char) 1) << (sizeof(char) * 8 - 1))))
+                goto raise_overflow;
+            if (is_negative)
+                val = ~val;
+        }
+        ret = 0;
+    done:
+        Py_XDECREF(shift);
+        Py_XDECREF(mask);
+        Py_XDECREF(stepval);
+#endif
+        if (unlikely(ret))
+            return (char) -1;
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to char");
+    return (char) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to char");
+    return (char) -1;
+}
+
+/* FormatTypeName */
+  #if CYTHON_COMPILING_IN_LIMITED_API
+static __Pyx_TypeName
+__Pyx_PyType_GetName(PyTypeObject* tp)
+{
+    PyObject *name = __Pyx_PyObject_GetAttrStr((PyObject *)tp,
+                                               __pyx_n_s_name_2);
+    if (unlikely(name == NULL) || unlikely(!PyUnicode_Check(name))) {
+        PyErr_Clear();
+        Py_XDECREF(name);
+        name = __Pyx_NewRef(__pyx_n_s__35);
+    }
+    return name;
+}
+#endif
+
+/* CheckBinaryVersion */
+  static unsigned long __Pyx_get_runtime_version(void) {
+#if __PYX_LIMITED_VERSION_HEX >= 0x030B00A4
+    return Py_Version & ~0xFFUL;
+#else
+    const char* rt_version = Py_GetVersion();
+    unsigned long version = 0;
+    unsigned long factor = 0x01000000UL;
+    unsigned int digit = 0;
+    int i = 0;
+    while (factor) {
+        while ('0' <= rt_version[i] && rt_version[i] <= '9') {
+            digit = digit * 10 + (unsigned int) (rt_version[i] - '0');
+            ++i;
+        }
+        version += factor * digit;
+        if (rt_version[i] != '.')
+            break;
+        digit = 0;
+        factor >>= 8;
+        ++i;
+    }
+    return version;
+#endif
+}
+static int __Pyx_check_binary_version(unsigned long ct_version, unsigned long rt_version, int allow_newer) {
+    const unsigned long MAJOR_MINOR = 0xFFFF0000UL;
+    if ((rt_version & MAJOR_MINOR) == (ct_version & MAJOR_MINOR))
+        return 0;
+    if (likely(allow_newer && (rt_version & MAJOR_MINOR) > (ct_version & MAJOR_MINOR)))
+        return 1;
+    {
+        char message[200];
+        PyOS_snprintf(message, sizeof(message),
+                      "compile time Python version %d.%d "
+                      "of module '%.100s' "
+                      "%s "
+                      "runtime version %d.%d",
+                       (int) (ct_version >> 24), (int) ((ct_version >> 16) & 0xFF),
+                       __Pyx_MODULE_NAME,
+                       (allow_newer) ? "was newer than" : "does not match",
+                       (int) (rt_version >> 24), (int) ((rt_version >> 16) & 0xFF)
+       );
+        return PyErr_WarnEx(NULL, message, 1);
+    }
+}
+
+/* InitStrings */
+  #if PY_MAJOR_VERSION >= 3
+static int __Pyx_InitString(__Pyx_StringTabEntry t, PyObject **str) {
+    if (t.is_unicode | t.is_str) {
+        if (t.intern) {
+            *str = PyUnicode_InternFromString(t.s);
+        } else if (t.encoding) {
+            *str = PyUnicode_Decode(t.s, t.n - 1, t.encoding, NULL);
+        } else {
+            *str = PyUnicode_FromStringAndSize(t.s, t.n - 1);
+        }
+    } else {
+        *str = PyBytes_FromStringAndSize(t.s, t.n - 1);
+    }
+    if (!*str)
+        return -1;
+    if (PyObject_Hash(*str) == -1)
+        return -1;
+    return 0;
+}
+#endif
+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
+    while (t->p) {
+        #if PY_MAJOR_VERSION >= 3
+        __Pyx_InitString(*t, t->p);
+        #else
+        if (t->is_unicode) {
+            *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
+        } else if (t->intern) {
+            *t->p = PyString_InternFromString(t->s);
+        } else {
+            *t->p = PyString_FromStringAndSize(t->s, t->n - 1);
+        }
+        if (!*t->p)
+            return -1;
+        if (PyObject_Hash(*t->p) == -1)
+            return -1;
+        #endif
+        ++t;
+    }
+    return 0;
+}
+
+#include <string.h>
+static CYTHON_INLINE Py_ssize_t __Pyx_ssize_strlen(const char *s) {
+    size_t len = strlen(s);
+    if (unlikely(len > (size_t) PY_SSIZE_T_MAX)) {
+        PyErr_SetString(PyExc_OverflowError, "byte string is too long");
+        return -1;
+    }
+    return (Py_ssize_t) len;
+}
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char* c_str) {
+    Py_ssize_t len = __Pyx_ssize_strlen(c_str);
+    if (unlikely(len < 0)) return NULL;
+    return __Pyx_PyUnicode_FromStringAndSize(c_str, len);
+}
+static CYTHON_INLINE PyObject* __Pyx_PyByteArray_FromString(const char* c_str) {
+    Py_ssize_t len = __Pyx_ssize_strlen(c_str);
+    if (unlikely(len < 0)) return NULL;
+    return PyByteArray_FromStringAndSize(c_str, len);
+}
+static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject* o) {
+    Py_ssize_t ignore;
+    return __Pyx_PyObject_AsStringAndSize(o, &ignore);
+}
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+#if !CYTHON_PEP393_ENABLED
+static const char* __Pyx_PyUnicode_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+    char* defenc_c;
+    PyObject* defenc = _PyUnicode_AsDefaultEncodedString(o, NULL);
+    if (!defenc) return NULL;
+    defenc_c = PyBytes_AS_STRING(defenc);
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+    {
+        char* end = defenc_c + PyBytes_GET_SIZE(defenc);
+        char* c;
+        for (c = defenc_c; c < end; c++) {
+            if ((unsigned char) (*c) >= 128) {
+                PyUnicode_AsASCIIString(o);
+                return NULL;
+            }
+        }
+    }
+#endif
+    *length = PyBytes_GET_SIZE(defenc);
+    return defenc_c;
+}
+#else
+static CYTHON_INLINE const char* __Pyx_PyUnicode_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+    if (unlikely(__Pyx_PyUnicode_READY(o) == -1)) return NULL;
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+    if (likely(PyUnicode_IS_ASCII(o))) {
+        *length = PyUnicode_GET_LENGTH(o);
+        return PyUnicode_AsUTF8(o);
+    } else {
+        PyUnicode_AsASCIIString(o);
+        return NULL;
+    }
+#else
+    return PyUnicode_AsUTF8AndSize(o, length);
+#endif
+}
+#endif
+#endif
+static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+    if (
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+            __Pyx_sys_getdefaultencoding_not_ascii &&
+#endif
+            PyUnicode_Check(o)) {
+        return __Pyx_PyUnicode_AsStringAndSize(o, length);
+    } else
+#endif
+#if (!CYTHON_COMPILING_IN_PYPY && !CYTHON_COMPILING_IN_LIMITED_API) || (defined(PyByteArray_AS_STRING) && defined(PyByteArray_GET_SIZE))
+    if (PyByteArray_Check(o)) {
+        *length = PyByteArray_GET_SIZE(o);
+        return PyByteArray_AS_STRING(o);
+    } else
+#endif
+    {
+        char* result;
+        int r = PyBytes_AsStringAndSize(o, &result, length);
+        if (unlikely(r < 0)) {
+            return NULL;
+        } else {
+            return result;
+        }
+    }
+}
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {
+   int is_true = x == Py_True;
+   if (is_true | (x == Py_False) | (x == Py_None)) return is_true;
+   else return PyObject_IsTrue(x);
+}
+static CYTHON_INLINE int __Pyx_PyObject_IsTrueAndDecref(PyObject* x) {
+    int retval;
+    if (unlikely(!x)) return -1;
+    retval = __Pyx_PyObject_IsTrue(x);
+    Py_DECREF(x);
+    return retval;
+}
+static PyObject* __Pyx_PyNumber_IntOrLongWrongResultType(PyObject* result, const char* type_name) {
+    __Pyx_TypeName result_type_name = __Pyx_PyType_GetName(Py_TYPE(result));
+#if PY_MAJOR_VERSION >= 3
+    if (PyLong_Check(result)) {
+        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                "__int__ returned non-int (type " __Pyx_FMT_TYPENAME ").  "
+                "The ability to return an instance of a strict subclass of int is deprecated, "
+                "and may be removed in a future version of Python.",
+                result_type_name)) {
+            __Pyx_DECREF_TypeName(result_type_name);
+            Py_DECREF(result);
+            return NULL;
+        }
+        __Pyx_DECREF_TypeName(result_type_name);
+        return result;
+    }
+#endif
+    PyErr_Format(PyExc_TypeError,
+                 "__%.4s__ returned non-%.4s (type " __Pyx_FMT_TYPENAME ")",
+                 type_name, type_name, result_type_name);
+    __Pyx_DECREF_TypeName(result_type_name);
+    Py_DECREF(result);
+    return NULL;
+}
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x) {
+#if CYTHON_USE_TYPE_SLOTS
+  PyNumberMethods *m;
+#endif
+  const char *name = NULL;
+  PyObject *res = NULL;
+#if PY_MAJOR_VERSION < 3
+  if (likely(PyInt_Check(x) || PyLong_Check(x)))
+#else
+  if (likely(PyLong_Check(x)))
+#endif
+    return __Pyx_NewRef(x);
+#if CYTHON_USE_TYPE_SLOTS
+  m = Py_TYPE(x)->tp_as_number;
+  #if PY_MAJOR_VERSION < 3
+  if (m && m->nb_int) {
+    name = "int";
+    res = m->nb_int(x);
+  }
+  else if (m && m->nb_long) {
+    name = "long";
+    res = m->nb_long(x);
+  }
+  #else
+  if (likely(m && m->nb_int)) {
+    name = "int";
+    res = m->nb_int(x);
+  }
+  #endif
+#else
+  if (!PyBytes_CheckExact(x) && !PyUnicode_CheckExact(x)) {
+    res = PyNumber_Int(x);
+  }
+#endif
+  if (likely(res)) {
+#if PY_MAJOR_VERSION < 3
+    if (unlikely(!PyInt_Check(res) && !PyLong_Check(res))) {
+#else
+    if (unlikely(!PyLong_CheckExact(res))) {
+#endif
+        return __Pyx_PyNumber_IntOrLongWrongResultType(res, name);
+    }
+  }
+  else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_TypeError,
+                    "an integer is required");
+  }
+  return res;
+}
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) {
+  Py_ssize_t ival;
+  PyObject *x;
+#if PY_MAJOR_VERSION < 3
+  if (likely(PyInt_CheckExact(b))) {
+    if (sizeof(Py_ssize_t) >= sizeof(long))
+        return PyInt_AS_LONG(b);
+    else
+        return PyInt_AsSsize_t(b);
+  }
+#endif
+  if (likely(PyLong_CheckExact(b))) {
+    #if CYTHON_USE_PYLONG_INTERNALS
+    if (likely(__Pyx_PyLong_IsCompact(b))) {
+        return __Pyx_PyLong_CompactValue(b);
+    } else {
+      const digit* digits = __Pyx_PyLong_Digits(b);
+      const Py_ssize_t size = __Pyx_PyLong_SignedDigitCount(b);
+      switch (size) {
+         case 2:
+           if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -2:
+           if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case 3:
+           if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -3:
+           if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case 4:
+           if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -4:
+           if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+      }
+    }
+    #endif
+    return PyLong_AsSsize_t(b);
+  }
+  x = PyNumber_Index(b);
+  if (!x) return -1;
+  ival = PyInt_AsSsize_t(x);
+  Py_DECREF(x);
+  return ival;
+}
+static CYTHON_INLINE Py_hash_t __Pyx_PyIndex_AsHash_t(PyObject* o) {
+  if (sizeof(Py_hash_t) == sizeof(Py_ssize_t)) {
+    return (Py_hash_t) __Pyx_PyIndex_AsSsize_t(o);
+#if PY_MAJOR_VERSION < 3
+  } else if (likely(PyInt_CheckExact(o))) {
+    return PyInt_AS_LONG(o);
+#endif
+  } else {
+    Py_ssize_t ival;
+    PyObject *x;
+    x = PyNumber_Index(o);
+    if (!x) return -1;
+    ival = PyInt_AsLong(x);
+    Py_DECREF(x);
+    return ival;
+  }
+}
+static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b) {
+  return b ? __Pyx_NewRef(Py_True) : __Pyx_NewRef(Py_False);
+}
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) {
+    return PyInt_FromSize_t(ival);
+}
+
+
+/* #### Code section: utility_code_pragmas_end ### */
+#ifdef _MSC_VER
+#pragma warning( pop )
+#endif
+
+
+
+/* #### Code section: end ### */
+#endif /* Py_PYTHON_H */
diff --git a/fairseq/data/token_block_utils_fast.cpython-310-x86_64-linux-gnu.so b/fairseq/data/token_block_utils_fast.cpython-310-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..2b4f9ea5610821a608fdb09e64c4618d3d88e1cf
--- /dev/null
+++ b/fairseq/data/token_block_utils_fast.cpython-310-x86_64-linux-gnu.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:105d14894186afdca051690cd766de67eb6156fa362d0356b978a58d830a33c0
+size 287008
diff --git a/fairseq/data/token_block_utils_fast.pyx b/fairseq/data/token_block_utils_fast.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..5563b973e9da2b9560e7660a370f9ef30a1d7ce6
--- /dev/null
+++ b/fairseq/data/token_block_utils_fast.pyx
@@ -0,0 +1,185 @@
+# cython: language_level=3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from itertools import chain
+from libc.math cimport ceil
+
+cimport cython
+cimport numpy as np
+
+DTYPE = np.int64
+ctypedef np.int64_t DTYPE_t
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.nonecheck(False)
+cdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_none_mode(np.ndarray[DTYPE_t, ndim=1] sizes, int block_size):
+    cdef DTYPE_t total_size = sizes.sum()
+    cdef DTYPE_t length = <DTYPE_t> ceil(total_size / <double> block_size)
+    cdef np.ndarray[DTYPE_t, ndim=2] slice_indices = np.zeros([length, 2], dtype=DTYPE)
+    cdef DTYPE_t[:, :] slice_indices_view = slice_indices
+    cdef DTYPE_t i
+    cdef DTYPE_t start
+    cdef DTYPE_t end
+    for i in range(length):
+        start = i * block_size
+        end = min(start + block_size, total_size)
+        slice_indices_view[i][0] = start
+        slice_indices_view[i][1] = end
+    return slice_indices
+
+
+cdef np.ndarray[DTYPE_t, ndim=2] _fast_convert_to_np_array(list list_of_list):
+    """
+    Faster function to convert DTYPE_t list of list.
+    Only fast when there are huge number of rows and low number of columns.
+    """
+    cdef np.ndarray[DTYPE_t, ndim=1] flat = np.fromiter(chain.from_iterable(list_of_list), DTYPE, -1)
+    return flat.reshape((len(list_of_list), -1))
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.nonecheck(False)
+cpdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_fast(np.ndarray[DTYPE_t, ndim=1] sizes, str break_mode, int block_size, int document_sep_len):
+    cdef DTYPE_t tok_idx = 0
+    cdef DTYPE_t sz_idx = 0
+    cdef DTYPE_t curr_size = 0
+    cdef DTYPE_t i = 0
+    cdef DTYPE_t length
+    cdef DTYPE_t total_size
+    cdef DTYPE_t[:] sizes_view = sizes
+    cdef np.ndarray[DTYPE_t, ndim=2] slice_indices
+    cdef list slice_indices_list = []
+
+    if break_mode is None or break_mode == 'none':
+        slice_indices = _get_slice_indices_none_mode(sizes, block_size)
+    elif break_mode == 'complete':
+        while sz_idx < len(sizes_view):
+            if curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0:
+                curr_size += sizes_view[sz_idx]
+                sz_idx += 1
+            else:
+                slice_indices_list.append((tok_idx, tok_idx + curr_size))
+                tok_idx += curr_size
+                curr_size = 0
+        if curr_size > 0:
+            slice_indices_list.append((tok_idx, tok_idx + curr_size))
+        slice_indices = _fast_convert_to_np_array(slice_indices_list)
+    elif break_mode == 'complete_doc':
+        while sz_idx < len(sizes_view):
+            if (
+                (curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0)
+                # an empty sentence indicates end-of-document:
+                and sizes_view[sz_idx] != document_sep_len
+            ):
+                curr_size += sizes_view[sz_idx]
+                sz_idx += 1
+            else:
+                # Only keep non-empty documents.
+                if curr_size > 1:
+                    slice_indices_list.append((tok_idx, tok_idx + curr_size))
+                tok_idx += curr_size
+                curr_size = 0
+                if sizes_view[sz_idx] == document_sep_len:
+                    tok_idx += sizes_view[sz_idx]
+                    sz_idx += 1
+        if curr_size > 1:
+            slice_indices_list.append((tok_idx, tok_idx + curr_size))
+        slice_indices = _fast_convert_to_np_array(slice_indices_list)
+    elif break_mode == 'eos':
+        slice_indices = np.zeros((len(sizes), 2), dtype=DTYPE)
+        cumsum = sizes.cumsum(axis=0)
+        slice_indices[1:, 0] = cumsum[:cumsum.shape[0] - 1]
+        slice_indices[:, 1] = cumsum
+    else:
+        raise ValueError('Invalid break_mode: ' + break_mode)
+    return slice_indices
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.nonecheck(False)
+cpdef np.ndarray[DTYPE_t, ndim=2] _get_block_to_dataset_index_fast(np.ndarray[DTYPE_t, ndim=1] sizes, np.ndarray[DTYPE_t, ndim=2] slice_indices):
+    cdef DTYPE_t start_ds_idx
+    cdef DTYPE_t start_offset
+    cdef DTYPE_t end_ds_idx
+    cdef DTYPE_t i
+    cdef DTYPE_t s
+    cdef DTYPE_t e
+    cdef DatasetSearcher ds = DatasetSearcher(sizes)
+    cdef np.ndarray[DTYPE_t, ndim=2] block_to_dataset_index = np.zeros([len(slice_indices), 3], dtype=DTYPE)
+    cdef DTYPE_t[:, :] block_to_dataset_index_view = block_to_dataset_index
+    cdef DTYPE_t[:, :] slice_indices_view = slice_indices
+    cdef Py_ssize_t x_max = slice_indices.shape[0]
+
+    for i in range(x_max):
+        s = slice_indices_view[i][0]
+        e = slice_indices_view[i][1]
+        ds.seek(s)
+        start_ds_idx = ds.current_index
+        start_offset = ds.current_offset
+        if e <= s:
+            end_ds_idx = start_ds_idx
+        else:
+            ds.seek(e - 1)
+            end_ds_idx = ds.current_index
+        block_to_dataset_index_view[i][0] = start_ds_idx  # starting index in dataset
+        block_to_dataset_index_view[i][1] = start_offset  # starting offset within starting index
+        block_to_dataset_index_view[i][2] = end_ds_idx    # ending index in dataset
+    return block_to_dataset_index
+
+
+cdef class DatasetSearcher(object):
+    """Helper for mapping "flat" indices to indices and offsets in an
+    underlying dataset."""
+    cdef DTYPE_t current_i
+    cdef DTYPE_t current_offset
+    cdef DTYPE_t current_index
+    cdef DTYPE_t[:] sizes
+
+    def __init__(self, DTYPE_t[:] sizes):
+        self.sizes = sizes
+        self.reset()
+
+    cdef reset(self):
+        self.current_offset = 0     # offset within current index in underlying dataset
+        self.current_i = 0          # "flat" index
+        self.current_index = 0      # index in underlying dataset
+
+    @cython.boundscheck(False)
+    @cython.wraparound(False)
+    @cython.nonecheck(False)
+    cdef int step(self, DTYPE_t i):
+        cdef DTYPE_t to_consume
+        cdef DTYPE_t remaining
+        if i < self.current_i:
+            self.reset()
+        if i > self.current_i:
+            to_consume = i - self.current_i
+            remaining = self.sizes[self.current_index] - self.current_offset
+            if remaining > to_consume:
+                self.current_offset += to_consume
+                self.current_i += to_consume
+            else:
+                assert remaining > 0
+                self.current_i += remaining
+                self.current_index += 1
+                self.current_offset = 0
+                return 1
+        return 0
+
+    @cython.boundscheck(False)
+    @cython.wraparound(False)
+    @cython.nonecheck(False)
+    cdef seek(self, DTYPE_t i):
+        cdef int not_done = 1
+        while not_done == 1:
+            not_done = self.step(i)
+        assert self.current_i == i
diff --git a/fairseq/data/transform_eos_dataset.py b/fairseq/data/transform_eos_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ce5ad811bbcd01c1d46cea6acd5206e730f4f76
--- /dev/null
+++ b/fairseq/data/transform_eos_dataset.py
@@ -0,0 +1,121 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from . import FairseqDataset
+
+
+class TransformEosDataset(FairseqDataset):
+    """A :class:`~fairseq.data.FairseqDataset` wrapper that appends/prepends/strips EOS.
+
+    Note that the transformation is applied in :func:`collater`.
+
+    Args:
+        dataset (~fairseq.data.FairseqDataset): dataset to wrap
+        eos (int): index of the end-of-sentence symbol
+        append_eos_to_src (bool, optional): append EOS to the end of src
+        remove_eos_from_src (bool, optional): remove EOS from the end of src
+        append_eos_to_tgt (bool, optional): append EOS to the end of tgt
+        remove_eos_from_tgt (bool, optional): remove EOS from the end of tgt
+    """
+
+    def __init__(
+        self,
+        dataset,
+        eos,
+        append_eos_to_src=False,
+        remove_eos_from_src=False,
+        append_eos_to_tgt=False,
+        remove_eos_from_tgt=False,
+        has_target=True,
+    ):
+        if not isinstance(dataset, FairseqDataset):
+            raise ValueError('dataset must be an instance of FairseqDataset')
+        if append_eos_to_src and remove_eos_from_src:
+            raise ValueError('cannot combine append_eos_to_src and remove_eos_from_src')
+        if append_eos_to_tgt and remove_eos_from_tgt:
+            raise ValueError('cannot combine append_eos_to_tgt and remove_eos_from_tgt')
+
+        self.dataset = dataset
+        self.eos = torch.LongTensor([eos])
+        self.append_eos_to_src = append_eos_to_src
+        self.remove_eos_from_src = remove_eos_from_src
+        self.append_eos_to_tgt = append_eos_to_tgt
+        self.remove_eos_from_tgt = remove_eos_from_tgt
+        self.has_target = has_target
+
+        # precompute how we should adjust the reported sizes
+        self._src_delta = 0
+        self._src_delta += 1 if append_eos_to_src else 0
+        self._src_delta -= 1 if remove_eos_from_src else 0
+        self._tgt_delta = 0
+        self._tgt_delta += 1 if append_eos_to_tgt else 0
+        self._tgt_delta -= 1 if remove_eos_from_tgt else 0
+
+        self._checked_src = False
+        self._checked_tgt = False
+
+    def _check_src(self, src, expect_eos):
+        if not self._checked_src:
+            assert (src[-1] == self.eos[0]) == expect_eos
+            self._checked_src = True
+
+    def _check_tgt(self, tgt, expect_eos):
+        if self.has_target and not self._checked_tgt:
+            assert (tgt[-1] == self.eos[0]) == expect_eos
+            self._checked_tgt = True
+
+    def __getitem__(self, index):
+        return self.dataset[index]
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def collater(self, samples):
+
+        def transform(item):
+            if self.append_eos_to_src:
+                self.eos = self.eos.to(device=item['source'].device)
+                self._check_src(item['source'], expect_eos=False)
+                item['source'] = torch.cat([item['source'], self.eos])
+            if self.remove_eos_from_src:
+                self.eos = self.eos.to(device=item['source'].device)
+                self._check_src(item['source'], expect_eos=True)
+                item['source'] = item['source'][:-1]
+            if self.append_eos_to_tgt:
+                self.eos = self.eos.to(device=item['target'].device)
+                self._check_tgt(item['target'], expect_eos=False)
+                item['target'] = torch.cat([item['target'], self.eos])
+            if self.remove_eos_from_tgt:
+                self.eos = self.eos.to(device=item['target'].device)
+                self._check_tgt(item['target'], expect_eos=True)
+                item['target'] = item['target'][:-1]
+            return item
+
+        samples = list(map(transform, samples))
+        return self.dataset.collater(samples)
+
+    def num_tokens(self, index):
+        return self.dataset.num_tokens(index)
+
+    def size(self, index):
+        if self.has_target:
+            src_len, tgt_len = self.dataset.size(index)
+            return (src_len + self._src_delta, tgt_len + self._tgt_delta)
+        else:
+            return self.dataset.size(index)
+
+    def ordered_indices(self):
+        # NOTE: we assume that the ordering does not change based on the
+        # addition or removal of eos
+        return self.dataset.ordered_indices()
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, 'supports_prefetch', False)
+
+    def prefetch(self, indices):
+        return self.dataset.prefetch(indices)
diff --git a/fairseq/data/transform_eos_lang_pair_dataset.py b/fairseq/data/transform_eos_lang_pair_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..55137ca55cc442d760a21c93c215bd74d3cea868
--- /dev/null
+++ b/fairseq/data/transform_eos_lang_pair_dataset.py
@@ -0,0 +1,89 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from . import FairseqDataset
+import torch
+from typing import Optional
+
+
+class TransformEosLangPairDataset(FairseqDataset):
+    """A :class:`~fairseq.data.FairseqDataset` wrapper that transform bos on
+    collated samples of language pair dataset.
+
+    Note that the transformation is applied in :func:`collater`.
+
+    Args:
+        dataset (~fairseq.data.FairseqDataset): dataset that collates sample into
+            LanguagePairDataset schema
+        src_eos (int): original source end-of-sentence symbol index to be replaced
+        new_src_eos (int, optional): new end-of-sentence symbol index to replace source eos symbol
+        tgt_bos (int, optional): original target beginning-of-sentence symbol index to be replaced
+        new_tgt_bos (int, optional): new beginning-of-sentence symbol index to replace at the
+            beginning of 'prev_output_tokens'
+    """
+
+    def __init__(
+        self,
+        dataset: FairseqDataset,
+        src_eos: int,
+        new_src_eos: Optional[int] = None,
+        tgt_bos: Optional[int] = None,
+        new_tgt_bos: Optional[int] = None,
+    ):
+        self.dataset = dataset
+        self.src_eos = src_eos
+        self.new_src_eos = new_src_eos
+        self.tgt_bos = tgt_bos
+        self.new_tgt_bos = new_tgt_bos
+
+    def __getitem__(self, index):
+        return self.dataset[index]
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def collater(self, samples, **extra_args):
+        samples = self.dataset.collater(samples, **extra_args)
+
+        if self.new_src_eos is not None:
+            if self.dataset.left_pad_source:
+                assert(samples['net_input']['src_tokens'][:, -1] != self.src_eos).sum() == 0
+                samples['net_input']['src_tokens'][:, -1] = self.new_src_eos
+            else:
+                eos_idx = samples['net_input']['src_lengths'] - 1
+                assert(
+                    samples['net_input']['src_tokens'][torch.arange(eos_idx.size(0)), eos_idx] != self.src_eos
+                ).sum() == 0
+                eos_idx = eos_idx.resize_(len(samples['net_input']['src_lengths']), 1)
+                samples['net_input']['src_tokens'].scatter_(1, eos_idx, self.new_src_eos)
+
+        if self.new_tgt_bos is not None and 'prev_output_tokens' in samples['net_input']:
+            if self.dataset.left_pad_target:
+                # TODO: support different padding direction on target side
+                raise NotImplementedError(
+                    'TransformEosLangPairDataset does not implement --left-pad-target True option'
+                )
+            else:
+                assert (samples['net_input']['prev_output_tokens'][:, 0] != self.tgt_bos).sum() == 0
+                samples['net_input']['prev_output_tokens'][:, 0] = self.new_tgt_bos
+
+        return samples
+
+    def num_tokens(self, index):
+        return self.dataset.num_tokens(index)
+
+    def size(self, index):
+        return self.dataset.size(index)
+
+    def ordered_indices(self):
+        return self.dataset.ordered_indices()
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, 'supports_prefetch', False)
+
+    def prefetch(self, indices):
+        return self.dataset.prefetch(indices)
diff --git a/fairseq/distributed_utils.py b/fairseq/distributed_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ee89adce9e4bee192253fb247e879a064036bf5
--- /dev/null
+++ b/fairseq/distributed_utils.py
@@ -0,0 +1,322 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import pickle
+import random
+import socket
+import struct
+import subprocess
+import warnings
+from collections import OrderedDict
+from typing import Any, Dict, Mapping
+
+import torch
+import torch.distributed as dist
+
+from fairseq import utils
+
+
+logger = logging.getLogger(__name__)
+
+
+def is_master(args):
+    return args.distributed_rank == 0
+
+
+def infer_init_method(args, force_distributed=False):
+    if args.distributed_init_method is not None or getattr(args, 'tpu', False):
+        return
+
+    # support torch.distributed.launch
+    if all(key in os.environ for key in [
+        'MASTER_ADDR', 'MASTER_PORT', 'WORLD_SIZE', 'RANK'
+    ]):
+        args.distributed_init_method = 'env://'
+        args.distributed_world_size = int(os.environ['WORLD_SIZE'])
+        args.distributed_rank = int(os.environ['RANK'])
+        # processes are created by torch.distributed.launch
+        args.distributed_no_spawn = True
+
+    # we can determine the init method automatically for Slurm
+    elif args.distributed_port > 0:
+        node_list = os.environ.get('SLURM_STEP_NODELIST')
+        if node_list is None:
+            node_list = os.environ.get('SLURM_JOB_NODELIST')
+        if node_list is not None:
+            try:
+                hostnames = subprocess.check_output(['scontrol', 'show', 'hostnames', node_list])
+                args.distributed_init_method = 'tcp://{host}:{port}'.format(
+                    host=hostnames.split()[0].decode('utf-8'),
+                    port=args.distributed_port,
+                )
+                nnodes = int(os.environ.get('SLURM_NNODES'))
+                ntasks_per_node = os.environ.get('SLURM_NTASKS_PER_NODE')
+                if ntasks_per_node is not None:
+                    ntasks_per_node = int(ntasks_per_node)
+                else:
+                    ntasks = int(os.environ.get('SLURM_NTASKS'))
+                    nnodes = int(os.environ.get('SLURM_NNODES'))
+                    assert ntasks % nnodes == 0
+                    ntasks_per_node = int(ntasks / nnodes)
+                if ntasks_per_node == 1:
+                    assert args.distributed_world_size % nnodes == 0
+                    gpus_per_node = args.distributed_world_size // nnodes
+                    node_id = int(os.environ.get('SLURM_NODEID'))
+                    args.distributed_rank = node_id * gpus_per_node
+                else:
+                    assert ntasks_per_node == args.distributed_world_size // nnodes
+                    args.distributed_no_spawn = True
+                    args.distributed_rank = int(os.environ.get('SLURM_PROCID'))
+                    args.device_id = int(os.environ.get('SLURM_LOCALID'))
+            except subprocess.CalledProcessError as e:  # scontrol failed
+                raise e
+            except FileNotFoundError:  # Slurm is not installed
+                pass
+
+    elif args.distributed_world_size > 1 or force_distributed:
+        # fallback for single node with multiple GPUs
+        assert args.distributed_world_size <= torch.cuda.device_count()
+        port = random.randint(10000, 20000)
+        args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port)
+
+
+def distributed_init(args):
+    if not getattr(args, 'tpu', False):
+        if torch.distributed.is_initialized():
+            warnings.warn('Distributed is already initialized, cannot initialize twice!')
+        else:
+            logger.info('distributed init (rank {}): {}'.format(
+                args.distributed_rank, args.distributed_init_method,
+            ))
+            dist.init_process_group(
+                backend=args.distributed_backend,
+                init_method=args.distributed_init_method,
+                world_size=args.distributed_world_size,
+                rank=args.distributed_rank,
+            )
+            logger.info('initialized host {} as rank {}'.format(
+                socket.gethostname(), args.distributed_rank,
+            ))
+
+            # perform a dummy all-reduce to initialize the NCCL communicator
+            if torch.cuda.is_available():
+                dist.all_reduce(torch.zeros(1).cuda())
+
+        args.distributed_rank = torch.distributed.get_rank()
+    else:
+        import torch_xla.core.xla_model as xm
+        assert xm.xrt_world_size() == args.distributed_world_size
+        args.device_id = xm.get_local_ordinal()
+        args.distributed_rank = xm.get_ordinal()
+        xm.rendezvous('distributed_init')  # wait for all workers
+        xm.mark_step()
+
+    if is_master(args):
+        logging.getLogger().setLevel(logging.INFO)
+    else:
+        logging.getLogger().setLevel(logging.WARNING)
+
+    if args.model_parallel_size > 1:
+        try:
+            from fairseq.model_parallel.megatron.mpu import (
+                get_model_parallel_rank,
+                initialize_model_parallel,
+                model_parallel_cuda_manual_seed,
+            )
+        except ImportError:
+            raise ImportError(
+                '\n\nPlease install the megatron submodule:'
+                '\n\n  git submodule update --init '
+                'fairseq/model_parallel/megatron'
+            )
+        initialize_model_parallel(args.model_parallel_size)
+        model_parallel_cuda_manual_seed(args.seed)
+        model_part_number = get_model_parallel_rank()
+        args.checkpoint_suffix += '-model_part-{0}'.format(model_part_number)
+    return args.distributed_rank
+
+
+def distributed_main(i, main, args, kwargs):
+    args.device_id = i
+    if torch.cuda.is_available() and not args.cpu and not getattr(args, "tpu", False):
+        torch.cuda.set_device(args.device_id)
+    if args.distributed_rank is None:  # torch.multiprocessing.spawn
+        args.distributed_rank = kwargs.pop('start_rank', 0) + i
+
+    args.distributed_rank = distributed_init(args)
+
+    after_distributed_init_fn = kwargs.pop('after_distributed_init_fn', None)
+    if after_distributed_init_fn:
+        args = after_distributed_init_fn(args)
+
+    main(args, **kwargs)
+
+
+def call_main(args, main, **kwargs):
+    if args.distributed_init_method is None:
+        infer_init_method(args)
+
+    if args.distributed_init_method is not None:
+        # distributed training
+        if not args.distributed_no_spawn:
+            start_rank = args.distributed_rank
+            args.distributed_rank = None  # assign automatically
+            kwargs['start_rank'] = start_rank
+            torch.multiprocessing.spawn(
+                fn=distributed_main,
+                args=(main, args, kwargs),
+                nprocs=min(
+                    torch.cuda.device_count(),
+                    args.distributed_world_size,
+                ),
+            )
+        else:
+            distributed_main(args.device_id, main, args, kwargs)
+    elif getattr(args, "tpu", False):
+        import torch_xla.distributed.xla_multiprocessing as xmp
+        torch.multiprocessing.set_sharing_strategy("file_system")
+        xmp.spawn(
+            fn=distributed_main,
+            args=(main, args, kwargs),
+            nprocs=8,  # use all 8 TPU cores
+        )
+    else:
+        # single GPU main
+        main(args, **kwargs)
+
+
+def get_rank():
+    return dist.get_rank()
+
+
+def get_world_size():
+    return dist.get_world_size()
+
+
+def get_default_group():
+    return dist.group.WORLD
+
+
+def all_reduce(tensor, group=None):
+    if isinstance(group, tuple) and group[0] == 'tpu':
+        import torch_xla.core.xla_model as xm
+        return xm.all_reduce('sum', [tensor], groups=group[1])
+    else:
+        if group is None:
+            group = get_default_group()
+        return dist.all_reduce(tensor, group=group)
+
+
+def all_gather_list(data, group=None, max_size=16384):
+    """Gathers arbitrary data from all nodes into a list.
+
+    Similar to :func:`~torch.distributed.all_gather` but for arbitrary Python
+    data. Note that *data* must be picklable.
+
+    Args:
+        data (Any): data from the local worker to be gathered on other workers
+        group (optional): group of the collective
+        max_size (int, optional): maximum size of the data to be gathered
+            across workers
+    """
+    rank = get_rank()
+    world_size = get_world_size()
+
+    buffer_size = max_size * world_size
+    if not hasattr(all_gather_list, '_buffer') or \
+            all_gather_list._buffer.numel() < buffer_size:
+        all_gather_list._buffer = torch.cuda.ByteTensor(buffer_size)
+        all_gather_list._cpu_buffer = torch.ByteTensor(max_size).pin_memory()
+    buffer = all_gather_list._buffer
+    buffer.zero_()
+    cpu_buffer = all_gather_list._cpu_buffer
+
+    data = utils.move_to_cpu(data)
+    enc = pickle.dumps(data)
+    enc_size = len(enc)
+    header_size = 4  # size of header that contains the length of the encoded data
+    size = header_size + enc_size
+    if size > max_size:
+        raise ValueError('encoded data size ({}) exceeds max_size ({})'.format(size, max_size))
+
+    header = struct.pack(">I", enc_size)
+    cpu_buffer[:size] = torch.ByteTensor(list(header + enc))
+    start = rank * max_size
+    buffer[start:start + size].copy_(cpu_buffer[:size])
+
+    all_reduce(buffer, group=group)
+
+    buffer = buffer.cpu()
+    try:
+        result = []
+        for i in range(world_size):
+            out_buffer = buffer[i * max_size:(i + 1) * max_size]
+            enc_size, = struct.unpack(">I", bytes(out_buffer[:header_size].tolist()))
+            if enc_size > 0:
+                result.append(pickle.loads(bytes(out_buffer[header_size:header_size + enc_size].tolist())))
+        return result
+    except pickle.UnpicklingError:
+        raise Exception(
+            'Unable to unpickle data from other workers. all_gather_list requires all '
+            'workers to enter the function together, so this error usually indicates '
+            'that the workers have fallen out of sync somehow. Workers can fall out of '
+            'sync if one of them runs out of memory, or if there are other conditions '
+            'in your training script that can cause one worker to finish an epoch '
+            'while other workers are still iterating over their portions of the data. '
+            'Try rerunning with --ddp-backend=no_c10d and see if that helps.'
+        )
+
+
+def all_reduce_dict(
+    data: Mapping[str, Any],
+    device,
+    group=None,
+) -> Dict[str, Any]:
+    """
+    AllReduce a dictionary of values across workers. We separately
+    reduce items that are already on the device and items on CPU for
+    better performance.
+
+    Args:
+        data (Mapping[str, Any]): dictionary of data to all-reduce, but
+            cannot be a nested dictionary
+        device (torch.device): device for the reduction
+        group (optional): group of the collective
+    """
+    data_keys = list(data.keys())
+
+    # We want to separately reduce items that are already on the
+    # device and items on CPU for performance reasons.
+    cpu_data = OrderedDict()
+    device_data = OrderedDict()
+    for k in data_keys:
+        t = data[k]
+        if not torch.is_tensor(t):
+            cpu_data[k] = torch.tensor(t, dtype=torch.double)
+        elif t.device.type != device.type:
+            cpu_data[k] = t.to(dtype=torch.double)
+        else:
+            device_data[k] = t.to(dtype=torch.double)
+
+    def _all_reduce_dict(data: OrderedDict):
+        if len(data) == 0:
+            return data
+        buf = torch.stack(list(data.values())).to(device=device)
+        all_reduce(buf, group=group)
+        return {k: buf[i] for i, k in enumerate(data)}
+
+    cpu_data = _all_reduce_dict(cpu_data)
+    device_data = _all_reduce_dict(device_data)
+
+    def get_from_stack(key):
+        if key in cpu_data:
+            return cpu_data[key]
+        elif key in device_data:
+            return device_data[key]
+        raise KeyError
+
+    return OrderedDict([(key, get_from_stack(key)) for key in data_keys])
diff --git a/fairseq/file_io.py b/fairseq/file_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..b57373f8b51ab23394296346c3e2a6a97a891f78
--- /dev/null
+++ b/fairseq/file_io.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import shutil
+from typing import List, Optional
+
+
+try:
+    from fvcore.common.file_io import PathManager as FVCorePathManager
+
+except ImportError:
+    FVCorePathManager = None
+
+
+class PathManager:
+    """
+    Wrapper for insulating OSS I/O (using Python builtin operations) from
+    fvcore's PathManager abstraction (for transparently handling various
+    internal backends).
+    """
+
+    @staticmethod
+    def open(
+        path: str,
+        mode: str = "r",
+        buffering: int = -1,
+        encoding: Optional[str] = None,
+        errors: Optional[str] = None,
+        newline: Optional[str] = None,
+    ):
+        if FVCorePathManager:
+            return FVCorePathManager.open(
+                path=path,
+                mode=mode,
+                buffering=buffering,
+                encoding=encoding,
+                errors=errors,
+                newline=newline,
+            )
+        return open(
+            path,
+            mode=mode,
+            buffering=buffering,
+            encoding=encoding,
+            errors=errors,
+            newline=newline,
+        )
+
+    @staticmethod
+    def copy(src_path: str, dst_path: str, overwrite: bool = False) -> bool:
+        if FVCorePathManager:
+            return FVCorePathManager.copy(
+                src_path=src_path, dst_path=dst_path, overwrite=overwrite
+            )
+        return shutil.copyfile(src_path, dst_path)
+
+    @staticmethod
+    def get_local_path(path: str, **kwargs) -> str:
+        if FVCorePathManager:
+            return FVCorePathManager.get_local_path(path, **kwargs)
+        return path
+
+    @staticmethod
+    def exists(path: str) -> bool:
+        if FVCorePathManager:
+            return FVCorePathManager.exists(path)
+        return os.path.exists(path)
+
+    @staticmethod
+    def isfile(path: str) -> bool:
+        if FVCorePathManager:
+            return FVCorePathManager.isfile(path)
+        return os.path.isfile(path)
+
+    @staticmethod
+    def ls(path: str) -> List[str]:
+        if FVCorePathManager:
+            return FVCorePathManager.ls(path)
+        return os.listdir(path)
+
+    @staticmethod
+    def mkdirs(path: str) -> None:
+        if FVCorePathManager:
+            return FVCorePathManager.mkdirs(path)
+        os.makedirs(path, exist_ok=True)
+
+    @staticmethod
+    def rm(path: str) -> None:
+        if FVCorePathManager:
+            return FVCorePathManager.rm(path)
+        os.remove(path)
+
+    @staticmethod
+    def chmod(path: str, mode: int) -> None:
+        if "manifold" not in path:
+            os.chmod(path, mode)
+
+    @staticmethod
+    def register_handler(handler) -> None:
+        if FVCorePathManager:
+            return FVCorePathManager.register_handler(handler=handler)
diff --git a/fairseq/file_utils.py b/fairseq/file_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..62278b367d659dc1c565428f903832d3a882848f
--- /dev/null
+++ b/fairseq/file_utils.py
@@ -0,0 +1,331 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Utilities for working with the local dataset cache.
+This file is adapted from `AllenNLP <https://github.com/allenai/allennlp>`_.
+and `huggingface <https://github.com/huggingface>`_.
+"""
+
+import fnmatch
+from functools import wraps, partial
+from hashlib import sha256
+from io import open
+import json
+import logging
+import os
+import shutil
+import tarfile
+import tempfile
+
+
+try:
+    from torch.hub import _get_torch_home
+    torch_cache_home = _get_torch_home()
+except ImportError:
+    torch_cache_home = os.path.expanduser(
+        os.getenv('TORCH_HOME', os.path.join(
+            os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
+default_cache_path = os.path.join(torch_cache_home, 'pytorch_fairseq')
+
+try:
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
+
+try:
+    from pathlib import Path
+    PYTORCH_FAIRSEQ_CACHE = Path(
+        os.getenv('PYTORCH_FAIRSEQ_CACHE', default_cache_path))
+except (AttributeError, ImportError):
+    PYTORCH_FAIRSEQ_CACHE = os.getenv(
+        'PYTORCH_FAIRSEQ_CACHE', default_cache_path)
+
+CONFIG_NAME = "config.json"
+WEIGHTS_NAME = "pytorch_model.bin"
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+def load_archive_file(archive_file):
+    # redirect to the cache, if necessary
+    try:
+        resolved_archive_file = cached_path(archive_file, cache_dir=None)
+    except EnvironmentError:
+        logger.info(
+            "Archive name '{}' was not found in archive name list. "
+            "We assumed '{}' was a path or URL but couldn't find any file "
+            "associated to this path or URL.".format(
+                archive_file,
+                archive_file,
+            )
+        )
+        return None
+
+    if resolved_archive_file == archive_file:
+        logger.info("loading archive file {}".format(archive_file))
+    else:
+        logger.info("loading archive file {} from cache at {}".format(
+            archive_file, resolved_archive_file))
+
+    # Extract archive to temp dir and replace .tar.bz2 if necessary
+    tempdir = None
+    if not os.path.isdir(resolved_archive_file):
+        tempdir = tempfile.mkdtemp()
+        logger.info("extracting archive file {} to temp dir {}".format(
+            resolved_archive_file, tempdir))
+        ext = os.path.splitext(archive_file)[1][1:]
+        with tarfile.open(resolved_archive_file, 'r:' + ext) as archive:
+            top_dir = os.path.commonprefix(archive.getnames())
+            archive.extractall(tempdir)
+        os.remove(resolved_archive_file)
+        shutil.move(os.path.join(tempdir, top_dir), resolved_archive_file)
+        shutil.rmtree(tempdir)
+
+    return resolved_archive_file
+
+
+def url_to_filename(url, etag=None):
+    """
+    Convert `url` into a hashed filename in a repeatable way.
+    If `etag` is specified, append its hash to the URL's, delimited
+    by a period.
+    """
+    url_bytes = url.encode('utf-8')
+    url_hash = sha256(url_bytes)
+    filename = url_hash.hexdigest()
+
+    if etag:
+        etag_bytes = etag.encode('utf-8')
+        etag_hash = sha256(etag_bytes)
+        filename += '.' + etag_hash.hexdigest()
+
+    return filename
+
+
+def filename_to_url(filename, cache_dir=None):
+    """
+    Return the url and etag (which may be ``None``) stored for `filename`.
+    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_FAIRSEQ_CACHE
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        raise EnvironmentError("file {} not found".format(cache_path))
+
+    meta_path = cache_path + '.json'
+    if not os.path.exists(meta_path):
+        raise EnvironmentError("file {} not found".format(meta_path))
+
+    with open(meta_path, encoding="utf-8") as meta_file:
+        metadata = json.load(meta_file)
+    url = metadata['url']
+    etag = metadata['etag']
+
+    return url, etag
+
+
+def cached_path(url_or_filename, cache_dir=None):
+    """
+    Given something that might be a URL (or might be a local path),
+    determine which. If it's a URL, download the file and cache it, and
+    return the path to the cached file. If it's already a local path,
+    make sure the file exists and then return the path.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_FAIRSEQ_CACHE
+    if isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    parsed = urlparse(url_or_filename)
+
+    if parsed.scheme in ('http', 'https', 's3'):
+        # URL, so get it from the cache (downloading if necessary)
+        return get_from_cache(url_or_filename, cache_dir)
+    elif os.path.exists(url_or_filename):
+        # File, and it exists.
+        return url_or_filename
+    elif parsed.scheme == '':
+        # File, but it doesn't exist.
+        raise EnvironmentError("file {} not found".format(url_or_filename))
+    else:
+        # Something unknown
+        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
+
+
+def split_s3_path(url):
+    """Split a full s3 path into the bucket name and path."""
+    parsed = urlparse(url)
+    if not parsed.netloc or not parsed.path:
+        raise ValueError("bad s3 path {}".format(url))
+    bucket_name = parsed.netloc
+    s3_path = parsed.path
+    # Remove '/' at beginning of path.
+    if s3_path.startswith("/"):
+        s3_path = s3_path[1:]
+    return bucket_name, s3_path
+
+
+def s3_request(func):
+    """
+    Wrapper function for s3 requests in order to create more helpful error
+    messages.
+    """
+
+    @wraps(func)
+    def wrapper(url, *args, **kwargs):
+        from botocore.exceptions import ClientError
+        try:
+            return func(url, *args, **kwargs)
+        except ClientError as exc:
+            if int(exc.response["Error"]["Code"]) == 404:
+                raise EnvironmentError("file {} not found".format(url))
+            else:
+                raise
+
+    return wrapper
+
+
+@s3_request
+def s3_etag(url):
+    """Check ETag on S3 object."""
+    import boto3
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_object = s3_resource.Object(bucket_name, s3_path)
+    return s3_object.e_tag
+
+
+@s3_request
+def s3_get(url, temp_file):
+    """Pull a file directly from S3."""
+    import boto3
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
+
+
+def request_wrap_timeout(func, url):
+    import requests
+    for attempt, timeout in enumerate([10, 20, 40, 60, 60]):
+        try:
+            return func(timeout=timeout)
+        except requests.exceptions.Timeout as e:
+            logger.warning("Request for %s timed-out (attempt %d). Retrying with a timeout of %d secs",
+                           url, attempt, timeout, exc_info=e)
+            continue
+    raise RuntimeError(f"Unable to fetch file {url}")
+
+
+def http_get(url, temp_file):
+    import requests
+    from tqdm import tqdm
+
+    req = request_wrap_timeout(partial(requests.get, url, stream=True), url)
+    content_length = req.headers.get('Content-Length')
+    total = int(content_length) if content_length is not None else None
+    progress = tqdm(unit="B", total=total)
+    for chunk in req.iter_content(chunk_size=1024):
+        if chunk:  # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+
+
+def get_from_cache(url, cache_dir=None):
+    """
+    Given a URL, look for the corresponding dataset in the local cache.
+    If it's not there, download it. Then return the path to the cached file.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_FAIRSEQ_CACHE
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+
+    # Get eTag to add to filename, if it exists.
+    if url.startswith("s3://"):
+        etag = s3_etag(url)
+    else:
+        try:
+            import requests
+            response = request_wrap_timeout(partial(requests.head, url, allow_redirects=True), url)
+            if response.status_code != 200:
+                etag = None
+            else:
+                etag = response.headers.get("ETag")
+        except EnvironmentError:
+            etag = None
+
+    filename = url_to_filename(url, etag)
+
+    # get cache path to put the file
+    cache_path = os.path.join(cache_dir, filename)
+
+    # If we don't have a connection (etag is None) and can't identify the file
+    # try to get the last downloaded one
+    if not os.path.exists(cache_path) and etag is None:
+        matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*')
+        matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files))
+        if matching_files:
+            cache_path = os.path.join(cache_dir, matching_files[-1])
+
+    if not os.path.exists(cache_path):
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with tempfile.NamedTemporaryFile() as temp_file:
+            logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
+
+            # GET file object
+            if url.startswith("s3://"):
+                s3_get(url, temp_file)
+            else:
+                http_get(url, temp_file)
+
+            # we are copying the file before closing it, so flush to avoid truncation
+            temp_file.flush()
+            # shutil.copyfileobj() starts at the current position, so go to the start
+            temp_file.seek(0)
+
+            logger.info("copying %s to cache at %s", temp_file.name, cache_path)
+            with open(cache_path, 'wb') as cache_file:
+                shutil.copyfileobj(temp_file, cache_file)
+
+            logger.info("creating metadata file for %s", cache_path)
+            meta = {'url': url, 'etag': etag}
+            meta_path = cache_path + '.json'
+            with open(meta_path, 'w') as meta_file:
+                output_string = json.dumps(meta)
+                meta_file.write(output_string)
+
+            logger.info("removing temp file %s", temp_file.name)
+
+    return cache_path
+
+
+def read_set_from_file(filename):
+    '''
+    Extract a de-duped collection (set) of text from a file.
+    Expected file format is one item per line.
+    '''
+    collection = set()
+    with open(filename, 'r', encoding='utf-8') as file_:
+        for line in file_:
+            collection.add(line.rstrip())
+    return collection
+
+
+def get_file_extension(path, dot=True, lower=True):
+    ext = os.path.splitext(path)[1]
+    ext = ext if dot else ext[1:]
+    return ext.lower() if lower else ext
diff --git a/fairseq/hub_utils.py b/fairseq/hub_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e040a8c3f348e6865317e45e4fb20a6aae9cc16f
--- /dev/null
+++ b/fairseq/hub_utils.py
@@ -0,0 +1,269 @@
+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import copy
+import logging
+import os
+from typing import List, Dict, Iterator, Tuple, Any
+
+import torch
+from torch import nn
+
+from fairseq import utils
+from fairseq.data import encoders
+
+
+logger = logging.getLogger(__name__)
+
+
+def from_pretrained(
+    model_name_or_path,
+    checkpoint_file='model.pt',
+    data_name_or_path='.',
+    archive_map=None,
+    **kwargs
+):
+    from fairseq import checkpoint_utils, file_utils
+
+    if archive_map is not None:
+        if model_name_or_path in archive_map:
+            model_name_or_path = archive_map[model_name_or_path]
+        if data_name_or_path is not None and data_name_or_path in archive_map:
+            data_name_or_path = archive_map[data_name_or_path]
+
+        # allow archive_map to set default arg_overrides (e.g., tokenizer, bpe)
+        # for each model
+        if isinstance(model_name_or_path, dict):
+            for k, v in model_name_or_path.items():
+                if k == 'checkpoint_file':
+                    checkpoint_file = v
+                elif (
+                    k != 'path'
+                    # only set kwargs that don't already have overrides
+                    and k not in kwargs
+                ):
+                    kwargs[k] = v
+            model_name_or_path = model_name_or_path['path']
+
+    model_path = file_utils.load_archive_file(model_name_or_path)
+
+    # convenience hack for loading data and BPE codes from model archive
+    if data_name_or_path.startswith('.'):
+        kwargs['data'] = os.path.abspath(os.path.join(model_path, data_name_or_path))
+    else:
+        kwargs['data'] = file_utils.load_archive_file(data_name_or_path)
+    for file, arg in {
+        'code': 'bpe_codes',
+        'bpecodes': 'bpe_codes',
+        'sentencepiece.bpe.model': 'sentencepiece_model',
+    }.items():
+        path = os.path.join(model_path, file)
+        if os.path.exists(path):
+            kwargs[arg] = path
+
+    if 'user_dir' in kwargs:
+        utils.import_user_module(argparse.Namespace(user_dir=kwargs['user_dir']))
+
+    models, args, task = checkpoint_utils.load_model_ensemble_and_task(
+        [os.path.join(model_path, cpt) for cpt in checkpoint_file.split(os.pathsep)],
+        arg_overrides=kwargs,
+    )
+
+    return {
+        'args': args,
+        'task': task,
+        'models': models,
+    }
+
+
+class GeneratorHubInterface(nn.Module):
+    """
+    PyTorch Hub interface for generating sequences from a pre-trained
+    translation or language model.
+    """
+
+    def __init__(self, args, task, models):
+        super().__init__()
+        self.args = args
+        self.task = task
+        self.models = nn.ModuleList(models)
+        self.src_dict = task.source_dictionary
+        self.tgt_dict = task.target_dictionary
+
+        # optimize model for generation
+        for model in self.models:
+            model.prepare_for_inference_(args)
+
+        # Load alignment dictionary for unknown word replacement
+        # (None if no unknown word replacement, empty if no path to align dictionary)
+        self.align_dict = utils.load_align_dict(getattr(args, 'replace_unk', None))
+
+        self.tokenizer = encoders.build_tokenizer(args)
+        self.bpe = encoders.build_bpe(args)
+
+        self.max_positions = utils.resolve_max_positions(
+            self.task.max_positions(), *[model.max_positions() for model in models]
+        )
+
+        # this is useful for determining the device
+        self.register_buffer('_float_tensor', torch.tensor([0], dtype=torch.float))
+
+    @property
+    def device(self):
+        return self._float_tensor.device
+
+    def translate(self, sentences: List[str], beam: int = 5, verbose: bool = False, **kwargs) -> List[str]:
+        return self.sample(sentences, beam, verbose, **kwargs)
+
+    def sample(self, sentences: List[str], beam: int = 1, verbose: bool = False, **kwargs) -> List[str]:
+        if isinstance(sentences, str):
+            return self.sample([sentences], beam=beam, verbose=verbose, **kwargs)[0]
+        tokenized_sentences = [self.encode(sentence) for sentence in sentences]
+        batched_hypos = self.generate(tokenized_sentences, beam, verbose, **kwargs)
+        return [self.decode(hypos[0]['tokens']) for hypos in batched_hypos]
+
+    def score(self, sentences: List[str], **kwargs):
+        if isinstance(sentences, str):
+            return self.score([sentences], **kwargs)[0]
+        # NOTE: this doesn't support translation tasks currently
+        tokenized_sentences = [self.encode(sentence) for sentence in sentences]
+        return [hypos[0] for hypos in self.generate(tokenized_sentences, score_reference=True, **kwargs)]
+
+    def generate(
+        self,
+        tokenized_sentences: List[torch.LongTensor],
+        beam: int = 5,
+        verbose: bool = False,
+        skip_invalid_size_inputs=False,
+        inference_step_args=None,
+        **kwargs
+    ) -> List[List[Dict[str, torch.Tensor]]]:
+        if torch.is_tensor(tokenized_sentences) and tokenized_sentences.dim() == 1:
+            return self.generate(
+                tokenized_sentences.unsqueeze(0), beam=beam, verbose=verbose, **kwargs
+            )[0]
+
+        # build generator using current args as well as any kwargs
+        gen_args = copy.copy(self.args)
+        gen_args.beam = beam
+        for k, v in kwargs.items():
+            setattr(gen_args, k, v)
+        generator = self.task.build_generator(self.models, gen_args)
+
+        inference_step_args = inference_step_args or {}
+        results = []
+        for batch in self._build_batches(tokenized_sentences, skip_invalid_size_inputs):
+            batch = utils.apply_to_sample(lambda t: t.to(self.device), batch)
+            translations = self.task.inference_step(
+                generator, self.models, batch, **inference_step_args
+            )
+            for id, hypos in zip(batch["id"].tolist(), translations):
+                results.append((id, hypos))
+
+        # sort output to match input order
+        outputs = [hypos for _, hypos in sorted(results, key=lambda x: x[0])]
+
+        if verbose:
+
+            def getarg(name, default):
+                return getattr(gen_args, name, getattr(self.args, name, default))
+
+            for source_tokens, target_hypotheses in zip(tokenized_sentences, outputs):
+                src_str_with_unk = self.string(source_tokens)
+                logger.info('S\t{}'.format(src_str_with_unk))
+                for hypo in target_hypotheses:
+                    hypo_str = self.decode(hypo['tokens'])
+                    logger.info('H\t{}\t{}'.format(hypo['score'], hypo_str))
+                    logger.info('P\t{}'.format(
+                        ' '.join(map(lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist()))
+                    ))
+                    if hypo['alignment'] is not None and getarg('print_alignment', False):
+                        logger.info('A\t{}'.format(
+                            ' '.join(['{}-{}'.format(src_idx, tgt_idx) for src_idx, tgt_idx in hypo['alignment']])
+                        ))
+        return outputs
+
+    def encode(self, sentence: str) -> torch.LongTensor:
+        sentence = self.tokenize(sentence)
+        sentence = self.apply_bpe(sentence)
+        return self.binarize(sentence)
+
+    def decode(self, tokens: torch.LongTensor) -> str:
+        sentence = self.string(tokens)
+        sentence = self.remove_bpe(sentence)
+        return self.detokenize(sentence)
+
+    def tokenize(self, sentence: str) -> str:
+        if self.tokenizer is not None:
+            sentence = self.tokenizer.encode(sentence)
+        return sentence
+
+    def detokenize(self, sentence: str) -> str:
+        if self.tokenizer is not None:
+            sentence = self.tokenizer.decode(sentence)
+        return sentence
+
+    def apply_bpe(self, sentence: str) -> str:
+        if self.bpe is not None:
+            sentence = self.bpe.encode(sentence)
+        return sentence
+
+    def remove_bpe(self, sentence: str) -> str:
+        if self.bpe is not None:
+            sentence = self.bpe.decode(sentence)
+        return sentence
+
+    def binarize(self, sentence: str) -> torch.LongTensor:
+        return self.src_dict.encode_line(sentence, add_if_not_exist=False).long()
+
+    def string(self, tokens: torch.LongTensor) -> str:
+        return self.tgt_dict.string(tokens)
+
+    def _build_batches(
+        self, tokens: List[List[int]], skip_invalid_size_inputs: bool
+    ) -> Iterator[Dict[str, Any]]:
+        lengths = torch.LongTensor([t.numel() for t in tokens])
+        batch_iterator = self.task.get_batch_iterator(
+            dataset=self.task.build_dataset_for_inference(tokens, lengths),
+            max_tokens=self.args.max_tokens,
+            max_sentences=self.args.max_sentences,
+            max_positions=self.max_positions,
+            ignore_invalid_inputs=skip_invalid_size_inputs,
+        ).next_epoch_itr(shuffle=False)
+        return batch_iterator
+
+
+class BPEHubInterface(object):
+    """PyTorch Hub interface for Byte-Pair Encoding (BPE)."""
+
+    def __init__(self, bpe, **kwargs):
+        super().__init__()
+        args = argparse.Namespace(bpe=bpe, **kwargs)
+        self.bpe = encoders.build_bpe(args)
+        assert self.bpe is not None
+
+    def encode(self, sentence: str) -> str:
+        return self.bpe.encode(sentence)
+
+    def decode(self, sentence: str) -> str:
+        return self.bpe.decode(sentence)
+
+
+class TokenizerHubInterface(object):
+    """PyTorch Hub interface for tokenization."""
+
+    def __init__(self, tokenizer, **kwargs):
+        super().__init__()
+        args = argparse.Namespace(tokenizer=tokenizer, **kwargs)
+        self.tokenizer = encoders.build_tokenizer(args)
+        assert self.tokenizer is not None
+
+    def encode(self, sentence: str) -> str:
+        return self.tokenizer.encode(sentence)
+
+    def decode(self, sentence: str) -> str:
+        return self.tokenizer.decode(sentence)
diff --git a/fairseq/incremental_decoding_utils.py b/fairseq/incremental_decoding_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..91128e8879673b57eab26b00680155a567e12907
--- /dev/null
+++ b/fairseq/incremental_decoding_utils.py
@@ -0,0 +1,50 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, Optional
+import uuid
+
+from torch import Tensor
+
+
+class FairseqIncrementalState(object):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.init_incremental_state()
+
+    def init_incremental_state(self):
+        self._incremental_state_id = str(uuid.uuid4())
+
+    def _get_full_incremental_state_key(self, key: str) -> str:
+        return "{}.{}".format(self._incremental_state_id, key)
+
+    def get_incremental_state(
+        self,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+        key: str,
+    ) -> Optional[Dict[str, Optional[Tensor]]]:
+        """Helper for getting incremental state for an nn.Module."""
+        full_key = self._get_full_incremental_state_key(key)
+        if incremental_state is None or full_key not in incremental_state:
+            return None
+        return incremental_state[full_key]
+
+    def set_incremental_state(
+        self,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+        key: str,
+        value: Dict[str, Optional[Tensor]],
+    ) -> Optional[Dict[str, Dict[str, Optional[Tensor]]]]:
+        """Helper for setting incremental state for an nn.Module."""
+        if incremental_state is not None:
+            full_key = self._get_full_incremental_state_key(key)
+            incremental_state[full_key] = value
+        return incremental_state
+
+
+def with_incremental_state(cls):
+    cls.__bases__ = (FairseqIncrementalState,) + tuple(b for b in cls.__bases__ if b != FairseqIncrementalState)
+    return cls
diff --git a/fairseq/iterative_refinement_generator.py b/fairseq/iterative_refinement_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7a267d2589e3fa03404b778634b99dea7809426
--- /dev/null
+++ b/fairseq/iterative_refinement_generator.py
@@ -0,0 +1,315 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import namedtuple
+
+import torch
+import numpy as np
+
+from fairseq import utils
+
+
+DecoderOut = namedtuple('IterativeRefinementDecoderOut', [
+    'output_tokens',
+    'output_scores',
+    'attn',
+    'step',
+    'max_step',
+    'history'
+])
+
+
+class IterativeRefinementGenerator(object):
+    def __init__(
+        self,
+        tgt_dict,
+        models=None,
+        eos_penalty=0.0,
+        max_iter=10,
+        max_ratio=2,
+        beam_size=1,
+        decoding_format=None,
+        retain_dropout=False,
+        adaptive=True,
+        retain_history=False,
+        reranking=False,
+    ):
+        """
+        Generates translations based on iterative refinement.
+
+        Args:
+            tgt_dict: target dictionary
+            eos_penalty: if > 0.0, it penalized early-stopping in decoding
+            max_iter: maximum number of refinement iterations
+            max_ratio: generate sequences of maximum length ax, where x is the source length
+            decoding_format: decoding mode in {'unigram', 'ensemble', 'vote', 'dp', 'bs'}
+            retain_dropout: retaining dropout in the inference
+            adaptive: decoding with early stop
+        """
+        self.bos = tgt_dict.bos()
+        self.pad = tgt_dict.pad()
+        self.unk = tgt_dict.unk()
+        self.eos = tgt_dict.eos()
+        self.vocab_size = len(tgt_dict)
+        self.eos_penalty = eos_penalty
+        self.max_iter = max_iter
+        self.max_ratio = max_ratio
+        self.beam_size = beam_size
+        self.reranking = reranking
+        self.decoding_format = decoding_format
+        self.retain_dropout = retain_dropout
+        self.retain_history = retain_history
+        self.adaptive = adaptive
+        self.models = models
+
+    def generate_batched_itr(
+        self,
+        data_itr,
+        maxlen_a=None,
+        maxlen_b=None,
+        cuda=False,
+        timer=None,
+        prefix_size=0,
+    ):
+        """Iterate over a batched dataset and yield individual translations.
+
+        Args:
+            maxlen_a/b: generate sequences of maximum length ax + b,
+                where x is the source sentence length.
+            cuda: use GPU for generation
+            timer: StopwatchMeter for timing generations.
+        """
+
+        for sample in data_itr:
+            if "net_input" not in sample:
+                continue
+            if timer is not None:
+                timer.start()
+            with torch.no_grad():
+                hypos = self.generate(
+                    self.models,
+                    sample,
+                    prefix_tokens=sample["target"][:, :prefix_size]
+                    if prefix_size > 0
+                    else None,
+                )
+            if timer is not None:
+                timer.stop(sample["ntokens"])
+            for i, id in enumerate(sample["id"]):
+                # remove padding
+                src = utils.strip_pad(sample["net_input"]["src_tokens"][i, :], self.pad)
+                ref = utils.strip_pad(sample["target"][i, :], self.pad)
+                yield id, src, ref, hypos[i]
+
+
+    @torch.no_grad()
+    def generate(self, models, sample, prefix_tokens=None):
+
+        # TODO: iterative refinement generator does not support ensemble for now.
+        if not self.retain_dropout:
+            for model in models:
+                model.eval()
+
+        model, reranker = models[0], None
+        if self.reranking:
+            assert len(models) > 1, "Assuming the last checkpoint is the reranker"
+            assert self.beam_size > 1, "Reranking requires multiple translation for each example"
+
+            reranker = models[-1]
+            models = models[:-1]
+
+        if len(models) > 1 and hasattr(model, 'enable_ensemble'):
+            assert model.allow_ensemble, "{} does not support ensembling".format(model.__class__.__name__)
+            model.enable_ensemble(models)
+
+        # TODO: better encoder inputs?
+        src_tokens = sample["net_input"]["src_tokens"]
+        src_lengths = sample["net_input"]["src_lengths"]
+        bsz, src_len = src_tokens.size()
+
+        # initialize
+        encoder_out = model.forward_encoder([src_tokens, src_lengths])
+        prev_decoder_out = model.initialize_output_tokens(encoder_out, src_tokens)
+
+        if self.beam_size > 1:
+            assert model.allow_length_beam, \
+                "{} does not support decoding with length beam.".format(model.__class__.__name__)
+
+            # regenerate data based on length-beam
+            length_beam_order = utils.new_arange(src_tokens, self.beam_size, bsz).t().reshape(-1)
+            encoder_out = model.encoder.reorder_encoder_out(encoder_out, length_beam_order)
+            prev_decoder_out = model.regenerate_length_beam(prev_decoder_out, self.beam_size)
+            bsz = bsz * self.beam_size
+
+        sent_idxs = torch.arange(bsz)
+        prev_output_tokens = prev_decoder_out.output_tokens.clone()
+
+        if self.retain_history:
+            prev_decoder_out = prev_decoder_out._replace(history=[prev_output_tokens])
+
+        finalized = [[] for _ in range(bsz)]
+
+        def is_a_loop(x, y, s, a):
+            b, l_x, l_y = x.size(0), x.size(1), y.size(1)
+            if l_x > l_y:
+                y = torch.cat([y, x.new_zeros(b, l_x - l_y).fill_(self.pad)], 1)
+                s = torch.cat([s, s.new_zeros(b, l_x - l_y)], 1)
+                if a is not None:
+                    a = torch.cat([a, a.new_zeros(b, l_x - l_y, a.size(2))], 1)
+            elif l_x < l_y:
+                x = torch.cat([x, y.new_zeros(b, l_y - l_x).fill_(self.pad)], 1)
+            return (x == y).all(1), y, s, a
+
+        def finalized_hypos(step, prev_out_token, prev_out_score, prev_out_attn):
+            cutoff = prev_out_token.ne(self.pad)
+            tokens = prev_out_token[cutoff]
+            if prev_out_score is None:
+                scores, score = None, None
+            else:
+                scores = prev_out_score[cutoff]
+                score = scores.mean()
+
+            if prev_out_attn is None:
+                hypo_attn, alignment = None, None
+            else:
+                hypo_attn = prev_out_attn[cutoff]
+                alignment = hypo_attn.max(dim=1)[1]
+            return {
+                "steps": step,
+                "tokens": tokens,
+                "positional_scores": scores,
+                "score": score,
+                "hypo_attn": hypo_attn,
+                "alignment": alignment,
+            }
+
+        for step in range(self.max_iter + 1):
+
+            decoder_options = {
+                "eos_penalty": self.eos_penalty,
+                "max_ratio": self.max_ratio,
+                "decoding_format": self.decoding_format,
+            }
+            prev_decoder_out = prev_decoder_out._replace(
+                step=step,
+                max_step=self.max_iter + 1,
+            )
+
+            decoder_out = model.forward_decoder(
+                prev_decoder_out, encoder_out, **decoder_options
+            )
+
+            if self.adaptive:
+                # terminate if there is a loop
+                terminated, out_tokens, out_scores, out_attn = is_a_loop(
+                    prev_output_tokens, decoder_out.output_tokens, decoder_out.output_scores, decoder_out.attn
+                )
+                decoder_out = decoder_out._replace(
+                    output_tokens=out_tokens,
+                    output_scores=out_scores,
+                    attn=out_attn,
+                )
+
+            else:
+                terminated = decoder_out.output_tokens.new_zeros(decoder_out.output_tokens.size(0)).bool()
+
+            if step == self.max_iter:  # reach last iteration, terminate
+                terminated.fill_(1)
+
+            # collect finalized sentences
+            finalized_idxs = sent_idxs[terminated]
+            finalized_tokens = decoder_out.output_tokens[terminated]
+            finalized_scores = decoder_out.output_scores[terminated]
+            finalized_attn = (
+                None if (decoder_out.attn is None or decoder_out.attn.size(0) == 0) else decoder_out.attn[terminated]
+            )
+
+            if self.retain_history:
+                finalized_history_tokens = [h[terminated] for h in decoder_out.history]
+
+            for i in range(finalized_idxs.size(0)):
+                finalized[finalized_idxs[i]] = [
+                    finalized_hypos(
+                        step,
+                        finalized_tokens[i],
+                        finalized_scores[i],
+                        None if finalized_attn is None else finalized_attn[i],
+                    )
+                ]
+
+                if self.retain_history:
+                    finalized[finalized_idxs[i]][0]['history'] = []
+                    for j in range(len(finalized_history_tokens)):
+                        finalized[finalized_idxs[i]][0]['history'].append(
+                            finalized_hypos(
+                                step,
+                                finalized_history_tokens[j][i],
+                                None, None
+                            )
+                        )
+
+            # check if all terminated
+            if terminated.sum() == terminated.size(0):
+                break
+
+            # for next step
+            not_terminated = ~terminated
+            prev_decoder_out = decoder_out._replace(
+                output_tokens=decoder_out.output_tokens[not_terminated],
+                output_scores=decoder_out.output_scores[not_terminated],
+                attn=decoder_out.attn[not_terminated]
+                if (decoder_out.attn is not None and decoder_out.attn.size(0) > 0)
+                else None,
+                history=[h[not_terminated] for h in decoder_out.history]
+                if decoder_out.history is not None
+                else None,
+            )
+            encoder_out = model.encoder.reorder_encoder_out(encoder_out, not_terminated.nonzero().squeeze())
+            sent_idxs = sent_idxs[not_terminated]
+            prev_output_tokens = prev_decoder_out.output_tokens.clone()
+
+        if self.beam_size > 1:
+            if reranker is not None:
+                finalized = self.rerank(
+                    reranker, finalized, [src_tokens, src_lengths], self.beam_size
+                )
+
+            # aggregate information from length beam
+            finalized = [
+                finalized[np.argmax(
+                    [finalized[self.beam_size * i + j][0]['score'] for j in range(self.beam_size)]
+                    ) + self.beam_size * i] for i in range(len(finalized) // self.beam_size)
+                ]
+
+        return finalized
+
+    def rerank(self, reranker, finalized, encoder_input, beam_size):
+
+        def rebuild_batch(finalized):
+            finalized_tokens = [f[0]['tokens'] for f in finalized]
+            finalized_maxlen = max(f.size(0) for f in finalized_tokens)
+            final_output_tokens = finalized_tokens[0].new_zeros(len(finalized_tokens), finalized_maxlen).fill_(self.pad)
+            for i, f in enumerate(finalized_tokens):
+                final_output_tokens[i, :f.size(0)] = f
+            return final_output_tokens
+
+        final_output_tokens = rebuild_batch(finalized)
+        final_output_tokens[:, 0] = self.eos  # autoregressive model assumes starting with EOS
+
+        reranker_encoder_out = reranker.encoder(*encoder_input)
+        length_beam_order = utils.new_arange(
+            final_output_tokens, beam_size, reranker_encoder_out.encoder_out.size(1)).t().reshape(-1)
+        reranker_encoder_out = reranker.encoder.reorder_encoder_out(reranker_encoder_out, length_beam_order)
+        reranking_scores = reranker.get_normalized_probs(
+            reranker.decoder(final_output_tokens[:, :-1], reranker_encoder_out), True, None)
+        reranking_scores = reranking_scores.gather(2, final_output_tokens[:, 1:, None])
+        reranking_masks = final_output_tokens[:, 1:].ne(self.pad)
+        reranking_scores = reranking_scores[:, :, 0].masked_fill_(~reranking_masks, 0).sum(1)
+        reranking_scores = reranking_scores / reranking_masks.sum(1).type_as(reranking_scores)
+
+        for i in range(len(finalized)):
+            finalized[i][0]['score'] = reranking_scores[i]
+
+        return finalized
diff --git a/fairseq/legacy_distributed_data_parallel.py b/fairseq/legacy_distributed_data_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..9832f2c97a43dda2cf01ce5f467cdc60aa439207
--- /dev/null
+++ b/fairseq/legacy_distributed_data_parallel.py
@@ -0,0 +1,170 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+A modified version of the legacy DistributedDataParallel module that uses c10d
+communication primitives. This version is simpler than the latest PyTorch
+version and is useful for debugging. Notably it does not overlap gradient
+communication with the backward pass, which makes it slower but more robust
+than the PyTorch version.
+
+This version also supports the *no_sync* context manager, which allows faster
+training with `--update-freq`.
+"""
+
+from collections import OrderedDict
+from contextlib import contextmanager
+import copy
+
+import torch
+from torch import nn
+from torch.autograd import Variable
+
+from . import distributed_utils
+
+
+class LegacyDistributedDataParallel(nn.Module):
+    """Implements distributed data parallelism at the module level.
+
+    A simplified version of :class:`torch.nn.parallel.DistributedDataParallel`.
+    This version uses a c10d process group for communication and does not
+    broadcast buffers.
+
+    Args:
+        module (~torch.nn.Module): module to be parallelized
+        world_size (int): number of parallel workers
+        process_group (optional): the c10d process group to be used for
+            distributed data all-reduction. If None, the default process group
+            will be used.
+        buffer_size (int, optional): number of elements to buffer before
+            performing all-reduce (default: 256M).
+    """
+
+    def __init__(self, module, world_size, process_group=None, buffer_size=2**28):
+        super().__init__()
+
+        self.module = module
+        self.world_size = world_size
+        self.process_group = process_group
+
+        # Never use a bigger buffer than the number of model params
+        self.buffer_size = min(buffer_size, sum(p.numel() for p in module.parameters()))
+        self.buffer = None
+
+        # We can also forcibly accumulate grads locally and only do the
+        # all-reduce at some later time
+        self.accumulate_grads = False
+
+        # make per-device lists of parameters
+        paramlists = OrderedDict()
+        for param in self.module.parameters():
+            device = param.device
+            if paramlists.get(device) is None:
+                paramlists[device] = []
+            paramlists[device] += [param]
+        self.per_device_params = list(paramlists.values())
+
+
+    def __getstate__(self):
+        attrs = copy.copy(self.__dict__)
+        return attrs
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+
+    @contextmanager
+    def no_sync(self):
+        """A context manager to disable gradient synchronization."""
+        old_accumulate_grads = self.accumulate_grads
+        self.accumulate_grads = True
+        yield
+        self.accumulate_grads = old_accumulate_grads
+
+    def forward(self, *inputs, **kwargs):
+        return self.module(*inputs, **kwargs)
+
+    def all_reduce(self):
+        """
+        This function must be called explicitly after backward to reduce
+        gradients. There is no automatic hook like c10d.
+        """
+
+        def all_reduce_params(params):
+            buffer = self.buffer
+            nonzero_buffer = False
+            if len(params) > 1:
+                offset = 0
+                for p in params:
+                    sz = p.numel()
+                    if p.grad is not None:
+                        buffer[offset:offset+sz].copy_(p.grad.data.view(-1))
+                        nonzero_buffer = True
+                    else:
+                        buffer[offset:offset+sz].zero_()
+                    offset += sz
+            else:
+                # we only have a single grad to all-reduce
+                p = params[0]
+                if p.grad is not None:
+                    buffer = p.grad.data
+                    nonzero_buffer = True
+                elif p.numel() <= self.buffer.numel():
+                    buffer = buffer[:p.numel()]
+                    buffer.zero_()
+                else:
+                    buffer = torch.zeros_like(p)
+
+            if nonzero_buffer:
+                buffer.div_(self.world_size)
+
+            distributed_utils.all_reduce(buffer, self.process_group)
+
+            # copy all-reduced grads back into their original place
+            offset = 0
+            for p in params:
+                sz = p.numel()
+                if p.grad is not None:
+                    p.grad.data.copy_(buffer[offset:offset+sz].view_as(p))
+                else:
+                    p.grad = buffer[offset:offset+sz].view_as(p).clone()
+                offset += sz
+
+        def reduction_fn():
+            # This function only needs to be called once
+            if self.accumulate_grads:
+                return
+
+            if self.buffer is None:
+                self.buffer = next(self.module.parameters()).new(self.buffer_size)
+
+            for params in self.per_device_params:
+                # All-reduce the gradients in buckets
+                offset = 0
+                buffered_params = []
+                for param in params:
+                    if not param.requires_grad:
+                        continue
+                    if param.grad is None:
+                        param.grad = torch.zeros_like(param)
+                    if param.grad.requires_grad:
+                        raise RuntimeError("DistributedDataParallel only works "
+                                           "with gradients that don't require "
+                                           "grad")
+                    sz = param.numel()
+                    if sz > self.buffer.numel():
+                        # all-reduce big params directly
+                        all_reduce_params([param])
+                    else:
+                        if offset + sz > self.buffer.numel():
+                            all_reduce_params(buffered_params)
+                            offset = 0
+                            buffered_params.clear()
+                        buffered_params.append(param)
+                        offset += sz
+
+                if len(buffered_params) > 0:
+                    all_reduce_params(buffered_params)
+
+        reduction_fn()
diff --git a/fairseq/libbleu.cpython-310-x86_64-linux-gnu.so b/fairseq/libbleu.cpython-310-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..c4b4489d1ea9f43ec7fa2c50ec0ff337eb50ac4c
Binary files /dev/null and b/fairseq/libbleu.cpython-310-x86_64-linux-gnu.so differ
diff --git a/fairseq/logging/__init__.py b/fairseq/logging/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fairseq/logging/__pycache__/__init__.cpython-310.pyc b/fairseq/logging/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24e0482907e608a5f9b1dafd593fb91bc2b84567
Binary files /dev/null and b/fairseq/logging/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/logging/__pycache__/meters.cpython-310.pyc b/fairseq/logging/__pycache__/meters.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff6282099c6a88842b8eec01843fa948ac992729
Binary files /dev/null and b/fairseq/logging/__pycache__/meters.cpython-310.pyc differ
diff --git a/fairseq/logging/__pycache__/metrics.cpython-310.pyc b/fairseq/logging/__pycache__/metrics.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..930920f6fde1973d76d4d1173fa123fa100e008e
Binary files /dev/null and b/fairseq/logging/__pycache__/metrics.cpython-310.pyc differ
diff --git a/fairseq/logging/__pycache__/progress_bar.cpython-310.pyc b/fairseq/logging/__pycache__/progress_bar.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4b3e21466ae4436f98af143e095b9dc4d58964a
Binary files /dev/null and b/fairseq/logging/__pycache__/progress_bar.cpython-310.pyc differ
diff --git a/fairseq/logging/meters.py b/fairseq/logging/meters.py
new file mode 100644
index 0000000000000000000000000000000000000000..78e6d4d224d8a490a02ecdff1487186b830ecb3b
--- /dev/null
+++ b/fairseq/logging/meters.py
@@ -0,0 +1,286 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import bisect
+from collections import OrderedDict
+import time
+from typing import Dict, Optional
+
+try:
+    import torch
+
+    def type_as(a, b):
+        if torch.is_tensor(a) and torch.is_tensor(b):
+            return a.to(b)
+        else:
+            return a
+except ImportError:
+    torch = None
+
+    def type_as(a, b):
+        return a
+
+
+try:
+    import numpy as np
+except ImportError:
+    np = None
+
+
+class Meter(object):
+    """Base class for Meters."""
+
+    def __init__(self):
+        pass
+
+    def state_dict(self):
+        return {}
+
+    def load_state_dict(self, state_dict):
+        pass
+
+    def reset(self):
+        raise NotImplementedError
+
+    @property
+    def smoothed_value(self) -> float:
+        """Smoothed value used for logging."""
+        raise NotImplementedError
+
+
+def safe_round(number, ndigits):
+    if hasattr(number, '__round__'):
+        return round(number, ndigits)
+    elif torch is not None and torch.is_tensor(number) and number.numel() == 1:
+        return safe_round(number.item(), ndigits)
+    elif np is not None and np.ndim(number) == 0 and hasattr(number, 'item'):
+        return safe_round(number.item(), ndigits)
+    else:
+        return number
+
+
+class AverageMeter(Meter):
+    """Computes and stores the average and current value"""
+
+    def __init__(self, round: Optional[int] = None):
+        self.round = round
+        self.reset()
+
+    def reset(self):
+        self.val = None  # most recent update
+        self.sum = 0  # sum from all updates
+        self.count = 0  # total n from all updates
+
+    def update(self, val, n=1):
+        if val is not None:
+            self.val = val
+            if n > 0:
+                self.sum = type_as(self.sum, val) + (val * n)
+                self.count = type_as(self.count, n) + n
+
+    def state_dict(self):
+        return {
+            'val': self.val,
+            'sum': self.sum,
+            'count': self.count,
+            'round': self.round,
+        }
+
+    def load_state_dict(self, state_dict):
+        self.val = state_dict['val']
+        self.sum = state_dict['sum']
+        self.count = state_dict['count']
+        self.round = state_dict.get('round', None)
+
+    @property
+    def avg(self):
+        return self.sum / self.count if self.count > 0 else self.val
+
+    @property
+    def smoothed_value(self) -> float:
+        val = self.avg
+        if self.round is not None and val is not None:
+            val = safe_round(val, self.round)
+        return val
+
+
+class TimeMeter(Meter):
+    """Computes the average occurrence of some event per second"""
+
+    def __init__(
+        self,
+        init: int = 0,
+        n: int = 0,
+        round: Optional[int] = None,
+    ):
+        self.round = round
+        self.reset(init, n)
+
+    def reset(self, init=0, n=0):
+        self.init = init
+        self.start = time.perf_counter()
+        self.n = n
+        self.i = 0
+
+    def update(self, val=1):
+        self.n = type_as(self.n, val) + val
+        self.i += 1
+
+    def state_dict(self):
+        return {
+            'init': self.elapsed_time,
+            'n': self.n,
+            'round': self.round,
+        }
+
+    def load_state_dict(self, state_dict):
+        if 'start' in state_dict:
+            # backwards compatibility for old state_dicts
+            self.reset(init=state_dict['init'])
+        else:
+            self.reset(init=state_dict['init'], n=state_dict['n'])
+            self.round = state_dict.get('round', None)
+
+    @property
+    def avg(self):
+        return self.n / self.elapsed_time
+
+    @property
+    def elapsed_time(self):
+        return self.init + (time.perf_counter() - self.start)
+
+    @property
+    def smoothed_value(self) -> float:
+        val = self.avg
+        if self.round is not None and val is not None:
+            val = safe_round(val, self.round)
+        return val
+
+
+class StopwatchMeter(Meter):
+    """Computes the sum/avg duration of some event in seconds"""
+
+    def __init__(self, round: Optional[int] = None):
+        self.round = round
+        self.sum = 0
+        self.n = 0
+        self.start_time = None
+
+    def start(self):
+        self.start_time = time.perf_counter()
+
+    def stop(self, n=1, prehook=None):
+        if self.start_time is not None:
+            if prehook is not None:
+                prehook()
+            delta = time.perf_counter() - self.start_time
+            self.sum = self.sum + delta
+            self.n = type_as(self.n, n) + n
+
+    def reset(self):
+        self.sum = 0  # cumulative time during which stopwatch was active
+        self.n = 0  # total n across all start/stop
+        self.start()
+
+    def state_dict(self):
+        return {
+            'sum': self.sum,
+            'n': self.n,
+            'round': self.round,
+        }
+
+    def load_state_dict(self, state_dict):
+        self.sum = state_dict['sum']
+        self.n = state_dict['n']
+        self.start_time = None
+        self.round = state_dict.get('round', None)
+
+    @property
+    def avg(self):
+        return self.sum / self.n if self.n > 0 else self.sum
+
+    @property
+    def elapsed_time(self):
+        if self.start_time is None:
+            return 0.
+        return time.perf_counter() - self.start_time
+
+    @property
+    def smoothed_value(self) -> float:
+        val = self.avg if self.sum > 0 else self.elapsed_time
+        if self.round is not None and val is not None:
+            val = safe_round(val, self.round)
+        return val
+
+
+class MetersDict(OrderedDict):
+    """A sorted dictionary of :class:`Meters`.
+
+    Meters are sorted according to a priority that is given when the
+    meter is first added to the dictionary.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.priorities = []
+
+    def __setitem__(self, key, value):
+        assert key not in self, "MetersDict doesn't support reassignment"
+        priority, value = value
+        bisect.insort(self.priorities, (priority, len(self.priorities), key))
+        super().__setitem__(key, value)
+        for _, _, key in self.priorities:  # reorder dict to match priorities
+            self.move_to_end(key)
+
+    def add_meter(self, key, meter, priority):
+        self.__setitem__(key, (priority, meter))
+
+    def state_dict(self):
+        return [
+            (pri, key, self[key].__class__.__name__, self[key].state_dict())
+            for pri, _, key in self.priorities
+            # can't serialize DerivedMeter instances
+            if not isinstance(self[key], MetersDict._DerivedMeter)
+        ]
+
+    def load_state_dict(self, state_dict):
+        self.clear()
+        self.priorities.clear()
+        for pri, key, meter_cls, meter_state in state_dict:
+            meter = globals()[meter_cls]()
+            meter.load_state_dict(meter_state)
+            self.add_meter(key, meter, pri)
+
+    def get_smoothed_value(self, key: str) -> float:
+        """Get a single smoothed value."""
+        meter = self[key]
+        if isinstance(meter, MetersDict._DerivedMeter):
+            return meter.fn(self)
+        else:
+            return meter.smoothed_value
+
+    def get_smoothed_values(self) -> Dict[str, float]:
+        """Get all smoothed values."""
+        return OrderedDict([
+            (key, self.get_smoothed_value(key))
+            for key in self.keys()
+            if not key.startswith("_")
+        ])
+
+    def reset(self):
+        """Reset Meter instances."""
+        for meter in self.values():
+            if isinstance(meter, MetersDict._DerivedMeter):
+                continue
+            meter.reset()
+
+    class _DerivedMeter(Meter):
+        """A Meter whose values are derived from other Meters."""
+
+        def __init__(self, fn):
+            self.fn = fn
+
+        def reset(self):
+            pass
diff --git a/fairseq/logging/metrics.py b/fairseq/logging/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ca1d201e02c57846d0ec810dbd175fd5fd8023d
--- /dev/null
+++ b/fairseq/logging/metrics.py
@@ -0,0 +1,291 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+A standalone module for aggregating metrics.
+
+Metrics can be logged from anywhere using the `log_*` functions defined
+in this module. The logged values will be aggregated dynamically based
+on the aggregation context in which the logging occurs. See the
+:func:`aggregate` context manager for more details.
+"""
+
+from collections import defaultdict, OrderedDict
+import contextlib
+import time
+from typing import Callable, Dict, List, Optional
+import uuid
+
+from .meters import *
+
+
+# Aggregation contexts are considered "active" when inside the scope
+# created by the :func:`aggregate` context manager.
+_aggregators = OrderedDict()
+_active_aggregators = OrderedDict()
+_active_aggregators_cnt = defaultdict(lambda: 0)
+
+
+def reset() -> None:
+    """Reset all metrics aggregators."""
+    _aggregators.clear()
+    _active_aggregators.clear()
+    _active_aggregators_cnt.clear()
+
+    # The "default" aggregator observes all logged values.
+    _aggregators["default"] = MetersDict()
+    _active_aggregators["default"] = _aggregators["default"]
+    _active_aggregators_cnt["default"] = 1
+
+
+reset()
+
+
+@contextlib.contextmanager
+def aggregate(name: Optional[str] = None, new_root: bool = False):
+    """Context manager to aggregate metrics under a given name.
+
+    Aggregations can be nested. If *new_root* is ``False``, then logged
+    metrics will be recorded along the entire stack of nested
+    aggregators, including a global "default" aggregator. If *new_root*
+    is ``True``, then this aggregator will be the root of a new
+    aggregation stack, thus bypassing any parent aggregators.
+
+    Note that aggregation contexts are uniquely identified by their
+    *name* (e.g., train, valid). Creating a context with an existing
+    name will reuse the corresponding :class:`MetersDict` instance.
+    If no name is given, then a temporary aggregator will be created.
+
+    Usage::
+
+        with metrics.aggregate("train"):
+            for step, batch in enumerate(epoch):
+                with metrics.aggregate("train_inner") as agg:
+                    metrics.log_scalar("loss", get_loss(batch))
+                    if step % log_interval == 0:
+                        print(agg.get_smoothed_value("loss"))
+                        agg.reset()
+        print(metrics.get_smoothed_values("train")["loss"])
+
+    Args:
+        name (str): name of the aggregation. Defaults to a
+            random/temporary name if not given explicitly.
+        new_root (bool): make this aggregation the root of a new
+            aggregation stack.
+    """
+    if name is None:
+        # generate a temporary name
+        name = str(uuid.uuid4())
+        assert name not in _aggregators
+        agg = MetersDict()
+    else:
+        assert name != "default"
+        agg = _aggregators.setdefault(name, MetersDict())
+
+    if new_root:
+        backup_aggregators = _active_aggregators.copy()
+        _active_aggregators.clear()
+        backup_aggregators_cnt = _active_aggregators_cnt.copy()
+        _active_aggregators_cnt.clear()
+
+    _active_aggregators[name] = agg
+    _active_aggregators_cnt[name] += 1
+
+    yield agg
+
+    _active_aggregators_cnt[name] -= 1
+    if _active_aggregators_cnt[name] == 0 and name in _active_aggregators:
+        del _active_aggregators[name]
+
+    if new_root:
+        _active_aggregators.clear()
+        _active_aggregators.update(backup_aggregators)
+        _active_aggregators_cnt.clear()
+        _active_aggregators_cnt.update(backup_aggregators_cnt)
+
+
+def get_active_aggregators() -> List[MetersDict]:
+    return list(_active_aggregators.values())
+
+
+def log_scalar(
+    key: str,
+    value: float,
+    weight: float = 1,
+    priority: int = 10,
+    round: Optional[int] = None,
+):
+    """Log a scalar value.
+
+    Args:
+        key (str): name of the field to log
+        value (float): value to log
+        weight (float): weight that this value contributes to the average.
+            A weight of 0 will always log the latest value.
+        priority (int): smaller values are logged earlier in the output
+        round (Optional[int]): number of digits to round to when displaying
+    """
+    for agg in get_active_aggregators():
+        if key not in agg:
+            agg.add_meter(key, AverageMeter(round=round), priority)
+        agg[key].update(value, weight)
+
+
+def log_derived(key: str, fn: Callable[[MetersDict], float], priority: int = 20):
+    """Log a scalar value derived from other meters.
+
+    Args:
+        key (str): name of the field to log
+        fn (Callable[[MetersDict], float]): function that takes a single
+            argument *meters* and returns the derived value
+        priority (int): smaller values are logged earlier in the output
+    """
+    for agg in get_active_aggregators():
+        if key not in agg:
+            agg.add_meter(key, MetersDict._DerivedMeter(fn), priority)
+
+
+def log_speed(
+    key: str,
+    value: float,
+    priority: int = 30,
+    round: Optional[int] = None,
+):
+    """Log the rate of some quantity per second.
+
+    Args:
+        key (str): name of the field to log
+        value (float): value to log
+        priority (int): smaller values are logged earlier in the output
+        round (Optional[int]): number of digits to round to when displaying
+    """
+    for agg in get_active_aggregators():
+        if key not in agg:
+            agg.add_meter(key, TimeMeter(round=round), priority)
+            agg[key].reset()  # reset meter on the first call
+        else:
+            agg[key].update(value)
+
+
+def log_start_time(key: str, priority: int = 40, round: Optional[int] = None):
+    """Log the duration of some event in seconds.
+
+    The duration will be computed once :func:`log_stop_time` is called.
+
+    Args:
+        key (str): name of the field to log
+        priority (int): smaller values are logged earlier in the output
+        round (Optional[int]): number of digits to round to when displaying
+    """
+    for agg in get_active_aggregators():
+        if key not in agg:
+            agg.add_meter(key, StopwatchMeter(round=round), priority)
+        agg[key].start()
+
+
+def log_stop_time(key: str, weight: float = 0., prehook=None):
+    """Log the duration of some event in seconds.
+
+    The duration will be computed since :func:`log_start_time` was called.
+    Set weight > 0 to report the average time instead of the sum.
+
+    Args:
+        key (str): name of the field to log
+        weight (float): weight that this time contributes to the average
+        prehook (function, no arguments): will be called before the timer
+        is stopped. For example, use prehook=torch.cuda.synchronize to
+        make sure all gpu operations are done before timer is stopped.
+    """
+    for agg in get_active_aggregators():
+        if key in agg:
+            agg[key].stop(weight, prehook)
+
+
+def log_custom(
+    new_meter_fn: Callable[[], Meter],
+    key: str,
+    *args,
+    priority: int = 50,
+    **kwargs,
+):
+    """Log using a custom Meter.
+
+    Any extra *args* or *kwargs* will be passed through to the Meter's
+    *update* method.
+
+    Args:
+        new_meter_fn (Callable[[], Meter]): function that returns a new
+            Meter instance
+        key (str): name of the field to log
+        priority (int): smaller values are logged earlier in the output
+    """
+    for agg in get_active_aggregators():
+        if key not in agg:
+            agg.add_meter(key, new_meter_fn(), priority)
+        agg[key].update(*args, **kwargs)
+
+
+def reset_meter(name: str, key: str) -> None:
+    """Reset Meter instance aggregated under a given *name* and *key*."""
+    meter = get_meter(name, key)
+    if meter is not None:
+        meter.reset()
+
+
+def reset_meters(name: str) -> None:
+    """Reset Meter instances aggregated under a given *name*."""
+    meters = get_meters(name)
+    if meters is not None:
+        meters.reset()
+
+
+def get_meter(name: str, key: str) -> Meter:
+    """Get a single Meter instance aggregated under *name* and *key*.
+
+    Returns:
+        Meter or None if no metrics have been logged under *name* and *key*.
+    """
+    if name not in _aggregators:
+        return None
+    return _aggregators[name].get(key, None)
+
+
+def get_meters(name: str) -> MetersDict:
+    """Get Meter instances aggregated under a given *name*.
+
+    Returns:
+        MetersDict or None if no metrics have been logged under *name*.
+    """
+    return _aggregators.get(name, None)
+
+
+def get_smoothed_value(name: str, key: str) -> float:
+    """Get a single smoothed value.
+
+    Raises:
+        KeyError: if no metrics have been logged under *name* and *key*.
+    """
+    return _aggregators[name].get_smoothed_value(key)
+
+
+def get_smoothed_values(name: str) -> Dict[str, float]:
+    """Get smoothed values aggregated under a given *name*.
+
+    Raises:
+        KeyError: if no metrics have been logged under *name*.
+    """
+    return _aggregators[name].get_smoothed_values()
+
+
+def state_dict():
+    return OrderedDict([
+        (name, agg.state_dict())
+        for name, agg in _aggregators.items()
+    ])
+
+
+def load_state_dict(state_dict):
+    for name, agg_state in state_dict.items():
+        _aggregators[name] = MetersDict()
+        _aggregators[name].load_state_dict(agg_state)
diff --git a/fairseq/logging/progress_bar.py b/fairseq/logging/progress_bar.py
new file mode 100644
index 0000000000000000000000000000000000000000..97e4162ea0a55f9b0d1caa95c082616a966c9d10
--- /dev/null
+++ b/fairseq/logging/progress_bar.py
@@ -0,0 +1,359 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Wrapper around various loggers and progress bars (e.g., tqdm).
+"""
+
+import atexit
+import json
+import logging
+import os
+import sys
+from collections import OrderedDict
+from contextlib import contextmanager
+from numbers import Number
+from typing import Optional
+
+import torch
+
+from .meters import AverageMeter, StopwatchMeter, TimeMeter
+
+
+logger = logging.getLogger(__name__)
+
+
+def progress_bar(
+    iterator,
+    log_format: Optional[str] = None,
+    log_interval: int = 100,
+    epoch: Optional[int] = None,
+    prefix: Optional[str] = None,
+    tensorboard_logdir: Optional[str] = None,
+    default_log_format: str = 'tqdm',
+):
+    if log_format is None:
+        log_format = default_log_format
+    if log_format == 'tqdm' and not sys.stderr.isatty():
+        log_format = 'simple'
+
+    if log_format == 'json':
+        bar = JsonProgressBar(iterator, epoch, prefix, log_interval)
+    elif log_format == 'none':
+        bar = NoopProgressBar(iterator, epoch, prefix)
+    elif log_format == 'simple':
+        bar = SimpleProgressBar(iterator, epoch, prefix, log_interval)
+    elif log_format == 'tqdm':
+        bar = TqdmProgressBar(iterator, epoch, prefix)
+    else:
+        raise ValueError('Unknown log format: {}'.format(log_format))
+
+    if tensorboard_logdir:
+        try:
+            # [FB only] custom wrapper for TensorBoard
+            import palaas  # noqa
+            from .fb_tbmf_wrapper import FbTbmfWrapper
+            bar = FbTbmfWrapper(bar, log_interval)
+        except ImportError:
+            bar = TensorboardProgressBarWrapper(bar, tensorboard_logdir)
+
+    return bar
+
+
+def build_progress_bar(
+    args,
+    iterator,
+    epoch: Optional[int] = None,
+    prefix: Optional[str] = None,
+    default: str = 'tqdm',
+    no_progress_bar: str = 'none',
+):
+    """Legacy wrapper that takes an argparse.Namespace."""
+    if getattr(args, 'no_progress_bar', False):
+        default = no_progress_bar
+    if getattr(args, 'distributed_rank', 0) == 0:
+        tensorboard_logdir = getattr(args, 'tensorboard_logdir', None)
+    else:
+        tensorboard_logdir = None
+    return progress_bar(
+        iterator,
+        log_format=args.log_format,
+        log_interval=args.log_interval,
+        epoch=epoch,
+        prefix=prefix,
+        tensorboard_logdir=tensorboard_logdir,
+        default_log_format=default,
+    )
+
+
+def format_stat(stat):
+    if isinstance(stat, Number):
+        stat = '{:g}'.format(stat)
+    elif isinstance(stat, AverageMeter):
+        stat = '{:.3f}'.format(stat.avg)
+    elif isinstance(stat, TimeMeter):
+        stat = '{:g}'.format(round(stat.avg))
+    elif isinstance(stat, StopwatchMeter):
+        stat = '{:g}'.format(round(stat.sum))
+    elif torch.is_tensor(stat):
+        stat = stat.tolist()
+    return stat
+
+
+class BaseProgressBar(object):
+    """Abstract class for progress bars."""
+    def __init__(self, iterable, epoch=None, prefix=None):
+        self.iterable = iterable
+        self.n = getattr(iterable, 'n', 0)
+        self.epoch = epoch
+        self.prefix = ''
+        if epoch is not None:
+            self.prefix += 'epoch {:03d}'.format(epoch)
+        if prefix is not None:
+            self.prefix += ' | {}'.format(prefix)
+
+    def __len__(self):
+        return len(self.iterable)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *exc):
+        return False
+
+    def __iter__(self):
+        raise NotImplementedError
+
+    def log(self, stats, tag=None, step=None):
+        """Log intermediate stats according to log_interval."""
+        raise NotImplementedError
+
+    def print(self, stats, tag=None, step=None):
+        """Print end-of-epoch stats."""
+        raise NotImplementedError
+
+    def _str_commas(self, stats):
+        return ', '.join(key + '=' + stats[key].strip()
+                         for key in stats.keys())
+
+    def _str_pipes(self, stats):
+        return ' | '.join(key + ' ' + stats[key].strip()
+                          for key in stats.keys())
+
+    def _format_stats(self, stats):
+        postfix = OrderedDict(stats)
+        # Preprocess stats according to datatype
+        for key in postfix.keys():
+            postfix[key] = str(format_stat(postfix[key]))
+        return postfix
+
+
+@contextmanager
+def rename_logger(logger, new_name):
+    old_name = logger.name
+    if new_name is not None:
+        logger.name = new_name
+    yield logger
+    logger.name = old_name
+
+
+class JsonProgressBar(BaseProgressBar):
+    """Log output in JSON format."""
+
+    def __init__(self, iterable, epoch=None, prefix=None, log_interval=1000):
+        super().__init__(iterable, epoch, prefix)
+        self.log_interval = log_interval
+        self.i = None
+        self.size = None
+
+    def __iter__(self):
+        self.size = len(self.iterable)
+        for i, obj in enumerate(self.iterable, start=self.n):
+            self.i = i
+            yield obj
+
+    def log(self, stats, tag=None, step=None):
+        """Log intermediate stats according to log_interval."""
+        step = step or self.i or 0
+        if (
+            step > 0
+            and self.log_interval is not None
+            and step % self.log_interval == 0
+        ):
+            update = (
+                self.epoch - 1 + (self.i + 1) / float(self.size)
+                if self.epoch is not None
+                else None
+            )
+            stats = self._format_stats(stats, epoch=self.epoch, update=update)
+            with rename_logger(logger, tag):
+                logger.info(json.dumps(stats))
+
+    def print(self, stats, tag=None, step=None):
+        """Print end-of-epoch stats."""
+        self.stats = stats
+        if tag is not None:
+            self.stats = OrderedDict([(tag + '_' + k, v) for k, v in self.stats.items()])
+        stats = self._format_stats(self.stats, epoch=self.epoch)
+        with rename_logger(logger, tag):
+            logger.info(json.dumps(stats))
+
+    def _format_stats(self, stats, epoch=None, update=None):
+        postfix = OrderedDict()
+        if epoch is not None:
+            postfix['epoch'] = epoch
+        if update is not None:
+            postfix['update'] = round(update, 3)
+        # Preprocess stats according to datatype
+        for key in stats.keys():
+            postfix[key] = format_stat(stats[key])
+        return postfix
+
+
+class NoopProgressBar(BaseProgressBar):
+    """No logging."""
+
+    def __init__(self, iterable, epoch=None, prefix=None):
+        super().__init__(iterable, epoch, prefix)
+
+    def __iter__(self):
+        for obj in self.iterable:
+            yield obj
+
+    def log(self, stats, tag=None, step=None):
+        """Log intermediate stats according to log_interval."""
+        pass
+
+    def print(self, stats, tag=None, step=None):
+        """Print end-of-epoch stats."""
+        pass
+
+
+class SimpleProgressBar(BaseProgressBar):
+    """A minimal logger for non-TTY environments."""
+
+    def __init__(self, iterable, epoch=None, prefix=None, log_interval=1000):
+        super().__init__(iterable, epoch, prefix)
+        self.log_interval = log_interval
+        self.i = None
+        self.size = None
+
+    def __iter__(self):
+        self.size = len(self.iterable)
+        for i, obj in enumerate(self.iterable, start=self.n):
+            self.i = i
+            yield obj
+
+    def log(self, stats, tag=None, step=None):
+        """Log intermediate stats according to log_interval."""
+        step = step or self.i or 0
+        if (
+            step > 0
+            and self.log_interval is not None
+            and step % self.log_interval == 0
+        ):
+            stats = self._format_stats(stats)
+            postfix = self._str_commas(stats)
+            with rename_logger(logger, tag):
+                logger.info(
+                    '{}:  {:5d} / {:d} {}'
+                    .format(self.prefix, self.i + 1, self.size, postfix)
+                )
+
+    def print(self, stats, tag=None, step=None):
+        """Print end-of-epoch stats."""
+        postfix = self._str_pipes(self._format_stats(stats))
+        with rename_logger(logger, tag):
+            logger.info('{} | {}'.format(self.prefix, postfix))
+
+
+class TqdmProgressBar(BaseProgressBar):
+    """Log to tqdm."""
+
+    def __init__(self, iterable, epoch=None, prefix=None):
+        super().__init__(iterable, epoch, prefix)
+        from tqdm import tqdm
+        self.tqdm = tqdm(
+            iterable,
+            self.prefix,
+            leave=False,
+            disable=(logger.getEffectiveLevel() > logging.INFO),
+        )
+
+    def __iter__(self):
+        return iter(self.tqdm)
+
+    def log(self, stats, tag=None, step=None):
+        """Log intermediate stats according to log_interval."""
+        self.tqdm.set_postfix(self._format_stats(stats), refresh=False)
+
+    def print(self, stats, tag=None, step=None):
+        """Print end-of-epoch stats."""
+        postfix = self._str_pipes(self._format_stats(stats))
+        with rename_logger(logger, tag):
+            logger.info('{} | {}'.format(self.prefix, postfix))
+
+
+try:
+    _tensorboard_writers = {}
+    from tensorboardX import SummaryWriter
+except ImportError:
+    SummaryWriter = None
+
+
+def _close_writers():
+    for w in _tensorboard_writers.values():
+        w.close()
+
+
+atexit.register(_close_writers)
+
+
+class TensorboardProgressBarWrapper(BaseProgressBar):
+    """Log to tensorboard."""
+
+    def __init__(self, wrapped_bar, tensorboard_logdir):
+        self.wrapped_bar = wrapped_bar
+        self.tensorboard_logdir = tensorboard_logdir
+
+        if SummaryWriter is None:
+            logger.warning(
+                "tensorboard not found, please install with: pip install tensorboardX"
+            )
+
+    def _writer(self, key):
+        if SummaryWriter is None:
+            return None
+        _writers = _tensorboard_writers
+        if key not in _writers:
+            _writers[key] = SummaryWriter(os.path.join(self.tensorboard_logdir, key))
+            _writers[key].add_text('sys.argv', " ".join(sys.argv))
+        return _writers[key]
+
+    def __iter__(self):
+        return iter(self.wrapped_bar)
+
+    def log(self, stats, tag=None, step=None):
+        """Log intermediate stats to tensorboard."""
+        self._log_to_tensorboard(stats, tag, step)
+        self.wrapped_bar.log(stats, tag=tag, step=step)
+
+    def print(self, stats, tag=None, step=None):
+        """Print end-of-epoch stats."""
+        self._log_to_tensorboard(stats, tag, step)
+        self.wrapped_bar.print(stats, tag=tag, step=step)
+
+    def _log_to_tensorboard(self, stats, tag=None, step=None):
+        writer = self._writer(tag or '')
+        if writer is None:
+            return
+        if step is None:
+            step = stats['num_updates']
+        for key in stats.keys() - {'num_updates'}:
+            if isinstance(stats[key], AverageMeter):
+                writer.add_scalar(key, stats[key].val, step)
+            elif isinstance(stats[key], Number):
+                writer.add_scalar(key, stats[key], step)
+        writer.flush()
diff --git a/fairseq/model_parallel/__init__.py b/fairseq/model_parallel/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc563db40b9441c7d1471041572cf029a8eb3919
--- /dev/null
+++ b/fairseq/model_parallel/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import criterions, modules, models  # noqa
diff --git a/fairseq/model_parallel/__pycache__/__init__.cpython-310.pyc b/fairseq/model_parallel/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ff2937fde134d7b71c37b679aa1981ec462e939
Binary files /dev/null and b/fairseq/model_parallel/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/model_parallel/criterions/__init__.py b/fairseq/model_parallel/criterions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b74de559824a03eb9511c83f9f61d183a0506597
--- /dev/null
+++ b/fairseq/model_parallel/criterions/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import importlib
+import os
+
+
+# automatically import any Python files in the criterions/ directory
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith('.py') and not file.startswith('_'):
+        module = file[:file.find('.py')]
+        importlib.import_module('fairseq.model_parallel.criterions.' + module)
diff --git a/fairseq/model_parallel/criterions/__pycache__/__init__.cpython-310.pyc b/fairseq/model_parallel/criterions/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fba83c5152c4e2899c12ce44088162d88c9a767c
Binary files /dev/null and b/fairseq/model_parallel/criterions/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/model_parallel/criterions/__pycache__/vocab_parallel_cross_entropy.cpython-310.pyc b/fairseq/model_parallel/criterions/__pycache__/vocab_parallel_cross_entropy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab62c05af3d464eaa427208c234c2db6bd8bbe72
Binary files /dev/null and b/fairseq/model_parallel/criterions/__pycache__/vocab_parallel_cross_entropy.cpython-310.pyc differ
diff --git a/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py b/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py
new file mode 100644
index 0000000000000000000000000000000000000000..eab8f9af4e1c81f4c87ccd67695a8031fc1db1de
--- /dev/null
+++ b/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py
@@ -0,0 +1,74 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+from fairseq import metrics, utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+try:
+    from fairseq.model_parallel.megatron.mpu.cross_entropy import vocab_parallel_cross_entropy
+    has_megatron_submodule = True
+except (ImportError, ModuleNotFoundError):
+    has_megatron_submodule = False
+
+
+@register_criterion('vocab_parallel_cross_entropy')
+class VocabParallelCrossEntropyCriterion(FairseqCriterion):
+
+    def __init__(self, task, sentence_avg):
+        super().__init__(task)
+        self.sentence_avg = sentence_avg
+        if not has_megatron_submodule:
+            raise ImportError(
+                '\n\nPlease install the megatron submodule:'
+                '\n\n  git submodule update --init '
+                'fairseq/model_parallel/megatron'
+            )
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample['net_input'])
+        target = sample['target']
+
+        loss = vocab_parallel_cross_entropy(net_output[0].float(), target)
+        loss = (loss * (target != self.padding_idx)).sum()
+        sample_size = sample['target'].size(0) if self.sentence_avg else sample['ntokens']
+        logging_output = {
+            'loss': utils.item(loss.data) if reduce else loss.data,
+            'ntokens': sample['ntokens'],
+            'nsentences': sample['target'].size(0),
+            'sample_size': sample_size,
+        }
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get('loss', 0) for log in logging_outputs)
+        ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
+        sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
+
+        metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3)
+        if sample_size != ntokens:
+            metrics.log_scalar('nll_loss', loss_sum / ntokens / math.log(2), ntokens, round=3)
+            metrics.log_derived('ppl', lambda meters: utils.get_perplexity(meters['nll_loss'].avg))
+        else:
+            metrics.log_derived('ppl', lambda meters: utils.get_perplexity(meters['loss'].avg))
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/fairseq/model_parallel/megatron_trainer.py b/fairseq/model_parallel/megatron_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1142a993ca05e42371f5092d8119b996ca49ef5
--- /dev/null
+++ b/fairseq/model_parallel/megatron_trainer.py
@@ -0,0 +1,63 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Train a network across multiple GPUs.
+"""
+
+from fairseq import distributed_utils
+from fairseq.trainer import Trainer
+
+try:
+    from fairseq.model_parallel.megatron.mpu import (
+        get_data_parallel_group,
+        get_data_parallel_rank,
+        get_data_parallel_world_size,
+        get_model_parallel_group,
+        get_model_parallel_src_rank,
+    )
+    has_megatron_submodule = True
+except (ImportError, ModuleNotFoundError):
+    has_megatron_submodule = False
+
+
+class MegatronTrainer(Trainer):
+    """Main class for model parallel with data parallel training.
+    """
+    def __init__(self, args, task, model, criterion):
+        if not has_megatron_submodule:
+            raise ImportError(
+                '\n\nPlease install the megatron submodule:'
+                '\n\n  git submodule update --init '
+                'fairseq/model_parallel/megatron'
+            )
+        super().__init__(args, task, model, criterion)
+
+    @property
+    def data_parallel_world_size(self):
+        return get_data_parallel_world_size()
+
+    @property
+    def data_parallel_process_group(self):
+        return get_data_parallel_group()
+
+    @property
+    def data_parallel_rank(self):
+        return get_data_parallel_rank()
+
+    @property
+    def is_data_parallel_master(self):
+        return get_model_parallel_src_rank() == 0
+
+    def clip_grad_norm(self, clip_norm):
+        def _aggregate_model_parallel_grad_norm(total_norm):
+            total_norm = total_norm ** 2
+            distributed_utils.all_reduce(total_norm, group=get_model_parallel_group())
+            total_norm = total_norm ** 0.5
+            return total_norm
+        return self.optimizer.clip_grad_norm(
+            clip_norm,
+            aggregate_norm_fn=_aggregate_model_parallel_grad_norm,
+        )
diff --git a/fairseq/model_parallel/models/__init__.py b/fairseq/model_parallel/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3207981adcb59289d59d18961b961c819e52013
--- /dev/null
+++ b/fairseq/model_parallel/models/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import importlib
+import os
+
+
+# automatically import any Python files in the models/ directory
+models_dir = os.path.dirname(__file__)
+for file in os.listdir(models_dir):
+    path = os.path.join(models_dir, file)
+    if not file.startswith('_') and not file.startswith('.') and (file.endswith('.py') or os.path.isdir(path)):
+        model_name = file[:file.find('.py')] if file.endswith('.py') else file
+        module = importlib.import_module('fairseq.model_parallel.models.' + model_name)
diff --git a/fairseq/model_parallel/models/__pycache__/__init__.cpython-310.pyc b/fairseq/model_parallel/models/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb7c6d40fc5ba40ad6cb13499661dd747be55cdf
Binary files /dev/null and b/fairseq/model_parallel/models/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/model_parallel/models/__pycache__/transformer.cpython-310.pyc b/fairseq/model_parallel/models/__pycache__/transformer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..669af6407599406e90831a11fbd31f3a075587be
Binary files /dev/null and b/fairseq/model_parallel/models/__pycache__/transformer.cpython-310.pyc differ
diff --git a/fairseq/model_parallel/models/__pycache__/transformer_lm.cpython-310.pyc b/fairseq/model_parallel/models/__pycache__/transformer_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9cad6d405663be6a943a5972d801d927773d8537
Binary files /dev/null and b/fairseq/model_parallel/models/__pycache__/transformer_lm.cpython-310.pyc differ
diff --git a/fairseq/model_parallel/models/roberta/__init__.py b/fairseq/model_parallel/models/roberta/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..117827c3e9c176477f33e3a6fd7fe19a922411a2
--- /dev/null
+++ b/fairseq/model_parallel/models/roberta/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .model import *  # noqa
diff --git a/fairseq/model_parallel/models/roberta/__pycache__/__init__.cpython-310.pyc b/fairseq/model_parallel/models/roberta/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc734ba288d5004fe10b83a3ac7ed9f513764c17
Binary files /dev/null and b/fairseq/model_parallel/models/roberta/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/model_parallel/models/roberta/__pycache__/model.cpython-310.pyc b/fairseq/model_parallel/models/roberta/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8093469ba2b23258a8c0fc45b6f4d7b14324e7a
Binary files /dev/null and b/fairseq/model_parallel/models/roberta/__pycache__/model.cpython-310.pyc differ
diff --git a/fairseq/model_parallel/models/roberta/model.py b/fairseq/model_parallel/models/roberta/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0ae4a2c8ff2a33eda5f804a0fc7802799dd697a
--- /dev/null
+++ b/fairseq/model_parallel/models/roberta/model.py
@@ -0,0 +1,268 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+RoBERTa: A Robustly Optimized BERT Pretraining Approach.
+"""
+
+import logging
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq import utils
+from fairseq.models import (
+    FairseqEncoder,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.roberta import (
+    RobertaModel,
+    RobertaEncoder,
+    RobertaLMHead,
+    RobertaClassificationHead,
+)
+from fairseq.modules import (
+    LayerNorm,
+    TransformerSentenceEncoder,
+)
+from fairseq.model_parallel.modules import (
+    ModelParallelTransformerSentenceEncoder,
+)
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+try:
+    from fairseq.model_parallel.megatron.mpu import (
+        copy_to_model_parallel_region,
+        gather_from_model_parallel_region,
+        ColumnParallelLinear,
+        RowParallelLinear,
+    )
+    has_megatron_submodule = True
+except (ImportError, ModuleNotFoundError):
+    has_megatron_submodule = False
+
+logger = logging.getLogger(__name__)
+
+
+@register_model('model_parallel_roberta')
+class ModelParallelRobertaModel(RobertaModel):
+
+
+    def __init__(self, args, encoder):
+        super().__init__(args, encoder)
+
+        self.classification_heads = nn.ModuleDict()
+
+    @staticmethod
+    def add_args(parser):
+        super(ModelParallelRobertaModel, ModelParallelRobertaModel).add_args(parser)
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present
+        base_architecture(args)
+
+        if not hasattr(args, 'max_positions'):
+            args.max_positions = args.tokens_per_sample
+
+        encoder = ModelParallelRobertaEncoder(args, task.source_dictionary)
+        return cls(args, encoder)
+
+    def forward(self, src_tokens, features_only=False, return_all_hiddens=False, classification_head_name=None, **kwargs):
+        if classification_head_name is not None:
+            features_only = True
+
+        x, extra = self.encoder(src_tokens, features_only, return_all_hiddens, **kwargs)
+
+        if classification_head_name is not None:
+            x = self.classification_heads[classification_head_name](x)
+        return x, extra
+
+    def register_classification_head(self, name, num_classes=None, inner_dim=None, **kwargs):
+        """Register a classification head."""
+        if name in self.classification_heads:
+            prev_num_classes = self.classification_heads[name].out_proj.out_features
+            prev_inner_dim = self.classification_heads[name].dense.out_features
+            if num_classes != prev_num_classes or inner_dim != prev_inner_dim:
+                logger.warning(
+                    're-registering head "{}" with num_classes {} (prev: {}) '
+                    'and inner_dim {} (prev: {})'.format(
+                        name, num_classes, prev_num_classes, inner_dim, prev_inner_dim
+                    )
+                )
+        self.classification_heads[name] = ModelParallelRobertaClassificationHead(
+            self.args.encoder_embed_dim,
+            inner_dim or self.args.encoder_embed_dim,
+            num_classes,
+            self.args.pooler_activation_fn,
+            self.args.pooler_dropout,
+        )
+
+
+class ModelParallelRobertaLMHead(nn.Module):
+    """Head for masked language modeling."""
+
+    def __init__(self, embed_dim, output_dim, activation_fn, weight=None):
+        super().__init__()
+        self.dense = ColumnParallelLinear(embed_dim, embed_dim, gather_output=True)
+        self.activation_fn = utils.get_activation_fn(activation_fn)
+        self.layer_norm = LayerNorm(embed_dim)
+
+        if weight is None:
+            weight = nn.Linear(embed_dim, output_dim, bias=False).weight
+        self.weight = weight
+        self.bias = nn.Parameter(torch.zeros(output_dim))
+
+    def forward(self, features, masked_tokens=None, **kwargs):
+        # Only project the unmasked tokens while training,
+        # saves both memory and computation
+        if masked_tokens is not None:
+            features = features[masked_tokens, :]
+
+        x = self.dense(features)
+        x = self.activation_fn(x)
+        x = self.layer_norm(x)
+
+        features = copy_to_model_parallel_region(features)
+        # project back to size of vocabulary with bias
+        x = F.linear(x, self.weight)
+        x = gather_from_model_parallel_region(x).contiguous()
+        x = x + self.bias
+        return x
+
+
+class ModelParallelRobertaClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, input_dim, inner_dim, num_classes, activation_fn, pooler_dropout):
+        super().__init__()
+        self.dense = ColumnParallelLinear(input_dim, inner_dim, gather_output=True)
+        self.activation_fn = utils.get_activation_fn(activation_fn)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = self.activation_fn(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+class ModelParallelRobertaEncoder(FairseqEncoder):
+    """RoBERTa encoder.
+
+    Implements the :class:`~fairseq.models.FairseqDecoder` interface required
+    by :class:`~fairseq.models.FairseqLanguageModel`.
+    """
+
+    def __init__(self, args, dictionary):
+        super().__init__(dictionary)
+        self.args = args
+
+        # RoBERTa is a sentence encoder model, so users will intuitively trim
+        # encoder layers. However, the implementation uses the fairseq decoder,
+        # so we fix here.
+        if args.encoder_layers_to_keep:
+            args.encoder_layers = len(args.encoder_layers_to_keep.split(","))
+            args.decoder_layers_to_keep = args.encoder_layers_to_keep
+            args.encoder_layers_to_keep = None
+
+        self.sentence_encoder = ModelParallelTransformerSentenceEncoder(
+            padding_idx=dictionary.pad(),
+            vocab_size=len(dictionary),
+            num_encoder_layers=args.encoder_layers,
+            embedding_dim=args.encoder_embed_dim,
+            ffn_embedding_dim=args.encoder_ffn_embed_dim,
+            num_attention_heads=args.encoder_attention_heads,
+            dropout=args.dropout,
+            attention_dropout=args.attention_dropout,
+            activation_dropout=args.activation_dropout,
+            layerdrop=args.encoder_layerdrop,
+            max_seq_len=args.max_positions,
+            num_segments=0,
+            encoder_normalize_before=False,
+            apply_bert_init=False,
+            activation_fn=args.activation_fn,
+        )
+        self.lm_head = ModelParallelRobertaLMHead(
+            embed_dim=args.encoder_embed_dim,
+            output_dim=len(dictionary),
+            activation_fn=args.activation_fn,
+            weight=self.sentence_encoder.embed_tokens.weight,
+        )
+
+    def forward(self, src_tokens, features_only=False, return_all_hiddens=False, masked_tokens=None, **unused):
+        """
+        Args:
+            src_tokens (LongTensor): input tokens of shape `(batch, src_len)`
+            features_only (bool, optional): skip LM head and just return
+                features. If True, the output will be of shape
+                `(batch, src_len, embed_dim)`.
+            return_all_hiddens (bool, optional): also return all of the
+                intermediate hidden states (default: False).
+
+        Returns:
+            tuple:
+                - the LM output of shape `(batch, src_len, vocab)`
+                - a dictionary of additional data, where 'inner_states'
+                  is a list of hidden states. Note that the hidden
+                  states have shape `(src_len, batch, vocab)`.
+        """
+        x, extra = self.extract_features(src_tokens, return_all_hiddens=return_all_hiddens)
+        if not features_only:
+            x = self.output_layer(x, masked_tokens=masked_tokens)
+        return x, extra
+
+    def extract_features(self, src_tokens, return_all_hiddens=False, **unused):
+        inner_states, _ = self.sentence_encoder(
+            src_tokens,
+            last_state_only=not return_all_hiddens,
+        )
+        features = inner_states[-1].transpose(0, 1)  # T x B x C -> B x T x C
+        return features, {'inner_states': inner_states if return_all_hiddens else None}
+
+    def output_layer(self, features, masked_tokens=None, **unused):
+        return self.lm_head(features, masked_tokens)
+
+    def max_positions(self):
+        """Maximum output length supported by the encoder."""
+        return self.args.max_positions
+
+
+@register_model_architecture('model_parallel_roberta', 'model_parallel_roberta')
+def base_architecture(args):
+    args.encoder_layers = getattr(args, 'encoder_layers', 12)
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 768)
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 3072)
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 12)
+
+    args.activation_fn = getattr(args, 'activation_fn', 'gelu')
+    args.pooler_activation_fn = getattr(args, 'pooler_activation_fn', 'tanh')
+
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
+    args.activation_dropout = getattr(args, 'activation_dropout', 0.0)
+    args.pooler_dropout = getattr(args, 'pooler_dropout', 0.0)
+    args.encoder_layers_to_keep = getattr(args, 'encoder_layers_to_keep', None)
+    args.encoder_layerdrop = getattr(args, 'encoder_layerdrop', 0.0)
+
+
+@register_model_architecture('model_parallel_roberta', 'model_parallel_roberta_base')
+def roberta_base_architecture(args):
+    base_architecture(args)
+
+
+@register_model_architecture('model_parallel_roberta', 'model_parallel_roberta_large')
+def roberta_large_architecture(args):
+    args.encoder_layers = getattr(args, 'encoder_layers', 24)
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1024)
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 4096)
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 16)
+    base_architecture(args)
diff --git a/fairseq/model_parallel/models/transformer.py b/fairseq/model_parallel/models/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5756ad8982631213d5b54d889e866d3585bac89
--- /dev/null
+++ b/fairseq/model_parallel/models/transformer.py
@@ -0,0 +1,112 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq.models import (
+    register_model,
+)
+
+from fairseq.models.transformer import (
+    TransformerDecoder,
+    TransformerEncoder,
+    TransformerModel,
+)
+
+from fairseq.model_parallel.modules import (
+    ModelParallelTransformerDecoderLayer,
+    ModelParallelTransformerEncoderLayer,
+)
+
+try:
+    from fairseq.model_parallel.megatron.mpu import (
+        copy_to_model_parallel_region,
+        gather_from_model_parallel_region,
+        VocabParallelEmbedding,
+    )
+    has_megatron_submodule = True
+except (ImportError, ModuleNotFoundError):
+    has_megatron_submodule = False
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_model('model_parallel_transformer')
+class ModelParallelTransformerModel(TransformerModel):
+    """
+    Model parallel Transformer model.
+    """
+    @classmethod
+    def build_embedding(cls, args, dictionary, embed_dim, path=None):
+        if not has_megatron_submodule:
+            raise ImportError(
+                '\n\nPlease install the megatron submodule:'
+                '\n\n  git submodule update --init '
+                'fairseq/model_parallel/megatron'
+            )
+        num_embeddings = len(dictionary)
+        padding_idx = dictionary.pad()
+
+        def _vocab_init(tensor, **kwargs):
+            nn.init.normal_(tensor, mean=0, std=num_embeddings ** -0.5)
+            nn.init.constant_(tensor[1], 0)
+        emb = VocabParallelEmbedding(num_embeddings, embed_dim, padding_idx, init_method=_vocab_init)
+        # if provided, load from preloaded dictionaries
+        if path:
+            raise NotImplementedError("Loading of embedding from path is not supported for model parallel")
+        return emb
+
+    @classmethod
+    def build_encoder(cls, args, src_dict, embed_tokens):
+        return ModelParallelTransformerEncoder(args, src_dict, embed_tokens)
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        return ModelParallelTransformerDecoder(
+            args,
+            tgt_dict,
+            embed_tokens,
+            no_encoder_attn=getattr(args, 'no_cross_attention', False),
+        )
+
+
+class ModelParallelTransformerEncoder(TransformerEncoder):
+    """
+    Model parallel Transformer encoder consisting of *args.encoder_layers* layers. Each layer
+    is a :class:`ModelParallelTransformerEncoderLayer`.
+    """
+
+    def build_encoder_layer(self, args):
+        return ModelParallelTransformerEncoderLayer(args)
+
+
+class ModelParallelTransformerDecoder(TransformerDecoder):
+    """
+    Model Parallel Transformer decoder consisting of *args.decoder_layers* layers. Each layer
+    is a :class:`ModelParallelTransformerDecoderLayer`.
+    """
+
+    def build_decoder_layer(self, args, no_encoder_attn=False):
+        return ModelParallelTransformerDecoderLayer(args, no_encoder_attn)
+
+    def output_layer(self, features, **kwargs):
+        """Project features to the vocabulary size."""
+        if not self.share_input_output_embed:
+            raise NotImplementedError(
+                'Model parallel training currently requires --share-decoder-input-output-embed'
+            )
+
+        features = copy_to_model_parallel_region(features)
+
+        # project back to size of vocabulary
+        x = self.output_projection(features)
+
+        if getattr(self.args, 'criterion') != 'vocab_parallel_cross_entropy':
+            x = gather_from_model_parallel_region(x).contiguous()
+        return x
diff --git a/fairseq/model_parallel/models/transformer_lm.py b/fairseq/model_parallel/models/transformer_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..81bc93bc0a975309bfe8b97ac9be1f119ce28117
--- /dev/null
+++ b/fairseq/model_parallel/models/transformer_lm.py
@@ -0,0 +1,88 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.nn as nn
+
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.transformer_lm import (
+    base_lm_architecture,
+    TransformerLanguageModel,
+)
+from fairseq.model_parallel.models.transformer import ModelParallelTransformerDecoder
+try:
+    from fairseq.model_parallel.megatron.mpu import VocabParallelEmbedding
+    has_megatron_submodule = True
+except (ImportError, ModuleNotFoundError):
+    has_megatron_submodule = False
+
+
+DEFAULT_MAX_TARGET_POSITIONS = 1024
+
+
+@register_model('model_parallel_transformer_lm')
+class ModelParallelTransformerLanguageModel(TransformerLanguageModel):
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        if not has_megatron_submodule:
+            raise ImportError(
+                '\n\nPlease install the megatron submodule:'
+                '\n\n  git submodule update --init '
+                'fairseq/model_parallel/megatron'
+            )
+
+        # make sure all arguments are present in older models
+        base_lm_architecture(args)
+
+        if args.decoder_layers_to_keep:
+            args.decoder_layers = len(args.decoder_layers_to_keep.split(","))
+
+        if getattr(args, 'max_target_positions', None) is None:
+            args.max_target_positions = getattr(args, 'tokens_per_sample', DEFAULT_MAX_TARGET_POSITIONS)
+
+        if args.character_embeddings:
+            raise NotImplementedError("Character embeddings is not supported for model parallel")
+        elif args.adaptive_input:
+            raise NotImplementedError("Adaptive input is not supported for model parallel")
+        else:
+            embed_tokens = cls.build_embedding(args, task.source_dictionary, args.decoder_input_dim)
+
+        decoder = ModelParallelTransformerDecoder(
+            args, task.target_dictionary, embed_tokens, no_encoder_attn=True,
+        )
+        return cls(decoder)
+
+    @classmethod
+    def build_embedding(cls, args, dictionary, embed_dim, path=None):
+        def _vocab_init(tensor, **kwargs):
+            nn.init.normal_(tensor, mean=0, std=embed_dim ** -0.5)
+            nn.init.constant_(tensor[1], 0)
+        embed_tokens = VocabParallelEmbedding(len(dictionary), embed_dim, dictionary.pad(), init_method=_vocab_init)
+        return embed_tokens
+
+
+@register_model_architecture('model_parallel_transformer_lm', 'transformer_lm_megatron')
+def transformer_lm_megatron(args):
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 3072)
+    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 3072 * 4)
+    args.decoder_layers = getattr(args, 'decoder_layers', 72)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 32)
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
+    args.activation_fn = getattr(args, 'activation_fn', 'gelu')
+    base_lm_architecture(args)
+
+
+@register_model_architecture('model_parallel_transformer_lm', 'transformer_lm_megatron_11b')
+def transformer_lm_megatron_11b(args):
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 3072)
+    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 3072 * 6)
+    args.decoder_layers = getattr(args, 'decoder_layers', 72)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 32)
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
+    args.activation_fn = getattr(args, 'activation_fn', 'gelu')
+    base_lm_architecture(args)
diff --git a/fairseq/model_parallel/modules/__init__.py b/fairseq/model_parallel/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c9431f92be6bc609bd999e77abf70197960cd48
--- /dev/null
+++ b/fairseq/model_parallel/modules/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .multihead_attention import ModelParallelMultiheadAttention
+from .transformer_layer import ModelParallelTransformerEncoderLayer, ModelParallelTransformerDecoderLayer
+from .transformer_sentence_encoder_layer import ModelParallelTransformerSentenceEncoderLayer
+from .transformer_sentence_encoder import ModelParallelTransformerSentenceEncoder
+
+__all__ = [
+    'ModelParallelMultiheadAttention',
+    'ModelParallelTransformerEncoderLayer',
+    'ModelParallelTransformerDecoderLayer',
+    'ModelParallelTransformerSentenceEncoder',
+    'ModelParallelTransformerSentenceEncoderLayer',
+]
diff --git a/fairseq/model_parallel/modules/__pycache__/__init__.cpython-310.pyc b/fairseq/model_parallel/modules/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e88d51da6986a3e9dd01e24f3a0151434b5c77e3
Binary files /dev/null and b/fairseq/model_parallel/modules/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/model_parallel/modules/__pycache__/multihead_attention.cpython-310.pyc b/fairseq/model_parallel/modules/__pycache__/multihead_attention.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..676b7075d92ca623d4155bbcb0ac288eb797dd28
Binary files /dev/null and b/fairseq/model_parallel/modules/__pycache__/multihead_attention.cpython-310.pyc differ
diff --git a/fairseq/model_parallel/modules/__pycache__/transformer_layer.cpython-310.pyc b/fairseq/model_parallel/modules/__pycache__/transformer_layer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..231680fa552a46cd62de920363105d0821d8c63b
Binary files /dev/null and b/fairseq/model_parallel/modules/__pycache__/transformer_layer.cpython-310.pyc differ
diff --git a/fairseq/model_parallel/modules/__pycache__/transformer_sentence_encoder.cpython-310.pyc b/fairseq/model_parallel/modules/__pycache__/transformer_sentence_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f02fa78442c3ad10f20bec72618d34bdd645a259
Binary files /dev/null and b/fairseq/model_parallel/modules/__pycache__/transformer_sentence_encoder.cpython-310.pyc differ
diff --git a/fairseq/model_parallel/modules/__pycache__/transformer_sentence_encoder_layer.cpython-310.pyc b/fairseq/model_parallel/modules/__pycache__/transformer_sentence_encoder_layer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a8361919bf68d0d299d2de95c6994bb2396c491
Binary files /dev/null and b/fairseq/model_parallel/modules/__pycache__/transformer_sentence_encoder_layer.cpython-310.pyc differ
diff --git a/fairseq/model_parallel/modules/multihead_attention.py b/fairseq/model_parallel/modules/multihead_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..e92a3f6a71c7225b293a941020749046d2c628d5
--- /dev/null
+++ b/fairseq/model_parallel/modules/multihead_attention.py
@@ -0,0 +1,312 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from torch import Tensor, nn
+from fairseq.incremental_decoding_utils import with_incremental_state
+from fairseq.modules.fairseq_dropout import FairseqDropout
+
+try:
+    from fairseq.model_parallel.megatron.mpu import (
+        get_cuda_rng_tracker,
+        get_model_parallel_world_size,
+        ColumnParallelLinear,
+        RowParallelLinear,
+    )
+    has_megatron_submodule = True
+except (ImportError, ModuleNotFoundError):
+    has_megatron_submodule = False
+
+
+@with_incremental_state
+class ModelParallelMultiheadAttention(nn.Module):
+    """Model parallel Multi-headed attention.
+    This performs the Multi-headed attention over multiple gpus.
+
+    See "Megatron-LM: https://arxiv.org/pdf/1909.08053.pdf" for more details.
+    """
+
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        kdim=None,
+        vdim=None,
+        dropout=0.0,
+        bias=True,
+        self_attention=False,
+        encoder_decoder_attention=False,
+    ):
+        super().__init__()
+        if not has_megatron_submodule:
+            raise ImportError(
+                '\n\nPlease install the megatron submodule:'
+                '\n\n  git submodule update --init '
+                'fairseq/model_parallel/megatron'
+            )
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.model_parallel_size = get_model_parallel_world_size()
+
+        self.num_heads_partition = num_heads // self.model_parallel_size
+        assert (
+            self.num_heads_partition * self.model_parallel_size == num_heads
+        ), "Number of heads must be divisble by model parallel size"
+
+        self.dropout_module = FairseqDropout(
+            dropout, module_name=self.__class__.__name__
+        )
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim ** -0.5
+
+        self.self_attention = self_attention
+        self.encoder_decoder_attention = encoder_decoder_attention
+
+        assert not self.self_attention or self.qkv_same_dim, (
+            "Self-attention requires query, key and value to be of the same size"
+        )
+
+        self.k_proj = ColumnParallelLinear(self.kdim, embed_dim, bias=bias, gather_output=False)
+        self.v_proj = ColumnParallelLinear(self.vdim, embed_dim, bias=bias, gather_output=False)
+        self.q_proj = ColumnParallelLinear(embed_dim, embed_dim, bias=bias, gather_output=False)
+        self.out_proj = RowParallelLinear(embed_dim, embed_dim, bias=bias, input_is_parallel=True)
+
+    def forward(
+        self,
+        query,
+        key: Optional[Tensor],
+        value: Optional[Tensor],
+        key_padding_mask: Optional[Tensor] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        static_kv: bool = False,
+        attn_mask: Optional[Tensor] = None,
+        **unused_kwargs,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """Input shape: Time x Batch x Channel
+
+        Args:
+            key_padding_mask (ByteTensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where
+                padding elements are indicated by 1s.
+            attn_mask (ByteTensor, optional): typically used to
+                implement causal attention, where the mask prevents the
+                attention from looking forward in time (default: None).
+        """
+        tgt_len, bsz, embed_dim = query.size()
+        assert embed_dim == self.embed_dim
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+
+        if incremental_state is not None:
+            saved_state = self._get_input_buffer(incremental_state)
+            if saved_state is not None and "prev_key" in saved_state:
+                # previous time steps are cached - no need to recompute
+                # key and value if they are static
+                if static_kv:
+                    assert self.encoder_decoder_attention and not self.self_attention
+                    key = value = None
+        else:
+            saved_state = None
+
+        if self.self_attention:
+            q = self.q_proj(query)
+            k = self.k_proj(query)
+            v = self.v_proj(query)
+        elif self.encoder_decoder_attention:
+            # encoder-decoder attention
+            q = self.q_proj(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                k = self.k_proj(key)
+                v = self.v_proj(key)
+
+        else:
+            assert key is not None and value is not None
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
+        q *= self.scaling
+
+
+        q = (
+            q.contiguous()
+            .view(tgt_len, bsz * self.num_heads_partition, self.head_dim)
+            .transpose(0, 1)
+        )
+        if k is not None:
+            k = (
+                k.contiguous()
+                .view(-1, bsz * self.num_heads_partition, self.head_dim)
+                .transpose(0, 1)
+            )
+        if v is not None:
+            v = (
+                v.contiguous()
+                .view(-1, bsz * self.num_heads_partition, self.head_dim)
+                .transpose(0, 1)
+            )
+
+        if saved_state is not None:
+            # saved states are stored with shape (bsz, num_heads_partition, seq_len, head_dim)
+            if "prev_key" in saved_state:
+                _prev_key = saved_state["prev_key"]
+                assert _prev_key is not None
+                prev_key = _prev_key.view(bsz * self.num_heads_partition, -1, self.head_dim)
+                if static_kv:
+                    k = prev_key
+                else:
+                    assert k is not None
+                    k = torch.cat([prev_key, k], dim=1)
+            if "prev_value" in saved_state:
+                _prev_value = saved_state["prev_value"]
+                assert _prev_value is not None
+                prev_value = _prev_value.view(bsz * self.num_heads_partition, -1, self.head_dim)
+                if static_kv:
+                    v = prev_value
+                else:
+                    assert v is not None
+                    v = torch.cat([prev_value, v], dim=1)
+            prev_key_padding_mask: Optional[Tensor] = None
+            if "prev_key_padding_mask" in saved_state:
+                prev_key_padding_mask = saved_state["prev_key_padding_mask"]
+            assert k is not None and v is not None
+            key_padding_mask = ModelParallelMultiheadAttention._append_prev_key_padding_mask(
+                key_padding_mask=key_padding_mask,
+                prev_key_padding_mask=prev_key_padding_mask,
+                batch_size=bsz,
+                src_len=k.size(1),
+                static_kv=static_kv,
+            )
+
+            saved_state["prev_key"] = k.view(bsz, self.num_heads_partition, -1, self.head_dim)
+            saved_state["prev_value"] = v.view(bsz, self.num_heads_partition, -1, self.head_dim)
+            saved_state["prev_key_padding_mask"] = key_padding_mask
+            # In this branch incremental_state is never None
+            assert incremental_state is not None
+            incremental_state = self._set_input_buffer(incremental_state, saved_state)
+        assert k is not None
+        src_len = k.size(1)
+
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+
+        assert list(attn_weights.size()) == [bsz * self.num_heads_partition, tgt_len, src_len]
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0)
+            attn_weights += attn_mask
+
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.num_heads_partition, tgt_len, src_len)
+            attn_weights = attn_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), float("-inf")
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads_partition, tgt_len, src_len)
+
+        attn_weights_float = utils.softmax(
+            attn_weights, dim=-1
+        )
+        attn_weights = attn_weights_float.type_as(attn_weights)
+
+        with get_cuda_rng_tracker().fork():
+            attn_probs = self.dropout_module(attn_weights)
+
+        assert v is not None
+        attn = torch.bmm(attn_probs, v)
+        assert list(attn.size()) == [bsz * self.num_heads_partition, tgt_len, self.head_dim]
+        embed_dim_partition = embed_dim // self.model_parallel_size
+        attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim_partition)
+        attn = self.out_proj(attn)
+        # return attn_weights None to keep the return type same as single gpu multihead attention
+        # This will be deprecated.
+        attn_weights: Optional[Tensor] = None
+
+        return attn, attn_weights
+
+    @staticmethod
+    def _append_prev_key_padding_mask(
+        key_padding_mask: Optional[Tensor],
+        prev_key_padding_mask: Optional[Tensor],
+        batch_size: int,
+        src_len: int,
+        static_kv: bool,
+    ) -> Optional[Tensor]:
+        # saved key padding masks have shape (bsz, seq_len)
+        if prev_key_padding_mask is not None and static_kv:
+            new_key_padding_mask = prev_key_padding_mask
+        elif prev_key_padding_mask is not None and key_padding_mask is not None:
+            new_key_padding_mask = torch.cat(
+                [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1
+            )
+        # During incremental decoding, as the padding token enters and
+        # leaves the frame, there will be a time when prev or current
+        # is None
+        elif prev_key_padding_mask is not None:
+
+            filler = torch.zeros(batch_size, src_len - prev_key_padding_mask.size(1))
+            if prev_key_padding_mask.is_cuda:
+                filler = filler.cuda()
+            new_key_padding_mask = torch.cat(
+                [prev_key_padding_mask.float(), filler.float()], dim=1
+            )
+        elif key_padding_mask is not None:
+            filler = torch.zeros(batch_size, src_len - key_padding_mask.size(1))
+            if key_padding_mask.is_cuda:
+                filler = filler.cuda()
+            new_key_padding_mask = torch.cat(
+                [filler.float(), key_padding_mask.float()], dim=1
+            )
+        else:
+            new_key_padding_mask = prev_key_padding_mask
+        return new_key_padding_mask
+
+    def reorder_incremental_state(
+        self, incremental_state: Dict[str, Dict[str, Optional[Tensor]]], new_order
+    ):
+        """Reorder buffered internal state (for incremental generation)."""
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            for k in input_buffer.keys():
+                if input_buffer[k] is not None:
+                    input_buffer[k] = input_buffer[k].index_select(0, new_order)
+            incremental_state = self._set_input_buffer(incremental_state, input_buffer)
+        return incremental_state
+
+    def _get_input_buffer(
+        self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
+    ) -> Dict[str, Optional[Tensor]]:
+        result = self.get_incremental_state(incremental_state, "attn_state")
+        if result is not None:
+            return result
+        else:
+            empty_result: Dict[str, Optional[Tensor]] = {}
+            return empty_result
+
+    def _set_input_buffer(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+        buffer: Dict[str, Optional[Tensor]],
+    ):
+        return self.set_incremental_state(incremental_state, "attn_state", buffer)
diff --git a/fairseq/model_parallel/modules/transformer_layer.py b/fairseq/model_parallel/modules/transformer_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..30b23d518c4c2f0815be7229d96c0689d2867e81
--- /dev/null
+++ b/fairseq/model_parallel/modules/transformer_layer.py
@@ -0,0 +1,79 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.modules import (
+    TransformerEncoderLayer,
+    TransformerDecoderLayer,
+)
+
+from fairseq.model_parallel.modules import ModelParallelMultiheadAttention
+
+try:
+    from fairseq.model_parallel.megatron.mpu import (
+        ColumnParallelLinear,
+        RowParallelLinear,
+    )
+    has_megatron_submodule = True
+except (ImportError, ModuleNotFoundError):
+    has_megatron_submodule = False
+
+
+class ModelParallelTransformerEncoderLayer(TransformerEncoderLayer):
+    """Encoder layer block over multiple gpus.
+
+        See "Megatron-LM: https://arxiv.org/pdf/1909.08053.pdf" for more details.
+    """
+
+    def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
+        if q_noise > 0:
+            raise NotImplementedError
+        return ColumnParallelLinear(input_dim, output_dim, gather_output=False)
+
+    def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
+        if q_noise > 0:
+            raise NotImplementedError
+        return RowParallelLinear(input_dim, output_dim, input_is_parallel=True)
+
+    def build_self_attention(self, embed_dim, args, **unused_kwargs):
+        return ModelParallelMultiheadAttention(
+            embed_dim,
+            args.encoder_attention_heads,
+            dropout=args.attention_dropout,
+            self_attention=True,
+        )
+
+
+class ModelParallelTransformerDecoderLayer(TransformerDecoderLayer):
+    """Decoder layer block.
+
+        See "Megatron-LM: https://arxiv.org/pdf/1909.08053.pdf" for more details.
+    """
+    def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
+        if q_noise > 0:
+            raise NotImplementedError
+        return ColumnParallelLinear(input_dim, output_dim, gather_output=False)
+
+    def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
+        if q_noise > 0:
+            raise NotImplementedError
+        return RowParallelLinear(input_dim, output_dim, input_is_parallel=True)
+
+    def build_self_attention(self, embed_dim, args, **unused_kwargs):
+        return ModelParallelMultiheadAttention(
+            embed_dim=embed_dim,
+            num_heads=args.decoder_attention_heads,
+            dropout=args.attention_dropout,
+            self_attention=not getattr(args, "cross_self_attention", False),
+        )
+
+    def build_encoder_attention(self, embed_dim, args, **unused_kwargs):
+        return ModelParallelMultiheadAttention(
+            embed_dim=embed_dim,
+            num_heads=args.decoder_attention_heads,
+            kdim=getattr(args, "encoder_embed_dim", None),
+            vdim=getattr(args, "encoder_embed_dim", None),
+            dropout=args.attention_dropout,
+            encoder_decoder_attention=True,
+        )
diff --git a/fairseq/model_parallel/modules/transformer_sentence_encoder.py b/fairseq/model_parallel/modules/transformer_sentence_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..101eca7bd4d143195ec89ffb1cfdd7b05060fcf1
--- /dev/null
+++ b/fairseq/model_parallel/modules/transformer_sentence_encoder.py
@@ -0,0 +1,64 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq.modules import (
+    LayerNorm,
+    MultiheadAttention,
+    PositionalEmbedding,
+    TransformerSentenceEncoder,
+)
+
+from fairseq.model_parallel.modules import (
+    ModelParallelTransformerSentenceEncoderLayer,
+)
+
+try:
+    from fairseq.model_parallel.megatron.mpu import (
+        copy_to_model_parallel_region,
+        gather_from_model_parallel_region,
+        VocabParallelEmbedding,
+    )
+    has_megatron_submodule = True
+except (ImportError, ModuleNotFoundError):
+    has_megatron_submodule = False
+
+import random
+
+
+class ModelParallelTransformerSentenceEncoder(TransformerSentenceEncoder):
+    """
+    Implementation for a Model Parallel Bi-directional Transformer based
+    Sentence Encoder used in BERT/XLM style pre-trained models.
+    """
+    def build_embedding(self, vocab_size, embedding_dim, padding_idx):
+        return VocabParallelEmbedding(vocab_size, embedding_dim, padding_idx)
+
+    def build_transformer_sentence_encoder_layer(
+        self,
+        embedding_dim,
+        ffn_embedding_dim,
+        num_attention_heads,
+        dropout,
+        attention_dropout,
+        activation_dropout,
+        activation_fn,
+        export,
+        **unused,
+    ):
+        return ModelParallelTransformerSentenceEncoderLayer(
+            embedding_dim=embedding_dim,
+            ffn_embedding_dim=ffn_embedding_dim,
+            num_attention_heads=num_attention_heads,
+            dropout=dropout,
+            attention_dropout=attention_dropout,
+            activation_dropout=activation_dropout,
+            activation_fn=activation_fn,
+            export=export,
+        )
diff --git a/fairseq/model_parallel/modules/transformer_sentence_encoder_layer.py b/fairseq/model_parallel/modules/transformer_sentence_encoder_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1ea2b7d757c041d6f4d0ba00b20478e3d72edc
--- /dev/null
+++ b/fairseq/model_parallel/modules/transformer_sentence_encoder_layer.py
@@ -0,0 +1,79 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn.functional as F
+
+from fairseq import utils
+from fairseq.modules import (
+    TransformerSentenceEncoderLayer
+)
+from fairseq.model_parallel.modules import ModelParallelMultiheadAttention
+try:
+    from fairseq.model_parallel.megatron.mpu import (
+        ColumnParallelLinear,
+        RowParallelLinear,
+    )
+    has_megatron_submodule = True
+except (ImportError, ModuleNotFoundError):
+    has_megatron_submodule = False
+
+
+class ModelParallelTransformerSentenceEncoderLayer(TransformerSentenceEncoderLayer):
+    """
+    Implements a Model Parallel Transformer Encoder Layer used in
+    BERT/XLM style pre-trained models.
+    """
+    def build_fc1(self, input_dim, output_dim, **unused):
+        return ColumnParallelLinear(input_dim, output_dim, gather_output=False)
+
+    def build_fc2(self, input_dim, output_dim, **unused):
+        return RowParallelLinear(input_dim, output_dim, input_is_parallel=True)
+
+    def build_self_attention(
+        self,
+        embed_dim,
+        num_attention_heads,
+        dropout,
+        **kwargs,
+    ):
+        return ModelParallelMultiheadAttention(
+            embed_dim,
+            num_attention_heads,
+            dropout=dropout,
+            self_attention=True
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        self_attn_mask: torch.Tensor = None,
+        self_attn_padding_mask: torch.Tensor = None,
+    ):
+        """
+        LayerNorm is applied either before or after the self-attention/ffn
+        modules similar to the original Transformer imlementation.
+        """
+        residual = x
+        x = self.self_attn_layer_norm(x)
+        x, attn = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            key_padding_mask=self_attn_padding_mask,
+            need_weights=False,
+            attn_mask=self_attn_mask,
+        )
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+
+        residual = x
+        x = self.final_layer_norm(x)
+        x = self.activation_fn(self.fc1(x))
+        x = F.dropout(x, p=self.activation_dropout, training=self.training)
+        x = self.fc2(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        return x, None
diff --git a/fairseq/models/__init__.py b/fairseq/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7d8eaafadf8f39a3e79501d86809aed6f5b414b
--- /dev/null
+++ b/fairseq/models/__init__.py
@@ -0,0 +1,141 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import importlib
+import os
+
+from .fairseq_decoder import FairseqDecoder
+from .fairseq_encoder import FairseqEncoder
+from .fairseq_incremental_decoder import FairseqIncrementalDecoder
+from .fairseq_model import (
+    BaseFairseqModel,
+    FairseqEncoderModel,
+    FairseqEncoderDecoderModel,
+    FairseqLanguageModel,
+    FairseqModel,
+    FairseqMultiModel,
+)
+
+from .composite_encoder import CompositeEncoder
+from .distributed_fairseq_model import DistributedFairseqModel
+
+
+MODEL_REGISTRY = {}
+ARCH_MODEL_REGISTRY = {}
+ARCH_MODEL_INV_REGISTRY = {}
+ARCH_CONFIG_REGISTRY = {}
+
+
+__all__ = [
+    'BaseFairseqModel',
+    'CompositeEncoder',
+    'DistributedFairseqModel',
+    'FairseqDecoder',
+    'FairseqEncoder',
+    'FairseqEncoderDecoderModel',
+    'FairseqEncoderModel',
+    'FairseqIncrementalDecoder',
+    'FairseqLanguageModel',
+    'FairseqModel',
+    'FairseqMultiModel',
+]
+
+
+def build_model(args, task):
+    return ARCH_MODEL_REGISTRY[args.arch].build_model(args, task)
+
+
+def register_model(name):
+    """
+    New model types can be added to fairseq with the :func:`register_model`
+    function decorator.
+
+    For example::
+
+        @register_model('lstm')
+        class LSTM(FairseqEncoderDecoderModel):
+            (...)
+
+    .. note:: All models must implement the :class:`BaseFairseqModel` interface.
+        Typically you will extend :class:`FairseqEncoderDecoderModel` for
+        sequence-to-sequence tasks or :class:`FairseqLanguageModel` for
+        language modeling tasks.
+
+    Args:
+        name (str): the name of the model
+    """
+
+    def register_model_cls(cls):
+        if name in MODEL_REGISTRY:
+            raise ValueError('Cannot register duplicate model ({})'.format(name))
+        if not issubclass(cls, BaseFairseqModel):
+            raise ValueError('Model ({}: {}) must extend BaseFairseqModel'.format(name, cls.__name__))
+        MODEL_REGISTRY[name] = cls
+        return cls
+
+    return register_model_cls
+
+
+def register_model_architecture(model_name, arch_name):
+    """
+    New model architectures can be added to fairseq with the
+    :func:`register_model_architecture` function decorator. After registration,
+    model architectures can be selected with the ``--arch`` command-line
+    argument.
+
+    For example::
+
+        @register_model_architecture('lstm', 'lstm_luong_wmt_en_de')
+        def lstm_luong_wmt_en_de(args):
+            args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1000)
+            (...)
+
+    The decorated function should take a single argument *args*, which is a
+    :class:`argparse.Namespace` of arguments parsed from the command-line. The
+    decorated function should modify these arguments in-place to match the
+    desired architecture.
+
+    Args:
+        model_name (str): the name of the Model (Model must already be
+            registered)
+        arch_name (str): the name of the model architecture (``--arch``)
+    """
+
+    def register_model_arch_fn(fn):
+        if model_name not in MODEL_REGISTRY:
+            raise ValueError('Cannot register model architecture for unknown model type ({})'.format(model_name))
+        if arch_name in ARCH_MODEL_REGISTRY:
+            raise ValueError('Cannot register duplicate model architecture ({})'.format(arch_name))
+        if not callable(fn):
+            raise ValueError('Model architecture must be callable ({})'.format(arch_name))
+        ARCH_MODEL_REGISTRY[arch_name] = MODEL_REGISTRY[model_name]
+        ARCH_MODEL_INV_REGISTRY.setdefault(model_name, []).append(arch_name)
+        ARCH_CONFIG_REGISTRY[arch_name] = fn
+        return fn
+
+    return register_model_arch_fn
+
+
+# automatically import any Python files in the models/ directory
+models_dir = os.path.dirname(__file__)
+for file in os.listdir(models_dir):
+    path = os.path.join(models_dir, file)
+    if (
+        not file.startswith('_')
+        and not file.startswith('.')
+        and (file.endswith('.py') or os.path.isdir(path))
+    ):
+        model_name = file[:file.find('.py')] if file.endswith('.py') else file
+        module = importlib.import_module('fairseq.models.' + model_name)
+
+        # extra `model_parser` for sphinx
+        if model_name in MODEL_REGISTRY:
+            parser = argparse.ArgumentParser(add_help=False)
+            group_archs = parser.add_argument_group('Named architectures')
+            group_archs.add_argument('--arch', choices=ARCH_MODEL_INV_REGISTRY[model_name])
+            group_args = parser.add_argument_group('Additional command-line arguments')
+            MODEL_REGISTRY[model_name].add_args(group_args)
+            globals()[model_name + '_parser'] = parser
diff --git a/fairseq/models/__pycache__/__init__.cpython-310.pyc b/fairseq/models/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..16b3681e3bc9575399d60db0ff55d1b3ed73f2f0
Binary files /dev/null and b/fairseq/models/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/models/__pycache__/composite_encoder.cpython-310.pyc b/fairseq/models/__pycache__/composite_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e7ebad2d7c651120d3acd3709c52f76aab4224ea
Binary files /dev/null and b/fairseq/models/__pycache__/composite_encoder.cpython-310.pyc differ
diff --git a/fairseq/models/__pycache__/distributed_fairseq_model.cpython-310.pyc b/fairseq/models/__pycache__/distributed_fairseq_model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a969f70aed607de317193a7feaf770b1e37e49e
Binary files /dev/null and b/fairseq/models/__pycache__/distributed_fairseq_model.cpython-310.pyc differ
diff --git a/fairseq/models/__pycache__/fairseq_decoder.cpython-310.pyc b/fairseq/models/__pycache__/fairseq_decoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33a7b844878ade67071ede8a4fcf4dabaa8f05bb
Binary files /dev/null and b/fairseq/models/__pycache__/fairseq_decoder.cpython-310.pyc differ
diff --git a/fairseq/models/__pycache__/fairseq_encoder.cpython-310.pyc b/fairseq/models/__pycache__/fairseq_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e48ed38968989defa68a0ba427b2b720292683b
Binary files /dev/null and b/fairseq/models/__pycache__/fairseq_encoder.cpython-310.pyc differ
diff --git a/fairseq/models/__pycache__/fairseq_incremental_decoder.cpython-310.pyc b/fairseq/models/__pycache__/fairseq_incremental_decoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5dc4f2486441b4931ee09178ecc40d0e577aa5fc
Binary files /dev/null and b/fairseq/models/__pycache__/fairseq_incremental_decoder.cpython-310.pyc differ
diff --git a/fairseq/models/__pycache__/fairseq_model.cpython-310.pyc b/fairseq/models/__pycache__/fairseq_model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f606b4714e3b9bccc5e8d32ebe3a855d91299909
Binary files /dev/null and b/fairseq/models/__pycache__/fairseq_model.cpython-310.pyc differ
diff --git a/fairseq/models/__pycache__/fconv.cpython-310.pyc b/fairseq/models/__pycache__/fconv.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e841ca9b9e89aa26c5730e66ef1ef5cedfe374b
Binary files /dev/null and b/fairseq/models/__pycache__/fconv.cpython-310.pyc differ
diff --git a/fairseq/models/__pycache__/fconv_lm.cpython-310.pyc b/fairseq/models/__pycache__/fconv_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47c5c5c5f58cac62a664707fca4d33be6d66d17a
Binary files /dev/null and b/fairseq/models/__pycache__/fconv_lm.cpython-310.pyc differ
diff --git a/fairseq/models/__pycache__/fconv_self_att.cpython-310.pyc b/fairseq/models/__pycache__/fconv_self_att.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c3611a19caf045c014abeb4e54de171735acc8c
Binary files /dev/null and b/fairseq/models/__pycache__/fconv_self_att.cpython-310.pyc differ
diff --git a/fairseq/models/__pycache__/lightconv.cpython-310.pyc b/fairseq/models/__pycache__/lightconv.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf0c35473484632803abee533df83ea60cdcdbb3
Binary files /dev/null and b/fairseq/models/__pycache__/lightconv.cpython-310.pyc differ
diff --git a/fairseq/models/__pycache__/lightconv_lm.cpython-310.pyc b/fairseq/models/__pycache__/lightconv_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f2ded58ffaa8906236eb70958e3d5dec6c5a8e0
Binary files /dev/null and b/fairseq/models/__pycache__/lightconv_lm.cpython-310.pyc differ
diff --git a/fairseq/models/__pycache__/lstm.cpython-310.pyc b/fairseq/models/__pycache__/lstm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f009f56398a0a982f1ac509e946026b27175bf2
Binary files /dev/null and b/fairseq/models/__pycache__/lstm.cpython-310.pyc differ
diff --git a/fairseq/models/__pycache__/lstm_lm.cpython-310.pyc b/fairseq/models/__pycache__/lstm_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..196c61b8fd0a6197e7ee76e8fde861f96a38c1da
Binary files /dev/null and b/fairseq/models/__pycache__/lstm_lm.cpython-310.pyc differ
diff --git a/fairseq/models/__pycache__/masked_lm.cpython-310.pyc b/fairseq/models/__pycache__/masked_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83b9a5635a1e4109c6ffac4e638021a1664ab928
Binary files /dev/null and b/fairseq/models/__pycache__/masked_lm.cpython-310.pyc differ
diff --git a/fairseq/models/__pycache__/model_utils.cpython-310.pyc b/fairseq/models/__pycache__/model_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f261c19ee2d6b4917db2e70bcbed59adeac20ab6
Binary files /dev/null and b/fairseq/models/__pycache__/model_utils.cpython-310.pyc differ
diff --git a/fairseq/models/__pycache__/multilingual_transformer.cpython-310.pyc b/fairseq/models/__pycache__/multilingual_transformer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e669ce5b860dde61f385f4ac12bdd3a735f6530f
Binary files /dev/null and b/fairseq/models/__pycache__/multilingual_transformer.cpython-310.pyc differ
diff --git a/fairseq/models/__pycache__/transformer.cpython-310.pyc b/fairseq/models/__pycache__/transformer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93fd92e0916756734d53679ae15c4eb9aa8640df
Binary files /dev/null and b/fairseq/models/__pycache__/transformer.cpython-310.pyc differ
diff --git a/fairseq/models/__pycache__/transformer_align.cpython-310.pyc b/fairseq/models/__pycache__/transformer_align.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83eb5091aa5da58c49500b72fc5813c3c9381138
Binary files /dev/null and b/fairseq/models/__pycache__/transformer_align.cpython-310.pyc differ
diff --git a/fairseq/models/__pycache__/transformer_from_pretrained_xlm.cpython-310.pyc b/fairseq/models/__pycache__/transformer_from_pretrained_xlm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a84183f493fc8a791c8aa4513617d4aafe479814
Binary files /dev/null and b/fairseq/models/__pycache__/transformer_from_pretrained_xlm.cpython-310.pyc differ
diff --git a/fairseq/models/__pycache__/transformer_lm.cpython-310.pyc b/fairseq/models/__pycache__/transformer_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a5b1280469b6846ccde2757b3911af2974e07e9
Binary files /dev/null and b/fairseq/models/__pycache__/transformer_lm.cpython-310.pyc differ
diff --git a/fairseq/models/bart/__init__.py b/fairseq/models/bart/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a701923f7e5a2a8aa9b75e5580ddea22907f53ee
--- /dev/null
+++ b/fairseq/models/bart/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .hub_interface import *  # noqa
+from .model import *  # noqa
diff --git a/fairseq/models/bart/__pycache__/__init__.cpython-310.pyc b/fairseq/models/bart/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0a70e92a3d3067df8f0f49b373909b7fc621bbf
Binary files /dev/null and b/fairseq/models/bart/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/models/bart/__pycache__/hub_interface.cpython-310.pyc b/fairseq/models/bart/__pycache__/hub_interface.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9824277d744697b3583a8104b6885f5d4563216
Binary files /dev/null and b/fairseq/models/bart/__pycache__/hub_interface.cpython-310.pyc differ
diff --git a/fairseq/models/bart/__pycache__/model.cpython-310.pyc b/fairseq/models/bart/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e7e08a4d573a65722d86723ae4c05bbb7b4a55d0
Binary files /dev/null and b/fairseq/models/bart/__pycache__/model.cpython-310.pyc differ
diff --git a/fairseq/models/bart/hub_interface.py b/fairseq/models/bart/hub_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..48c59cb91df3a693a68db1bd1ab7004f23b8dcfa
--- /dev/null
+++ b/fairseq/models/bart/hub_interface.py
@@ -0,0 +1,186 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import logging
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from typing import List
+
+from fairseq import utils
+from fairseq.data import encoders
+
+
+logger = logging.getLogger(__name__)
+
+
+class BARTHubInterface(nn.Module):
+    """A simple PyTorch Hub interface to BART.
+
+    Usage: https://github.com/pytorch/fairseq/tree/master/examples/bart
+    """
+
+    def __init__(self, args, task, model):
+        super().__init__()
+        self.args = args
+        self.task = task
+        self.model = model
+
+        self.bpe = encoders.build_bpe(args)
+
+        self.max_positions = min(utils.resolve_max_positions(
+            self.task.max_positions(),
+            self.model.max_positions(),
+        ))
+
+        # this is useful for determining the device
+        self.register_buffer('_float_tensor', torch.tensor([0], dtype=torch.float))
+
+    @property
+    def device(self):
+        return self._float_tensor.device
+
+    def encode(self, sentence: str, *addl_sentences, no_separator=True) -> torch.LongTensor:
+        """
+        BPE-encode a sentence (or multiple sentences).
+
+        Every sequence begins with a beginning-of-sentence (`<s>`) symbol.
+        Every sentence ends with an end-of-sentence (`</s>`).
+
+        Example (single sentence): `<s> a b c </s>`
+        Example (sentence pair): `<s> d e f </s> 1 2 3 </s>`
+
+        The BPE encoding follows GPT-2. One subtle detail is that the GPT-2 BPE
+        requires leading spaces. For example::
+
+            >>> bart.encode('Hello world').tolist()
+            [0, 31414, 232, 2]
+            >>> bart.encode(' world').tolist()
+            [0, 232, 2]
+            >>> bart.encode('world').tolist()
+            [0, 8331, 2]
+        """
+        tokens = self.bpe.encode(sentence)
+        if len(tokens.split(' ')) > self.max_positions - 2:
+            tokens = ' '.join(tokens.split(' ')[:self.max_positions - 2])
+        bpe_sentence = '<s> ' + tokens + ' </s>'
+        for s in addl_sentences:
+            bpe_sentence += (' </s>' if not no_separator else '')
+            bpe_sentence += ' ' + self.bpe.encode(s) + ' </s>'
+        tokens = self.task.source_dictionary.encode_line(bpe_sentence, append_eos=False)
+        return tokens.long()
+
+    def decode(self, tokens: torch.LongTensor):
+        assert tokens.dim() == 1
+        tokens = tokens.cpu().numpy()
+        if tokens[0] == self.task.source_dictionary.bos():
+            tokens = tokens[1:]  # remove <s>
+        eos_mask = (tokens == self.task.source_dictionary.eos())
+        doc_mask = eos_mask[1:] & eos_mask[:-1]
+        sentences = np.split(tokens, doc_mask.nonzero()[0] + 1)
+        sentences = [self.bpe.decode(self.task.source_dictionary.string(s)) for s in sentences]
+        if len(sentences) == 1:
+            return sentences[0]
+        return sentences
+
+    def _build_sample(self, src_tokens: List[torch.LongTensor]):
+        # assert torch.is_tensor(src_tokens)
+        dataset = self.task.build_dataset_for_inference(
+            src_tokens,
+            [x.numel() for x in src_tokens],
+        )
+        sample = dataset.collater(dataset)
+        sample = utils.apply_to_sample(
+            lambda tensor: tensor.to(self.device),
+            sample
+        )
+        return sample
+
+    def sample(self, sentences: List[str], beam: int = 1, verbose: bool = False, **kwargs) -> str:
+        input = [self.encode(sentence) for sentence in sentences]
+        hypos = self.generate(input, beam, verbose, **kwargs)
+        return [self.decode(x['tokens']) for x in hypos]
+
+    def generate(self, tokens: List[torch.LongTensor], beam: int = 5, verbose: bool = False, **kwargs) -> torch.LongTensor:
+        sample = self._build_sample(tokens)
+
+        # build generator using current args as well as any kwargs
+        gen_args = copy.copy(self.args)
+        gen_args.beam = beam
+        for k, v in kwargs.items():
+            setattr(gen_args, k, v)
+        generator = self.task.build_generator([self.model], gen_args)
+        translations = self.task.inference_step(
+            generator,
+            [self.model],
+            sample,
+            prefix_tokens=sample['net_input']['src_tokens'].new_zeros((len(tokens), 1)).fill_(self.task.source_dictionary.bos()),
+        )
+
+        if verbose:
+            src_str_with_unk = self.string(tokens)
+            logger.info('S\t{}'.format(src_str_with_unk))
+
+        def getarg(name, default):
+            return getattr(gen_args, name, getattr(self.args, name, default))
+
+        # Process top predictions
+        hypos = [x[0] for x in translations]
+        hypos = [v for _, v in sorted(zip(sample['id'].tolist(), hypos))]
+        return hypos
+
+    def extract_features(self, tokens: torch.LongTensor, return_all_hiddens: bool = False) -> torch.Tensor:
+        if tokens.dim() == 1:
+            tokens = tokens.unsqueeze(0)
+        if tokens.size(-1) > min(self.model.max_positions()):
+            raise ValueError('tokens exceeds maximum length: {} > {}'.format(
+                tokens.size(-1), self.model.max_positions()
+            ))
+        tokens.to(device=self.device),
+        prev_output_tokens = tokens.clone()
+
+        prev_output_tokens[:, 0] = tokens.gather(
+            1,
+            (tokens.ne(self.task.source_dictionary.pad()).sum(dim=1)- 1).unsqueeze(-1),
+        ).squeeze()
+
+        prev_output_tokens[:, 1:] = tokens[:, :-1]
+        features, extra = self.model(
+            src_tokens=tokens,
+            src_lengths=None,
+            prev_output_tokens=prev_output_tokens,
+            features_only=True,
+            return_all_hiddens=return_all_hiddens,
+        )
+        if return_all_hiddens:
+            # convert from T x B x C -> B x T x C
+            inner_states = extra['inner_states']
+            return [inner_state.transpose(0, 1) for inner_state in inner_states]
+        else:
+            return features  # just the last layer's features
+
+    def register_classification_head(
+        self, name: str, num_classes: int = None, embedding_size: int = None, **kwargs
+    ):
+        self.model.register_classification_head(
+            name, num_classes=num_classes, embedding_size=embedding_size, **kwargs
+        )
+
+    def predict(self, head: str, tokens: torch.LongTensor, return_logits: bool = False):
+        if tokens.dim() == 1:
+            tokens = tokens.unsqueeze(0)
+        features = self.extract_features(tokens.to(device=self.device))
+        sentence_representation = features[
+            tokens.eq(self.task.source_dictionary.eos()), :
+        ].view(features.size(0), -1, features.size(-1))[:, -1, :]
+
+        logits = self.model.classification_heads[head](sentence_representation)
+        if return_logits:
+            return logits
+        return F.log_softmax(logits, dim=-1)
diff --git a/fairseq/models/bart/model.py b/fairseq/models/bart/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..62c495cb640dfd262ef2726144e38a9658b3409c
--- /dev/null
+++ b/fairseq/models/bart/model.py
@@ -0,0 +1,323 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+BART: Denoising Sequence-to-Sequence Pre-training for
+Natural Language Generation, Translation, and Comprehension
+"""
+
+import logging
+
+import torch
+import torch.nn as nn
+
+from fairseq import utils
+from fairseq.models import (
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.transformer import TransformerModel
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+
+from .hub_interface import BARTHubInterface
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_model('bart')
+class BARTModel(TransformerModel):
+
+    @classmethod
+    def hub_models(cls):
+        return {
+            'bart.base': 'http://dl.fbaipublicfiles.com/fairseq/models/bart.base.tar.gz',
+            'bart.large': 'http://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz',
+            'bart.large.mnli': 'http://dl.fbaipublicfiles.com/fairseq/models/bart.large.mnli.tar.gz',
+            'bart.large.cnn': 'http://dl.fbaipublicfiles.com/fairseq/models/bart.large.cnn.tar.gz',
+            'bart.large.xsum': 'http://dl.fbaipublicfiles.com/fairseq/models/bart.large.xsum.tar.gz',
+        }
+
+    def __init__(self, args, encoder, decoder):
+        super().__init__(args, encoder, decoder)
+
+        # We follow BERT's random weight initialization
+        self.apply(init_bert_params)
+
+        self.classification_heads = nn.ModuleDict()
+
+    @staticmethod
+    def add_args(parser):
+        super(BARTModel, BARTModel).add_args(parser)
+        parser.add_argument(
+            '--pooler-dropout', type=float, metavar='D',
+            help='dropout probability in the masked_lm pooler layers'
+        )
+        parser.add_argument(
+            '--pooler-activation-fn',
+            choices=utils.get_available_activation_fns(),
+            help='activation function to use for pooler layer'
+        )
+
+    @property
+    def supported_targets(self):
+        return {'self'}
+
+    def forward(
+        self, src_tokens, src_lengths, prev_output_tokens,
+        features_only=False, classification_head_name=None, **kwargs
+    ):
+        if classification_head_name is not None:
+            features_only = True
+
+        encoder_out = self.encoder(
+            src_tokens,
+            src_lengths=src_lengths,
+            **kwargs,
+        )
+        x, extra = self.decoder(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            features_only=features_only,
+            **kwargs,
+        )
+
+        if classification_head_name is not None:
+            sentence_representation = x[
+                src_tokens.eq(self.encoder.dictionary.eos()), :
+            ].view(x.size(0), -1, x.size(-1))[:, -1, :]
+            x = self.classification_heads[classification_head_name](
+                sentence_representation
+            )
+        return x, extra
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path,
+        checkpoint_file='model.pt',
+        data_name_or_path='.',
+        bpe='gpt2',
+        **kwargs,
+    ):
+        from fairseq import hub_utils
+        x = hub_utils.from_pretrained(
+            model_name_or_path,
+            checkpoint_file,
+            data_name_or_path,
+            archive_map=cls.hub_models(),
+            bpe=bpe,
+            load_checkpoint_heads=True,
+            **kwargs,
+        )
+        return BARTHubInterface(x['args'], x['task'], x['models'][0])
+
+    def register_classification_head(self, name, num_classes=None, inner_dim=None, **kwargs):
+        """Register a classification head."""
+        logger.info("Registering classification head: {0}".format(name))
+        if name in self.classification_heads:
+            prev_num_classes = self.classification_heads[name].out_proj.out_features
+            prev_inner_dim = self.classification_heads[name].dense.out_features
+            if num_classes != prev_num_classes or inner_dim != prev_inner_dim:
+                logger.warning(
+                    're-registering head "{}" with num_classes {} (prev: {}) '
+                    'and inner_dim {} (prev: {})'.format(
+                        name, num_classes, prev_num_classes, inner_dim, prev_inner_dim
+                    )
+                )
+        self.classification_heads[name] = BARTClassificationHead(
+            self.args.encoder_embed_dim,
+            inner_dim or self.args.encoder_embed_dim,
+            num_classes,
+            self.args.pooler_activation_fn,
+            self.args.pooler_dropout,
+        )
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        super().upgrade_state_dict_named(state_dict, name)
+
+        prefix = name + '.' if name != '' else ''
+        current_head_names = [] if not hasattr(self, 'classification_heads') else \
+            self.classification_heads.keys()
+
+        # Handle new classification heads present in the state dict.
+        keys_to_delete = []
+        for k in state_dict.keys():
+            if not k.startswith(prefix + 'classification_heads.'):
+                continue
+
+            head_name = k[len(prefix + 'classification_heads.'):].split('.')[0]
+            num_classes = state_dict[prefix + 'classification_heads.' + head_name + '.out_proj.weight'].size(0)
+            inner_dim = state_dict[prefix + 'classification_heads.' + head_name + '.dense.weight'].size(0)
+
+            if getattr(self.args, 'load_checkpoint_heads', False):
+                if head_name not in current_head_names:
+                    self.register_classification_head(head_name, num_classes, inner_dim)
+            else:
+                if head_name not in current_head_names:
+                    logger.warning(
+                        'deleting classification head ({}) from checkpoint '
+                        'not present in current model: {}'.format(head_name, k)
+                    )
+                    keys_to_delete.append(k)
+                elif (
+                    num_classes != self.classification_heads[head_name].out_proj.out_features
+                    or inner_dim != self.classification_heads[head_name].dense.out_features
+                ):
+                    logger.warning(
+                        'deleting classification head ({}) from checkpoint '
+                        'with different dimensions than current model: {}'.format(head_name, k)
+                    )
+                    keys_to_delete.append(k)
+        for k in keys_to_delete:
+            del state_dict[k]
+
+        def truncate_emb(key):
+            if key in state_dict:
+                state_dict[key] = state_dict[key][:-1, :]
+
+        # When finetuning on translation task, remove last row of
+        # embedding matrix that corresponds to mask_idx token.
+        loaded_dict_size = state_dict['encoder.embed_tokens.weight'].size(0)
+        if loaded_dict_size == len(self.encoder.dictionary) + 1 and '<mask>' not in self.encoder.dictionary:
+            truncate_emb('encoder.embed_tokens.weight')
+            truncate_emb('decoder.embed_tokens.weight')
+            truncate_emb('encoder.output_projection.weight')
+            truncate_emb('decoder.output_projection.weight')
+
+        # When continued pretraining on new set of languages for mbart,
+        # add extra lang embeddings at the end of embed_tokens.
+        # Note: newly added languages are assumed to have been added at the end.
+        if self.args.task == 'multilingual_denoising' and loaded_dict_size < len(self.encoder.dictionary):
+            logger.info(
+                "Adding extra language embeddings not found in pretrained model for "\
+                "continued pretraining of MBART on new set of languages."
+            )
+            loaded_mask_token_embedding = state_dict['encoder.embed_tokens.weight'][-1, :]
+
+            num_langids_to_add = len(self.encoder.dictionary) - loaded_dict_size
+            embed_dim = state_dict['encoder.embed_tokens.weight'].size(1)
+
+            new_lang_embed_to_add = torch.zeros(num_langids_to_add, embed_dim)
+            nn.init.normal_(
+                new_lang_embed_to_add,
+                mean=0,
+                std=embed_dim ** -0.5
+            )
+            new_lang_embed_to_add = new_lang_embed_to_add.to(
+                dtype=state_dict['encoder.embed_tokens.weight'].dtype,
+            )
+
+            state_dict['encoder.embed_tokens.weight'] = torch.cat([
+                state_dict['encoder.embed_tokens.weight'][:loaded_dict_size-1, :],
+                new_lang_embed_to_add,
+                loaded_mask_token_embedding.unsqueeze(0)]
+            )
+            state_dict['decoder.embed_tokens.weight'] = torch.cat([
+                state_dict['decoder.embed_tokens.weight'][:loaded_dict_size-1, :],
+                new_lang_embed_to_add,
+                loaded_mask_token_embedding.unsqueeze(0)]
+            )
+
+        # Copy any newly-added classification heads into the state dict
+        # with their current weights.
+        if hasattr(self, 'classification_heads'):
+            cur_state = self.classification_heads.state_dict()
+            for k, v in cur_state.items():
+                if prefix + 'classification_heads.' + k not in state_dict:
+                    logger.info('Overwriting', prefix + 'classification_heads.' + k)
+                    state_dict[prefix + 'classification_heads.' + k] = v
+
+
+class BARTClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(
+        self,
+        input_dim,
+        inner_dim,
+        num_classes,
+        activation_fn,
+        pooler_dropout,
+    ):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.activation_fn = utils.get_activation_fn(activation_fn)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, features, **kwargs):
+        x = features
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = self.activation_fn(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@register_model_architecture('bart', 'bart_large')
+def bart_large_architecture(args):
+    args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1024)
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 4*1024)
+    args.encoder_layers = getattr(args, 'encoder_layers', 12)
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 16)
+    args.encoder_normalize_before = getattr(args, 'encoder_normalize_before', False)
+    args.encoder_learned_pos = getattr(args, 'encoder_learned_pos', True)
+    args.decoder_embed_path = getattr(args, 'decoder_embed_path', None)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', args.encoder_ffn_embed_dim)
+    args.decoder_layers = getattr(args, 'decoder_layers', 12)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
+    args.decoder_normalize_before = getattr(args, 'decoder_normalize_before', False)
+    args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', True)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.)
+    args.relu_dropout = getattr(args, 'relu_dropout', 0.)
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.max_target_positions = getattr(args, 'max_target_positions', 1024)
+    args.max_source_positions = getattr(args, 'max_source_positions', 1024)
+    args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
+    args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
+    args.share_decoder_input_output_embed = getattr(args, 'share_decoder_input_output_embed', True)
+    args.share_all_embeddings = getattr(args, 'share_all_embeddings', True)
+
+    args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim)
+    args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim)
+
+    args.no_scale_embedding = getattr(args, 'no_scale_embedding', True)
+    args.layernorm_embedding = getattr(args, 'layernorm_embedding', True)
+
+    args.activation_fn = getattr(args, 'activation_fn', 'gelu')
+    args.pooler_activation_fn = getattr(args, 'pooler_activation_fn', 'tanh')
+    args.pooler_dropout = getattr(args, 'pooler_dropout', 0.0)
+
+
+@register_model_architecture('bart', 'bart_base')
+def bart_base_architecture(args):
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 768)
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 4*768)
+    args.encoder_layers = getattr(args, 'encoder_layers', 6)
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 12)
+    args.decoder_layers = getattr(args, 'decoder_layers', 6)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 12)
+    bart_large_architecture(args)
+
+
+@register_model_architecture('bart', 'mbart_large')
+def mbart_large_architecture(args):
+    args.no_scale_embedding = getattr(args, 'no_scale_embedding', False)
+    bart_large_architecture(args)
+
+
+@register_model_architecture('bart', 'mbart_base')
+def mbart_base_architecture(args):
+    args.no_scale_embedding = getattr(args, 'no_scale_embedding', False)
+    bart_base_architecture(args)
+
+
+@register_model_architecture('bart', 'mbart_base_wmt20')
+def mbart_base_wmt20_architecture(args):
+    args.layernorm_embedding = getattr(args, 'layernorm_embedding', False)
+    mbart_base_architecture(args)
diff --git a/fairseq/models/composite_encoder.py b/fairseq/models/composite_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..afef248cdcc5e657350ce37c8ba434bc01d70558
--- /dev/null
+++ b/fairseq/models/composite_encoder.py
@@ -0,0 +1,55 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.models import FairseqEncoder
+
+
+class CompositeEncoder(FairseqEncoder):
+    """
+    A wrapper around a dictionary of :class:`FairseqEncoder` objects.
+
+    We run forward on each encoder and return a dictionary of outputs. The first
+    encoder's dictionary is used for initialization.
+
+    Args:
+        encoders (dict): a dictionary of :class:`FairseqEncoder` objects.
+    """
+
+    def __init__(self, encoders):
+        super().__init__(next(iter(encoders.values())).dictionary)
+        self.encoders = encoders
+        for key in self.encoders:
+            self.add_module(key, self.encoders[key])
+
+    def forward(self, src_tokens, src_lengths):
+        """
+        Args:
+            src_tokens (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+            src_lengths (LongTensor): lengths of each source sentence of shape
+                `(batch)`
+
+        Returns:
+            dict:
+                the outputs from each Encoder
+        """
+        encoder_out = {}
+        for key in self.encoders:
+            encoder_out[key] = self.encoders[key](src_tokens, src_lengths)
+        return encoder_out
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        """Reorder encoder output according to new_order."""
+        for key in self.encoders:
+            encoder_out[key] = self.encoders[key].reorder_encoder_out(encoder_out[key], new_order)
+        return encoder_out
+
+    def max_positions(self):
+        return min(self.encoders[key].max_positions() for key in self.encoders)
+
+    def upgrade_state_dict(self, state_dict):
+        for key in self.encoders:
+            self.encoders[key].upgrade_state_dict(state_dict)
+        return state_dict
diff --git a/fairseq/models/distributed_fairseq_model.py b/fairseq/models/distributed_fairseq_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd74bf1f1344f0fbafa6b13bbe8f43b343561100
--- /dev/null
+++ b/fairseq/models/distributed_fairseq_model.py
@@ -0,0 +1,105 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import inspect
+
+import torch.nn as nn
+
+from fairseq.legacy_distributed_data_parallel import LegacyDistributedDataParallel
+from fairseq.models import BaseFairseqModel
+
+
+_GOSSIP_DISABLED = False
+try:
+    import gossip
+except ImportError:
+    _GOSSIP_DISABLED = True
+
+
+def DistributedFairseqModel(args, model, process_group=None):
+    """
+    Wrap a *model* to support distributed data parallel training.
+
+    This is similar to the built-in DistributedDataParallel, but allows
+    additional configuration of the DistributedDataParallel class to
+    use, and also provides easier access to the wrapped model by
+    forwarding requests for missing attributes to the wrapped model.
+
+    Args:
+        args (argparse.Namespace): fairseq args
+        model (BaseFairseqModel): model to wrap
+    """
+    # determine which DDP class to extend
+    assert isinstance(model, nn.Module)
+    if args.distributed_wrapper == 'DDP' and args.ddp_backend == 'c10d':
+        ddp_class = nn.parallel.DistributedDataParallel
+        init_kwargs = dict(
+            module=model,
+            device_ids=[args.device_id],
+            output_device=args.device_id,
+            broadcast_buffers=args.broadcast_buffers,
+            bucket_cap_mb=args.bucket_cap_mb,
+            process_group=process_group,
+        )
+        # Maintain backward compatibility
+        if 'check_reduction' in inspect.getargspec(ddp_class)[0]:
+            init_kwargs['check_reduction'] = True
+        if 'find_unused_parameters' in inspect.getargspec(ddp_class)[0]:
+            init_kwargs['find_unused_parameters'] = args.find_unused_parameters
+    elif args.distributed_wrapper == 'DDP' and args.ddp_backend == 'no_c10d':
+        ddp_class = LegacyDistributedDataParallel
+        init_kwargs = dict(
+            module=model,
+            world_size=args.distributed_world_size,
+            buffer_size=2**28,
+            process_group=process_group,
+        )
+    elif args.distributed_wrapper == 'SlowMo':
+        if _GOSSIP_DISABLED:
+            raise ImportError(
+                'Cannot find gossip library. Please install from: '
+                'github.com/facebookresearch/stochastic_gradient_push'
+            )
+        ddp_class = gossip.GossipDataParallel
+
+        # The values of slowmo_momentum below were obtained by tuning on the
+        # En-De 16 dataset by training the transformer_wmt_en_de_large model
+        if args.slowmo_momentum is None:
+            if args.distributed_world_size <= 16:
+                args.slowmo_momentum = 0.0
+            elif args.distributed_world_size <= 32:
+                args.slowmo_momentum = 0.2
+            elif args.distributed_world_size <= 64:
+                args.slowmo_momentum = 0.5
+            else:
+                args.slowmo_momentum = 0.6
+
+        init_kwargs = dict(
+            module=model,
+            device_ids=[args.device_id],
+            output_device=args.device_id,
+            broadcast_buffers=args.broadcast_buffers,
+            nprocs_per_node=args.nprocs_per_node,
+            slowmo_momentum=args.slowmo_momentum,
+            localsgd=(args.slowmo_algorithm == 'LocalSGD'),
+            localsgd_frequency=args.localsgd_frequency
+        )
+    else:
+        raise ValueError('Unknown --ddp-backend: ' + args.ddp_backend)
+
+    class _DistributedFairseqModel(ddp_class):
+        """Extend DistributedDataParallel to check for missing
+        attributes in the wrapped module."""
+
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+
+        def __getattr__(self, name):
+            wrapped_module = super().__getattr__('module')
+            if hasattr(wrapped_module, name):
+                return getattr(wrapped_module, name)
+            return super().__getattr__(name)
+
+    return _DistributedFairseqModel(**init_kwargs)
diff --git a/fairseq/models/fairseq_decoder.py b/fairseq/models/fairseq_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb6c52dc7ffd95c63e0b43512db398cbb8b91582
--- /dev/null
+++ b/fairseq/models/fairseq_decoder.py
@@ -0,0 +1,90 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List, Optional, Tuple
+
+import torch.nn as nn
+from fairseq import utils
+from torch import Tensor
+
+
+class FairseqDecoder(nn.Module):
+    """Base class for decoders."""
+
+    def __init__(self, dictionary):
+        super().__init__()
+        self.dictionary = dictionary
+        self.onnx_trace = False
+
+    def forward(self, prev_output_tokens, encoder_out=None, **kwargs):
+        """
+        Args:
+            prev_output_tokens (LongTensor): shifted output tokens of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (dict, optional): output from the encoder, used for
+                encoder-side attention
+
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        x, extra = self.extract_features(
+            prev_output_tokens, encoder_out=encoder_out, **kwargs
+        )
+        x = self.output_layer(x)
+        return x, extra
+
+    def extract_features(self, prev_output_tokens, encoder_out=None, **kwargs):
+        """
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+        """
+        raise NotImplementedError
+
+    def output_layer(self, features, **kwargs):
+        """
+        Project features to the default output size, e.g., vocabulary size.
+
+        Args:
+            features (Tensor): features returned by *extract_features*.
+        """
+        raise NotImplementedError
+
+    def get_normalized_probs(
+        self,
+        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+        log_probs: bool,
+        sample: Optional[Dict[str, Tensor]] = None,
+    ):
+        """Get normalized probabilities (or log probs) from a net's output."""
+
+        if hasattr(self, "adaptive_softmax") and self.adaptive_softmax is not None:
+            if sample is not None:
+                assert "target" in sample
+                target = sample["target"]
+            else:
+                target = None
+            out = self.adaptive_softmax.get_log_prob(net_output[0], target=target)
+            return out.exp_() if not log_probs else out
+
+        logits = net_output[0]
+        if log_probs:
+            return utils.log_softmax(logits, dim=-1, onnx_trace=self.onnx_trace)
+        else:
+            return utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace)
+
+    def max_positions(self):
+        """Maximum input length supported by the decoder."""
+        return 1e6  # an arbitrary large number
+
+    def upgrade_state_dict(self, state_dict):
+        """Upgrade a (possibly old) state dict for new versions of fairseq."""
+        return state_dict
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
diff --git a/fairseq/models/fairseq_encoder.py b/fairseq/models/fairseq_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ddc0fba01ffd6ce0fa7b2bdb2761ea01392bbcf
--- /dev/null
+++ b/fairseq/models/fairseq_encoder.py
@@ -0,0 +1,91 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+from typing import Dict, List, NamedTuple, Optional
+from torch import Tensor
+
+EncoderOut = NamedTuple(
+    "EncoderOut",
+    [
+        ("encoder_out", Tensor),  # T x B x C
+        ("encoder_padding_mask", Optional[Tensor]),  # B x T
+        ("encoder_embedding", Optional[Tensor]),  # B x T x C
+        ("encoder_states", Optional[List[Tensor]]),  # List[T x B x C]
+        ("src_tokens", Optional[Tensor]),  # B x T
+        ("src_lengths", Optional[Tensor]),  # B x 1
+    ],
+)
+
+
+class FairseqEncoder(nn.Module):
+    """Base class for encoders."""
+
+    def __init__(self, dictionary):
+        super().__init__()
+        self.dictionary = dictionary
+
+    def forward(self, src_tokens, src_lengths=None, **kwargs):
+        """
+        Args:
+            src_tokens (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+            src_lengths (LongTensor): lengths of each source sentence of shape
+                `(batch)`
+        """
+        raise NotImplementedError
+
+    def forward_torchscript(self, net_input: Dict[str, Tensor]):
+        """A TorchScript-compatible version of forward.
+
+        Encoders which use additional arguments may want to override
+        this method for TorchScript compatibility.
+        """
+        if torch.jit.is_scripting():
+            return self.forward(
+                src_tokens=net_input["src_tokens"],
+                src_lengths=net_input["src_lengths"],
+            )
+        else:
+            return self.forward_non_torchscript(net_input)
+
+    @torch.jit.unused
+    def forward_non_torchscript(self, net_input: Dict[str, Tensor]):
+        encoder_input = {
+            k: v
+            for k, v in net_input.items()
+            if k != "prev_output_tokens"
+        }
+        return self.forward(**encoder_input)
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        """
+        Reorder encoder output according to `new_order`.
+
+        Args:
+            encoder_out: output from the ``forward()`` method
+            new_order (LongTensor): desired order
+
+        Returns:
+            `encoder_out` rearranged according to `new_order`
+        """
+        raise NotImplementedError
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return 1e6  # an arbitrary large number
+
+    def upgrade_state_dict(self, state_dict):
+        """Upgrade a (possibly old) state dict for new versions of fairseq."""
+        return state_dict
+
+    def set_num_updates(self, num_updates):
+        """State from trainer to pass along to model at every update."""
+
+        def _apply(m):
+            if hasattr(m, 'set_num_updates') and m != self:
+                m.set_num_updates(num_updates)
+        self.apply(_apply)
diff --git a/fairseq/models/fairseq_incremental_decoder.py b/fairseq/models/fairseq_incremental_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..68e583fea8c368f01a493fab41dc275cb18c76e2
--- /dev/null
+++ b/fairseq/models/fairseq_incremental_decoder.py
@@ -0,0 +1,112 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Dict, Optional
+
+from torch import Tensor
+
+from fairseq.models import FairseqDecoder
+from fairseq.incremental_decoding_utils import with_incremental_state
+
+
+logger = logging.getLogger(__name__)
+
+
+@with_incremental_state
+class FairseqIncrementalDecoder(FairseqDecoder):
+    """Base class for incremental decoders.
+
+    Incremental decoding is a special mode at inference time where the Model
+    only receives a single timestep of input corresponding to the previous
+    output token (for teacher forcing) and must produce the next output
+    *incrementally*. Thus the model must cache any long-term state that is
+    needed about the sequence, e.g., hidden states, convolutional states, etc.
+
+    Compared to the standard :class:`FairseqDecoder` interface, the incremental
+    decoder interface allows :func:`forward` functions to take an extra keyword
+    argument (*incremental_state*) that can be used to cache state across
+    time-steps.
+
+    The :class:`FairseqIncrementalDecoder` interface also defines the
+    :func:`reorder_incremental_state` method, which is used during beam search
+    to select and reorder the incremental state based on the selection of beams.
+
+    To learn more about how incremental decoding works, refer to `this blog
+    <http://www.telesens.co/2019/04/21/understanding-incremental-decoding-in-fairseq/>`_.
+    """
+
+    def __init__(self, dictionary):
+        super().__init__(dictionary)
+
+    def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs):
+        """
+        Args:
+            prev_output_tokens (LongTensor): shifted output tokens of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (dict, optional): output from the encoder, used for
+                encoder-side attention
+            incremental_state (dict, optional): dictionary used for storing
+                state during :ref:`Incremental decoding`
+
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        raise NotImplementedError
+
+    def extract_features(self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs):
+        """
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+        """
+        raise NotImplementedError
+
+    def reorder_incremental_state(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+        new_order: Tensor,
+    ):
+        """Reorder incremental state.
+
+        This will be called when the order of the input has changed from the
+        previous time step. A typical use case is beam search, where the input
+        order changes between time steps based on the selection of beams.
+        """
+        pass
+
+    def reorder_incremental_state_scripting(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+        new_order: Tensor,
+    ):
+        """Main entry point for reordering the incremental state.
+
+        Due to limitations in TorchScript, we call this function in
+        :class:`fairseq.sequence_generator.SequenceGenerator` instead of
+        calling :func:`reorder_incremental_state` directly.
+        """
+        for module in self.modules():
+            if hasattr(module, 'reorder_incremental_state'):
+                result = module.reorder_incremental_state(incremental_state, new_order)
+                if result is not None:
+                    incremental_state = result
+
+    def set_beam_size(self, beam_size):
+        """Sets the beam size in the decoder and all children."""
+        if getattr(self, '_beam_size', -1) != beam_size:
+            seen = set()
+
+            def apply_set_beam_size(module):
+                if module != self and hasattr(module, 'set_beam_size') \
+                        and module not in seen:
+                    seen.add(module)
+                    module.set_beam_size(beam_size)
+
+            self.apply(apply_set_beam_size)
+            self._beam_size = beam_size
diff --git a/fairseq/models/fairseq_model.py b/fairseq/models/fairseq_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cf6cba1189d94da32c4ea262e1eabaf9c82e34d
--- /dev/null
+++ b/fairseq/models/fairseq_model.py
@@ -0,0 +1,540 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Base classes for various fairseq models.
+"""
+
+import logging
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.checkpoint_utils import prune_state_dict
+from fairseq.data import Dictionary
+from fairseq.models import FairseqDecoder, FairseqEncoder
+from torch import Tensor
+
+
+logger = logging.getLogger(__name__)
+
+
+class BaseFairseqModel(nn.Module):
+    """Base class for fairseq models."""
+
+    def __init__(self):
+        super().__init__()
+        self._is_generation_fast = False
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        pass
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        raise NotImplementedError("Model must implement the build_model method")
+
+    def get_targets(self, sample, net_output):
+        """Get targets from either the sample or the net's output."""
+        return sample["target"]
+
+    def get_normalized_probs(
+        self,
+        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+        log_probs: bool,
+        sample: Optional[Dict[str, Tensor]] = None,
+    ):
+        """Get normalized probabilities (or log probs) from a net's output."""
+        return self.get_normalized_probs_scriptable(net_output, log_probs, sample)
+
+    # TorchScript doesn't support super() method so that the scriptable Subclass
+    # can't access the base class model in Torchscript.
+    # Current workaround is to add a helper function with different name and
+    # call the helper function from scriptable Subclass.
+    def get_normalized_probs_scriptable(
+        self,
+        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+        log_probs: bool,
+        sample: Optional[Dict[str, Tensor]] = None,
+    ):
+        """Scriptable helper function for get_normalized_probs in ~BaseFairseqModel"""
+        if hasattr(self, "decoder"):
+            return self.decoder.get_normalized_probs(net_output, log_probs, sample)
+        elif torch.is_tensor(net_output):
+            logits = net_output.float()
+            if log_probs:
+                return F.log_softmax(logits, dim=-1)
+            else:
+                return F.softmax(logits, dim=-1)
+        raise NotImplementedError
+
+    def extract_features(self, *args, **kwargs):
+        """Similar to *forward* but only return features."""
+        return self(*args, **kwargs)
+
+    def max_positions(self):
+        """Maximum length supported by the model."""
+        return None
+
+    def load_state_dict(self, state_dict, strict=True, args=None):
+        """Copies parameters and buffers from *state_dict* into this module and
+        its descendants.
+
+        Overrides the method in :class:`nn.Module`. Compared with that method
+        this additionally "upgrades" *state_dicts* from old checkpoints.
+        """
+        self.upgrade_state_dict(state_dict)
+        new_state_dict = prune_state_dict(state_dict, args)
+        return super().load_state_dict(new_state_dict, strict)
+
+    def upgrade_state_dict(self, state_dict):
+        """Upgrade old state dicts to work with newer code."""
+        self.upgrade_state_dict_named(state_dict, "")
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        """Upgrade old state dicts to work with newer code.
+
+        Args:
+            state_dict (dict): state dictionary to upgrade, in place
+            name (str): the state dict key corresponding to the current module
+        """
+        assert state_dict is not None
+
+        def do_upgrade(m, prefix):
+            if len(prefix) > 0:
+                prefix += "."
+
+            for n, c in m.named_children():
+                name = prefix + n
+                if hasattr(c, "upgrade_state_dict_named"):
+                    c.upgrade_state_dict_named(state_dict, name)
+                elif hasattr(c, "upgrade_state_dict"):
+                    c.upgrade_state_dict(state_dict)
+                do_upgrade(c, name)
+
+        do_upgrade(self, name)
+
+    def set_num_updates(self, num_updates):
+        """State from trainer to pass along to model at every update."""
+
+        def _apply(m):
+            if hasattr(m, 'set_num_updates') and m != self:
+                m.set_num_updates(num_updates)
+        self.apply(_apply)
+
+    def prepare_for_inference_(self, args):
+        """Prepare model for inference."""
+        kwargs = {}
+        kwargs['beamable_mm_beam_size'] = (
+            None if getattr(args, 'no_beamable_mm', False)
+            else getattr(args, 'beam', 5)
+        )
+        kwargs['need_attn'] = getattr(args, 'print_alignment', False)
+        if hasattr(args, 'retain_dropout'):
+            kwargs['retain_dropout'] = args.retain_dropout
+            kwargs['retain_dropout_modules'] = getattr(
+                args, 'retain_dropout_modules', None
+            )
+        self.make_generation_fast_(**kwargs)
+
+    def make_generation_fast_(self, **kwargs):
+        """
+        Legacy entry point to optimize model for faster generation.
+        Prefer prepare_for_inference_.
+        """
+        if self._is_generation_fast:
+            return  # only apply once
+        self._is_generation_fast = True
+
+        # remove weight norm from all modules in the network
+        def apply_remove_weight_norm(module):
+            try:
+                nn.utils.remove_weight_norm(module)
+            except ValueError:  # this module didn't have weight norm
+                return
+
+        self.apply(apply_remove_weight_norm)
+
+        def apply_make_generation_fast_(module, prefix):
+            if len(prefix) > 0:
+                prefix += "."
+
+            base_func = BaseFairseqModel.make_generation_fast_
+            for n, m in module.named_modules():
+                if (
+                    m != self
+                    and hasattr(m, "make_generation_fast_")
+                    # don't call this implementation again, e.g., if
+                    # children modules also inherit from BaseFairseqModel
+                    and m.make_generation_fast_.__func__ is not base_func
+                ):
+                    name = prefix + n
+                    m.make_generation_fast_(name=name, **kwargs)
+
+        apply_make_generation_fast_(self, "")
+
+        def train(mode=True):
+            if mode:
+                raise RuntimeError("cannot train after make_generation_fast")
+
+        # this model should no longer be used for training
+        self.eval()
+        self.train = train
+
+    def prepare_for_onnx_export_(self, **kwargs):
+        """Make model exportable via ONNX trace."""
+        seen = set()
+
+        def apply_prepare_for_onnx_export_(module):
+            if (
+                module != self
+                and hasattr(module, "prepare_for_onnx_export_")
+                and module not in seen
+            ):
+                seen.add(module)
+                module.prepare_for_onnx_export_(**kwargs)
+
+        self.apply(apply_prepare_for_onnx_export_)
+
+    def prepare_for_tpu_(self, **kwargs):
+        """Optionally modify model for use on TPUs."""
+        seen = set()
+
+        def apply_prepare_for_tpu_(module):
+            if (
+                module != self
+                and hasattr(module, "prepare_for_tpu_")
+                and module not in seen
+            ):
+                seen.add(module)
+                module.prepare_for_tpu_(**kwargs)
+
+        self.apply(apply_prepare_for_tpu_)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path,
+        checkpoint_file="model.pt",
+        data_name_or_path=".",
+        **kwargs,
+    ):
+        """
+        Load a :class:`~fairseq.models.FairseqModel` from a pre-trained model
+        file. Downloads and caches the pre-trained model file if needed.
+
+        The base implementation returns a
+        :class:`~fairseq.hub_utils.GeneratorHubInterface`, which can be used to
+        generate translations or sample from language models. The underlying
+        :class:`~fairseq.models.FairseqModel` can be accessed via the
+        *generator.models* attribute.
+
+        Other models may override this to implement custom hub interfaces.
+
+        Args:
+            model_name_or_path (str): either the name of a pre-trained model to
+                load or a path/URL to a pre-trained model state dict
+            checkpoint_file (str, optional): colon-separated list of checkpoint
+                files in the model archive to ensemble (default: 'model.pt')
+            data_name_or_path (str, optional): point args.data to the archive
+                at the given path/URL. Can start with '.' or './' to reuse the
+                model archive path.
+        """
+        from fairseq import hub_utils
+
+        x = hub_utils.from_pretrained(
+            model_name_or_path,
+            checkpoint_file,
+            data_name_or_path,
+            archive_map=cls.hub_models(),
+            **kwargs,
+        )
+        logger.info(x["args"])
+        return hub_utils.GeneratorHubInterface(x["args"], x["task"], x["models"])
+
+    @classmethod
+    def hub_models(cls):
+        return {}
+
+
+class FairseqEncoderDecoderModel(BaseFairseqModel):
+    """Base class for encoder-decoder models.
+
+    Args:
+        encoder (FairseqEncoder): the encoder
+        decoder (FairseqDecoder): the decoder
+    """
+
+    def __init__(self, encoder, decoder):
+        super().__init__()
+
+        self.encoder = encoder
+        self.decoder = decoder
+        assert isinstance(self.encoder, FairseqEncoder)
+        assert isinstance(self.decoder, FairseqDecoder)
+
+    def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs):
+        """
+        Run the forward pass for an encoder-decoder model.
+
+        First feed a batch of source tokens through the encoder. Then, feed the
+        encoder output and previous decoder outputs (i.e., teacher forcing) to
+        the decoder to produce the next outputs::
+
+            encoder_out = self.encoder(src_tokens, src_lengths)
+            return self.decoder(prev_output_tokens, encoder_out)
+
+        Args:
+            src_tokens (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+            src_lengths (LongTensor): source sentence lengths of shape `(batch)`
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+        decoder_out = self.decoder(
+            prev_output_tokens, encoder_out=encoder_out, **kwargs
+        )
+        return decoder_out
+
+    def forward_decoder(self, prev_output_tokens, **kwargs):
+        return self.decoder(prev_output_tokens, **kwargs)
+
+    def extract_features(self, src_tokens, src_lengths, prev_output_tokens, **kwargs):
+        """
+        Similar to *forward* but only return features.
+
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+        """
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+        features = self.decoder.extract_features(
+            prev_output_tokens, encoder_out=encoder_out, **kwargs
+        )
+        return features
+
+    def output_layer(self, features, **kwargs):
+        """Project features to the default output size (typically vocabulary size)."""
+        return self.decoder.output_layer(features, **kwargs)
+
+    def max_positions(self):
+        """Maximum length supported by the model."""
+        return (self.encoder.max_positions(), self.decoder.max_positions())
+
+    def max_decoder_positions(self):
+        """Maximum length supported by the decoder."""
+        return self.decoder.max_positions()
+
+
+class FairseqModel(FairseqEncoderDecoderModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        utils.deprecation_warning(
+            "FairseqModel is deprecated, please use FairseqEncoderDecoderModel "
+            "or BaseFairseqModel instead",
+            stacklevel=4,
+        )
+
+
+class FairseqMultiModel(BaseFairseqModel):
+    """Base class for combining multiple encoder-decoder models."""
+
+    def __init__(self, encoders, decoders):
+        super().__init__()
+        assert encoders.keys() == decoders.keys()
+        self.keys = list(encoders.keys())
+        for key in self.keys:
+            assert isinstance(encoders[key], FairseqEncoder)
+            assert isinstance(decoders[key], FairseqDecoder)
+
+        self.models = nn.ModuleDict(
+            {
+                key: FairseqEncoderDecoderModel(encoders[key], decoders[key])
+                for key in self.keys
+            }
+        )
+
+    @staticmethod
+    def build_shared_embeddings(
+        dicts: Dict[str, Dictionary],
+        langs: List[str],
+        embed_dim: int,
+        build_embedding: callable,
+        pretrained_embed_path: Optional[str] = None,
+    ):
+        """
+        Helper function to build shared embeddings for a set of languages after
+        checking that all dicts corresponding to those languages are equivalent.
+
+        Args:
+            dicts: Dict of lang_id to its corresponding Dictionary
+            langs: languages that we want to share embeddings for
+            embed_dim: embedding dimension
+            build_embedding: callable function to actually build the embedding
+            pretrained_embed_path: Optional path to load pretrained embeddings
+        """
+        shared_dict = dicts[langs[0]]
+        if any(dicts[lang] != shared_dict for lang in langs):
+            raise ValueError(
+                "--share-*-embeddings requires a joined dictionary: "
+                "--share-encoder-embeddings requires a joined source "
+                "dictionary, --share-decoder-embeddings requires a joined "
+                "target dictionary, and --share-all-embeddings requires a "
+                "joint source + target dictionary."
+            )
+        return build_embedding(shared_dict, embed_dim, pretrained_embed_path)
+
+    def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs):
+        raise NotImplementedError
+
+    def max_positions(self):
+        """Maximum length supported by the model."""
+        return {
+            key: (
+                self.models[key].encoder.max_positions(),
+                self.models[key].decoder.max_positions(),
+            )
+            for key in self.keys
+        }
+
+    def max_decoder_positions(self):
+        """Maximum length supported by the decoder."""
+        return min(model.decoder.max_positions() for model in self.models.values())
+
+    @property
+    def encoder(self):
+        return self.models[self.keys[0]].encoder
+
+    @property
+    def decoder(self):
+        return self.models[self.keys[0]].decoder
+
+    def forward_decoder(self, prev_output_tokens, **kwargs):
+        return self.decoder(prev_output_tokens, **kwargs)
+
+    def load_state_dict(self, state_dict, strict=True, args=None):
+        """Copies parameters and buffers from *state_dict* into this module and
+        its descendants.
+
+        Overrides the method in :class:`nn.Module`. Compared with that method
+        this additionally "upgrades" *state_dicts* from old checkpoints.
+        """
+        self.upgrade_state_dict(state_dict)
+        new_state_dict = prune_state_dict(state_dict, args)
+        return super().load_state_dict(new_state_dict, strict)
+
+
+class FairseqLanguageModel(BaseFairseqModel):
+    """Base class for decoder-only models.
+
+    Args:
+        decoder (FairseqDecoder): the decoder
+    """
+
+    def __init__(self, decoder):
+        super().__init__()
+        self.decoder = decoder
+        assert isinstance(self.decoder, FairseqDecoder)
+
+    def forward(self, src_tokens, **kwargs):
+        """
+        Run the forward pass for a decoder-only model.
+
+        Feeds a batch of tokens through the decoder to predict the next tokens.
+
+        Args:
+            src_tokens (LongTensor): tokens on which to condition the decoder,
+                of shape `(batch, tgt_len)`
+            src_lengths (LongTensor): source sentence lengths of shape `(batch)`
+
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, seq_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        return self.decoder(src_tokens, **kwargs)
+
+    def forward_decoder(self, prev_output_tokens, **kwargs):
+        return self.decoder(prev_output_tokens, **kwargs)
+
+    def extract_features(self, src_tokens, **kwargs):
+        """
+        Similar to *forward* but only return features.
+
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, seq_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+        """
+        return self.decoder.extract_features(src_tokens, **kwargs)
+
+    def output_layer(self, features, **kwargs):
+        """Project features to the default output size (typically vocabulary size)."""
+        return self.decoder.output_layer(features, **kwargs)
+
+    def max_positions(self):
+        """Maximum length supported by the model."""
+        return self.decoder.max_positions()
+
+    def max_decoder_positions(self):
+        """Maximum length supported by the decoder."""
+        return self.decoder.max_positions()
+
+    @property
+    def supported_targets(self):
+        return {"future"}
+
+
+class FairseqEncoderModel(BaseFairseqModel):
+    """Base class for encoder-only models.
+
+    Args:
+        encoder (FairseqEncoder): the encoder
+    """
+
+    def __init__(self, encoder):
+        super().__init__()
+        self.encoder = encoder
+        assert isinstance(self.encoder, FairseqEncoder)
+
+    def forward(self, src_tokens, src_lengths, **kwargs):
+        """
+        Run the forward pass for a encoder-only model.
+
+        Feeds a batch of tokens through the encoder to generate features.
+
+        Args:
+            src_tokens (LongTensor): input tokens of shape `(batch, src_len)`
+            src_lengths (LongTensor): source sentence lengths of shape `(batch)`
+
+        Returns:
+            the encoder's output, typically of shape `(batch, src_len, features)`
+        """
+        return self.encoder(src_tokens, src_lengths, **kwargs)
+
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        """Get normalized probabilities (or log probs) from a net's output."""
+        encoder_out = net_output["encoder_out"]
+        if torch.is_tensor(encoder_out):
+            logits = encoder_out.float()
+            if log_probs:
+                return F.log_softmax(logits, dim=-1)
+            else:
+                return F.softmax(logits, dim=-1)
+        raise NotImplementedError
+
+    def max_positions(self):
+        """Maximum length supported by the model."""
+        return self.encoder.max_positions()
diff --git a/fairseq/models/fconv.py b/fairseq/models/fconv.py
new file mode 100644
index 0000000000000000000000000000000000000000..c60a2f4e5f5d4b9596146299a8baec1b031ef7f5
--- /dev/null
+++ b/fairseq/models/fconv.py
@@ -0,0 +1,672 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq import utils
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqIncrementalDecoder,
+    FairseqEncoderDecoderModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.modules import (
+    AdaptiveSoftmax, BeamableMM, FairseqDropout, GradMultiply, LearnedPositionalEmbedding,
+    LinearizedConvolution,
+)
+
+
+@register_model('fconv')
+class FConvModel(FairseqEncoderDecoderModel):
+    """
+    A fully convolutional model, i.e. a convolutional encoder and a
+    convolutional decoder, as described in `"Convolutional Sequence to Sequence
+    Learning" (Gehring et al., 2017) <https://arxiv.org/abs/1705.03122>`_.
+
+    Args:
+        encoder (FConvEncoder): the encoder
+        decoder (FConvDecoder): the decoder
+
+    The Convolutional model provides the following named architectures and
+    command-line arguments:
+
+    .. argparse::
+        :ref: fairseq.models.fconv_parser
+        :prog:
+    """
+
+    @classmethod
+    def hub_models(cls):
+
+        def moses_subword(path):
+            return {
+                'path': path,
+                'tokenizer': 'moses',
+                'bpe': 'subword_nmt',
+            }
+
+        return {
+            'conv.wmt14.en-fr': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/wmt14.v2.en-fr.fconv-py.tar.bz2'),
+            'conv.wmt14.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-de.fconv-py.tar.bz2'),
+            'conv.wmt17.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/wmt17.v2.en-de.fconv-py.tar.bz2'),
+        }
+
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+        self.encoder.num_attention_layers = sum(layer is not None for layer in decoder.attention)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension')
+        parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained encoder embedding')
+        parser.add_argument('--encoder-layers', type=str, metavar='EXPR',
+                            help='encoder layers [(dim, kernel_size), ...]')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained decoder embedding')
+        parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
+                            help='decoder layers [(dim, kernel_size), ...]')
+        parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
+                            help='decoder output embedding dimension')
+        parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
+                            help='decoder attention [True, ...]')
+        parser.add_argument('--share-input-output-embed', action='store_true',
+                            help='share input and output embeddings (requires'
+                                 ' --decoder-out-embed-dim and --decoder-embed-dim'
+                                 ' to be equal)')
+        # fmt: on
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure that all args are properly defaulted (in case there are any new ones)
+        base_architecture(args)
+
+        encoder_embed_dict = None
+        if args.encoder_embed_path:
+            encoder_embed_dict = utils.parse_embedding(args.encoder_embed_path)
+            utils.print_embed_overlap(encoder_embed_dict, task.source_dictionary)
+
+        decoder_embed_dict = None
+        if args.decoder_embed_path:
+            decoder_embed_dict = utils.parse_embedding(args.decoder_embed_path)
+            utils.print_embed_overlap(decoder_embed_dict, task.target_dictionary)
+
+        encoder = FConvEncoder(
+            dictionary=task.source_dictionary,
+            embed_dim=args.encoder_embed_dim,
+            embed_dict=encoder_embed_dict,
+            convolutions=eval(args.encoder_layers),
+            dropout=args.dropout,
+            max_positions=args.max_source_positions,
+        )
+        decoder = FConvDecoder(
+            dictionary=task.target_dictionary,
+            embed_dim=args.decoder_embed_dim,
+            embed_dict=decoder_embed_dict,
+            convolutions=eval(args.decoder_layers),
+            out_embed_dim=args.decoder_out_embed_dim,
+            attention=eval(args.decoder_attention),
+            dropout=args.dropout,
+            max_positions=args.max_target_positions,
+            share_embed=args.share_input_output_embed,
+        )
+        return FConvModel(encoder, decoder)
+
+
+class FConvEncoder(FairseqEncoder):
+    """
+    Convolutional encoder consisting of `len(convolutions)` layers.
+
+    Args:
+        dictionary (~fairseq.data.Dictionary): encoding dictionary
+        embed_dim (int, optional): embedding dimension
+        embed_dict (str, optional): filename from which to load pre-trained
+            embeddings
+        max_positions (int, optional): maximum supported input sequence length
+        convolutions (list, optional): the convolutional layer structure. Each
+            list item `i` corresponds to convolutional layer `i`. Layers are
+            given as ``(out_channels, kernel_width, [residual])``. Residual
+            connections are added between layers when ``residual=1`` (which is
+            the default behavior).
+        dropout (float, optional): dropout to be applied before each conv layer
+    """
+
+    def __init__(
+        self, dictionary, embed_dim=512, embed_dict=None, max_positions=1024,
+        convolutions=((512, 3),) * 20, dropout=0.1,
+    ):
+        super().__init__(dictionary)
+        self.dropout_module = FairseqDropout(
+            dropout, module_name=self.__class__.__name__
+        )
+        self.num_attention_layers = None
+
+        num_embeddings = len(dictionary)
+        self.padding_idx = dictionary.pad()
+        self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx)
+        if embed_dict:
+            self.embed_tokens = utils.load_embedding(embed_dict, self.dictionary, self.embed_tokens)
+
+        self.embed_positions = PositionalEmbedding(
+            max_positions,
+            embed_dim,
+            self.padding_idx,
+        )
+
+        convolutions = extend_conv_spec(convolutions)
+        in_channels = convolutions[0][0]
+        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
+        self.projections = nn.ModuleList()
+        self.convolutions = nn.ModuleList()
+        self.residuals = []
+
+        layer_in_channels = [in_channels]
+        for _, (out_channels, kernel_size, residual) in enumerate(convolutions):
+            if residual == 0:
+                residual_dim = out_channels
+            else:
+                residual_dim = layer_in_channels[-residual]
+            self.projections.append(Linear(residual_dim, out_channels)
+                                    if residual_dim != out_channels else None)
+            if kernel_size % 2 == 1:
+                padding = kernel_size // 2
+            else:
+                padding = 0
+            self.convolutions.append(
+                ConvTBC(in_channels, out_channels * 2, kernel_size,
+                        dropout=dropout, padding=padding)
+            )
+            self.residuals.append(residual)
+            in_channels = out_channels
+            layer_in_channels.append(out_channels)
+        self.fc2 = Linear(in_channels, embed_dim)
+
+    def forward(self, src_tokens, src_lengths):
+        """
+        Args:
+            src_tokens (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+            src_lengths (LongTensor): lengths of each source sentence of shape
+                `(batch)`
+
+        Returns:
+            dict:
+                - **encoder_out** (tuple): a tuple with two elements, where the
+                  first element is the last encoder layer's output and the
+                  second element is the same quantity summed with the input
+                  embedding (used for attention). The shape of both tensors is
+                  `(batch, src_len, embed_dim)`.
+                - **encoder_padding_mask** (ByteTensor): the positions of
+                  padding elements of shape `(batch, src_len)`
+        """
+        # embed tokens and positions
+        x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens)
+        x = self.dropout_module(x)
+        input_embedding = x
+
+        # project to size of convolution
+        x = self.fc1(x)
+
+        # used to mask padding in input
+        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()  # -> T x B
+        if not encoder_padding_mask.any():
+            encoder_padding_mask = None
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        residuals = [x]
+        # temporal convolutions
+        for proj, conv, res_layer in zip(self.projections, self.convolutions, self.residuals):
+            if res_layer > 0:
+                residual = residuals[-res_layer]
+                residual = residual if proj is None else proj(residual)
+            else:
+                residual = None
+
+            if encoder_padding_mask is not None:
+                x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
+
+            x = self.dropout_module(x)
+            if conv.kernel_size[0] % 2 == 1:
+                # padding is implicit in the conv
+                x = conv(x)
+            else:
+                padding_l = (conv.kernel_size[0] - 1) // 2
+                padding_r = conv.kernel_size[0] // 2
+                x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
+                x = conv(x)
+            x = F.glu(x, dim=2)
+
+            if residual is not None:
+                x = (x + residual) * math.sqrt(0.5)
+            residuals.append(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(1, 0)
+
+        # project back to size of embedding
+        x = self.fc2(x)
+
+        if encoder_padding_mask is not None:
+            encoder_padding_mask = encoder_padding_mask.t()  # -> B x T
+            x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
+
+        # scale gradients (this only affects backward, not forward)
+        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))
+
+        # add output to input embedding for attention
+        y = (x + input_embedding) * math.sqrt(0.5)
+
+        return {
+            'encoder_out': (x, y),
+            'encoder_padding_mask': encoder_padding_mask,  # B x T
+        }
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        if encoder_out['encoder_out'] is not None:
+            encoder_out['encoder_out'] = (
+                encoder_out['encoder_out'][0].index_select(0, new_order),
+                encoder_out['encoder_out'][1].index_select(0, new_order),
+            )
+        if encoder_out['encoder_padding_mask'] is not None:
+            encoder_out['encoder_padding_mask'] = \
+                encoder_out['encoder_padding_mask'].index_select(0, new_order)
+        return encoder_out
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return self.embed_positions.max_positions
+
+
+class AttentionLayer(nn.Module):
+    def __init__(self, conv_channels, embed_dim, bmm=None):
+        super().__init__()
+        # projects from output of convolution to embedding dimension
+        self.in_projection = Linear(conv_channels, embed_dim)
+        # projects from embedding dimension to convolution size
+        self.out_projection = Linear(embed_dim, conv_channels)
+
+        self.bmm = bmm if bmm is not None else torch.bmm
+
+    def forward(self, x, target_embedding, encoder_out, encoder_padding_mask):
+        residual = x
+
+        # attention
+        x = (self.in_projection(x) + target_embedding) * math.sqrt(0.5)
+        x = self.bmm(x, encoder_out[0])
+
+        # don't attend over padding
+        if encoder_padding_mask is not None:
+            x = x.float().masked_fill(
+                encoder_padding_mask.unsqueeze(1),
+                float('-inf')
+            ).type_as(x)  # FP16 support: cast to float and back
+
+        # softmax over last dim
+        sz = x.size()
+        x = F.softmax(x.view(sz[0] * sz[1], sz[2]), dim=1)
+        x = x.view(sz)
+        attn_scores = x
+
+        x = self.bmm(x, encoder_out[1])
+
+        # scale attention output (respecting potentially different lengths)
+        s = encoder_out[1].size(1)
+        if encoder_padding_mask is None:
+            x = x * (s * math.sqrt(1.0 / s))
+        else:
+            s = s - encoder_padding_mask.type_as(x).sum(dim=1, keepdim=True)  # exclude padding
+            s = s.unsqueeze(-1)
+            x = x * (s * s.rsqrt())
+
+        # project back
+        x = (self.out_projection(x) + residual) * math.sqrt(0.5)
+        return x, attn_scores
+
+    def make_generation_fast_(self, beamable_mm_beam_size=None, **kwargs):
+        """Replace torch.bmm with BeamableMM."""
+        if beamable_mm_beam_size is not None:
+            del self.bmm
+            self.add_module('bmm', BeamableMM(beamable_mm_beam_size))
+
+
+class FConvDecoder(FairseqIncrementalDecoder):
+    """Convolutional decoder"""
+
+    def __init__(
+        self, dictionary, embed_dim=512, embed_dict=None, out_embed_dim=256,
+        max_positions=1024, convolutions=((512, 3),) * 20, attention=True,
+        dropout=0.1, share_embed=False, positional_embeddings=True,
+        adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0.,
+    ):
+        super().__init__(dictionary)
+        self.register_buffer('version', torch.Tensor([2]))
+        self.dropout_module = FairseqDropout(
+            dropout, module_name=self.__class__.__name__
+        )
+        self.need_attn = True
+
+        convolutions = extend_conv_spec(convolutions)
+        in_channels = convolutions[0][0]
+        if isinstance(attention, bool):
+            # expand True into [True, True, ...] and do the same with False
+            attention = [attention] * len(convolutions)
+        if not isinstance(attention, list) or len(attention) != len(convolutions):
+            raise ValueError('Attention is expected to be a list of booleans of '
+                             'length equal to the number of layers.')
+
+        num_embeddings = len(dictionary)
+        padding_idx = dictionary.pad()
+        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
+        if embed_dict:
+            self.embed_tokens = utils.load_embedding(embed_dict, self.dictionary, self.embed_tokens)
+
+        self.embed_positions = PositionalEmbedding(
+            max_positions,
+            embed_dim,
+            padding_idx,
+        ) if positional_embeddings else None
+
+        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
+        self.projections = nn.ModuleList()
+        self.convolutions = nn.ModuleList()
+        self.attention = nn.ModuleList()
+        self.residuals = []
+
+        layer_in_channels = [in_channels]
+        for i, (out_channels, kernel_size, residual) in enumerate(convolutions):
+            if residual == 0:
+                residual_dim = out_channels
+            else:
+                residual_dim = layer_in_channels[-residual]
+            self.projections.append(Linear(residual_dim, out_channels)
+                                    if residual_dim != out_channels else None)
+            self.convolutions.append(
+                LinearizedConv1d(in_channels, out_channels * 2, kernel_size,
+                                 padding=(kernel_size - 1), dropout=dropout)
+            )
+            self.attention.append(AttentionLayer(out_channels, embed_dim)
+                                  if attention[i] else None)
+            self.residuals.append(residual)
+            in_channels = out_channels
+            layer_in_channels.append(out_channels)
+
+        self.adaptive_softmax = None
+        self.fc2 = self.fc3 = None
+
+        if adaptive_softmax_cutoff is not None:
+            assert not share_embed
+            self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, in_channels, adaptive_softmax_cutoff,
+                                                    dropout=adaptive_softmax_dropout)
+        else:
+            self.fc2 = Linear(in_channels, out_embed_dim)
+            if share_embed:
+                assert out_embed_dim == embed_dim, \
+                    "Shared embed weights implies same dimensions " \
+                    " out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim)
+                self.fc3 = nn.Linear(out_embed_dim, num_embeddings)
+                self.fc3.weight = self.embed_tokens.weight
+            else:
+                self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout)
+
+    def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused):
+        if encoder_out is not None:
+            encoder_padding_mask = encoder_out['encoder_padding_mask']
+            encoder_out = encoder_out['encoder_out']
+
+            # split and transpose encoder outputs
+            encoder_a, encoder_b = self._split_encoder_out(encoder_out, incremental_state)
+
+        if self.embed_positions is not None:
+            pos_embed = self.embed_positions(prev_output_tokens, incremental_state)
+        else:
+            pos_embed = 0
+
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+        x = self._embed_tokens(prev_output_tokens, incremental_state)
+
+        # embed tokens and combine with positional embeddings
+        x += pos_embed
+        x = self.dropout_module(x)
+        target_embedding = x
+
+        # project to size of convolution
+        x = self.fc1(x)
+
+        # B x T x C -> T x B x C
+        x = self._transpose_if_training(x, incremental_state)
+
+        # temporal convolutions
+        avg_attn_scores = None
+        num_attn_layers = len(self.attention)
+        residuals = [x]
+        for proj, conv, attention, res_layer in zip(self.projections, self.convolutions, self.attention,
+                                                    self.residuals):
+            if res_layer > 0:
+                residual = residuals[-res_layer]
+                residual = residual if proj is None else proj(residual)
+            else:
+                residual = None
+
+            x = self.dropout_module(x)
+            x = conv(x, incremental_state)
+            x = F.glu(x, dim=2)
+
+            # attention
+            if attention is not None:
+                x = self._transpose_if_training(x, incremental_state)
+
+                x, attn_scores = attention(x, target_embedding, (encoder_a, encoder_b), encoder_padding_mask)
+
+                if not self.training and self.need_attn:
+                    attn_scores = attn_scores / num_attn_layers
+                    if avg_attn_scores is None:
+                        avg_attn_scores = attn_scores
+                    else:
+                        avg_attn_scores.add_(attn_scores)
+
+                x = self._transpose_if_training(x, incremental_state)
+
+            # residual
+            if residual is not None:
+                x = (x + residual) * math.sqrt(0.5)
+            residuals.append(x)
+
+        # T x B x C -> B x T x C
+        x = self._transpose_if_training(x, incremental_state)
+
+        # project back to size of vocabulary if not using adaptive softmax
+        if self.fc2 is not None and self.fc3 is not None:
+            x = self.fc2(x)
+            x = self.dropout_module(x)
+            x = self.fc3(x)
+
+        return x, avg_attn_scores
+
+    def reorder_incremental_state(self, incremental_state, new_order):
+        super().reorder_incremental_state(incremental_state, new_order)
+        encoder_out = utils.get_incremental_state(self, incremental_state, 'encoder_out')
+        if encoder_out is not None:
+            encoder_out = tuple(eo.index_select(0, new_order) for eo in encoder_out)
+            utils.set_incremental_state(self, incremental_state, 'encoder_out', encoder_out)
+
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        return self.embed_positions.max_positions if self.embed_positions is not None else float('inf')
+
+    def upgrade_state_dict(self, state_dict):
+        if utils.item(state_dict.get('decoder.version', torch.Tensor([1]))[0]) < 2:
+            # old models use incorrect weight norm dimension
+            for i, conv in enumerate(self.convolutions):
+                # reconfigure weight norm
+                nn.utils.remove_weight_norm(conv)
+                self.convolutions[i] = nn.utils.weight_norm(conv, dim=0)
+            state_dict['decoder.version'] = torch.Tensor([1])
+        return state_dict
+
+    def make_generation_fast_(self, need_attn=False, **kwargs):
+        self.need_attn = need_attn
+
+    def _embed_tokens(self, tokens, incremental_state):
+        if incremental_state is not None:
+            # keep only the last token for incremental forward pass
+            tokens = tokens[:, -1:]
+        return self.embed_tokens(tokens)
+
+    def _split_encoder_out(self, encoder_out, incremental_state):
+        """Split and transpose encoder outputs.
+
+        This is cached when doing incremental inference.
+        """
+        cached_result = utils.get_incremental_state(self, incremental_state, 'encoder_out')
+        if cached_result is not None:
+            return cached_result
+
+        # transpose only once to speed up attention layers
+        encoder_a, encoder_b = encoder_out
+        encoder_a = encoder_a.transpose(1, 2).contiguous()
+        result = (encoder_a, encoder_b)
+
+        if incremental_state is not None:
+            utils.set_incremental_state(self, incremental_state, 'encoder_out', result)
+        return result
+
+    def _transpose_if_training(self, x, incremental_state):
+        if incremental_state is None:
+            x = x.transpose(0, 1)
+        return x
+
+
+def extend_conv_spec(convolutions):
+    """
+    Extends convolutional spec that is a list of tuples of 2 or 3 parameters
+    (kernel size, dim size and optionally how many layers behind to look for residual)
+    to default the residual propagation param if it is not specified
+    """
+    extended = []
+    for spec in convolutions:
+        if len(spec) == 3:
+            extended.append(spec)
+        elif len(spec) == 2:
+            extended.append(spec + (1,))
+        else:
+            raise Exception('invalid number of parameters in convolution spec ' + str(spec) + '. expected 2 or 3')
+    return tuple(extended)
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.normal_(m.weight, 0, 0.1)
+    nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+
+
+def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx):
+    m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx)
+    nn.init.normal_(m.weight, 0, 0.1)
+    nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+
+
+def Linear(in_features, out_features, dropout=0.):
+    """Weight-normalized Linear layer (input: N x T x C)"""
+    m = nn.Linear(in_features, out_features)
+    nn.init.normal_(m.weight, mean=0, std=math.sqrt((1 - dropout) / in_features))
+    nn.init.constant_(m.bias, 0)
+    return nn.utils.weight_norm(m)
+
+
+def LinearizedConv1d(in_channels, out_channels, kernel_size, dropout=0., **kwargs):
+    """Weight-normalized Conv1d layer optimized for decoding"""
+    m = LinearizedConvolution(in_channels, out_channels, kernel_size, **kwargs)
+    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
+    nn.init.normal_(m.weight, mean=0, std=std)
+    nn.init.constant_(m.bias, 0)
+    return nn.utils.weight_norm(m, dim=2)
+
+
+def ConvTBC(in_channels, out_channels, kernel_size, dropout=0., **kwargs):
+    """Weight-normalized Conv1d layer"""
+    from fairseq.modules import ConvTBC
+    m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs)
+    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
+    nn.init.normal_(m.weight, mean=0, std=std)
+    nn.init.constant_(m.bias, 0)
+    return nn.utils.weight_norm(m, dim=2)
+
+
+@register_model_architecture('fconv', 'fconv')
+def base_architecture(args):
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
+    args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
+    args.encoder_layers = getattr(args, 'encoder_layers', '[(512, 3)] * 20')
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
+    args.decoder_embed_path = getattr(args, 'decoder_embed_path', None)
+    args.decoder_layers = getattr(args, 'decoder_layers', '[(512, 3)] * 20')
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
+    args.decoder_attention = getattr(args, 'decoder_attention', 'True')
+    args.share_input_output_embed = getattr(args, 'share_input_output_embed', False)
+
+
+@register_model_architecture('fconv', 'fconv_iwslt_de_en')
+def fconv_iwslt_de_en(args):
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 256)
+    args.encoder_layers = getattr(args, 'encoder_layers', '[(256, 3)] * 4')
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 256)
+    args.decoder_layers = getattr(args, 'decoder_layers', '[(256, 3)] * 3')
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
+    base_architecture(args)
+
+
+@register_model_architecture('fconv', 'fconv_wmt_en_ro')
+def fconv_wmt_en_ro(args):
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 512)
+    base_architecture(args)
+
+
+@register_model_architecture('fconv', 'fconv_wmt_en_de')
+def fconv_wmt_en_de(args):
+    convs = '[(512, 3)] * 9'  # first 9 layers have 512 units
+    convs += ' + [(1024, 3)] * 4'  # next 4 layers have 1024 units
+    convs += ' + [(2048, 1)] * 2'  # final 2 layers use 1x1 convolutions
+
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 768)
+    args.encoder_layers = getattr(args, 'encoder_layers', convs)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 768)
+    args.decoder_layers = getattr(args, 'decoder_layers', convs)
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 512)
+    base_architecture(args)
+
+
+@register_model_architecture('fconv', 'fconv_wmt_en_fr')
+def fconv_wmt_en_fr(args):
+    convs = '[(512, 3)] * 6'  # first 6 layers have 512 units
+    convs += ' + [(768, 3)] * 4'  # next 4 layers have 768 units
+    convs += ' + [(1024, 3)] * 3'  # next 3 layers have 1024 units
+    convs += ' + [(2048, 1)] * 1'  # next 1 layer uses 1x1 convolutions
+    convs += ' + [(4096, 1)] * 1'  # final 1 layer uses 1x1 convolutions
+
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 768)
+    args.encoder_layers = getattr(args, 'encoder_layers', convs)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 768)
+    args.decoder_layers = getattr(args, 'decoder_layers', convs)
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 512)
+    base_architecture(args)
diff --git a/fairseq/models/fconv_lm.py b/fairseq/models/fconv_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2320b170059be09a815f20aa99fcecd56bece22
--- /dev/null
+++ b/fairseq/models/fconv_lm.py
@@ -0,0 +1,104 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq import options
+from fairseq.models import (
+    FairseqLanguageModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.fconv import FConvDecoder
+
+
+@register_model('fconv_lm')
+class FConvLanguageModel(FairseqLanguageModel):
+    def __init__(self, decoder):
+        super().__init__(decoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
+                            help='decoder layers [(dim, kernel_size), ...]')
+        parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
+                            help='decoder output embedding dimension')
+        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+                            help='comma separated list of adaptive softmax cutoff points. '
+                                 'Must be used with adaptive_loss criterion')
+        parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
+                            help='sets adaptive softmax dropout for the tail projections')
+        parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
+                            help='decoder attention [True, ...]')
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure all arguments are present in older models
+        base_lm_architecture(args)
+
+        if hasattr(args, 'max_target_positions') and not hasattr(args, 'tokens_per_sample'):
+            args.tokens_per_sample = args.max_target_positions
+
+        decoder = FConvDecoder(
+            dictionary=task.target_dictionary,
+            embed_dim=args.decoder_embed_dim,
+            convolutions=eval(args.decoder_layers),
+            out_embed_dim=args.decoder_embed_dim,
+            attention=eval(args.decoder_attention),
+            dropout=args.dropout,
+            max_positions=args.tokens_per_sample,
+            share_embed=False,
+            positional_embeddings=False,
+            adaptive_softmax_cutoff=(
+                options.eval_str_list(args.adaptive_softmax_cutoff, type=int)
+                if args.criterion == 'adaptive_loss' else None
+            ),
+            adaptive_softmax_dropout=args.adaptive_softmax_dropout,
+        )
+        return FConvLanguageModel(decoder)
+
+
+@register_model_architecture('fconv_lm', 'fconv_lm')
+def base_lm_architecture(args):
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128)
+    args.decoder_layers = getattr(args, 'decoder_layers', '[(1268, 4)] * 13')
+    args.decoder_attention = getattr(args, 'decoder_attention', 'False')
+    args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
+    args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
+
+
+@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_wikitext103')
+def fconv_lm_dauphin_wikitext103(args):
+    layers = '[(850, 6)] * 3'
+    layers += ' + [(850, 1)] * 1'
+    layers += ' + [(850, 5)] * 4'
+    layers += ' + [(850, 1)] * 1'
+    layers += ' + [(850, 4)] * 3'
+    layers += ' + [(1024, 4)] * 1'
+    layers += ' + [(2048, 4)] * 1'
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 280)
+    args.decoder_layers = getattr(args, 'decoder_layers', layers)
+    args.decoder_attention = getattr(args, 'decoder_attention', 'False')
+    args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,20000,200000')
+    base_lm_architecture(args)
+
+
+@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_gbw')
+def fconv_lm_dauphin_gbw(args):
+    layers = '[(512, 5)]'
+    layers += ' + [(128, 1, 0), (128, 5, 0), (512, 1, 3)] * 3'
+    layers += ' + [(512, 1, 0), (512, 5, 0), (1024, 1, 3)] * 3'
+    layers += ' + [(1024, 1, 0), (1024, 5, 0), (2048, 1, 3)] * 6'
+    layers += ' + [(1024, 1, 0), (1024, 5, 0), (4096, 1, 3)]'
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128)
+    args.decoder_layers = getattr(args, 'decoder_layers', layers)
+    args.decoder_attention = getattr(args, 'decoder_attention', 'False')
+    args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,50000,200000')
+    base_lm_architecture(args)
diff --git a/fairseq/models/fconv_self_att.py b/fairseq/models/fconv_self_att.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3582da96f5ad92d3eec2a1f1a70a95463e55f3b
--- /dev/null
+++ b/fairseq/models/fconv_self_att.py
@@ -0,0 +1,589 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import math
+import os
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq import checkpoint_utils
+from fairseq.models import (
+    CompositeEncoder,
+    FairseqDecoder,
+    FairseqEncoder,
+    FairseqEncoderDecoderModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.modules import (
+    FairseqDropout,
+    DownsampledMultiHeadAttention,
+    GradMultiply,
+    LayerNorm,
+    LearnedPositionalEmbedding,
+    LinearizedConvolution,
+)
+from fairseq.incremental_decoding_utils import with_incremental_state
+
+logger = logging.getLogger(__name__)
+
+
+@register_model('fconv_self_att')
+class FConvModelSelfAtt(FairseqEncoderDecoderModel):
+
+    @classmethod
+    def hub_models(cls):
+        return {
+            'conv.stories.pretrained': {
+                'path': 'https://dl.fbaipublicfiles.com/fairseq/models/stories_checkpoint.tar.gz',
+                'checkpoint_file': 'pretrained_checkpoint.pt',
+                'tokenizer': 'nltk',
+            },
+            'conv.stories': {
+                'path': 'https://dl.fbaipublicfiles.com/fairseq/models/stories_checkpoint.tar.gz',
+                'checkpoint_file': 'fusion_checkpoint.pt',
+                'tokenizer': 'nltk',
+                'pretrained': 'True',
+                'pretrained_checkpoint': './pretrained_checkpoint.pt',
+            },
+            # Test set containing dictionaries
+            'data.stories': 'https://dl.fbaipublicfiles.com/fairseq/data/stories_test.tar.bz2',
+        }
+
+    def __init__(self, encoder, decoder, pretrained_encoder=None):
+        super().__init__(encoder, decoder)
+        self.encoder.num_attention_layers = sum(layer is not None for layer in decoder.attention)
+        self.pretrained_encoder = pretrained_encoder
+        if self.pretrained_encoder is None:
+            encoders = {'encoder': encoder}
+        else:
+            encoders = {'encoder': encoder, 'pretrained': self.pretrained_encoder}
+        # for fusion model, CompositeEncoder contains both pretrained and training encoders
+        # these are forwarded and then combined in the decoder
+        self.encoder = CompositeEncoder(encoders)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension')
+        parser.add_argument('--encoder-layers', type=str, metavar='EXPR',
+                            help='encoder layers [(dim, kernel_size), ...]')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
+                            help='decoder layers [(dim, kernel_size), ...]')
+        parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
+                            help='decoder output embedding dimension')
+        parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
+                            help='decoder attention [True, ...]')
+        parser.add_argument('--self-attention', type=str, metavar='EXPR',
+                            help='decoder self-attention layers, ex: [True] + [False]*5')
+        parser.add_argument('--multihead-attention-nheads', type=int,
+                            help='Number of heads to use in attention')
+        parser.add_argument('--multihead-self-attention-nheads', type=int,
+                            help='Number of heads to use in self-attention')
+        parser.add_argument('--encoder-attention', type=str, metavar='EXPR',
+                            help='encoder attention [True, ...]')
+        parser.add_argument('--encoder-attention-nheads', type=int,
+                            help='Number of heads to use in encoder attention')
+        parser.add_argument('--project-input', type=str, metavar='EXPR',
+                            help='Use projections in self-attention [True, ...]')
+        parser.add_argument('--gated-attention', type=str, metavar='EXPR',
+                            help='Use GLU layers in self-attention projections [True, ...]')
+        parser.add_argument('--downsample', type=str, metavar='EXPR',
+                            help='Use downsampling in self-attention [True, ...]')
+        parser.add_argument('--pretrained-checkpoint', metavar='DIR',
+                            help='path to load checkpoint from pretrained model')
+        parser.add_argument('--pretrained', type=str, metavar='EXPR',
+                            help='use pretrained model when training [True, ...]')
+        # fmt: on
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        trained_encoder, trained_decoder = None, None
+        pretrained = eval(args.pretrained)
+        if pretrained:
+            logger.info('loading pretrained model')
+            if not os.path.exists(args.pretrained_checkpoint):
+                new_pretrained_checkpoint = os.path.join(args.data, args.pretrained_checkpoint)
+                if os.path.exists(new_pretrained_checkpoint):
+                    args.pretrained_checkpoint = new_pretrained_checkpoint
+            trained_model = checkpoint_utils.load_model_ensemble(
+                filenames=[args.pretrained_checkpoint],
+                task=task,
+            )[0][0]
+            trained_decoder = list(trained_model.children())[1]
+            trained_encoder = list(trained_model.children())[0]
+
+            # freeze pretrained model
+            for param in trained_decoder.parameters():
+                param.requires_grad = False
+            for param in trained_encoder.parameters():
+                param.requires_grad = False
+
+        encoder = FConvEncoder(
+            task.source_dictionary,
+            embed_dim=args.encoder_embed_dim,
+            convolutions=eval(args.encoder_layers),
+            dropout=args.dropout,
+            max_positions=args.max_source_positions,
+            attention=eval(args.encoder_attention),
+            attention_nheads=args.encoder_attention_nheads,
+        )
+
+        decoder = FConvDecoder(
+            task.target_dictionary,
+            embed_dim=args.decoder_embed_dim,
+            convolutions=eval(args.decoder_layers),
+            out_embed_dim=args.decoder_out_embed_dim,
+            attention=eval(args.decoder_attention),
+            dropout=args.dropout,
+            max_positions=args.max_target_positions,
+            selfattention=eval(args.self_attention),
+            attention_nheads=args.multihead_attention_nheads,
+            selfattention_nheads=args.multihead_self_attention_nheads,
+            project_input=eval(args.project_input),
+            gated_attention=eval(args.gated_attention),
+            downsample=eval(args.downsample),
+            pretrained=pretrained,
+            trained_decoder=trained_decoder,
+        )
+        model = FConvModelSelfAtt(encoder, decoder, trained_encoder)
+
+        return model
+
+    @property
+    def pretrained(self):
+        return self.pretrained_encoder is not None
+
+
+class FConvEncoder(FairseqEncoder):
+    """Convolutional encoder"""
+    def __init__(
+        self, dictionary, embed_dim=512, max_positions=1024,
+        convolutions=((512, 3),) * 20, dropout=0.1, attention=False,
+        attention_nheads=1,
+    ):
+        super().__init__(dictionary)
+        self.dropout_module = FairseqDropout(
+            dropout, module_name=self.__class__.__name__
+        )
+        self.num_attention_layers = None
+
+        num_embeddings = len(dictionary)
+        self.padding_idx = dictionary.pad()
+        self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx)
+        self.embed_positions = PositionalEmbedding(
+            max_positions,
+            embed_dim,
+            self.padding_idx,
+        )
+
+        def expand_bool_array(val):
+            if isinstance(val, bool):
+                # expand True into [True, True, ...] and do the same with False
+                return [val] * len(convolutions)
+            return val
+
+        attention = expand_bool_array(attention)
+
+        in_channels = convolutions[0][0]
+        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
+        self.projections = nn.ModuleList()
+        self.convolutions = nn.ModuleList()
+        self.attention = nn.ModuleList()
+        self.attproj = nn.ModuleList()
+        for i, (out_channels, kernel_size) in enumerate(convolutions):
+            self.projections.append(
+                Linear(in_channels, out_channels) if in_channels != out_channels else None
+            )
+            self.convolutions.append(
+                ConvTBC(in_channels, out_channels * 2, kernel_size, dropout=dropout)
+            )
+
+            self.attention.append(
+                SelfAttention(out_channels, embed_dim, attention_nheads) if attention[i] else None
+            )
+            in_channels = out_channels
+
+        self.fc2 = Linear(in_channels, embed_dim)
+
+    def forward(self, src_tokens, src_lengths):
+        # embed tokens and positions
+        x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens)
+        x = self.dropout_module(x)
+        input_embedding = x.transpose(0, 1)
+
+        # project to size of convolution
+        x = self.fc1(x)
+
+        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()  # -> T x B
+        if not encoder_padding_mask.any():
+            encoder_padding_mask = None
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        # temporal convolutions
+        for proj, conv, attention in zip(self.projections, self.convolutions, self.attention):
+            residual = x if proj is None else proj(x)
+
+            if encoder_padding_mask is not None:
+                x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
+
+            x = self.dropout_module(x)
+            padding_l = (conv.kernel_size[0] - 1) // 2
+            padding_r = conv.kernel_size[0] // 2
+            x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
+            x = conv(x)
+            x = F.glu(x, dim=2)
+            if attention is not None:
+                x = attention(x)
+            x = (x + residual) * math.sqrt(0.5)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(1, 0)
+
+        # project back to size of embedding
+        x = self.fc2(x)
+
+        if encoder_padding_mask is not None:
+            encoder_padding_mask = encoder_padding_mask.t()  # -> B x T
+            x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
+
+        # scale gradients (this only affects backward, not forward)
+        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))
+
+        # add output to input embedding for attention
+        y = (x + input_embedding.transpose(0, 1)) * math.sqrt(0.5)
+
+        return {
+            'encoder_out': (x, y),
+            'encoder_padding_mask': encoder_padding_mask,  # B x T
+        }
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        encoder_out['encoder_out'] = tuple(
+            eo.index_select(0, new_order) for eo in encoder_out['encoder_out']
+        )
+
+        if encoder_out['encoder_padding_mask'] is not None:
+            encoder_out['encoder_padding_mask'] = \
+                encoder_out['encoder_padding_mask'].index_select(0, new_order)
+
+        if 'pretrained' in encoder_out:
+            encoder_out['pretrained']['encoder_out'] = tuple(
+                eo.index_select(0, new_order)
+                for eo in encoder_out['pretrained']['encoder_out']
+            )
+
+        return encoder_out
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return self.embed_positions.max_positions
+
+
+@with_incremental_state
+class FConvDecoder(FairseqDecoder):
+    """Convolutional decoder"""
+    def __init__(
+        self, dictionary, embed_dim=512, out_embed_dim=256, max_positions=1024,
+        convolutions=((512, 3),) * 8, attention=True, dropout=0.1,
+        selfattention=False, attention_nheads=1, selfattention_nheads=1,
+        project_input=False, gated_attention=False, downsample=False,
+        pretrained=False, trained_decoder=None,
+    ):
+        super().__init__(dictionary)
+        self.register_buffer('version', torch.Tensor([2]))
+        self.pretrained = pretrained
+        self.pretrained_decoder = trained_decoder
+        self.dropout_module = FairseqDropout(
+            dropout, module_name=self.__class__.__name__
+        )
+        self.need_attn = True
+        in_channels = convolutions[0][0]
+
+        def expand_bool_array(val):
+            if isinstance(val, bool):
+                # expand True into [True, True, ...] and do the same with False
+                return [val] * len(convolutions)
+            return val
+
+        attention = expand_bool_array(attention)
+        selfattention = expand_bool_array(selfattention)
+
+        if not isinstance(attention, list) or len(attention) != len(convolutions):
+            raise ValueError('Attention is expected to be a list of booleans of '
+                             'length equal to the number of layers.')
+
+        num_embeddings = len(dictionary)
+        padding_idx = dictionary.pad()
+        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
+
+        self.embed_positions = PositionalEmbedding(
+            max_positions,
+            embed_dim,
+            padding_idx,
+        )
+
+        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
+        self.projections = nn.ModuleList()
+        self.convolutions = nn.ModuleList()
+        self.attention = nn.ModuleList()
+        self.selfattention = nn.ModuleList()
+        self.attproj = nn.ModuleList()
+        for i, (out_channels, kernel_size) in enumerate(convolutions):
+            self.projections.append(
+                Linear(in_channels, out_channels) if in_channels != out_channels else None
+            )
+            self.convolutions.append(
+                LinearizedConv1d(
+                    in_channels, out_channels * 2, kernel_size,
+                    padding=(kernel_size - 1), dropout=dropout,
+                )
+            )
+
+            self.attention.append(
+                DownsampledMultiHeadAttention(
+                    out_channels, embed_dim, attention_nheads,
+                    project_input=project_input, gated=False, downsample=False,
+                ) if attention[i] else None
+            )
+
+            self.attproj.append(
+                Linear(out_channels, embed_dim, dropout=dropout) if attention[i] else None
+            )
+            self.selfattention.append(
+                SelfAttention(
+                    out_channels, embed_dim, selfattention_nheads,
+                    project_input=project_input, gated=gated_attention,
+                    downsample=downsample,
+                ) if selfattention[i] else None
+            )
+            in_channels = out_channels
+
+        self.fc2 = Linear(in_channels, out_embed_dim)
+        self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout)
+
+        # model fusion
+        if self.pretrained:
+            # independent gates are learned from the concatenated input
+            self.gate1 = nn.Sequential(Linear(out_embed_dim*2, out_embed_dim), nn.Sigmoid())
+            self.gate2 = nn.Sequential(Linear(out_embed_dim*2, out_embed_dim), nn.Sigmoid())
+            # pretrained and trained models are joined
+            self.joining = nn.Sequential(
+                Linear(out_embed_dim*2, out_embed_dim*2),
+                LayerNorm(out_embed_dim*2),
+                nn.GLU(),
+                Linear(out_embed_dim, out_embed_dim*2),
+                LayerNorm(out_embed_dim*2),
+                nn.GLU(),
+                Linear(out_embed_dim, out_embed_dim),
+                LayerNorm(out_embed_dim)
+            )
+            # pretrained model contains an output layer that is nhid -> vocab size
+            # but the models are combined in their hidden state
+            # the hook stores the output of the pretrained model forward
+            self.pretrained_outputs = {}
+
+            def save_output():
+                def hook(a, b, output):
+                    self.pretrained_outputs["out"] = output
+                return hook
+
+            self.pretrained_decoder.fc2.register_forward_hook(save_output())
+
+    def forward(self, prev_output_tokens, encoder_out):
+        trained_encoder_out = encoder_out['pretrained'] if self.pretrained else None
+        encoder_out = encoder_out['encoder']['encoder_out']
+
+        encoder_a, encoder_b = self._split_encoder_out(encoder_out)
+
+        # embed positions
+        positions = self.embed_positions(prev_output_tokens)
+
+        # embed tokens and positions
+        x = self.embed_tokens(prev_output_tokens) + positions
+        x = self.dropout_module(x)
+        target_embedding = x.transpose(0, 1)
+
+        # project to size of convolution
+        x = self.fc1(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        # temporal convolutions
+        avg_attn_scores = None
+        for proj, conv, attention, selfattention, attproj in zip(
+            self.projections, self.convolutions, self.attention, self.selfattention, self.attproj
+        ):
+            residual = x if proj is None else proj(x)
+
+            x = self.dropout_module(x)
+            x = conv(x)
+            x = F.glu(x, dim=2)
+
+            # attention
+            if attention is not None:
+                r = x
+                x, attn_scores = attention(attproj(x) + target_embedding, encoder_a, encoder_b)
+                x = x + r
+                if not self.training and self.need_attn:
+                    if avg_attn_scores is None:
+                        avg_attn_scores = attn_scores
+                    else:
+                        avg_attn_scores.add_(attn_scores)
+
+            if selfattention is not None:
+                x = selfattention(x)
+
+            x = (x + residual) * math.sqrt(0.5)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        # project back to size of vocabulary
+        x = self.fc2(x)
+        x = self.dropout_module(x)
+        if not self.pretrained:
+            x = self.fc3(x)
+
+        # fusion gating
+        if self.pretrained:
+            trained_x, _ = self.pretrained_decoder.forward(prev_output_tokens, trained_encoder_out)
+            y = torch.cat([x, self.pretrained_outputs["out"]], dim=-1)
+            gate1 = self.gate1(y)
+            gate2 = self.gate2(y)
+            gated_x1 = gate1 * x
+            gated_x2 = gate2 * self.pretrained_outputs["out"]
+            fusion = torch.cat([gated_x1, gated_x2], dim=-1)
+            fusion = self.joining(fusion)
+            fusion_output = self.fc3(fusion)
+            return fusion_output, avg_attn_scores
+        else:
+            return x, avg_attn_scores
+
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        return self.embed_positions.max_positions
+
+    def make_generation_fast_(self, need_attn=False, **kwargs):
+        self.need_attn = need_attn
+
+    def _split_encoder_out(self, encoder_out):
+        """Split and transpose encoder outputs."""
+        # transpose only once to speed up attention layers
+        encoder_a, encoder_b = encoder_out
+        encoder_a = encoder_a.transpose(0, 1).contiguous()
+        encoder_b = encoder_b.transpose(0, 1).contiguous()
+        result = (encoder_a, encoder_b)
+        return result
+
+
+class SelfAttention(nn.Module):
+
+    def __init__(self, out_channels, embed_dim, num_heads, project_input=False, gated=False, downsample=False):
+        super().__init__()
+        self.attention = DownsampledMultiHeadAttention(
+            out_channels, embed_dim, num_heads, dropout=0, bias=True,
+            project_input=project_input, gated=gated, downsample=downsample,
+        )
+        self.in_proj_q = Linear(out_channels, embed_dim)
+        self.in_proj_k = Linear(out_channels, embed_dim)
+        self.in_proj_v = Linear(out_channels, embed_dim)
+        self.ln = LayerNorm(out_channels)
+
+    def forward(self, x):
+        residual = x
+        query = self.in_proj_q(x)
+        key = self.in_proj_k(x)
+        value = self.in_proj_v(x)
+        x, _ = self.attention(query, key, value, mask_future_timesteps=True, use_scalar_bias=True)
+        return self.ln(x + residual)
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    m.weight.data.normal_(0, 0.1)
+    return m
+
+
+def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx):
+    m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx)
+    m.weight.data.normal_(0, 0.1)
+    return m
+
+
+def Linear(in_features, out_features, dropout=0.):
+    """Weight-normalized Linear layer (input: N x T x C)"""
+    m = nn.Linear(in_features, out_features)
+    m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features))
+    m.bias.data.zero_()
+    return m
+
+
+def LinearizedConv1d(in_channels, out_channels, kernel_size, dropout=0., **kwargs):
+    """Weight-normalized Conv1d layer optimized for decoding"""
+    m = LinearizedConvolution(in_channels, out_channels, kernel_size, **kwargs)
+    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
+    m.weight.data.normal_(mean=0, std=std)
+    m.bias.data.zero_()
+    return m
+
+
+def ConvTBC(in_channels, out_channels, kernel_size, dropout=0., **kwargs):
+    """Weight-normalized Conv1d layer"""
+    from fairseq.modules import ConvTBC
+    m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs)
+    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
+    m.weight.data.normal_(mean=0, std=std)
+    m.bias.data.zero_()
+    return m
+
+
+@register_model_architecture('fconv_self_att', 'fconv_self_att')
+def base_architecture(args):
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
+    args.encoder_layers = getattr(args, 'encoder_layers', '[(512, 3)] * 3')
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
+    args.decoder_layers = getattr(args, 'decoder_layers', '[(512, 3)] * 8')
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
+    args.decoder_attention = getattr(args, 'decoder_attention', 'True')
+    args.self_attention = getattr(args, 'self_attention', 'False')
+    args.encoder_attention = getattr(args, 'encoder_attention', 'False')
+    args.multihead_attention_nheads = getattr(args, 'multihead_attention_nheads', 1)
+    args.multihead_self_attention_nheads = getattr(args, 'multihead_self_attention_nheads', 1)
+    args.encoder_attention_nheads = getattr(args, 'encoder_attention_nheads', 1)
+    args.project_input = getattr(args, 'project_input', 'False')
+    args.gated_attention = getattr(args, 'gated_attention', 'False')
+    args.downsample = getattr(args, 'downsample', 'False')
+    args.pretrained_checkpoint = getattr(args, 'pretrained_checkpoint', '')
+    args.pretrained = getattr(args, 'pretrained', 'False')
+
+
+@register_model_architecture('fconv_self_att', 'fconv_self_att_wp')
+def fconv_self_att_wp(args):
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 256)
+    args.encoder_layers = getattr(args, 'encoder_layers', '[(128, 3)] * 2 + [(512,3)] * 1')
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 256)
+    args.decoder_layers = getattr(args, 'decoder_layers', '[(512, 4)] * 4 + [(768, 4)] * 2 + [(1024, 4)] * 1')
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
+    args.self_attention = getattr(args, 'self_attention', 'True')
+    args.multihead_self_attention_nheads = getattr(args, 'multihead_self_attention_nheads', 4)
+    args.project_input = getattr(args, 'project_input', 'True')
+    args.gated_attention = getattr(args, 'gated_attention', 'True')
+    args.downsample = getattr(args, 'downsample', 'True')
+    base_architecture(args)
diff --git a/fairseq/models/huggingface/__init__.py b/fairseq/models/huggingface/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..633315f54d9b7548d381d3271c19aecd5ef1d042
--- /dev/null
+++ b/fairseq/models/huggingface/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import importlib
+import os
+
+
+# automatically import any Python files in the models/huggingface/ directory
+models_dir = os.path.dirname(__file__)
+for file in os.listdir(models_dir):
+    path = os.path.join(models_dir, file)
+    if (
+        not file.startswith('_')
+        and not file.startswith('.')
+        and (file.endswith('.py') or os.path.isdir(path))
+    ):
+        model_name = file[:file.find('.py')] if file.endswith('.py') else file
+        module = importlib.import_module('fairseq.models.huggingface.' + model_name)
diff --git a/fairseq/models/huggingface/__pycache__/__init__.cpython-310.pyc b/fairseq/models/huggingface/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f8508d69896ee63318b8dea27634e6aafdbe172
Binary files /dev/null and b/fairseq/models/huggingface/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/models/huggingface/__pycache__/hf_gpt2.cpython-310.pyc b/fairseq/models/huggingface/__pycache__/hf_gpt2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..124e87882360b5d9ade0a95a2a6c3d3e5aa201e2
Binary files /dev/null and b/fairseq/models/huggingface/__pycache__/hf_gpt2.cpython-310.pyc differ
diff --git a/fairseq/models/huggingface/hf_gpt2.py b/fairseq/models/huggingface/hf_gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a03406ef6fccedd86ef3325dec47de5218456f3
--- /dev/null
+++ b/fairseq/models/huggingface/hf_gpt2.py
@@ -0,0 +1,199 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+from typing import Dict, List, Optional
+
+import torch
+from fairseq.models import (
+    FairseqIncrementalDecoder,
+    FairseqLanguageModel,
+    register_model,
+    register_model_architecture,
+)
+
+try:
+    # Prepend the transformers submodule to the path, so that
+    # it's prioritized over other installations. This allows
+    # making local changes in the submodule.
+    sys.path.insert(
+        0, os.path.join(os.path.dirname(__file__), 'transformers', 'src')
+    )
+    from transformers import AutoModel, GPT2Config, GPT2LMHeadModel
+    has_hf = True
+except ImportError:
+    has_hf = False
+
+
+logger = logging.getLogger(__name__)
+
+
+DEFAULT_MAX_TARGET_POSITIONS = 1024
+
+
+@register_model('hf_gpt2')
+class HuggingFaceGPT2LanguageModel(FairseqLanguageModel):
+
+    def __init__(self, decoder):
+        super().__init__(decoder)
+        if not has_hf:
+            raise ImportError(
+                '\n\nPlease install huggingface/transformers with:'
+                '\n\n  pip install transformers'
+                '\n\nOr to make local edits, install the submodule:'
+                '\n\n  git submodule update --init '
+                'fairseq/models/huggingface/transformers'
+            )
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--embed-dim', type=int, metavar='N',
+                            help='embedding dimension')
+        parser.add_argument('--num-attention-heads', type=int, metavar='N',
+                            help='num attention heads')
+        parser.add_argument('--num-layers', type=int, metavar='N',
+                            help='num layers')
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability for all fully connected layers '
+                                 'in the embeddings, encoder, and pooler')
+        parser.add_argument('--attention-dropout', type=float, metavar='D',
+                            help='dropout probability for attention weights')
+        # fmt: on
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        default_architecture(args)
+        return cls(HuggingFaceGPT2Decoder(args, task))
+
+
+class HuggingFaceGPT2Decoder(FairseqIncrementalDecoder):
+
+    def __init__(self, args, task):
+        super().__init__(task.target_dictionary)
+
+        try:
+            # Prepend the transformers submodule to the path, so that
+            # it's prioritized over other installations. This allows
+            # making local changes in the submodule.
+            sys.path.insert(
+                0, os.path.join(os.path.dirname(__file__), 'transformers', 'src')
+            )
+            from transformers import GPT2Config, GPT2LMHeadModel
+        except ImportError:
+            raise ImportError(
+                '\n\nPlease install huggingface/transformers with:'
+                '\n\n  pip install transformers'
+                '\n\nOr to make local edits, install the submodule:'
+                '\n\n  git submodule update --init '
+                'fairseq/models/huggingface/transformers'
+            )
+
+        config = GPT2Config(
+            vocab_size=len(task.target_dictionary),
+            n_positions=args.max_target_positions + 1,
+            n_ctx=args.max_target_positions,
+            n_embd=args.embed_dim,
+            n_layer=args.num_layers,
+            n_head=args.num_attention_heads,
+            resid_pdrop=args.dropout,
+            embd_pdrop=args.dropout,
+            attn_pdrop=args.attention_dropout,
+            layer_norm_epsilon=1e-6,
+        )
+        self.model = GPT2LMHeadModel(config)
+
+        # set zero embedding for padding symbol
+        self.pad_idx = task.target_dictionary.pad()
+        self.model.transformer.wte.weight.data[self.pad_idx].zero_()
+        self.model.transformer.wpe.weight.data[0].zero_()
+
+    def forward(
+        self,
+        prev_output_tokens,
+        src_lengths=None,
+        incremental_state: Optional[Dict[str, List[torch.Tensor]]] = None,
+        encoder_out=None,
+    ):
+        features = self.extract_features(prev_output_tokens, incremental_state)
+        lm_logits = self.model.lm_head(features)
+        return (lm_logits, )
+
+    def extract_features(
+        self,
+        prev_output_tokens,
+        incremental_state: Optional[Dict[str, List[torch.Tensor]]] = None,
+    ):
+        if incremental_state:
+            past = self.get_incremental_state("past")
+        else:
+            past = None
+
+        # don't attend to padding symbols
+        attention_mask = prev_output_tokens.ne(self.pad_idx).int()
+
+        # set position ids to exclude padding symbols
+        position_ids = attention_mask * (
+            torch.arange(1, 1 + prev_output_tokens.size(1))
+            .to(prev_output_tokens)
+            .repeat(prev_output_tokens.size(0), 1)
+        )
+
+        outputs = self.model.transformer(
+            input_ids=prev_output_tokens,
+            past=past,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+        last_hidden_states = outputs[0]
+
+        if incremental_state:
+            self.set_incremental_state(incremental_state, "past", outputs[1])
+
+        return last_hidden_states
+
+    def max_positions(self):
+        return self.model.config.n_positions - 1
+
+
+@register_model_architecture('hf_gpt2', 'hf_gpt2')
+def default_architecture(args):
+    if getattr(args, 'max_target_positions', None) is None:
+        args.max_target_positions = getattr(
+            args, 'tokens_per_sample', DEFAULT_MAX_TARGET_POSITIONS
+        )
+    args.embed_dim = getattr(args, 'embed_dim', 768)
+    args.num_attention_heads = getattr(args, 'num_attention_heads', 12)
+    args.num_layers = getattr(args, 'num_layers', 12)
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
+
+
+@register_model_architecture('hf_gpt2', 'hf_gpt2_medium')
+def hf_gpt2_medium(args):
+    args.embed_dim = getattr(args, 'embed_dim', 1024)
+    args.num_attention_heads = getattr(args, 'num_attention_heads', 16)
+    args.num_layers = getattr(args, 'num_layers', 24)
+    default_architecture(args)
+
+
+@register_model_architecture('hf_gpt2', 'hf_gpt2_large')
+def hf_gpt2_large(args):
+    args.embed_dim = getattr(args, 'embed_dim', 1280)
+    args.num_attention_heads = getattr(args, 'num_attention_heads', 20)
+    args.num_layers = getattr(args, 'num_layers', 36)
+    default_architecture(args)
+
+
+@register_model_architecture('hf_gpt2', 'hf_gpt2_xl')
+def hf_gpt2_xl(args):
+    args.embed_dim = getattr(args, 'embed_dim', 1600)
+    args.num_attention_heads = getattr(args, 'num_attention_heads', 25)
+    args.num_layers = getattr(args, 'num_layers', 48)
+    default_architecture(args)
diff --git a/fairseq/models/lightconv.py b/fairseq/models/lightconv.py
new file mode 100644
index 0000000000000000000000000000000000000000..05939e1c758d6ce93b0bb4c1cd83bbc97ac98e8b
--- /dev/null
+++ b/fairseq/models/lightconv.py
@@ -0,0 +1,786 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq import options, utils
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqIncrementalDecoder,
+    FairseqEncoderDecoderModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.modules import (
+    AdaptiveSoftmax,
+    DynamicConv,
+    FairseqDropout,
+    LayerNorm,
+    PositionalEmbedding,
+    LightweightConv,
+    MultiheadAttention,
+)
+
+
+@register_model('lightconv')
+class LightConvModel(FairseqEncoderDecoderModel):
+    """
+    LightConv and DynamicConv model from `"Pay Less Attention with Lightweight and Dynamic Convolutions" (Wu, et al, 2019)
+    <https://openreview.net/pdf?id=SkVhlh09tX>`_.
+    To use LightConv please set ``--encoder-conv-type lightweight --decoder-conv-type lightweight``
+    To use DynamicConv please set ``--encoder-conv-type dynamic --decoder-conv-type dynamic``
+
+    Args:
+        encoder (LightConvEncoder): the encoder
+        decoder (LightConvDecoder): the decoder
+
+    The LightConv model provides the following named architectures and
+    command-line arguments:
+
+    .. argparse::
+        :ref: fairseq.models.lightconv_parser
+        :prog:
+    """
+
+    @classmethod
+    def hub_models(cls):
+        # fmt: off
+
+        def moses_subword(path):
+            return {
+                'path': path,
+                'tokenizer': 'moses',
+                'bpe': 'subword_nmt',
+            }
+
+        return {
+            'lightconv.no_glu.iwslt14.de-en': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/iwslt14.de-en.lightconv.tar.gz'),
+            'dynamicconv.no_glu.iwslt14.de-en': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/iwslt14.de-en.dynamicconv.tar.gz'),
+            'lightconv.no_glu.wmt16.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv.tar.gz'),
+            'dynamicconv.no_glu.wmt16.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv.tar.gz'),
+            'lightconv.glu.wmt16.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv-glu.tar.gz'),
+            'dynamicconv.glu.wmt16.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv-glu.tar.gz'),
+            'lightconv.glu.wmt17.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv-glu.tar.gz'),
+            'dynamicconv.glu.wmt17.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv-glu.tar.gz'),
+            'lightconv.glu.wmt14.en-fr': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt14.en-fr.joined-dict.lightconv-glu.tar.gz'),
+            'dynamicconv.glu.wmt14.en-fr': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt14.en-fr.joined-dict.dynamicconv-glu.tar.gz'),
+            'lightconv.glu.wmt17.zh-en': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.zh-en.lightconv-glu.tar.gz'),
+            'dynamicconv.glu.wmt17.zh-en': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.zh-en.dynamicconv-glu.tar.gz'),
+        }
+        # fmt: on
+
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--attention-dropout', type=float, metavar='D',
+                            help='dropout probability for attention weights')
+        parser.add_argument('--relu-dropout', type=float, metavar='D',
+                            help='dropout probability after ReLU in FFN')
+        parser.add_argument('--input-dropout', type=float, metavar='D',
+                            help='dropout probability of the inputs')
+        parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained encoder embedding')
+        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension')
+        parser.add_argument('--encoder-conv-dim', type=int, metavar='N',
+                            help='encoder embedding dimension')
+        parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension for FFN')
+        parser.add_argument('--encoder-layers', type=int, metavar='N',
+                            help='num encoder layers')
+        parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
+                            help='num encoder attention heads or LightConv/DynamicConv heads')
+        parser.add_argument('--encoder-normalize-before', action='store_true',
+                            help='apply layernorm before each encoder block')
+        parser.add_argument('--encoder-learned-pos', action='store_true',
+                            help='use learned positional embeddings in the encoder')
+        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained decoder embedding')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-conv-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension for FFN')
+        parser.add_argument('--decoder-layers', type=int, metavar='N',
+                            help='num decoder layers')
+        parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
+                            help='num decoder attention heads or LightConv/DynamicConv heads')
+        parser.add_argument('--decoder-learned-pos', action='store_true',
+                            help='use learned positional embeddings in the decoder')
+        parser.add_argument('--decoder-normalize-before', action='store_true',
+                            help='apply layernorm before each decoder block')
+        parser.add_argument('--share-decoder-input-output-embed', action='store_true',
+                            help='share decoder input and output embeddings')
+        parser.add_argument('--share-all-embeddings', action='store_true',
+                            help='share encoder, decoder and output embeddings'
+                                 ' (requires shared dictionary and embed dim)')
+        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+                            help='comma separated list of adaptive softmax cutoff points. '
+                                 'Must be used with adaptive_loss criterion'),
+        parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
+                            help='sets adaptive softmax dropout for the tail projections')
+
+        """LightConv and DynamicConv arguments"""
+        parser.add_argument('--encoder-kernel-size-list', type=lambda x: options.eval_str_list(x, int),
+                            help='list of kernel size (default: "[3,7,15,31,31,31,31]")')
+        parser.add_argument('--decoder-kernel-size-list', type=lambda x: options.eval_str_list(x, int),
+                            help='list of kernel size (default: "[3,7,15,31,31,31]")')
+        parser.add_argument('--encoder-glu', type=options.eval_bool,
+                            help='glu after in proj')
+        parser.add_argument('--decoder-glu', type=options.eval_bool,
+                            help='glu after in proj')
+        parser.add_argument('--encoder-conv-type', default='dynamic', type=str,
+                            choices=['dynamic', 'lightweight'],
+                            help='type of convolution')
+        parser.add_argument('--decoder-conv-type', default='dynamic', type=str,
+                            choices=['dynamic', 'lightweight'],
+                            help='type of convolution')
+        parser.add_argument('--weight-softmax', default=True, type=options.eval_bool)
+        parser.add_argument('--weight-dropout', type=float, metavar='D',
+                            help='dropout probability for conv weights')
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present in older models
+        base_architecture(args)
+
+        if not hasattr(args, 'max_source_positions'):
+            args.max_source_positions = 1024
+        if not hasattr(args, 'max_target_positions'):
+            args.max_target_positions = 1024
+
+        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
+
+        def build_embedding(dictionary, embed_dim, path=None):
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            emb = Embedding(num_embeddings, embed_dim, padding_idx)
+            # if provided, load from preloaded dictionaries
+            if path:
+                embed_dict = utils.parse_embedding(path)
+                utils.load_embedding(embed_dict, dictionary, emb)
+            return emb
+
+        if args.share_all_embeddings:
+            if src_dict != tgt_dict:
+                raise RuntimeError('--share-all-embeddings requires a joined dictionary')
+            if args.encoder_embed_dim != args.decoder_embed_dim:
+                raise RuntimeError(
+                    '--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim')
+            if args.decoder_embed_path and (
+                    args.decoder_embed_path != args.encoder_embed_path):
+                raise RuntimeError('--share-all-embeddings not compatible with --decoder-embed-path')
+            encoder_embed_tokens = build_embedding(
+                src_dict, args.encoder_embed_dim, args.encoder_embed_path
+            )
+            decoder_embed_tokens = encoder_embed_tokens
+            args.share_decoder_input_output_embed = True
+        else:
+            encoder_embed_tokens = build_embedding(
+                src_dict, args.encoder_embed_dim, args.encoder_embed_path
+            )
+            decoder_embed_tokens = build_embedding(
+                tgt_dict, args.decoder_embed_dim, args.decoder_embed_path
+            )
+
+        encoder = LightConvEncoder(args, src_dict, encoder_embed_tokens)
+        decoder = LightConvDecoder(args, tgt_dict, decoder_embed_tokens)
+        return LightConvModel(encoder, decoder)
+
+
+class LightConvEncoder(FairseqEncoder):
+    """
+    LightConv encoder consisting of *args.encoder_layers* layers. Each layer
+    is a :class:`LightConvEncoderLayer`.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): encoding dictionary
+        embed_tokens (torch.nn.Embedding): input embedding
+    """
+
+    def __init__(self, args, dictionary, embed_tokens):
+        super().__init__(dictionary)
+        self.dropout_module = FairseqDropout(args.dropout, module_name=self.__class__.__name__)
+
+        embed_dim = embed_tokens.embedding_dim
+        self.padding_idx = embed_tokens.padding_idx
+        self.max_source_positions = args.max_source_positions
+
+        self.embed_tokens = embed_tokens
+        self.embed_scale = math.sqrt(embed_dim)
+        self.embed_positions = PositionalEmbedding(
+            args.max_source_positions, embed_dim, self.padding_idx,
+            learned=args.encoder_learned_pos,
+        ) if not args.no_token_positional_embeddings else None
+
+        self.layers = nn.ModuleList([])
+        self.layers.extend([
+            LightConvEncoderLayer(args, kernel_size=args.encoder_kernel_size_list[i])
+            for i in range(args.encoder_layers)
+        ])
+        self.register_buffer('version', torch.Tensor([2]))
+        self.normalize = args.encoder_normalize_before
+        if self.normalize:
+            self.layer_norm = LayerNorm(embed_dim)
+
+    def forward(self, src_tokens, **unused):
+        """
+        Args:
+            src_tokens (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+
+        Returns:
+            dict:
+                - **encoder_out** (Tensor): the last encoder layer's output of
+                  shape `(src_len, batch, embed_dim)`
+                - **encoder_padding_mask** (ByteTensor): the positions of
+                  padding elements of shape `(batch, src_len)`
+        """
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(src_tokens)
+        if self.embed_positions is not None:
+            x += self.embed_positions(src_tokens)
+        x = self.dropout_module(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        # compute padding mask
+        encoder_padding_mask = src_tokens.eq(self.padding_idx)
+        if not encoder_padding_mask.any():
+            encoder_padding_mask = None
+
+        # encoder layers
+        for layer in self.layers:
+            x = layer(x, encoder_padding_mask)
+
+        if self.normalize:
+            x = self.layer_norm(x)
+
+        return {
+            'encoder_out': x,  # T x B x C
+            'encoder_padding_mask': encoder_padding_mask,  # B x T
+        }
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        """
+        Reorder encoder output according to *new_order*.
+
+        Args:
+            encoder_out: output from the ``forward()`` method
+            new_order (LongTensor): desired order
+
+        Returns:
+            *encoder_out* rearranged according to *new_order*
+        """
+        if encoder_out['encoder_out'] is not None:
+            encoder_out['encoder_out'] = \
+                encoder_out['encoder_out'].index_select(1, new_order)
+        if encoder_out['encoder_padding_mask'] is not None:
+            encoder_out['encoder_padding_mask'] = \
+                encoder_out['encoder_padding_mask'].index_select(0, new_order)
+        return encoder_out
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        if self.embed_positions is None:
+            return self.max_source_positions
+        return min(self.max_source_positions, self.embed_positions.max_positions)
+
+
+class LightConvDecoder(FairseqIncrementalDecoder):
+    """
+    LightConv decoder consisting of *args.decoder_layers* layers. Each layer
+    is a :class:`LightConvDecoderLayer`.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): decoding dictionary
+        embed_tokens (torch.nn.Embedding): output embedding
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs.
+            Default: ``False``
+    """
+
+    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True):
+        super().__init__(dictionary)
+        self.dropout_module = FairseqDropout(args.dropout, module_name=self.__class__.__name__)
+        self.share_input_output_embed = args.share_decoder_input_output_embed
+
+        input_embed_dim = embed_tokens.embedding_dim
+        embed_dim = args.decoder_embed_dim
+        output_embed_dim = args.decoder_output_dim
+
+        padding_idx = embed_tokens.padding_idx
+        self.max_target_positions = args.max_target_positions
+
+        self.embed_tokens = embed_tokens
+        self.embed_scale = math.sqrt(embed_dim)  # todo: try with input_embed_dim
+
+        self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None
+
+        self.embed_positions = PositionalEmbedding(
+            args.max_target_positions, embed_dim, padding_idx,
+            learned=args.decoder_learned_pos,
+        ) if not args.no_token_positional_embeddings else None
+
+        self.layers = nn.ModuleList([])
+        self.layers.extend([
+            LightConvDecoderLayer(args, no_encoder_attn, kernel_size=args.decoder_kernel_size_list[i])
+            for i in range(args.decoder_layers)
+        ])
+
+        self.adaptive_softmax = None
+
+        self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \
+            if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None
+
+        if args.adaptive_softmax_cutoff is not None:
+            self.adaptive_softmax = AdaptiveSoftmax(
+                len(dictionary),
+                output_embed_dim,
+                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
+                dropout=args.adaptive_softmax_dropout,
+                adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None,
+                factor=args.adaptive_softmax_factor,
+                tie_proj=args.tie_adaptive_proj,
+            )
+        elif not self.share_input_output_embed:
+            self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), output_embed_dim))
+            nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5)
+        self.register_buffer('version', torch.Tensor([2]))
+        self.normalize = args.decoder_normalize_before and final_norm
+        if self.normalize:
+            self.layer_norm = LayerNorm(embed_dim)
+
+    def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (Tensor, optional): output from the encoder, used for
+                encoder-side attention
+            incremental_state (dict): dictionary used for storing state during
+                :ref:`Incremental decoding`
+
+        Returns:
+            tuple:
+                - the last decoder layer's output of shape `(batch, tgt_len,
+                  vocab)`
+                - the last decoder layer's attention weights of shape `(batch,
+                  tgt_len, src_len)`
+        """
+        # embed positions
+        positions = self.embed_positions(
+            prev_output_tokens,
+            incremental_state=incremental_state,
+        ) if self.embed_positions is not None else None
+
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+            if positions is not None:
+                positions = positions[:, -1:]
+
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+
+        if self.project_in_dim is not None:
+            x = self.project_in_dim(x)
+
+        if positions is not None:
+            x += positions
+        x = self.dropout_module(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        attn = None
+
+        inner_states = [x]
+
+        # decoder layers
+        for layer in self.layers:
+            x, attn = layer(
+                x,
+                encoder_out['encoder_out'] if encoder_out is not None else None,
+                encoder_out['encoder_padding_mask'] if encoder_out is not None else None,
+                incremental_state,
+            )
+            inner_states.append(x)
+
+        if self.normalize:
+            x = self.layer_norm(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        if self.project_out_dim is not None:
+            x = self.project_out_dim(x)
+
+        if self.adaptive_softmax is None:
+            # project back to size of vocabulary
+            if self.share_input_output_embed:
+                x = F.linear(x, self.embed_tokens.weight)
+            else:
+                x = F.linear(x, self.embed_out)
+
+        return x, {'attn': attn, 'inner_states': inner_states}
+
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        if self.embed_positions is None:
+            return self.max_target_positions
+        return min(self.max_target_positions, self.embed_positions.max_positions)
+
+    def buffered_future_mask(self, tensor):
+        dim = tensor.size(0)
+        if not hasattr(self, '_future_mask') or self._future_mask is None or self._future_mask.device != tensor.device:
+            self._future_mask = torch.triu(utils.fill_with_neg_inf(tensor.new(dim, dim)), 1)
+        if self._future_mask.size(0) < dim:
+            self._future_mask = torch.triu(utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1)
+        return self._future_mask[:dim, :dim]
+
+
+class LightConvEncoderLayer(nn.Module):
+    """Encoder layer block.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        kernel_size: kernel size of the convolution
+    """
+
+    def __init__(self, args, kernel_size=0):
+        super().__init__()
+        self.embed_dim = args.encoder_embed_dim
+        self.conv_dim = args.encoder_conv_dim
+        padding_l = kernel_size // 2 if kernel_size % 2 == 1 else ((kernel_size - 1) // 2, kernel_size // 2)
+
+        if args.encoder_glu:
+            self.linear1 = Linear(self.embed_dim, 2*self.conv_dim)
+            self.act = nn.GLU()
+        else:
+            self.linear1 = Linear(self.embed_dim, self.conv_dim)
+            self.act = None
+        if args.encoder_conv_type == 'lightweight':
+            self.conv = LightweightConv(self.conv_dim, kernel_size, padding_l=padding_l,
+                                        weight_softmax=args.weight_softmax,
+                                        num_heads=args.encoder_attention_heads,
+                                        weight_dropout=args.weight_dropout)
+        elif args.encoder_conv_type == 'dynamic':
+            self.conv = DynamicConv(self.conv_dim, kernel_size, padding_l=padding_l,
+                                    weight_softmax=args.weight_softmax,
+                                    num_heads=args.encoder_attention_heads,
+                                    weight_dropout=args.weight_dropout)
+        else:
+            raise NotImplementedError
+        self.linear2 = Linear(self.conv_dim, self.embed_dim)
+
+        self.dropout_module = FairseqDropout(args.dropout, module_name=self.__class__.__name__)
+        self.relu_dropout_module = FairseqDropout(args.relu_dropout, module_name=self.__class__.__name__)
+        self.input_dropout_module = FairseqDropout(args.input_dropout, module_name=self.__class__.__name__)
+        self.normalize_before = args.encoder_normalize_before
+        self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
+        self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
+        self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for _ in range(2)])
+
+    def forward(self, x, encoder_padding_mask):
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
+                `(batch, src_len)` where padding elements are indicated by ``1``.
+
+        Returns:
+            encoded output of shape `(batch, src_len, embed_dim)`
+        """
+        residual = x
+        x = self.maybe_layer_norm(0, x, before=True)
+        x = self.input_dropout_module(x)
+        x = self.linear1(x)
+        if self.act is not None:
+            x = self.act(x)
+        if encoder_padding_mask is not None:
+            x = x.masked_fill(encoder_padding_mask.transpose(0, 1).unsqueeze(2), 0)
+        x = self.conv(x)
+        x = self.linear2(x)
+        x = self.dropout_module(x)
+        x = residual + x
+        x = self.maybe_layer_norm(0, x, after=True)
+
+        residual = x
+        x = self.maybe_layer_norm(1, x, before=True)
+        x = F.relu(self.fc1(x))
+        x = self.relu_dropout_module(x)
+        x = self.fc2(x)
+        x = self.dropout_module(x)
+        x = residual + x
+        x = self.maybe_layer_norm(1, x, after=True)
+        return x
+
+    def maybe_layer_norm(self, i, x, before=False, after=False):
+        assert before ^ after
+        if after ^ self.normalize_before:
+            return self.layer_norms[i](x)
+        else:
+            return x
+
+    def extra_repr(self):
+        return 'dropout={}, relu_dropout={}, input_dropout={}, normalize_before={}'.format(
+            self.dropout_module.p, self.relu_dropout_module.p, self.input_dropout_module.p, self.normalize_before)
+
+
+class LightConvDecoderLayer(nn.Module):
+    """Decoder layer block.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs.
+            Default: ``False``
+        kernel_size: kernel size of the convolution
+    """
+
+    def __init__(self, args, no_encoder_attn=False, kernel_size=0):
+        super().__init__()
+        self.embed_dim = args.decoder_embed_dim
+        self.conv_dim = args.decoder_conv_dim
+        if args.decoder_glu:
+            self.linear1 = Linear(self.embed_dim, 2*self.conv_dim)
+            self.act = nn.GLU()
+        else:
+            self.linear1 = Linear(self.embed_dim, self.conv_dim)
+            self.act = None
+        if args.decoder_conv_type == 'lightweight':
+            self.conv = LightweightConv(self.conv_dim, kernel_size, padding_l=kernel_size-1,
+                                        weight_softmax=args.weight_softmax,
+                                        num_heads=args.decoder_attention_heads,
+                                        weight_dropout=args.weight_dropout)
+        elif args.decoder_conv_type == 'dynamic':
+            self.conv = DynamicConv(self.conv_dim, kernel_size, padding_l=kernel_size-1,
+                                    weight_softmax=args.weight_softmax,
+                                    num_heads=args.decoder_attention_heads,
+                                    weight_dropout=args.weight_dropout)
+        else:
+            raise NotImplementedError
+        self.linear2 = Linear(self.conv_dim, self.embed_dim)
+
+        self.dropout_module = FairseqDropout(args.dropout, module_name=self.__class__.__name__)
+        self.relu_dropout_module = FairseqDropout(args.relu_dropout, module_name=self.__class__.__name__)
+        self.input_dropout_module = FairseqDropout(args.input_dropout, module_name=self.__class__.__name__)
+        self.normalize_before = args.decoder_normalize_before
+
+        self.conv_layer_norm = LayerNorm(self.embed_dim)
+
+        if no_encoder_attn:
+            self.encoder_attn = None
+            self.encoder_attn_layer_norm = None
+        else:
+            self.encoder_attn = MultiheadAttention(
+                self.embed_dim, args.decoder_attention_heads,
+                dropout=args.attention_dropout, encoder_decoder_attention=True,
+            )
+            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)
+
+        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
+        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)
+
+        self.final_layer_norm = LayerNorm(self.embed_dim)
+        self.need_attn = True
+
+    def forward(self, x, encoder_out, encoder_padding_mask, incremental_state,
+                prev_conv_state=None, prev_attn_state=None, conv_mask=None,
+                conv_padding_mask=None):
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
+                `(batch, src_len)` where padding elements are indicated by ``1``.
+
+        Returns:
+            encoded output of shape `(batch, src_len, embed_dim)`
+        """
+        residual = x
+        x = self.maybe_layer_norm(self.conv_layer_norm, x, before=True)
+        if prev_conv_state is not None:
+            if incremental_state is None:
+                incremental_state = {}
+            self.conv._set_input_buffer(incremental_state, prev_conv_state)
+        x = self.input_dropout_module(x)
+        x = self.linear1(x)
+        if self.act is not None:
+            x = self.act(x)
+        x = self.conv(x, incremental_state=incremental_state)
+        x = self.linear2(x)
+        x = self.dropout_module(x)
+        x = residual + x
+        x = self.maybe_layer_norm(self.conv_layer_norm, x, after=True)
+
+        attn = None
+        if self.encoder_attn is not None:
+            residual = x
+            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True)
+            if prev_attn_state is not None:
+                if incremental_state is None:
+                    incremental_state = {}
+                prev_key, prev_value = prev_attn_state
+                saved_state = {"prev_key": prev_key, "prev_value": prev_value}
+                self.encoder_attn._set_input_buffer(incremental_state, saved_state)
+            x, attn = self.encoder_attn(
+                query=x,
+                key=encoder_out,
+                value=encoder_out,
+                key_padding_mask=encoder_padding_mask,
+                incremental_state=incremental_state,
+                static_kv=True,
+                need_weights=(not self.training and self.need_attn),
+            )
+            x = self.dropout_module(x)
+            x = residual + x
+            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True)
+
+        residual = x
+        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
+        x = F.relu(self.fc1(x))
+        x = self.relu_dropout_module(x)
+        x = self.fc2(x)
+        x = self.dropout_module(x)
+        x = residual + x
+        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
+        return x, attn
+
+    def maybe_layer_norm(self, layer_norm, x, before=False, after=False):
+        assert before ^ after
+        if after ^ self.normalize_before:
+            return layer_norm(x)
+        else:
+            return x
+
+    def make_generation_fast_(self, need_attn=False, **kwargs):
+        self.need_attn = need_attn
+
+    def extra_repr(self):
+        return 'dropout={}, relu_dropout={}, input_dropout={}, normalize_before={}'.format(
+            self.dropout_module.p, self.relu_dropout_module.p, self.input_dropout_module.p, self.normalize_before)
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
+    nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+
+
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    nn.init.xavier_uniform_(m.weight)
+    if bias:
+        nn.init.constant_(m.bias, 0.)
+    return m
+
+
+@register_model_architecture('lightconv', 'lightconv')
+def base_architecture(args):
+    args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 2048)
+    args.encoder_layers = getattr(args, 'encoder_layers', 7)
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 8)
+    args.encoder_normalize_before = getattr(args, 'encoder_normalize_before', False)
+    args.encoder_learned_pos = getattr(args, 'encoder_learned_pos', False)
+    args.decoder_embed_path = getattr(args, 'decoder_embed_path', None)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', args.encoder_ffn_embed_dim)
+    args.decoder_layers = getattr(args, 'decoder_layers', 6)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
+    args.decoder_normalize_before = getattr(args, 'decoder_normalize_before', False)
+    args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.)
+    args.relu_dropout = getattr(args, 'relu_dropout', 0.)
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
+    args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
+    args.share_decoder_input_output_embed = getattr(args, 'share_decoder_input_output_embed', False)
+    args.share_all_embeddings = getattr(args, 'share_all_embeddings', False)
+    args.no_token_positional_embeddings = getattr(args, 'no_token_positional_embeddings', False)
+
+    args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim)
+    args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim)
+
+    args.encoder_conv_dim = getattr(args, 'encoder_conv_dim', args.encoder_embed_dim)
+    args.decoder_conv_dim = getattr(args, 'decoder_conv_dim', args.decoder_embed_dim)
+
+    args.encoder_kernel_size_list = getattr(args, 'encoder_kernel_size_list', [3, 7, 15, 31, 31, 31, 31])
+    args.decoder_kernel_size_list = getattr(args, 'decoder_kernel_size_list', [3, 7, 15, 31, 31, 31])
+    if len(args.encoder_kernel_size_list) == 1:
+        args.encoder_kernel_size_list = args.encoder_kernel_size_list * args.encoder_layers
+    if len(args.decoder_kernel_size_list) == 1:
+        args.decoder_kernel_size_list = args.decoder_kernel_size_list * args.decoder_layers
+    assert len(args.encoder_kernel_size_list) == args.encoder_layers, "encoder_kernel_size_list doesn't match encoder_layers"
+    assert len(args.decoder_kernel_size_list) == args.decoder_layers, "decoder_kernel_size_list doesn't match decoder_layers"
+    args.encoder_glu = getattr(args, 'encoder_glu', True)
+    args.decoder_glu = getattr(args, 'decoder_glu', True)
+    args.input_dropout = getattr(args, 'input_dropout', 0.1)
+    args.weight_dropout = getattr(args, 'weight_dropout', args.attention_dropout)
+
+
+@register_model_architecture('lightconv', 'lightconv_iwslt_de_en')
+def lightconv_iwslt_de_en(args):
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 1024)
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 4)
+    args.encoder_layers = getattr(args, 'encoder_layers', 7)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
+    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 1024)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 4)
+    args.decoder_layers = getattr(args, 'decoder_layers', 6)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
+    args.weight_dropout = getattr(args, 'weight_dropout', 0.1)
+    args.encoder_glu = getattr(args, 'encoder_glu', False)
+    args.decoder_glu = getattr(args, 'decoder_glu', False)
+    args.input_dropout = getattr(args, 'input_dropout', 0.0)
+    base_architecture(args)
+
+
+@register_model_architecture('lightconv', 'lightconv_wmt_en_de')
+def lightconv_wmt_en_de(args):
+    base_architecture(args)
+
+
+@register_model_architecture('lightconv', 'lightconv_wmt_en_de_big')
+def lightconv_wmt_en_de_big(args):
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1024)
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 4096)
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 16)
+    args.encoder_normalize_before = getattr(args, 'encoder_normalize_before', False)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)
+    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
+    args.dropout = getattr(args, 'dropout', 0.3)
+    base_architecture(args)
+
+
+@register_model_architecture('lightconv', 'lightconv_wmt_en_fr_big')
+def lightconv_wmt_en_fr_big(args):
+    args.dropout = getattr(args, 'dropout', 0.1)
+    lightconv_wmt_en_de_big(args)
+
+
+@register_model_architecture('lightconv', 'lightconv_wmt_zh_en_big')
+def lightconv_wmt_zh_en_big(args):
+    args.dropout = getattr(args, 'dropout', 0.2)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.2)
+    args.weight_dropout = getattr(args, 'weight_dropout', 0.2)
+    lightconv_wmt_en_de_big(args)
diff --git a/fairseq/models/lightconv_lm.py b/fairseq/models/lightconv_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..a268ddd8597cdce75613b59befb5095ff6c6eda1
--- /dev/null
+++ b/fairseq/models/lightconv_lm.py
@@ -0,0 +1,176 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq import options
+from fairseq.models import (
+    FairseqLanguageModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.lightconv import (
+    Embedding,
+    LightConvDecoder,
+)
+from fairseq.modules import (
+    AdaptiveInput,
+    CharacterTokenEmbedder,
+)
+
+
+@register_model('lightconv_lm')
+class LightConvLanguageModel(FairseqLanguageModel):
+    def __init__(self, decoder):
+        super().__init__(decoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--attention-dropout', default=0., type=float, metavar='D',
+                            help='dropout probability for attention weights')
+        parser.add_argument('--relu-dropout', default=0., type=float, metavar='D',
+                            help='dropout probability after ReLU in FFN')
+        parser.add_argument('--input-dropout', type=float, metavar='D',
+                            help='dropout probability of the inputs')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-output-dim', type=int, metavar='N',
+                            help='decoder output dimension')
+        parser.add_argument('--decoder-input-dim', type=int, metavar='N',
+                            help='decoder input dimension')
+        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension for FFN')
+        parser.add_argument('--decoder-layers', type=int, metavar='N',
+                            help='num decoder layers')
+        parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
+                            help='num decoder attention heads or LightConv/DynamicConv heads')
+        parser.add_argument('--decoder-normalize-before', default=False, action='store_true',
+                            help='apply layernorm before each decoder block')
+        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+                            help='comma separated list of adaptive softmax cutoff points. '
+                                 'Must be used with adaptive_loss criterion')
+        parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
+                            help='sets adaptive softmax dropout for the tail projections')
+        parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
+                            help='adaptive input factor')
+        parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
+                            help='if set, disables positional embeddings (outside self attention)')
+        parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true',
+                            help='share decoder input and output embeddings')
+        parser.add_argument('--character-embeddings', default=False, action='store_true',
+                            help='if set, uses character embedding convolutions to produce token embeddings')
+        parser.add_argument('--character-filters', type=str, metavar='LIST',
+                            default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
+                            help='size of character embeddings')
+        parser.add_argument('--character-embedding-dim', type=int, metavar='N', default=4,
+                            help='size of character embeddings')
+        parser.add_argument('--char-embedder-highway-layers', type=int, metavar='N', default=2,
+                            help='number of highway layers for character token embeddder')
+        parser.add_argument('--adaptive-input', default=False, action='store_true',
+                            help='if set, uses adaptive input')
+        parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
+                            help='adaptive input factor')
+        parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
+                            help='comma separated list of adaptive input cutoff points.')
+        parser.add_argument('--tie-adaptive-weights', action='store_true',
+                            help='if set, ties the weights of adaptive softmax and adaptive input')
+        parser.add_argument('--tie-adaptive-proj', action='store_true',
+                            help='if set, ties the projection weights of adaptive softmax and adaptive input')
+        parser.add_argument('--decoder-learned-pos', action='store_true',
+                            help='use learned positional embeddings in the decoder')
+
+        """LightConv and DynamicConv arguments"""
+        parser.add_argument('--decoder-kernel-size-list', type=lambda x: options.eval_str_list(x, int),
+                            help='list of kernel size (default: "[3,7,15,31,31,31]")')
+        parser.add_argument('--decoder-glu', type=options.eval_bool,
+                            help='glu after in proj')
+        parser.add_argument('--decoder-conv-type', default='dynamic', type=str,
+                            choices=['dynamic', 'lightweight'],
+                            help='type of convolution')
+        parser.add_argument('--weight-softmax', default=True, type=options.eval_bool)
+        parser.add_argument('--weight-dropout', type=float, metavar='D',
+                            help='dropout probability for conv weights')
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present in older models
+        base_lm_architecture(args)
+
+        if getattr(args, 'max_source_positions', None) is None:
+            args.max_source_positions = args.tokens_per_sample
+        if getattr(args, 'max_target_positions', None) is None:
+            args.max_target_positions = args.tokens_per_sample
+
+        if args.character_embeddings:
+            embed_tokens = CharacterTokenEmbedder(task.dictionary, eval(args.character_filters),
+                                                  args.character_embedding_dim,
+                                                  args.decoder_embed_dim,
+                                                  args.char_embedder_highway_layers,
+                                                  )
+        elif args.adaptive_input:
+            embed_tokens = AdaptiveInput(len(task.dictionary), task.dictionary.pad(), args.decoder_input_dim,
+                                         args.adaptive_input_factor, args.decoder_embed_dim,
+                                         options.eval_str_list(args.adaptive_input_cutoff, type=int))
+        else:
+            embed_tokens = Embedding(len(task.dictionary), args.decoder_input_dim, task.dictionary.pad())
+
+        if args.tie_adaptive_weights:
+            assert args.adaptive_input
+            assert args.adaptive_input_factor == args.adaptive_softmax_factor
+            assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format(
+                args.adaptive_softmax_cutoff, args.adaptive_input_cutoff)
+            assert args.decoder_input_dim == args.decoder_output_dim
+
+        decoder = LightConvDecoder(args, task.output_dictionary, embed_tokens, no_encoder_attn=True, final_norm=False)
+        return LightConvLanguageModel(decoder)
+
+
+@register_model_architecture('lightconv_lm', 'lightconv_lm')
+def base_lm_architecture(args):
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
+    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048)
+    args.decoder_layers = getattr(args, 'decoder_layers', 6)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
+    args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
+    args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
+    args.adaptive_softmax_factor = getattr(args, 'adaptive_softmax_factor', 4)
+    args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
+
+    args.character_embeddings = getattr(args, 'character_embeddings', False)
+
+    args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim)
+    args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim)
+    args.decoder_conv_dim = getattr(args, 'decoder_conv_dim', args.decoder_embed_dim)
+
+    # The model training is not stable without this
+    args.decoder_normalize_before = True
+
+    args.adaptive_input = getattr(args, 'adaptive_input', False)
+    args.adaptive_input_factor = getattr(args, 'adaptive_input_factor', 4)
+    args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', None)
+
+    args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', False)
+    args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', False)
+
+    args.decoder_kernel_size_list = getattr(args, 'decoder_kernel_size_list', [3, 7, 15, 31, 31, 31])
+    if len(args.decoder_kernel_size_list) == 1:
+        args.decoder_kernel_size_list = args.decoder_kernel_size_list * args.decoder_layers
+    assert len(args.decoder_kernel_size_list) == args.decoder_layers, "decoder_kernel_size_list doesn't match decoder_layers"
+    args.decoder_glu = getattr(args, 'decoder_glu', True)
+    args.input_dropout = getattr(args, 'input_dropout', 0.1)
+    args.weight_dropout = getattr(args, 'weight_dropout', args.attention_dropout)
+
+
+@register_model_architecture('lightconv_lm', 'lightconv_lm_gbw')
+def lightconv_lm_gbw(args):
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
+    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
+    base_lm_architecture(args)
diff --git a/fairseq/models/lstm.py b/fairseq/models/lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..850428a32d7d82b6f8b2f62d995dae871b2b9556
--- /dev/null
+++ b/fairseq/models/lstm.py
@@ -0,0 +1,681 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq import options, utils
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqIncrementalDecoder,
+    FairseqEncoderDecoderModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.modules import AdaptiveSoftmax, FairseqDropout
+from torch import Tensor
+from typing import Dict, List, Optional, Tuple
+
+
+DEFAULT_MAX_SOURCE_POSITIONS = 1e5
+DEFAULT_MAX_TARGET_POSITIONS = 1e5
+
+
+@register_model('lstm')
+class LSTMModel(FairseqEncoderDecoderModel):
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension')
+        parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained encoder embedding')
+        parser.add_argument('--encoder-freeze-embed', action='store_true',
+                            help='freeze encoder embeddings')
+        parser.add_argument('--encoder-hidden-size', type=int, metavar='N',
+                            help='encoder hidden size')
+        parser.add_argument('--encoder-layers', type=int, metavar='N',
+                            help='number of encoder layers')
+        parser.add_argument('--encoder-bidirectional', action='store_true',
+                            help='make all layers of encoder bidirectional')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained decoder embedding')
+        parser.add_argument('--decoder-freeze-embed', action='store_true',
+                            help='freeze decoder embeddings')
+        parser.add_argument('--decoder-hidden-size', type=int, metavar='N',
+                            help='decoder hidden size')
+        parser.add_argument('--decoder-layers', type=int, metavar='N',
+                            help='number of decoder layers')
+        parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
+                            help='decoder output embedding dimension')
+        parser.add_argument('--decoder-attention', type=str, metavar='BOOL',
+                            help='decoder attention')
+        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+                            help='comma separated list of adaptive softmax cutoff points. '
+                                 'Must be used with adaptive_loss criterion')
+        parser.add_argument('--share-decoder-input-output-embed', default=False,
+                            action='store_true',
+                            help='share decoder input and output embeddings')
+        parser.add_argument('--share-all-embeddings', default=False, action='store_true',
+                            help='share encoder, decoder and output embeddings'
+                                 ' (requires shared dictionary and embed dim)')
+
+        # Granular dropout settings (if not specified these default to --dropout)
+        parser.add_argument('--encoder-dropout-in', type=float, metavar='D',
+                            help='dropout probability for encoder input embedding')
+        parser.add_argument('--encoder-dropout-out', type=float, metavar='D',
+                            help='dropout probability for encoder output')
+        parser.add_argument('--decoder-dropout-in', type=float, metavar='D',
+                            help='dropout probability for decoder input embedding')
+        parser.add_argument('--decoder-dropout-out', type=float, metavar='D',
+                            help='dropout probability for decoder output')
+        # fmt: on
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure that all args are properly defaulted (in case there are any new ones)
+        base_architecture(args)
+
+        if args.encoder_layers != args.decoder_layers:
+            raise ValueError('--encoder-layers must match --decoder-layers')
+
+        max_source_positions = getattr(args, 'max_source_positions', DEFAULT_MAX_SOURCE_POSITIONS)
+        max_target_positions = getattr(args, 'max_target_positions', DEFAULT_MAX_TARGET_POSITIONS)
+
+        def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim):
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
+            embed_dict = utils.parse_embedding(embed_path)
+            utils.print_embed_overlap(embed_dict, dictionary)
+            return utils.load_embedding(embed_dict, dictionary, embed_tokens)
+
+        if args.encoder_embed_path:
+            pretrained_encoder_embed = load_pretrained_embedding_from_file(
+                args.encoder_embed_path, task.source_dictionary, args.encoder_embed_dim)
+        else:
+            num_embeddings = len(task.source_dictionary)
+            pretrained_encoder_embed = Embedding(
+                num_embeddings, args.encoder_embed_dim, task.source_dictionary.pad()
+            )
+
+        if args.share_all_embeddings:
+            # double check all parameters combinations are valid
+            if task.source_dictionary != task.target_dictionary:
+                raise ValueError('--share-all-embeddings requires a joint dictionary')
+            if args.decoder_embed_path and (
+                    args.decoder_embed_path != args.encoder_embed_path):
+                raise ValueError(
+                    '--share-all-embed not compatible with --decoder-embed-path'
+                )
+            if args.encoder_embed_dim != args.decoder_embed_dim:
+                raise ValueError(
+                    '--share-all-embeddings requires --encoder-embed-dim to '
+                    'match --decoder-embed-dim'
+                )
+            pretrained_decoder_embed = pretrained_encoder_embed
+            args.share_decoder_input_output_embed = True
+        else:
+            # separate decoder input embeddings
+            pretrained_decoder_embed = None
+            if args.decoder_embed_path:
+                pretrained_decoder_embed = load_pretrained_embedding_from_file(
+                    args.decoder_embed_path,
+                    task.target_dictionary,
+                    args.decoder_embed_dim
+                )
+        # one last double check of parameter combinations
+        if args.share_decoder_input_output_embed and (
+                args.decoder_embed_dim != args.decoder_out_embed_dim):
+            raise ValueError(
+                '--share-decoder-input-output-embeddings requires '
+                '--decoder-embed-dim to match --decoder-out-embed-dim'
+            )
+
+        if args.encoder_freeze_embed:
+            pretrained_encoder_embed.weight.requires_grad = False
+        if args.decoder_freeze_embed:
+            pretrained_decoder_embed.weight.requires_grad = False
+
+        encoder = LSTMEncoder(
+            dictionary=task.source_dictionary,
+            embed_dim=args.encoder_embed_dim,
+            hidden_size=args.encoder_hidden_size,
+            num_layers=args.encoder_layers,
+            dropout_in=args.encoder_dropout_in,
+            dropout_out=args.encoder_dropout_out,
+            bidirectional=args.encoder_bidirectional,
+            pretrained_embed=pretrained_encoder_embed,
+            max_source_positions=max_source_positions,
+        )
+        decoder = LSTMDecoder(
+            dictionary=task.target_dictionary,
+            embed_dim=args.decoder_embed_dim,
+            hidden_size=args.decoder_hidden_size,
+            out_embed_dim=args.decoder_out_embed_dim,
+            num_layers=args.decoder_layers,
+            dropout_in=args.decoder_dropout_in,
+            dropout_out=args.decoder_dropout_out,
+            attention=options.eval_bool(args.decoder_attention),
+            encoder_output_units=encoder.output_units,
+            pretrained_embed=pretrained_decoder_embed,
+            share_input_output_embed=args.share_decoder_input_output_embed,
+            adaptive_softmax_cutoff=(
+                options.eval_str_list(args.adaptive_softmax_cutoff, type=int)
+                if args.criterion == 'adaptive_loss' else None
+            ),
+            max_target_positions=max_target_positions,
+            residuals=False,
+        )
+        return cls(encoder, decoder)
+
+    def forward(
+        self,
+        src_tokens,
+        src_lengths,
+        prev_output_tokens,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+    ):
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths)
+        decoder_out = self.decoder(
+            prev_output_tokens, encoder_out=encoder_out, incremental_state=incremental_state
+        )
+        return decoder_out
+
+
+class LSTMEncoder(FairseqEncoder):
+    """LSTM encoder."""
+    def __init__(
+        self, dictionary, embed_dim=512, hidden_size=512, num_layers=1,
+        dropout_in=0.1, dropout_out=0.1, bidirectional=False,
+        left_pad=True, pretrained_embed=None, padding_idx=None,
+        max_source_positions=DEFAULT_MAX_SOURCE_POSITIONS,
+    ):
+        super().__init__(dictionary)
+        self.num_layers = num_layers
+        self.dropout_in_module = FairseqDropout(dropout_in, module_name=self.__class__.__name__)
+        self.dropout_out_module = FairseqDropout(dropout_out, module_name=self.__class__.__name__)
+        self.bidirectional = bidirectional
+        self.hidden_size = hidden_size
+        self.max_source_positions = max_source_positions
+
+        num_embeddings = len(dictionary)
+        self.padding_idx = padding_idx if padding_idx is not None else dictionary.pad()
+        if pretrained_embed is None:
+            self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx)
+        else:
+            self.embed_tokens = pretrained_embed
+
+        self.lstm = LSTM(
+            input_size=embed_dim,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            dropout=self.dropout_out_module.p if num_layers > 1 else 0.,
+            bidirectional=bidirectional,
+        )
+        self.left_pad = left_pad
+
+        self.output_units = hidden_size
+        if bidirectional:
+            self.output_units *= 2
+
+    def forward(
+        self,
+        src_tokens: Tensor,
+        src_lengths: Tensor,
+        enforce_sorted: bool = True,
+    ):
+        """
+        Args:
+            src_tokens (LongTensor): tokens in the source language of
+                shape `(batch, src_len)`
+            src_lengths (LongTensor): lengths of each source sentence of
+                shape `(batch)`
+            enforce_sorted (bool, optional): if True, `src_tokens` is
+                expected to contain sequences sorted by length in a
+                decreasing order. If False, this condition is not
+                required. Default: True.
+        """
+        if self.left_pad:
+            # nn.utils.rnn.pack_padded_sequence requires right-padding;
+            # convert left-padding to right-padding
+            src_tokens = utils.convert_padding_direction(
+                src_tokens,
+                torch.zeros_like(src_tokens).fill_(self.padding_idx),
+                left_to_right=True,
+            )
+
+        bsz, seqlen = src_tokens.size()
+
+        # embed tokens
+        x = self.embed_tokens(src_tokens)
+        x = self.dropout_in_module(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        # pack embedded source tokens into a PackedSequence
+        packed_x = nn.utils.rnn.pack_padded_sequence(
+            x, src_lengths.data, enforce_sorted=enforce_sorted
+        )
+
+        # apply LSTM
+        if self.bidirectional:
+            state_size = 2 * self.num_layers, bsz, self.hidden_size
+        else:
+            state_size = self.num_layers, bsz, self.hidden_size
+        h0 = x.new_zeros(*state_size)
+        c0 = x.new_zeros(*state_size)
+        packed_outs, (final_hiddens, final_cells) = self.lstm(packed_x, (h0, c0))
+
+        # unpack outputs and apply dropout
+        x, _ = nn.utils.rnn.pad_packed_sequence(packed_outs, padding_value=self.padding_idx*1.0)
+        x = self.dropout_out_module(x)
+        assert list(x.size()) == [seqlen, bsz, self.output_units]
+
+        if self.bidirectional:
+            final_hiddens = self.combine_bidir(final_hiddens, bsz)
+            final_cells = self.combine_bidir(final_cells, bsz)
+
+        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()
+
+        return tuple((
+            x,  # seq_len x batch x hidden
+            final_hiddens,  # num_layers x batch x num_directions*hidden
+            final_cells,  # num_layers x batch x num_directions*hidden
+            encoder_padding_mask,  # seq_len x batch
+        ))
+
+    def combine_bidir(self, outs, bsz: int):
+        out = outs.view(self.num_layers, 2, bsz, -1).transpose(1, 2).contiguous()
+        return out.view(self.num_layers, bsz, -1)
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        return tuple((
+            encoder_out[0].index_select(1, new_order),
+            encoder_out[1].index_select(1, new_order),
+            encoder_out[2].index_select(1, new_order),
+            encoder_out[3].index_select(1, new_order),
+        ))
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return self.max_source_positions
+
+
+class AttentionLayer(nn.Module):
+    def __init__(self, input_embed_dim, source_embed_dim, output_embed_dim, bias=False):
+        super().__init__()
+
+        self.input_proj = Linear(input_embed_dim, source_embed_dim, bias=bias)
+        self.output_proj = Linear(input_embed_dim + source_embed_dim, output_embed_dim, bias=bias)
+
+    def forward(self, input, source_hids, encoder_padding_mask):
+        # input: bsz x input_embed_dim
+        # source_hids: srclen x bsz x source_embed_dim
+
+        # x: bsz x source_embed_dim
+        x = self.input_proj(input)
+
+        # compute attention
+        attn_scores = (source_hids * x.unsqueeze(0)).sum(dim=2)
+
+        # don't attend over padding
+        if encoder_padding_mask is not None:
+            attn_scores = attn_scores.float().masked_fill_(
+                encoder_padding_mask,
+                float('-inf')
+            ).type_as(attn_scores)  # FP16 support: cast to float and back
+
+        attn_scores = F.softmax(attn_scores, dim=0)  # srclen x bsz
+
+        # sum weighted sources
+        x = (attn_scores.unsqueeze(2) * source_hids).sum(dim=0)
+
+        x = torch.tanh(self.output_proj(torch.cat((x, input), dim=1)))
+        return x, attn_scores
+
+
+class LSTMDecoder(FairseqIncrementalDecoder):
+    """LSTM decoder."""
+    def __init__(
+        self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512,
+        num_layers=1, dropout_in=0.1, dropout_out=0.1, attention=True,
+        encoder_output_units=512, pretrained_embed=None,
+        share_input_output_embed=False, adaptive_softmax_cutoff=None,
+        max_target_positions=DEFAULT_MAX_TARGET_POSITIONS,
+        residuals=False,
+    ):
+        super().__init__(dictionary)
+        self.dropout_in_module = FairseqDropout(dropout_in, module_name=self.__class__.__name__)
+        self.dropout_out_module = FairseqDropout(dropout_out, module_name=self.__class__.__name__)
+        self.hidden_size = hidden_size
+        self.share_input_output_embed = share_input_output_embed
+        self.need_attn = True
+        self.max_target_positions = max_target_positions
+        self.residuals = residuals
+        self.num_layers = num_layers
+
+        self.adaptive_softmax = None
+        num_embeddings = len(dictionary)
+        padding_idx = dictionary.pad()
+        if pretrained_embed is None:
+            self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
+        else:
+            self.embed_tokens = pretrained_embed
+
+        self.encoder_output_units = encoder_output_units
+        if encoder_output_units != hidden_size and encoder_output_units != 0:
+            self.encoder_hidden_proj = Linear(encoder_output_units, hidden_size)
+            self.encoder_cell_proj = Linear(encoder_output_units, hidden_size)
+        else:
+            self.encoder_hidden_proj = self.encoder_cell_proj = None
+
+        # disable input feeding if there is no encoder
+        # input feeding is described in arxiv.org/abs/1508.04025
+        input_feed_size = 0 if encoder_output_units == 0 else hidden_size
+        self.layers = nn.ModuleList([
+            LSTMCell(
+                input_size=input_feed_size + embed_dim if layer == 0 else hidden_size,
+                hidden_size=hidden_size,
+            )
+            for layer in range(num_layers)
+        ])
+
+        if attention:
+            # TODO make bias configurable
+            self.attention = AttentionLayer(hidden_size, encoder_output_units, hidden_size, bias=False)
+        else:
+            self.attention = None
+
+        if hidden_size != out_embed_dim:
+            self.additional_fc = Linear(hidden_size, out_embed_dim)
+
+        if adaptive_softmax_cutoff is not None:
+            # setting adaptive_softmax dropout to dropout_out for now but can be redefined
+            self.adaptive_softmax = AdaptiveSoftmax(
+                num_embeddings, hidden_size, adaptive_softmax_cutoff, dropout=dropout_out,
+            )
+        elif not self.share_input_output_embed:
+            self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
+
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Tuple[Tensor, Tensor, Tensor, Tensor]] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        src_lengths: Optional[Tensor] = None,
+    ):
+        x, attn_scores = self.extract_features(
+            prev_output_tokens, encoder_out, incremental_state
+        )
+        return self.output_layer(x), attn_scores
+
+    def extract_features(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Tuple[Tensor, Tensor, Tensor, Tensor]] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+    ):
+        """
+        Similar to *forward* but only return features.
+        """
+        # get outputs from encoder
+        if encoder_out is not None:
+            encoder_outs = encoder_out[0]
+            encoder_hiddens = encoder_out[1]
+            encoder_cells = encoder_out[2]
+            encoder_padding_mask = encoder_out[3]
+        else:
+            encoder_outs = torch.empty(0)
+            encoder_hiddens = torch.empty(0)
+            encoder_cells = torch.empty(0)
+            encoder_padding_mask = torch.empty(0)
+        srclen = encoder_outs.size(0)
+
+        if incremental_state is not None and len(incremental_state) > 0:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+
+        bsz, seqlen = prev_output_tokens.size()
+
+        # embed tokens
+        x = self.embed_tokens(prev_output_tokens)
+        x = self.dropout_in_module(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        # initialize previous states (or get from cache during incremental generation)
+        if incremental_state is not None and len(incremental_state) > 0:
+            prev_hiddens, prev_cells, input_feed = self.get_cached_state(incremental_state)
+        elif encoder_out is not None:
+            # setup recurrent cells
+            prev_hiddens = [encoder_hiddens[i] for i in range(self.num_layers)]
+            prev_cells = [encoder_cells[i] for i in range(self.num_layers)]
+            if self.encoder_hidden_proj is not None:
+                prev_hiddens = [self.encoder_hidden_proj(y) for y in prev_hiddens]
+                prev_cells = [self.encoder_cell_proj(y) for y in prev_cells]
+            input_feed = x.new_zeros(bsz, self.hidden_size)
+        else:
+            # setup zero cells, since there is no encoder
+            zero_state = x.new_zeros(bsz, self.hidden_size)
+            prev_hiddens = [zero_state for i in range(self.num_layers)]
+            prev_cells = [zero_state for i in range(self.num_layers)]
+            input_feed = None
+
+        assert srclen > 0 or self.attention is None, \
+            "attention is not supported if there are no encoder outputs"
+        attn_scores = x.new_zeros(srclen, seqlen, bsz) if self.attention is not None else None
+        outs = []
+        for j in range(seqlen):
+            # input feeding: concatenate context vector from previous time step
+            if input_feed is not None:
+                input = torch.cat((x[j, :, :], input_feed), dim=1)
+            else:
+                input = x[j]
+
+            for i, rnn in enumerate(self.layers):
+                # recurrent cell
+                hidden, cell = rnn(input, (prev_hiddens[i], prev_cells[i]))
+
+                # hidden state becomes the input to the next layer
+                input = self.dropout_out_module(hidden)
+                if self.residuals:
+                    input = input + prev_hiddens[i]
+
+                # save state for next time step
+                prev_hiddens[i] = hidden
+                prev_cells[i] = cell
+
+            # apply attention using the last layer's hidden state
+            if self.attention is not None:
+                assert attn_scores is not None
+                out, attn_scores[:, j, :] = self.attention(hidden, encoder_outs, encoder_padding_mask)
+            else:
+                out = hidden
+            out = self.dropout_out_module(out)
+
+            # input feeding
+            if input_feed is not None:
+                input_feed = out
+
+            # save final output
+            outs.append(out)
+
+        # Stack all the necessary tensors together and store
+        prev_hiddens_tensor = torch.stack(prev_hiddens)
+        prev_cells_tensor = torch.stack(prev_cells)
+        cache_state = torch.jit.annotate(
+            Dict[str, Optional[Tensor]],
+            {
+                "prev_hiddens": prev_hiddens_tensor,
+                "prev_cells": prev_cells_tensor,
+                "input_feed": input_feed,
+            }
+        )
+        self.set_incremental_state(incremental_state, 'cached_state', cache_state)
+
+        # collect outputs across time steps
+        x = torch.cat(outs, dim=0).view(seqlen, bsz, self.hidden_size)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(1, 0)
+
+        if hasattr(self, 'additional_fc') and self.adaptive_softmax is None:
+            x = self.additional_fc(x)
+            x = self.dropout_out_module(x)
+        # srclen x tgtlen x bsz -> bsz x tgtlen x srclen
+        if not self.training and self.need_attn and self.attention is not None:
+            assert attn_scores is not None
+            attn_scores = attn_scores.transpose(0, 2)
+        else:
+            attn_scores = None
+        return x, attn_scores
+
+    def output_layer(self, x):
+        """Project features to the vocabulary size."""
+        if self.adaptive_softmax is None:
+            if self.share_input_output_embed:
+                x = F.linear(x, self.embed_tokens.weight)
+            else:
+                x = self.fc_out(x)
+        return x
+
+    def get_cached_state(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+    ) -> Tuple[List[Tensor], List[Tensor], Optional[Tensor]]:
+        cached_state = self.get_incremental_state(incremental_state, 'cached_state')
+        assert cached_state is not None
+        prev_hiddens_ = cached_state["prev_hiddens"]
+        assert prev_hiddens_ is not None
+        prev_cells_ = cached_state["prev_cells"]
+        assert prev_cells_ is not None
+        prev_hiddens = [prev_hiddens_[i] for i in range(self.num_layers)]
+        prev_cells = [prev_cells_[j] for j in range(self.num_layers)]
+        input_feed = cached_state["input_feed"]  # can be None for decoder-only language models
+        return prev_hiddens, prev_cells, input_feed
+
+    def reorder_incremental_state(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+        new_order: Tensor,
+    ):
+        if incremental_state is None or len(incremental_state) == 0:
+            return
+        prev_hiddens, prev_cells, input_feed = self.get_cached_state(incremental_state)
+        prev_hiddens = [p.index_select(0, new_order) for p in prev_hiddens]
+        prev_cells = [p.index_select(0, new_order) for p in prev_cells]
+        if input_feed is not None:
+            input_feed = input_feed.index_select(0, new_order)
+        cached_state_new = torch.jit.annotate(
+            Dict[str, Optional[Tensor]],
+            {
+                "prev_hiddens": torch.stack(prev_hiddens),
+                "prev_cells": torch.stack(prev_cells),
+                "input_feed": input_feed,
+            }
+        )
+        self.set_incremental_state(incremental_state, 'cached_state', cached_state_new),
+        return
+
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        return self.max_target_positions
+
+    def make_generation_fast_(self, need_attn=False, **kwargs):
+        self.need_attn = need_attn
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.uniform_(m.weight, -0.1, 0.1)
+    nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+
+
+def LSTM(input_size, hidden_size, **kwargs):
+    m = nn.LSTM(input_size, hidden_size, **kwargs)
+    for name, param in m.named_parameters():
+        if 'weight' in name or 'bias' in name:
+            param.data.uniform_(-0.1, 0.1)
+    return m
+
+
+def LSTMCell(input_size, hidden_size, **kwargs):
+    m = nn.LSTMCell(input_size, hidden_size, **kwargs)
+    for name, param in m.named_parameters():
+        if 'weight' in name or 'bias' in name:
+            param.data.uniform_(-0.1, 0.1)
+    return m
+
+
+def Linear(in_features, out_features, bias=True, dropout=0.):
+    """Linear layer (input: N x T x C)"""
+    m = nn.Linear(in_features, out_features, bias=bias)
+    m.weight.data.uniform_(-0.1, 0.1)
+    if bias:
+        m.bias.data.uniform_(-0.1, 0.1)
+    return m
+
+
+@register_model_architecture('lstm', 'lstm')
+def base_architecture(args):
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
+    args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
+    args.encoder_freeze_embed = getattr(args, 'encoder_freeze_embed', False)
+    args.encoder_hidden_size = getattr(args, 'encoder_hidden_size', args.encoder_embed_dim)
+    args.encoder_layers = getattr(args, 'encoder_layers', 1)
+    args.encoder_bidirectional = getattr(args, 'encoder_bidirectional', False)
+    args.encoder_dropout_in = getattr(args, 'encoder_dropout_in', args.dropout)
+    args.encoder_dropout_out = getattr(args, 'encoder_dropout_out', args.dropout)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
+    args.decoder_embed_path = getattr(args, 'decoder_embed_path', None)
+    args.decoder_freeze_embed = getattr(args, 'decoder_freeze_embed', False)
+    args.decoder_hidden_size = getattr(args, 'decoder_hidden_size', args.decoder_embed_dim)
+    args.decoder_layers = getattr(args, 'decoder_layers', 1)
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 512)
+    args.decoder_attention = getattr(args, 'decoder_attention', '1')
+    args.decoder_dropout_in = getattr(args, 'decoder_dropout_in', args.dropout)
+    args.decoder_dropout_out = getattr(args, 'decoder_dropout_out', args.dropout)
+    args.share_decoder_input_output_embed = getattr(args, 'share_decoder_input_output_embed', False)
+    args.share_all_embeddings = getattr(args, 'share_all_embeddings', False)
+    args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,50000,200000')
+
+
+@register_model_architecture('lstm', 'lstm_wiseman_iwslt_de_en')
+def lstm_wiseman_iwslt_de_en(args):
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 256)
+    args.encoder_dropout_in = getattr(args, 'encoder_dropout_in', 0)
+    args.encoder_dropout_out = getattr(args, 'encoder_dropout_out', 0)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 256)
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
+    args.decoder_dropout_in = getattr(args, 'decoder_dropout_in', 0)
+    args.decoder_dropout_out = getattr(args, 'decoder_dropout_out', args.dropout)
+    base_architecture(args)
+
+
+@register_model_architecture('lstm', 'lstm_luong_wmt_en_de')
+def lstm_luong_wmt_en_de(args):
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1000)
+    args.encoder_layers = getattr(args, 'encoder_layers', 4)
+    args.encoder_dropout_out = getattr(args, 'encoder_dropout_out', 0)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1000)
+    args.decoder_layers = getattr(args, 'decoder_layers', 4)
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 1000)
+    args.decoder_dropout_out = getattr(args, 'decoder_dropout_out', 0)
+    base_architecture(args)
diff --git a/fairseq/models/lstm_lm.py b/fairseq/models/lstm_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f6758a4bc876ff7c2da3fff0c305ab7ee13a1e9
--- /dev/null
+++ b/fairseq/models/lstm_lm.py
@@ -0,0 +1,130 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq import options, utils
+from fairseq.models import (
+    FairseqLanguageModel, register_model, register_model_architecture
+)
+from fairseq.models.lstm import (
+    LSTMDecoder, Embedding
+)
+
+DEFAULT_MAX_TARGET_POSITIONS = 1e5
+
+@register_model('lstm_lm')
+class LSTMLanguageModel(FairseqLanguageModel):
+    def __init__(self, decoder):
+        super().__init__(decoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained decoder embedding')
+        parser.add_argument('--decoder-hidden-size', type=int, metavar='N',
+                            help='decoder hidden size')
+        parser.add_argument('--decoder-layers', type=int, metavar='N',
+                            help='number of decoder layers')
+        parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
+                            help='decoder output embedding dimension')
+        parser.add_argument('--decoder-attention', type=str, metavar='BOOL',
+                            help='decoder attention')
+        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+                            help='comma separated list of adaptive softmax cutoff points. '
+                                 'Must be used with adaptive_loss criterion')
+        parser.add_argument('--residuals', default=False,
+                            action='store_true',
+                            help='applying residuals between LSTM layers')
+
+        # Granular dropout settings (if not specified these default to --dropout)
+        parser.add_argument('--decoder-dropout-in', type=float, metavar='D',
+                            help='dropout probability for decoder input embedding')
+        parser.add_argument('--decoder-dropout-out', type=float, metavar='D',
+                            help='dropout probability for decoder output')
+        parser.add_argument('--share-decoder-input-output-embed', default=False,
+                            action='store_true',
+                            help='share decoder input and output embeddings')
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present in older models
+        base_architecture(args)
+
+        if getattr(args, 'max_target_positions', None) is not None:
+            max_target_positions = args.max_target_positions
+        else:
+            max_target_positions = getattr(args, 'tokens_per_sample', DEFAULT_MAX_TARGET_POSITIONS)
+
+        def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim):
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
+            embed_dict = utils.parse_embedding(embed_path)
+            utils.print_embed_overlap(embed_dict, dictionary)
+            return utils.load_embedding(embed_dict, dictionary, embed_tokens)
+
+        pretrained_decoder_embed = None
+        if args.decoder_embed_path:
+            pretrained_decoder_embed = load_pretrained_embedding_from_file(
+                args.decoder_embed_path,
+                task.target_dictionary,
+                args.decoder_embed_dim
+            )
+
+        if args.share_decoder_input_output_embed:
+            # double check all parameters combinations are valid
+            if task.source_dictionary != task.target_dictionary:
+                raise ValueError('--share-decoder-input-output-embeddings requires a joint dictionary')
+
+            if args.decoder_embed_dim != args.decoder_out_embed_dim:
+                raise ValueError(
+                    '--share-decoder-input-output-embeddings requires '
+                    '--decoder-embed-dim to match --decoder-out-embed-dim'
+                    )
+
+        decoder = LSTMDecoder(
+            dictionary=task.dictionary,
+            embed_dim=args.decoder_embed_dim,
+            hidden_size=args.decoder_hidden_size,
+            out_embed_dim=args.decoder_out_embed_dim,
+            num_layers=args.decoder_layers,
+            dropout_in=args.decoder_dropout_in,
+            dropout_out=args.decoder_dropout_out,
+            attention=False,  # decoder-only language model doesn't support attention
+            encoder_output_units=0,
+            pretrained_embed=pretrained_decoder_embed,
+            share_input_output_embed=args.share_decoder_input_output_embed,
+            adaptive_softmax_cutoff=(
+                options.eval_str_list(args.adaptive_softmax_cutoff, type=int)
+                if args.criterion == 'adaptive_loss' else None
+            ),
+            max_target_positions=max_target_positions,
+            residuals=args.residuals
+        )
+
+        return cls(decoder)
+
+
+@register_model_architecture('lstm_lm', 'lstm_lm')
+def base_architecture(args):
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
+    args.decoder_embed_path = getattr(args, 'decoder_embed_path', None)
+    args.decoder_hidden_size = getattr(args, 'decoder_hidden_size', args.decoder_embed_dim)
+    args.decoder_layers = getattr(args, 'decoder_layers', 1)
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 512)
+    args.decoder_attention = getattr(args, 'decoder_attention', '0')
+    args.decoder_dropout_in = getattr(args, 'decoder_dropout_in', args.dropout)
+    args.decoder_dropout_out = getattr(args, 'decoder_dropout_out', args.dropout)
+    args.share_decoder_input_output_embed = getattr(args, 'share_decoder_input_output_embed', False)
+    args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,50000,200000')
+    args.residuals = getattr(args, 'residuals', False)
diff --git a/fairseq/models/masked_lm.py b/fairseq/models/masked_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..35a6323ef214786c3391eaa9647cd7a52c4296e4
--- /dev/null
+++ b/fairseq/models/masked_lm.py
@@ -0,0 +1,352 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq import utils
+from fairseq.models import (
+    FairseqEncoderModel,
+    FairseqEncoder,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.modules import (
+    LayerNorm,
+    SinusoidalPositionalEmbedding,
+    TransformerSentenceEncoder,
+)
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_model('masked_lm')
+class MaskedLMModel(FairseqEncoderModel):
+    """
+    Class for training a Masked Language Model. It also supports an
+    additional sentence level prediction if the sent-loss argument is set.
+    """
+    def __init__(self, args, encoder):
+        super().__init__(encoder)
+        self.args = args
+
+        # if specified then apply bert initialization on the model. We need
+        # to explictly call this to make sure that the output embeddings
+        # and projection layers are also correctly initialized
+        if getattr(args, 'apply_bert_init', False):
+            self.apply(init_bert_params)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # Arguments related to dropout
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--attention-dropout', type=float,
+                            metavar='D', help='dropout probability for'
+                            ' attention weights')
+        parser.add_argument('--act-dropout', type=float,
+                            metavar='D', help='dropout probability after'
+                            ' activation in FFN')
+
+        # Arguments related to hidden states and self-attention
+        parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension for FFN')
+        parser.add_argument('--encoder-layers', type=int, metavar='N',
+                            help='num encoder layers')
+        parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
+                            help='num encoder attention heads')
+
+        # Arguments related to input and output embeddings
+        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension')
+        parser.add_argument('--share-encoder-input-output-embed',
+                            action='store_true', help='share encoder input'
+                            ' and output embeddings')
+        parser.add_argument('--encoder-learned-pos', action='store_true',
+                            help='use learned positional embeddings in the encoder')
+        parser.add_argument('--no-token-positional-embeddings',
+                            action='store_true',
+                            help='if set, disables positional embeddings'
+                            ' (outside self attention)')
+        parser.add_argument('--num-segment', type=int, metavar='N',
+                            help='num segment in the input')
+        parser.add_argument('--max-positions', type=int,
+                            help='number of positional embeddings to learn')
+
+        # Arguments related to sentence level prediction
+        parser.add_argument('--sentence-class-num', type=int, metavar='N',
+                            help='number of classes for sentence task')
+        parser.add_argument('--sent-loss', action='store_true', help='if set,'
+                            ' calculate sentence level predictions')
+
+        # Arguments related to parameter initialization
+        parser.add_argument('--apply-bert-init', action='store_true',
+                            help='use custom param initialization for BERT')
+
+        # misc params
+        parser.add_argument('--activation-fn',
+                            choices=utils.get_available_activation_fns(),
+                            help='activation function to use')
+        parser.add_argument('--pooler-activation-fn',
+                            choices=utils.get_available_activation_fns(),
+                            help='Which activation function to use for pooler layer.')
+        parser.add_argument('--encoder-normalize-before', action='store_true',
+                            help='apply layernorm before each encoder block')
+
+    def forward(self, src_tokens, segment_labels=None, **kwargs):
+        return self.encoder(src_tokens, segment_labels=segment_labels, **kwargs)
+
+    def max_positions(self):
+        return self.encoder.max_positions
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure all arguments are present in older models
+        base_architecture(args)
+
+        if not hasattr(args, 'max_positions'):
+            args.max_positions = args.tokens_per_sample
+
+        logger.info(args)
+
+        encoder = MaskedLMEncoder(args, task.dictionary)
+        return cls(args, encoder)
+
+
+class MaskedLMEncoder(FairseqEncoder):
+    """
+    Encoder for Masked Language Modelling.
+    """
+
+    def __init__(self, args, dictionary):
+        super().__init__(dictionary)
+
+        self.padding_idx = dictionary.pad()
+        self.vocab_size = dictionary.__len__()
+        self.max_positions = args.max_positions
+
+        self.sentence_encoder = TransformerSentenceEncoder(
+            padding_idx=self.padding_idx,
+            vocab_size=self.vocab_size,
+            num_encoder_layers=args.encoder_layers,
+            embedding_dim=args.encoder_embed_dim,
+            ffn_embedding_dim=args.encoder_ffn_embed_dim,
+            num_attention_heads=args.encoder_attention_heads,
+            dropout=args.dropout,
+            attention_dropout=args.attention_dropout,
+            activation_dropout=args.act_dropout,
+            max_seq_len=self.max_positions,
+            num_segments=args.num_segment,
+            use_position_embeddings=not args.no_token_positional_embeddings,
+            encoder_normalize_before=args.encoder_normalize_before,
+            apply_bert_init=args.apply_bert_init,
+            activation_fn=args.activation_fn,
+            learned_pos_embedding=args.encoder_learned_pos,
+        )
+
+        self.share_input_output_embed = args.share_encoder_input_output_embed
+        self.embed_out = None
+        self.sentence_projection_layer = None
+        self.sentence_out_dim = args.sentence_class_num
+        self.lm_output_learned_bias = None
+
+        # Remove head is set to true during fine-tuning
+        self.load_softmax = not getattr(args, 'remove_head', False)
+
+        self.masked_lm_pooler = nn.Linear(
+            args.encoder_embed_dim, args.encoder_embed_dim
+        )
+        self.pooler_activation = utils.get_activation_fn(args.pooler_activation_fn)
+
+        self.lm_head_transform_weight = nn.Linear(args.encoder_embed_dim, args.encoder_embed_dim)
+        self.activation_fn = utils.get_activation_fn(args.activation_fn)
+        self.layer_norm = LayerNorm(args.encoder_embed_dim)
+
+        self.lm_output_learned_bias = None
+        if self.load_softmax:
+            self.lm_output_learned_bias = nn.Parameter(torch.zeros(self.vocab_size))
+
+            if not self.share_input_output_embed:
+                self.embed_out = nn.Linear(
+                    args.encoder_embed_dim,
+                    self.vocab_size,
+                    bias=False
+                )
+
+            if args.sent_loss:
+                self.sentence_projection_layer = nn.Linear(
+                    args.encoder_embed_dim,
+                    self.sentence_out_dim,
+                    bias=False
+                )
+
+    def forward(self, src_tokens, segment_labels=None, masked_tokens=None, **unused):
+        """
+        Forward pass for Masked LM encoder. This first computes the token
+        embedding using the token embedding matrix, position embeddings (if
+        specified) and segment embeddings (if specified).
+
+        Here we assume that the sentence representation corresponds to the
+        output of the classification_token (see bert_task or cross_lingual_lm
+        task for more details).
+        Args:
+            - src_tokens: B x T matrix representing sentences
+            - segment_labels: B x T matrix representing segment label for tokens
+        Returns:
+            - a tuple of the following:
+                - logits for predictions in format B x T x C to be used in
+                  softmax afterwards
+                - a dictionary of additional data, where 'pooled_output' contains
+                  the representation for classification_token and 'inner_states'
+                  is a list of internal model states used to compute the
+                  predictions (similar in ELMO). 'sentence_logits'
+                  is the prediction logit for NSP task and is only computed if
+                  this is specified in the input arguments.
+        """
+
+        inner_states, sentence_rep = self.sentence_encoder(
+            src_tokens,
+            segment_labels=segment_labels,
+        )
+
+        x = inner_states[-1].transpose(0, 1)
+        # project masked tokens only
+        if masked_tokens is not None:
+            x = x[masked_tokens, :]
+        x = self.layer_norm(self.activation_fn(self.lm_head_transform_weight(x)))
+
+        pooled_output = self.pooler_activation(self.masked_lm_pooler(sentence_rep))
+
+        # project back to size of vocabulary
+        if self.share_input_output_embed \
+                and hasattr(self.sentence_encoder.embed_tokens, 'weight'):
+            x = F.linear(x, self.sentence_encoder.embed_tokens.weight)
+        elif self.embed_out is not None:
+            x = self.embed_out(x)
+        if self.lm_output_learned_bias is not None:
+            x = x + self.lm_output_learned_bias
+        sentence_logits = None
+        if self.sentence_projection_layer:
+            sentence_logits = self.sentence_projection_layer(pooled_output)
+
+        return x, {
+            'inner_states': inner_states,
+            'pooled_output': pooled_output,
+            'sentence_logits': sentence_logits
+        }
+
+    def max_positions(self):
+        """Maximum output length supported by the encoder."""
+        return self.max_positions
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        if isinstance(
+                self.sentence_encoder.embed_positions,
+                SinusoidalPositionalEmbedding
+        ):
+            state_dict[
+                name + '.sentence_encoder.embed_positions._float_tensor'
+            ] = torch.FloatTensor(1)
+        if not self.load_softmax:
+            for k in list(state_dict.keys()):
+                if (
+                    "embed_out.weight" in k or
+                    "sentence_projection_layer.weight" in k or
+                    "lm_output_learned_bias" in k
+                ):
+                    del state_dict[k]
+        return state_dict
+
+
+@register_model_architecture('masked_lm', 'masked_lm')
+def base_architecture(args):
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
+    args.act_dropout = getattr(args, 'act_dropout', 0.0)
+
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 4096)
+    args.encoder_layers = getattr(args, 'encoder_layers', 6)
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 8)
+
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1024)
+    args.share_encoder_input_output_embed = getattr(args, 'share_encoder_input_output_embed', False)
+    args.encoder_learned_pos = getattr(args, 'encoder_learned_pos', False)
+    args.no_token_positional_embeddings = getattr(args, 'no_token_positional_embeddings', False)
+    args.num_segment = getattr(args, 'num_segment', 2)
+
+    args.sentence_class_num = getattr(args, 'sentence_class_num', 2)
+    args.sent_loss = getattr(args, 'sent_loss', False)
+
+    args.apply_bert_init = getattr(args, 'apply_bert_init', False)
+
+    args.activation_fn = getattr(args, 'activation_fn', 'relu')
+    args.pooler_activation_fn = getattr(args, 'pooler_activation_fn', 'tanh')
+    args.encoder_normalize_before = getattr(args, 'encoder_normalize_before', False)
+
+
+@register_model_architecture('masked_lm', 'bert_base')
+def bert_base_architecture(args):
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 768)
+    args.share_encoder_input_output_embed = getattr(
+        args, 'share_encoder_input_output_embed', True)
+    args.no_token_positional_embeddings = getattr(
+        args, 'no_token_positional_embeddings', False)
+    args.encoder_learned_pos = getattr(args, 'encoder_learned_pos', True)
+    args.num_segment = getattr(args, 'num_segment', 2)
+
+    args.encoder_layers = getattr(args, 'encoder_layers', 12)
+
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 12)
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 3072)
+
+    args.sentence_class_num = getattr(args, 'sentence_class_num', 2)
+    args.sent_loss = getattr(args, 'sent_loss', True)
+
+    args.apply_bert_init = getattr(args, 'apply_bert_init', True)
+
+    args.activation_fn = getattr(args, 'activation_fn', 'gelu')
+    args.pooler_activation_fn = getattr(args, 'pooler_activation_fn', 'tanh')
+    args.encoder_normalize_before = getattr(args, 'encoder_normalize_before', True)
+    base_architecture(args)
+
+
+@register_model_architecture('masked_lm', 'bert_large')
+def bert_large_architecture(args):
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1024)
+    args.encoder_layers = getattr(args, 'encoder_layers', 24)
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 16)
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 4096)
+    bert_base_architecture(args)
+
+
+@register_model_architecture('masked_lm', 'xlm_base')
+def xlm_architecture(args):
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1024)
+    args.share_encoder_input_output_embed = getattr(
+        args, 'share_encoder_input_output_embed', True)
+    args.no_token_positional_embeddings = getattr(
+        args, 'no_token_positional_embeddings', False)
+    args.encoder_learned_pos = getattr(args, 'encoder_learned_pos', True)
+    args.num_segment = getattr(args, 'num_segment', 1)
+
+    args.encoder_layers = getattr(args, 'encoder_layers', 6)
+
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 8)
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 4096)
+
+    args.sent_loss = getattr(args, 'sent_loss', False)
+
+    args.activation_fn = getattr(args, 'activation_fn', 'gelu')
+    args.encoder_normalize_before = getattr(args, 'encoder_normalize_before', False)
+    args.pooler_activation_fn = getattr(args, 'pooler_activation_fn', 'tanh')
+    args.apply_bert_init = getattr(args, 'apply_bert_init', True)
+    base_architecture(args)
diff --git a/fairseq/models/model_utils.py b/fairseq/models/model_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..46ec62f77243308f82d862a52fd1a9b2615f9e90
--- /dev/null
+++ b/fairseq/models/model_utils.py
@@ -0,0 +1,90 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Optional
+
+import torch
+from torch import Tensor
+
+
+@torch.jit.script
+def script_skip_tensor_list(x: List[Tensor], mask):
+    res = [xi[mask] if xi.size(0) == mask.size(0) else xi[:, mask] for xi in x]
+    outputs = []
+    for i, t in enumerate(res):
+        if t.numel() != 0:
+            outputs.append(t)
+        else:
+            outputs.append(x[i])
+    return outputs
+
+
+@torch.jit.script
+def script_skip_tensor(x: Tensor, mask):
+    # None case
+    if x.size(0) == 0:
+        return x
+    res = x[mask] if x.size(0) == mask.size(0) else x[:, mask]
+    if res.numel() == 0:
+        return x
+    else:
+        return res
+
+
+@torch.jit.script
+def expand_2d_or_3d_tensor(x, trg_dim: int, padding_idx: int):
+    """
+    Expand 2D/3D tensor on dim=1
+    """
+    if x is None:
+        return None
+
+    assert x.dim() == 2 or x.dim() == 3
+    assert trg_dim >= x.size(1), (trg_dim, x.size())
+    if trg_dim == x.size(1):
+        return x
+
+    dims = [x.size(0), trg_dim - x.size(1)]
+    if x.dim() == 3:
+        dims.append(x.size(2))
+    x = torch.cat([x, torch.zeros(dims).to(x).fill_(padding_idx)], 1)
+
+    return x
+
+
+@torch.jit.script
+def coalesce(x: Optional[Tensor], y: Tensor) -> Tensor:
+    return x if x is not None else y
+
+
+@torch.jit.script
+def fill_tensors(x: Optional[Tensor], mask, y: Optional[Tensor], padding_idx: int) -> Optional[Tensor]:
+    """
+    Filling tensor x with y at masked positions (dim=0).
+    """
+    if x is None or x.size()[0] == 0 or y is None:
+        return x
+    assert x.dim() == y.dim() and mask.size(0) == x.size(0)
+    assert x.dim() == 2 or (x.dim() == 3 and x.size(2) == y.size(2))
+
+    n_selected = mask.sum()
+    if n_selected == 0:
+        return x
+    assert n_selected == y.size(0)
+    if n_selected == x.size(0):
+        return y
+
+    if x.size(1) < y.size(1):
+        x = expand_2d_or_3d_tensor(x, y.size(1), padding_idx)
+        x[mask] = y
+    elif x.size(1) > y.size(1):
+        x[mask] = torch.tensor(padding_idx).type_as(x)
+        if x.dim() == 2:
+            x[mask, :y.size(1)] = y
+        else:
+            x[mask, :y.size(1), :] = y
+    else:
+        x[mask] = y
+    return x
diff --git a/fairseq/models/multilingual_transformer.py b/fairseq/models/multilingual_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f6a837805bd2071b0dbe6db644bdea9a8ee2994
--- /dev/null
+++ b/fairseq/models/multilingual_transformer.py
@@ -0,0 +1,196 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import OrderedDict
+
+from fairseq import utils
+from fairseq.models import (
+    FairseqMultiModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.transformer import (
+    base_architecture,
+    Embedding,
+    TransformerModel,
+    TransformerEncoder,
+    TransformerDecoder,
+)
+
+
+@register_model('multilingual_transformer')
+class MultilingualTransformerModel(FairseqMultiModel):
+    """Train Transformer models for multiple language pairs simultaneously.
+
+    Requires `--task multilingual_translation`.
+
+    We inherit all arguments from TransformerModel and assume that all language
+    pairs use a single Transformer architecture. In addition, we provide several
+    options that are specific to the multilingual setting.
+
+    Args:
+        --share-encoder-embeddings: share encoder embeddings across all source languages
+        --share-decoder-embeddings: share decoder embeddings across all target languages
+        --share-encoders: share all encoder params (incl. embeddings) across all source languages
+        --share-decoders: share all decoder params (incl. embeddings) across all target languages
+    """
+
+    def __init__(self, encoders, decoders):
+        super().__init__(encoders, decoders)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        TransformerModel.add_args(parser)
+        parser.add_argument('--share-encoder-embeddings', action='store_true',
+                            help='share encoder embeddings across languages')
+        parser.add_argument('--share-decoder-embeddings', action='store_true',
+                            help='share decoder embeddings across languages')
+        parser.add_argument('--share-encoders', action='store_true',
+                            help='share encoders across languages')
+        parser.add_argument('--share-decoders', action='store_true',
+                            help='share decoders across languages')
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        from fairseq.tasks.multilingual_translation import MultilingualTranslationTask
+        assert isinstance(task, MultilingualTranslationTask)
+
+        # make sure all arguments are present in older models
+        base_multilingual_architecture(args)
+
+        if not hasattr(args, 'max_source_positions'):
+            args.max_source_positions = 1024
+        if not hasattr(args, 'max_target_positions'):
+            args.max_target_positions = 1024
+
+        src_langs = [lang_pair.split('-')[0] for lang_pair in task.model_lang_pairs]
+        tgt_langs = [lang_pair.split('-')[1] for lang_pair in task.model_lang_pairs]
+
+        if args.share_encoders:
+            args.share_encoder_embeddings = True
+        if args.share_decoders:
+            args.share_decoder_embeddings = True
+
+        def build_embedding(dictionary, embed_dim, path=None):
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            emb = Embedding(num_embeddings, embed_dim, padding_idx)
+            # if provided, load from preloaded dictionaries
+            if path:
+                embed_dict = utils.parse_embedding(path)
+                utils.load_embedding(embed_dict, dictionary, emb)
+            return emb
+
+        # build shared embeddings (if applicable)
+        shared_encoder_embed_tokens, shared_decoder_embed_tokens = None, None
+        if args.share_all_embeddings:
+            if args.encoder_embed_dim != args.decoder_embed_dim:
+                raise ValueError(
+                    '--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim')
+            if args.decoder_embed_path and (
+                    args.decoder_embed_path != args.encoder_embed_path):
+                raise ValueError('--share-all-embeddings not compatible with --decoder-embed-path')
+            shared_encoder_embed_tokens = FairseqMultiModel.build_shared_embeddings(
+                dicts=task.dicts,
+                langs=task.langs,
+                embed_dim=args.encoder_embed_dim,
+                build_embedding=build_embedding,
+                pretrained_embed_path=args.encoder_embed_path,
+            )
+            shared_decoder_embed_tokens = shared_encoder_embed_tokens
+            args.share_decoder_input_output_embed = True
+        else:
+            if args.share_encoder_embeddings:
+                shared_encoder_embed_tokens = (
+                    FairseqMultiModel.build_shared_embeddings(
+                        dicts=task.dicts,
+                        langs=src_langs,
+                        embed_dim=args.encoder_embed_dim,
+                        build_embedding=build_embedding,
+                        pretrained_embed_path=args.encoder_embed_path,
+                    )
+                )
+            if args.share_decoder_embeddings:
+                shared_decoder_embed_tokens = (
+                    FairseqMultiModel.build_shared_embeddings(
+                        dicts=task.dicts,
+                        langs=tgt_langs,
+                        embed_dim=args.decoder_embed_dim,
+                        build_embedding=build_embedding,
+                        pretrained_embed_path=args.decoder_embed_path,
+                    )
+                )
+
+        # encoders/decoders for each language
+        lang_encoders, lang_decoders = {}, {}
+
+        def get_encoder(lang):
+            if lang not in lang_encoders:
+                if shared_encoder_embed_tokens is not None:
+                    encoder_embed_tokens = shared_encoder_embed_tokens
+                else:
+                    encoder_embed_tokens = build_embedding(
+                        task.dicts[lang], args.encoder_embed_dim, args.encoder_embed_path
+                    )
+                lang_encoders[lang] = TransformerEncoder(args, task.dicts[lang], encoder_embed_tokens)
+            return lang_encoders[lang]
+
+        def get_decoder(lang):
+            if lang not in lang_decoders:
+                if shared_decoder_embed_tokens is not None:
+                    decoder_embed_tokens = shared_decoder_embed_tokens
+                else:
+                    decoder_embed_tokens = build_embedding(
+                        task.dicts[lang], args.decoder_embed_dim, args.decoder_embed_path
+                    )
+                lang_decoders[lang] = TransformerDecoder(args, task.dicts[lang], decoder_embed_tokens)
+            return lang_decoders[lang]
+
+        # shared encoders/decoders (if applicable)
+        shared_encoder, shared_decoder = None, None
+        if args.share_encoders:
+            shared_encoder = get_encoder(src_langs[0])
+        if args.share_decoders:
+            shared_decoder = get_decoder(tgt_langs[0])
+
+        encoders, decoders = OrderedDict(), OrderedDict()
+        for lang_pair, src, tgt in zip(task.model_lang_pairs, src_langs, tgt_langs):
+            encoders[lang_pair] = shared_encoder if shared_encoder is not None else get_encoder(src)
+            decoders[lang_pair] = shared_decoder if shared_decoder is not None else get_decoder(tgt)
+
+        return MultilingualTransformerModel(encoders, decoders)
+
+    def load_state_dict(self, state_dict, strict=True, args=None):
+        state_dict_subset = state_dict.copy()
+        for k, _ in state_dict.items():
+            assert k.startswith('models.')
+            lang_pair = k.split('.')[1]
+            if lang_pair not in self.models:
+                del state_dict_subset[k]
+        super().load_state_dict(state_dict_subset, strict=strict, args=args)
+
+
+@register_model_architecture('multilingual_transformer', 'multilingual_transformer')
+def base_multilingual_architecture(args):
+    base_architecture(args)
+    args.share_encoder_embeddings = getattr(args, 'share_encoder_embeddings', False)
+    args.share_decoder_embeddings = getattr(args, 'share_decoder_embeddings', False)
+    args.share_encoders = getattr(args, 'share_encoders', False)
+    args.share_decoders = getattr(args, 'share_decoders', False)
+
+
+@register_model_architecture('multilingual_transformer', 'multilingual_transformer_iwslt_de_en')
+def multilingual_transformer_iwslt_de_en(args):
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 1024)
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 4)
+    args.encoder_layers = getattr(args, 'encoder_layers', 6)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
+    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 1024)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 4)
+    args.decoder_layers = getattr(args, 'decoder_layers', 6)
+    base_multilingual_architecture(args)
diff --git a/fairseq/models/nat/__init__.py b/fairseq/models/nat/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6ca06acb931f2fe53ec0dea0e0201c2289c7409
--- /dev/null
+++ b/fairseq/models/nat/__init__.py
@@ -0,0 +1,7 @@
+from .fairseq_nat_model import *
+from .nonautoregressive_transformer import *
+from .nat_crf_transformer import *
+from .iterative_nonautoregressive_transformer import *
+from .cmlm_transformer import *
+from .levenshtein_transformer import *
+from .insertion_transformer import *
diff --git a/fairseq/models/nat/__pycache__/__init__.cpython-310.pyc b/fairseq/models/nat/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e81ccc2c6214ba7f1114628fa9d98c5986527ccd
Binary files /dev/null and b/fairseq/models/nat/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/models/nat/__pycache__/cmlm_transformer.cpython-310.pyc b/fairseq/models/nat/__pycache__/cmlm_transformer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..185fb60f672ab3c69853d32c46e990b7180930e8
Binary files /dev/null and b/fairseq/models/nat/__pycache__/cmlm_transformer.cpython-310.pyc differ
diff --git a/fairseq/models/nat/__pycache__/fairseq_nat_model.cpython-310.pyc b/fairseq/models/nat/__pycache__/fairseq_nat_model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb5a166ae29430172bb09bde2e27dd01bbe0b79f
Binary files /dev/null and b/fairseq/models/nat/__pycache__/fairseq_nat_model.cpython-310.pyc differ
diff --git a/fairseq/models/nat/__pycache__/insertion_transformer.cpython-310.pyc b/fairseq/models/nat/__pycache__/insertion_transformer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6489f0c539eebda4fa1df0b7f0472c19d0bf003f
Binary files /dev/null and b/fairseq/models/nat/__pycache__/insertion_transformer.cpython-310.pyc differ
diff --git a/fairseq/models/nat/__pycache__/iterative_nonautoregressive_transformer.cpython-310.pyc b/fairseq/models/nat/__pycache__/iterative_nonautoregressive_transformer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..14245007117d49ce3e251968a1b9573c3ef65e33
Binary files /dev/null and b/fairseq/models/nat/__pycache__/iterative_nonautoregressive_transformer.cpython-310.pyc differ
diff --git a/fairseq/models/nat/__pycache__/levenshtein_transformer.cpython-310.pyc b/fairseq/models/nat/__pycache__/levenshtein_transformer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..feebfd93995d834effa3b09f8cfac48ef7220e68
Binary files /dev/null and b/fairseq/models/nat/__pycache__/levenshtein_transformer.cpython-310.pyc differ
diff --git a/fairseq/models/nat/__pycache__/levenshtein_utils.cpython-310.pyc b/fairseq/models/nat/__pycache__/levenshtein_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c59b896296a3db6aa648366de7168d99d7ca6a07
Binary files /dev/null and b/fairseq/models/nat/__pycache__/levenshtein_utils.cpython-310.pyc differ
diff --git a/fairseq/models/nat/__pycache__/nat_crf_transformer.cpython-310.pyc b/fairseq/models/nat/__pycache__/nat_crf_transformer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0453b8c53cfa8483362f0210361a7ec08dc43fa1
Binary files /dev/null and b/fairseq/models/nat/__pycache__/nat_crf_transformer.cpython-310.pyc differ
diff --git a/fairseq/models/nat/__pycache__/nonautoregressive_transformer.cpython-310.pyc b/fairseq/models/nat/__pycache__/nonautoregressive_transformer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b0ae0ae1e6dcbfdb5ea2a7b69f19e89124da553
Binary files /dev/null and b/fairseq/models/nat/__pycache__/nonautoregressive_transformer.cpython-310.pyc differ
diff --git a/fairseq/models/nat/cmlm_transformer.py b/fairseq/models/nat/cmlm_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..86c770569d43d9946db9c74cfa4468c432df5482
--- /dev/null
+++ b/fairseq/models/nat/cmlm_transformer.py
@@ -0,0 +1,154 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+This file implements:
+Ghazvininejad, Marjan, et al.
+"Constant-time machine translation with conditional masked language models."
+arXiv preprint arXiv:1904.09324 (2019).
+"""
+
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.nat import NATransformerModel
+from fairseq.utils import new_arange
+
+
+def _skeptical_unmasking(output_scores, output_masks, p):
+    sorted_index = output_scores.sort(-1)[1]
+    boundary_len = (
+        (output_masks.sum(1, keepdim=True).type_as(output_scores) - 2) * p
+    ).long()
+    skeptical_mask = new_arange(output_masks) < boundary_len
+    return skeptical_mask.scatter(1, sorted_index, skeptical_mask)
+
+
+@register_model("cmlm_transformer")
+class CMLMNATransformerModel(NATransformerModel):
+    @staticmethod
+    def add_args(parser):
+        NATransformerModel.add_args(parser)
+
+    def forward(
+        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
+    ):
+        assert not self.decoder.src_embedding_copy, "do not support embedding copy."
+
+        # encoding
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+        # length prediction
+        length_out = self.decoder.forward_length(normalize=False, encoder_out=encoder_out)
+        length_tgt = self.decoder.forward_length_prediction(length_out, encoder_out, tgt_tokens)
+
+        # decoding
+        word_ins_out = self.decoder(
+            normalize=False,
+            prev_output_tokens=prev_output_tokens,
+            encoder_out=encoder_out)
+        word_ins_mask = prev_output_tokens.eq(self.unk)
+
+        return {
+            "word_ins": {
+                "out": word_ins_out, "tgt": tgt_tokens,
+                "mask": word_ins_mask, "ls": self.args.label_smoothing,
+                "nll_loss": True
+            },
+            "length": {
+                "out": length_out, "tgt": length_tgt,
+                "factor": self.decoder.length_loss_factor
+            }
+        }
+
+    def forward_decoder(self, decoder_out, encoder_out, decoding_format=None, **kwargs):
+
+        step = decoder_out.step
+        max_step = decoder_out.max_step
+
+        output_tokens = decoder_out.output_tokens
+        output_scores = decoder_out.output_scores
+        history = decoder_out.history
+
+        # execute the decoder
+        output_masks = output_tokens.eq(self.unk)
+        _scores, _tokens = self.decoder(
+            normalize=True,
+            prev_output_tokens=output_tokens,
+            encoder_out=encoder_out,
+        ).max(-1)
+        output_tokens.masked_scatter_(output_masks, _tokens[output_masks])
+        output_scores.masked_scatter_(output_masks, _scores[output_masks])
+
+        if history is not None:
+            history.append(output_tokens.clone())
+
+        # skeptical decoding (depend on the maximum decoding steps.)
+        if (step + 1) < max_step:
+            skeptical_mask = _skeptical_unmasking(
+                output_scores, output_tokens.ne(self.pad), 1 - (step + 1) / max_step
+            )
+
+            output_tokens.masked_fill_(skeptical_mask, self.unk)
+            output_scores.masked_fill_(skeptical_mask, 0.0)
+
+            if history is not None:
+                history.append(output_tokens.clone())
+
+        return decoder_out._replace(
+            output_tokens=output_tokens,
+            output_scores=output_scores,
+            attn=None,
+            history=history
+        )
+
+
+@register_model_architecture("cmlm_transformer", "cmlm_transformer")
+def cmlm_base_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", True)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.apply_bert_init = getattr(args, "apply_bert_init", False)
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+
+    # --- special arguments ---
+    args.sg_length_pred = getattr(args, "sg_length_pred", False)
+    args.pred_length_offset = getattr(args, "pred_length_offset", False)
+    args.length_loss_factor = getattr(args, "length_loss_factor", 0.1)
+    args.ngram_predictor = getattr(args, "ngram_predictor", 1)
+    args.src_embedding_copy = getattr(args, "src_embedding_copy", False)
+
+
+@register_model_architecture("cmlm_transformer", "cmlm_transformer_wmt_en_de")
+def cmlm_wmt_en_de(args):
+    cmlm_base_architecture(args)
diff --git a/fairseq/models/nat/fairseq_nat_model.py b/fairseq/models/nat/fairseq_nat_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..d37a234ba9c8658eba8b393c25af98807ea1ec10
--- /dev/null
+++ b/fairseq/models/nat/fairseq_nat_model.py
@@ -0,0 +1,145 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import torch
+
+from fairseq.models.transformer import TransformerModel, TransformerEncoder, TransformerDecoder
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+
+
+def ensemble_encoder(func):
+    def wrapper(self, *args, **kwargs):
+        if self.ensemble_models is None or len(self.ensemble_models) == 1:
+            return func(self, *args, **kwargs)
+        encoder_outs = [func(model, *args, **kwargs) for model in self.ensemble_models]
+        _encoder_out = encoder_outs[0]
+
+        def stack(key):
+            outs = [getattr(e, key) for e in encoder_outs]
+            return torch.stack(outs, -1) if outs[0] is not None else None
+
+        return _encoder_out._replace(
+            encoder_out=stack('encoder_out'),
+            encoder_embedding=stack('encoder_embedding'),
+            encoder_states=stack('encoder_states')
+        )
+    return wrapper
+
+
+def ensemble_decoder(func):
+    def wrapper(self, normalize=False, encoder_out=None, *args, **kwargs):
+        if self.ensemble_models is None or len(self.ensemble_models) == 1:
+            return func(self, normalize=normalize, encoder_out=encoder_out, *args, **kwargs)
+
+        action_outs = [
+            func(model, normalize=normalize, encoder_out=encoder_out._replace(
+                encoder_out=encoder_out.encoder_out[:, :, :, i]
+            ), *args, **kwargs)
+            for i, model in enumerate(self.ensemble_models)
+        ]
+
+        if not isinstance(action_outs[0], tuple):  # return multiple values
+            action_outs = [[a] for a in action_outs]
+        else:
+            action_outs = [list(a) for a in action_outs]
+
+        ensembled_outs = []
+        for i in range(len(action_outs[0])):
+            if i == 0 and normalize:
+                ensembled_outs += [
+                    torch.logsumexp(
+                        torch.stack([a[i] for a in action_outs], -1),
+                        dim=-1) - math.log(len(self.ensemble_models))
+                ]
+            elif action_outs[0][i] is not None:
+                ensembled_outs += [
+                    torch.stack([a[i] for a in action_outs], -1)
+                ]
+            else:
+                ensembled_outs += [None]
+
+        if len(ensembled_outs) == 1:
+            return ensembled_outs[0]
+        return tuple(ensembled_outs)
+    return wrapper
+
+
+class FairseqNATModel(TransformerModel):
+    """
+    Abstract class for all nonautoregressive-based models
+    """
+    def __init__(self, args, encoder, decoder):
+        super().__init__(args, encoder, decoder)
+        self.tgt_dict = decoder.dictionary
+        self.bos = decoder.dictionary.bos()
+        self.eos = decoder.dictionary.eos()
+        self.pad = decoder.dictionary.pad()
+        self.unk = decoder.dictionary.unk()
+
+        self.ensemble_models = None
+
+    @property
+    def allow_length_beam(self):
+        return False
+
+    @property
+    def allow_ensemble(self):
+        return True
+
+    def enable_ensemble(self, models):
+        self.encoder.ensemble_models = [m.encoder for m in models]
+        self.decoder.ensemble_models = [m.decoder for m in models]
+
+    @staticmethod
+    def add_args(parser):
+        TransformerModel.add_args(parser)
+        parser.add_argument(
+            "--apply-bert-init",
+            action="store_true",
+            help="use custom param initialization for BERT",
+        )
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        decoder = FairseqNATDecoder(args, tgt_dict, embed_tokens)
+        if getattr(args, "apply_bert_init", False):
+            decoder.apply(init_bert_params)
+        return decoder
+
+    @classmethod
+    def build_encoder(cls, args, src_dict, embed_tokens):
+        encoder = FairseqNATEncoder(args, src_dict, embed_tokens)
+        if getattr(args, "apply_bert_init", False):
+            encoder.apply(init_bert_params)
+        return encoder
+
+    def forward_encoder(self, encoder_inputs):
+        return self.encoder(*encoder_inputs)
+
+    def forward_decoder(self, *args, **kwargs):
+        return NotImplementedError
+
+    def initialize_output_tokens(self, *args, **kwargs):
+        return NotImplementedError
+
+    def forward(self, *args, **kwargs):
+        return NotImplementedError
+
+
+class FairseqNATEncoder(TransformerEncoder):
+    def __init__(self, args, dictionary, embed_tokens):
+        super().__init__(args, dictionary, embed_tokens)
+        self.ensemble_models = None
+
+    @ensemble_encoder
+    def forward(self, *args, **kwargs):
+        return super().forward(*args, **kwargs)
+
+
+class FairseqNATDecoder(TransformerDecoder):
+    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
+        super().__init__(args, dictionary, embed_tokens, no_encoder_attn)
+        self.ensemble_models = None
diff --git a/fairseq/models/nat/insertion_transformer.py b/fairseq/models/nat/insertion_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5f3c1abc5a7106aac2430e05bfaa8652162913e
--- /dev/null
+++ b/fairseq/models/nat/insertion_transformer.py
@@ -0,0 +1,280 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.nat import (
+    LevenshteinTransformerDecoder,
+    LevenshteinTransformerModel,
+    FairseqNATModel,
+    ensemble_decoder
+)
+from fairseq.models.transformer import Linear
+from fairseq.utils import new_arange
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+
+
+class NegativeDistanceScore(object):
+    def __init__(self):
+
+        # pre-compute some values
+        self.scores = {}
+
+        self.scores[0.5] = self.compute_score_full(50, 0.5)
+        self.scores[1.0] = self.compute_score_full(50, 1.0)
+        self.scores[2.0] = self.compute_score_full(50, 2.0)
+
+    def __call__(self, i, L, tau):
+        if (tau is None) or (tau > 1000):
+            return 1 / L
+
+        if tau in self.scores:
+            if L < self.scores[tau].shape[0]:
+                return self.scores[tau][L - 1, i]
+        return self.compute_score(L, tau)[i]
+
+    def compute_score(self, L, tau):
+        s = np.array([-abs(L / 2 - i) / tau for i in range(L)])
+        s = np.exp(s - s.max())
+        return s / s.sum()
+
+    def compute_score_full(self, L, tau):
+        s = -abs(np.arange(0, L - 1)[:, None] / 2 - np.arange(L)[None, :]) / tau
+        s = np.tril(s, 0) + np.triu(s - float("inf"), 1)
+        s = np.exp(s - s.max(1, keepdims=True))
+        return s / s.sum(1, keepdims=True)
+
+
+neg_scorer = NegativeDistanceScore()
+
+
+def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx, vocab_size, tau=None):
+    try:
+        from fairseq import libnat
+    except ImportError as e:
+        import sys
+        sys.stderr.write('ERROR: missing libnat. run `pip install --editable .`\n')
+        raise e
+
+    B = in_tokens.size(0)
+    T = in_tokens.size(1)
+    V = vocab_size
+
+    with torch.cuda.device_of(in_tokens):
+        in_tokens_list = [
+            [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist())
+        ]
+        out_tokens_list = [
+            [t for t in s if t != padding_idx]
+            for i, s in enumerate(out_tokens.tolist())
+        ]
+
+    full_labels = libnat.suggested_ed2_path(
+        in_tokens_list, out_tokens_list, padding_idx
+    )
+    insert_labels = [a[:-1] for a in full_labels]
+
+    # numericalize1
+    insert_label_tensors = in_tokens.new_zeros(B * (T - 1) * V).float()
+    insert_index, insert_labels = zip(
+        *[
+            (w + (j + i * (T - 1)) * V, neg_scorer(k, len(label), tau))
+            for i, labels in enumerate(insert_labels)
+            for j, label in enumerate(labels[1:-1])
+            for k, w in enumerate(label)
+        ]
+    )  # HACK 1:-1
+    insert_index, insert_labels = [
+        torch.tensor(list(a), device=in_tokens.device)
+        for a in [insert_index, insert_labels]
+    ]
+    insert_label_tensors.scatter_(0, insert_index.long(), insert_labels)
+    insert_label_tensors = insert_label_tensors.view(B, T - 1, V)
+
+    return insert_label_tensors
+
+
+def _apply_ins_words(in_tokens, in_scores, word_ins_pred, word_ins_scores, padding_idx):
+
+    padding_masks = in_tokens[:, 1:].eq(padding_idx)
+    word_ins_scores.masked_fill_(padding_masks, 0.0)
+    word_ins_pred.masked_fill_(padding_masks, padding_idx)
+
+    in_coords = new_arange(in_tokens).type_as(in_scores)
+
+    # shift all padding predictions to infinite
+    out_coords = (in_coords[:, 1:] - 0.5).masked_fill(
+        word_ins_pred.eq(padding_idx), float("inf")
+    )
+    out_coords = torch.cat([in_coords, out_coords], 1).sort(-1)[1]
+    out_tokens = torch.cat([in_tokens, word_ins_pred], 1).gather(1, out_coords)
+    out_scores = torch.cat([in_scores, word_ins_scores], 1).gather(1, out_coords)
+    return out_tokens, out_scores
+
+
+@register_model("insertion_transformer")
+class InsertionTransformerModel(LevenshteinTransformerModel):
+    def __init__(self, args, encoder, decoder):
+        super().__init__(args, encoder, decoder)
+
+    @staticmethod
+    def add_args(parser):
+        FairseqNATModel.add_args(parser)
+        parser.add_argument("--label-tau", default=None, type=float)
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        decoder = InsertionTransformerDecoder(args, tgt_dict, embed_tokens)
+        if getattr(args, "apply_bert_init", False):
+            decoder.apply(init_bert_params)
+        return decoder
+
+    def forward(
+        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
+    ):
+
+        assert tgt_tokens is not None, "forward function only supports training."
+
+        # encoding
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+
+        # generate training labels for insertion
+        word_ins_out = self.decoder.forward_word_ins(
+            normalize=False,
+            prev_output_tokens=prev_output_tokens,
+            encoder_out=encoder_out
+        )
+
+        word_ins_tgt = _get_ins_targets(
+            prev_output_tokens,
+            tgt_tokens,
+            self.pad,
+            self.unk,
+            len(self.tgt_dict),
+            tau=self.decoder.label_tau,
+        ).type_as(word_ins_out)
+        word_ins_masks = prev_output_tokens[:, 1:].ne(self.pad)
+
+        return {
+            "word_ins": {
+                "out": word_ins_out, "tgt": word_ins_tgt,
+                "mask": word_ins_masks, "ls": self.args.label_smoothing,
+                "nll_loss": True
+            }
+        }
+
+    def forward_decoder(
+        self, decoder_out, encoder_out, eos_penalty=0.0, max_ratio=None, **kwargs
+    ):
+
+        output_tokens = decoder_out.output_tokens
+        output_scores = decoder_out.output_scores
+        history = decoder_out.history
+
+        # TODO: decoding for InsertionTransformer
+        word_ins_score = self.decoder.forward_word_ins(
+            normalize=True,
+            prev_output_tokens=output_tokens,
+            encoder_out=encoder_out
+        )
+
+        if eos_penalty > 0.0:
+            word_ins_score[:, :, self.pad] -= eos_penalty
+        word_ins_score, word_ins_pred = word_ins_score.max(-1)
+        output_tokens, output_scores = _apply_ins_words(
+            output_tokens, output_scores, word_ins_pred, word_ins_score, self.pad
+        )
+
+        # delete some unnecessary paddings
+        cut_off = output_tokens.ne(self.pad).sum(1).max()
+        output_tokens = output_tokens[:, :cut_off]
+        output_scores = output_scores[:, :cut_off]
+
+        if history is not None:
+            history.append(output_tokens.clone())
+
+        return decoder_out._replace(
+            output_tokens=output_tokens,
+            output_scores=output_scores,
+            attn=None,
+            history=history
+        )
+
+
+class InsertionTransformerDecoder(LevenshteinTransformerDecoder):
+    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
+        # use the TransformerDecoder's __init__
+        super(LevenshteinTransformerDecoder, self).__init__(
+            args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn
+        )
+
+        self.dictionary = dictionary
+        self.bos = dictionary.bos()
+        self.unk = dictionary.unk()
+        self.eos = dictionary.eos()
+        self.pool_out = Linear(self.output_embed_dim * 2, self.output_embed_dim)
+
+        self.label_tau = getattr(args, "label_tau", None)
+
+    @ensemble_decoder
+    def forward_word_ins(self, normalize, encoder_out, prev_output_tokens):
+        features = self.extract_features(prev_output_tokens, encoder_out=encoder_out)[0]
+        features = self.pool_out(
+            torch.cat([features[:, :-1, :], features[:, 1:, :]], 2)
+        )
+        decoder_out = self.output_layer(features)
+        return F.log_softmax(decoder_out, -1) if normalize else decoder_out
+
+    def forward_mask_ins(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def forward_word_del(self, *args, **kwargs):
+        raise NotImplementedError
+
+
+@register_model_architecture("insertion_transformer", "insertion_transformer")
+def insertion_base_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.apply_bert_init = getattr(args, "apply_bert_init", False)
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+
+    # special for insertion transformer
+    args.label_tau = getattr(args, "label_tau", None)
diff --git a/fairseq/models/nat/iterative_nonautoregressive_transformer.py b/fairseq/models/nat/iterative_nonautoregressive_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc340c387df8bb13eacf8ae527b855a8f3501d53
--- /dev/null
+++ b/fairseq/models/nat/iterative_nonautoregressive_transformer.py
@@ -0,0 +1,205 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.nat import NATransformerModel
+
+
+def _sequential_poisoning(s, V, beta=0.33, bos=2, eos=3, pad=1):
+    # s: input batch
+    # V: vocabulary size
+    rand_words = torch.randint(low=4, high=V, size=s.size(), device=s.device)
+    choices = torch.rand(size=s.size(), device=s.device)
+    choices.masked_fill_((s == pad) | (s == bos) | (s == eos), 1)
+
+    replace = choices < beta / 3
+    repeat = (choices >= beta / 3) & (choices < beta * 2 / 3)
+    swap = (choices >= beta * 2 / 3) & (choices < beta)
+    safe = choices >= beta
+
+    for i in range(s.size(1) - 1):
+        rand_word = rand_words[:, i]
+        next_word = s[:, i + 1]
+        self_word = s[:, i]
+
+        replace_i = replace[:, i]
+        swap_i = swap[:, i] & (next_word != 3)
+        repeat_i = repeat[:, i] & (next_word != 3)
+        safe_i = safe[:, i] | ((next_word == 3) & (~replace_i))
+
+        s[:, i] = (
+            self_word * (safe_i | repeat_i).long()
+            + next_word * swap_i.long()
+            + rand_word * replace_i.long()
+        )
+        s[:, i + 1] = (
+            next_word * (safe_i | replace_i).long()
+            + self_word * (swap_i | repeat_i).long()
+        )
+    return s
+
+
+def gumbel_noise(input, TINY=1e-8):
+    return input.new_zeros(*input.size()).uniform_().add_(
+        TINY).log_().neg_().add_(TINY).log_().neg_()
+
+
+@register_model("iterative_nonautoregressive_transformer")
+class IterNATransformerModel(NATransformerModel):
+    @staticmethod
+    def add_args(parser):
+        NATransformerModel.add_args(parser)
+        parser.add_argument("--train-step", type=int,
+                            help="number of refinement iterations during training")
+        parser.add_argument("--dae-ratio", type=float,
+                            help="the probability of switching to the denoising auto-encoder loss")
+        parser.add_argument("--stochastic-approx", action="store_true",
+                            help="sampling from the decoder as the inputs for next iteration")
+
+    @classmethod
+    def build_model(cls, args, task):
+        model = super().build_model(args, task)
+        model.train_step = getattr(args, "train_step", 4)
+        model.dae_ratio = getattr(args, "dae_ratio", 0.5)
+        model.stochastic_approx = getattr(args, "stochastic_approx", False)
+        return model
+
+    def forward(
+        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
+    ):
+
+        B, T = prev_output_tokens.size()
+
+        # encoding
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+
+        # length prediction
+        length_out = self.decoder.forward_length(normalize=False, encoder_out=encoder_out)
+        length_tgt = self.decoder.forward_length_prediction(length_out, encoder_out, tgt_tokens)
+
+        # decoding
+        word_ins_outs, word_ins_tgts, word_ins_masks = [], [], []
+        for t in range(self.train_step):
+            word_ins_out = self.decoder(
+                normalize=False, 
+                prev_output_tokens=prev_output_tokens,
+                encoder_out=encoder_out,
+                step=t,
+            )
+            word_ins_tgt = tgt_tokens
+            word_ins_mask = word_ins_tgt.ne(self.pad)
+
+            word_ins_outs.append(word_ins_out)
+            word_ins_tgts.append(word_ins_tgt)
+            word_ins_masks.append(word_ins_mask)
+
+            if t < (self.train_step - 1):
+                # prediction for next iteration
+                if self.stochastic_approx:
+                    word_ins_prediction = (
+                        word_ins_out + gumbel_noise(word_ins_out)
+                    ).max(-1)[1]
+                else:
+                    word_ins_prediction = word_ins_out.max(-1)[1]
+
+                prev_output_tokens = prev_output_tokens.masked_scatter(
+                    word_ins_mask, word_ins_prediction[word_ins_mask]
+                )
+
+                if self.dae_ratio > 0:
+                    # we do not perform denoising for the first iteration
+                    corrputed = (
+                        torch.rand(size=(B,), device=prev_output_tokens.device)
+                        < self.dae_ratio
+                    )
+                    corrputed_tokens = _sequential_poisoning(
+                        tgt_tokens[corrputed],
+                        len(self.tgt_dict),
+                        0.33,
+                        self.bos,
+                        self.eos,
+                        self.pad,
+                    )
+                    prev_output_tokens[corrputed] = corrputed_tokens
+
+        # concat everything
+        word_ins_out = torch.cat(word_ins_outs, 0)
+        word_ins_tgt = torch.cat(word_ins_tgts, 0)
+        word_ins_mask = torch.cat(word_ins_masks, 0)
+
+        return {
+            "word_ins": {
+                "out": word_ins_out, "tgt": word_ins_tgt,
+                "mask": word_ins_mask, "ls": self.args.label_smoothing,
+                "nll_loss": True
+            },
+            "length": {
+                "out": length_out, "tgt": length_tgt,
+                "factor": self.decoder.length_loss_factor
+            }
+        }
+
+
+@register_model_architecture(
+    "iterative_nonautoregressive_transformer", "iterative_nonautoregressive_transformer"
+)
+def inat_base_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.apply_bert_init = getattr(args, "apply_bert_init", False)
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+
+    # --- special arguments ---
+    args.sg_length_pred = getattr(args, "sg_length_pred", False)
+    args.pred_length_offset = getattr(args, "pred_length_offset", False)
+    args.length_loss_factor = getattr(args, "length_loss_factor", 0.1)
+    args.ngram_predictor = getattr(args, "ngram_predictor", 1)
+    args.src_embedding_copy = getattr(args, "src_embedding_copy", False)
+
+    args.train_step = getattr(args, "train_step", 4)
+    args.dae_ratio = getattr(args, "dae_ratio", 0.5)
+    args.stochastic_approx = getattr(args, "stochastic_approx", False)
+
+
+@register_model_architecture(
+    "iterative_nonautoregressive_transformer",
+    "iterative_nonautoregressive_transformer_wmt_en_de",
+)
+def iter_nat_wmt_en_de(args):
+    inat_base_architecture(args)
diff --git a/fairseq/models/nat/levenshtein_transformer.py b/fairseq/models/nat/levenshtein_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1748145c3b37de434e82b49d41504867f487271
--- /dev/null
+++ b/fairseq/models/nat/levenshtein_transformer.py
@@ -0,0 +1,478 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq.iterative_refinement_generator import DecoderOut
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.transformer import (
+    Embedding,
+    TransformerDecoderLayer
+)
+
+from fairseq.models.nat import (
+    FairseqNATModel,
+    FairseqNATDecoder,
+    ensemble_decoder
+)
+
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+
+
+from .levenshtein_utils import (
+    _skip, _skip_encoder_out, _fill,
+    _get_ins_targets, _get_del_targets,
+    _apply_ins_masks, _apply_ins_words, _apply_del_words
+)
+
+
+@register_model("levenshtein_transformer")
+class LevenshteinTransformerModel(FairseqNATModel):
+
+    @property
+    def allow_length_beam(self):
+        return False
+
+    @staticmethod
+    def add_args(parser):
+        FairseqNATModel.add_args(parser)
+        parser.add_argument(
+            "--early-exit",
+            default="6,6,6",
+            type=str,
+            help="number of decoder layers before word_del, mask_ins, word_ins",
+        )
+        parser.add_argument(
+            "--no-share-discriminator",
+            action="store_true",
+            help="separate parameters for discriminator",
+        )
+        parser.add_argument(
+            "--no-share-maskpredictor",
+            action="store_true",
+            help="separate parameters for mask-predictor",
+        )
+        parser.add_argument(
+            "--share-discriminator-maskpredictor",
+            action="store_true",
+            help="share the parameters for both mask-predictor and discriminator",
+        )
+        parser.add_argument(
+            "--sampling-for-deletion",
+            action='store_true',
+            help='instead of argmax, use sampling to predict the tokens'
+        )
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        decoder = LevenshteinTransformerDecoder(args, tgt_dict, embed_tokens)
+        if getattr(args, "apply_bert_init", False):
+            decoder.apply(init_bert_params)
+        return decoder
+
+    def forward(
+        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
+    ):
+
+        assert tgt_tokens is not None, "forward function only supports training."
+
+        # encoding
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+
+        # generate training labels for insertion
+        masked_tgt_masks, masked_tgt_tokens, mask_ins_targets = _get_ins_targets(
+            prev_output_tokens, tgt_tokens, self.pad, self.unk
+        )
+        mask_ins_targets = mask_ins_targets.clamp(min=0, max=255)  # for safe prediction
+        mask_ins_masks = prev_output_tokens[:, 1:].ne(self.pad)
+
+        mask_ins_out, _ = self.decoder.forward_mask_ins(
+            normalize=False,
+            prev_output_tokens=prev_output_tokens,
+            encoder_out=encoder_out
+        )
+        word_ins_out, _ = self.decoder.forward_word_ins(
+            normalize=False,
+            prev_output_tokens=masked_tgt_tokens,
+            encoder_out=encoder_out
+        )
+
+        # make online prediction
+        if self.decoder.sampling_for_deletion:
+            word_predictions = torch.multinomial(
+                F.softmax(word_ins_out, -1).view(-1, word_ins_out.size(-1)), 1).view(
+                    word_ins_out.size(0), -1)
+        else:
+            word_predictions = F.log_softmax(word_ins_out, dim=-1).max(2)[1]
+
+        word_predictions.masked_scatter_(
+            ~masked_tgt_masks, tgt_tokens[~masked_tgt_masks]
+        )
+
+        # generate training labels for deletion
+        word_del_targets = _get_del_targets(word_predictions, tgt_tokens, self.pad)
+        word_del_out, _ = self.decoder.forward_word_del(
+            normalize=False,
+            prev_output_tokens=word_predictions,
+            encoder_out=encoder_out)
+        word_del_masks = word_predictions.ne(self.pad)
+
+        return {
+            "mask_ins": {
+                "out": mask_ins_out, "tgt": mask_ins_targets,
+                "mask": mask_ins_masks, "ls": 0.01,
+            },
+            "word_ins": {
+                "out": word_ins_out, "tgt": tgt_tokens,
+                "mask": masked_tgt_masks, "ls": self.args.label_smoothing,
+                "nll_loss": True
+            },
+            "word_del": {
+                "out": word_del_out, "tgt": word_del_targets,
+                "mask": word_del_masks
+            }
+        }
+
+    def forward_decoder(
+        self, decoder_out, encoder_out, eos_penalty=0.0, max_ratio=None, **kwargs
+    ):
+
+        output_tokens = decoder_out.output_tokens
+        output_scores = decoder_out.output_scores
+        attn = decoder_out.attn
+        history = decoder_out.history
+
+        bsz = output_tokens.size(0)
+        if max_ratio is None:
+            max_lens = torch.zeros_like(output_tokens).fill_(255)
+        else:
+            if encoder_out.encoder_padding_mask is None:
+                max_src_len = encoder_out.encoder_out.size(0)
+                src_lens = encoder_out.encoder_out.new(bsz).fill_(max_src_len)
+            else:
+                src_lens = (~encoder_out.encoder_padding_mask).sum(1)
+            max_lens = (src_lens * max_ratio).clamp(min=10).long()
+
+        # delete words
+        # do not delete tokens if it is <s> </s>
+        can_del_word = output_tokens.ne(self.pad).sum(1) > 2
+        if can_del_word.sum() != 0:  # we cannot delete, skip
+            word_del_score, word_del_attn = self.decoder.forward_word_del(
+                normalize=True,
+                prev_output_tokens=_skip(output_tokens, can_del_word),
+                encoder_out=_skip_encoder_out(self.encoder, encoder_out, can_del_word)
+            )
+            word_del_pred = word_del_score.max(-1)[1].bool()
+
+            _tokens, _scores, _attn = _apply_del_words(
+                output_tokens[can_del_word],
+                output_scores[can_del_word],
+                word_del_attn,
+                word_del_pred,
+                self.pad,
+                self.bos,
+                self.eos,
+            )
+            output_tokens = _fill(output_tokens, can_del_word, _tokens, self.pad)
+            output_scores = _fill(output_scores, can_del_word, _scores, 0)
+            attn = _fill(attn, can_del_word, _attn, 0.)
+
+            if history is not None:
+                history.append(output_tokens.clone())
+
+        # insert placeholders
+        can_ins_mask = output_tokens.ne(self.pad).sum(1) < max_lens
+        if can_ins_mask.sum() != 0:
+            mask_ins_score, _ = self.decoder.forward_mask_ins(
+                normalize=True,
+                prev_output_tokens=_skip(output_tokens, can_ins_mask),
+                encoder_out=_skip_encoder_out(self.encoder, encoder_out, can_ins_mask)
+            )
+            if eos_penalty > 0.0:
+                mask_ins_score[:, :, 0] = mask_ins_score[:, :, 0] - eos_penalty
+            mask_ins_pred = mask_ins_score.max(-1)[1]
+            mask_ins_pred = torch.min(
+                mask_ins_pred, max_lens[can_ins_mask, None].expand_as(mask_ins_pred)
+            )
+
+            _tokens, _scores = _apply_ins_masks(
+                output_tokens[can_ins_mask],
+                output_scores[can_ins_mask],
+                mask_ins_pred,
+                self.pad,
+                self.unk,
+                self.eos,
+            )
+            output_tokens = _fill(output_tokens, can_ins_mask, _tokens, self.pad)
+            output_scores = _fill(output_scores, can_ins_mask, _scores, 0)
+
+            if history is not None:
+                history.append(output_tokens.clone())
+
+        # insert words
+        can_ins_word = output_tokens.eq(self.unk).sum(1) > 0
+        if can_ins_word.sum() != 0:
+            word_ins_score, word_ins_attn = self.decoder.forward_word_ins(
+                normalize=True,
+                prev_output_tokens=_skip(output_tokens, can_ins_word),
+                encoder_out=_skip_encoder_out(self.encoder, encoder_out, can_ins_word)
+            )
+            word_ins_score, word_ins_pred = word_ins_score.max(-1)
+            _tokens, _scores = _apply_ins_words(
+                output_tokens[can_ins_word],
+                output_scores[can_ins_word],
+                word_ins_pred,
+                word_ins_score,
+                self.unk,
+            )
+
+            output_tokens = _fill(output_tokens, can_ins_word, _tokens, self.pad)
+            output_scores = _fill(output_scores, can_ins_word, _scores, 0)
+            attn = _fill(attn, can_ins_word, word_ins_attn, 0.)
+
+            if history is not None:
+                history.append(output_tokens.clone())
+
+        # delete some unnecessary paddings
+        cut_off = output_tokens.ne(self.pad).sum(1).max()
+        output_tokens = output_tokens[:, :cut_off]
+        output_scores = output_scores[:, :cut_off]
+        attn = None if attn is None else attn[:, :cut_off, :]
+
+        return decoder_out._replace(
+            output_tokens=output_tokens,
+            output_scores=output_scores,
+            attn=attn,
+            history=history
+        )
+
+    def initialize_output_tokens(self, encoder_out, src_tokens):
+        initial_output_tokens = src_tokens.new_zeros(src_tokens.size(0), 2)
+        initial_output_tokens[:, 0] = self.bos
+        initial_output_tokens[:, 1] = self.eos
+
+        initial_output_scores = initial_output_tokens.new_zeros(
+            *initial_output_tokens.size()
+        ).type_as(encoder_out.encoder_out)
+
+        return DecoderOut(
+            output_tokens=initial_output_tokens,
+            output_scores=initial_output_scores,
+            attn=None,
+            step=0,
+            max_step=0,
+            history=None
+        )
+
+
+class LevenshteinTransformerDecoder(FairseqNATDecoder):
+    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
+        super().__init__(
+            args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn
+        )
+        self.dictionary = dictionary
+        self.bos = dictionary.bos()
+        self.unk = dictionary.unk()
+        self.eos = dictionary.eos()
+        self.sampling_for_deletion = getattr(args, "sampling_for_deletion", False)
+        self.embed_mask_ins = Embedding(256, self.output_embed_dim * 2, None)
+        self.embed_word_del = Embedding(2, self.output_embed_dim, None)
+
+        # del_word, ins_mask, ins_word
+        self.early_exit = [int(i) for i in args.early_exit.split(',')]
+        assert len(self.early_exit) == 3
+
+        # copy layers for mask-predict/deletion
+        self.layers_msk = None
+        if getattr(args, "no_share_maskpredictor", False):
+            self.layers_msk = nn.ModuleList([
+                                TransformerDecoderLayer(args, no_encoder_attn)
+                                for _ in range(self.early_exit[1])
+                            ])
+        self.layers_del = None
+        if getattr(args, "no_share_discriminator", False):
+            self.layers_del = nn.ModuleList([
+                                TransformerDecoderLayer(args, no_encoder_attn)
+                                for _ in range(self.early_exit[0])
+                            ])
+
+        if getattr(args, "share_discriminator_maskpredictor", False):
+            assert getattr(args, "no_share_discriminator", False), "must set saperate discriminator"
+            self.layers_msk = self.layers_del
+
+    def extract_features(
+        self, prev_output_tokens, encoder_out=None, early_exit=None, layers=None, **unused
+    ):
+        """
+        Similar to *forward* but only return features.
+        Inputs:
+            prev_output_tokens: Tensor(B, T)
+            encoder_out: a dictionary of hidden states and masks
+
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+            the LevenshteinTransformer decoder has full-attention to all generated tokens
+        """
+        # embed positions
+        positions = (
+            self.embed_positions(prev_output_tokens)
+            if self.embed_positions is not None
+            else None
+        )
+
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+        if self.project_in_dim is not None:
+            x = self.project_in_dim(x)
+
+        if positions is not None:
+            x += positions
+        x = self.dropout_module(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        attn = None
+        inner_states = [x]
+
+        # decoder layers
+        decoder_padding_mask = prev_output_tokens.eq(self.padding_idx)
+        layers = self.layers if layers is None else layers
+        early_exit = len(layers) if early_exit is None else early_exit
+        for _, layer in enumerate(layers[: early_exit]):
+            x, attn, _ = layer(
+                x,
+                encoder_out.encoder_out if encoder_out is not None else None,
+                encoder_out.encoder_padding_mask if encoder_out is not None else None,
+                self_attn_mask=None,
+                self_attn_padding_mask=decoder_padding_mask,
+            )
+            inner_states.append(x)
+
+        if self.layer_norm:
+            x = self.layer_norm(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        if self.project_out_dim is not None:
+            x = self.project_out_dim(x)
+
+        return x, {"attn": attn, "inner_states": inner_states}
+
+    @ensemble_decoder
+    def forward_mask_ins(self, normalize, encoder_out, prev_output_tokens, **unused):
+        features, extra = self.extract_features(
+            prev_output_tokens, encoder_out=encoder_out, early_exit=self.early_exit[1], layers=self.layers_msk, **unused
+        )
+        features_cat = torch.cat([features[:, :-1, :], features[:, 1:, :]], 2)
+        decoder_out = F.linear(features_cat, self.embed_mask_ins.weight)
+        if normalize:
+            return F.log_softmax(decoder_out, -1), extra['attn']
+        return decoder_out, extra['attn']
+
+    @ensemble_decoder
+    def forward_word_ins(self, normalize, encoder_out, prev_output_tokens, **unused):
+        features, extra = self.extract_features(
+            prev_output_tokens, encoder_out=encoder_out, early_exit=self.early_exit[2], layers=self.layers, **unused
+        )
+        decoder_out = self.output_layer(features)
+        if normalize:
+            return F.log_softmax(decoder_out, -1), extra['attn']
+        return decoder_out, extra['attn']
+
+    @ensemble_decoder
+    def forward_word_del(self, normalize, encoder_out, prev_output_tokens, **unused):
+        features, extra = self.extract_features(
+            prev_output_tokens, encoder_out=encoder_out, early_exit=self.early_exit[0], layers=self.layers_del, **unused
+        )
+        decoder_out = F.linear(features, self.embed_word_del.weight)
+        if normalize:
+            return F.log_softmax(decoder_out, -1), extra['attn']
+        return decoder_out, extra['attn']
+
+
+@register_model_architecture("levenshtein_transformer", "levenshtein_transformer")
+def levenshtein_base_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.apply_bert_init = getattr(args, "apply_bert_init", False)
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.sampling_for_deletion = getattr(args, "sampling_for_deletion", False)
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+    args.early_exit = getattr(args, "early_exit", "6,6,6")
+    args.no_share_discriminator = getattr(args, "no_share_discriminator", False)
+    args.no_share_maskpredictor = getattr(args, "no_share_maskpredictor", False)
+    args.share_discriminator_maskpredictor = getattr(args, "share_discriminator_maskpredictor", False)
+    args.no_share_last_layer = getattr(args, "no_share_last_layer", False)
+
+
+@register_model_architecture(
+    "levenshtein_transformer", "levenshtein_transformer_wmt_en_de"
+)
+def levenshtein_transformer_wmt_en_de(args):
+    levenshtein_base_architecture(args)
+
+
+# similar parameters used in the "Attention Is All You Need" paper (Vaswani et al., 2017)
+@register_model_architecture(
+    "levenshtein_transformer", "levenshtein_transformer_vaswani_wmt_en_de_big"
+)
+def levenshtein_transformer_vaswani_wmt_en_de_big(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    args.dropout = getattr(args, "dropout", 0.3)
+    levenshtein_base_architecture(args)
+
+
+# default parameters used in tensor2tensor implementation
+@register_model_architecture(
+    "levenshtein_transformer", "levenshtein_transformer_wmt_en_de_big"
+)
+def levenshtein_transformer_wmt_en_de_big_t2t(args):
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.1)
+    levenshtein_transformer_vaswani_wmt_en_de_big(args)
diff --git a/fairseq/models/nat/levenshtein_utils.py b/fairseq/models/nat/levenshtein_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e29b1fa27cb92f2ce3dd2235d3d6ece910c3ffe5
--- /dev/null
+++ b/fairseq/models/nat/levenshtein_utils.py
@@ -0,0 +1,284 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from fairseq.utils import new_arange
+
+
+# -------------- Helper Functions --------------------------------------------------- #
+
+def load_libnat():
+    try:
+        from fairseq import libnat_cuda
+        return libnat_cuda, True
+
+    except ImportError as e:
+        print(str(e) + '... fall back to CPU version')
+
+        try:
+            from fairseq import libnat
+            return libnat, False
+
+        except ImportError as e:
+            import sys
+            sys.stderr.write("ERROR: missing libnat_cuda. run `python setup.py build_ext --inplace`\n")
+            raise e
+
+
+def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx):
+    libnat, use_cuda = load_libnat()
+
+    def _get_ins_targets_cuda(in_tokens, out_tokens, padding_idx, unk_idx):
+        in_masks = in_tokens.ne(padding_idx)
+        out_masks = out_tokens.ne(padding_idx)
+        mask_ins_targets, masked_tgt_masks = libnat.generate_insertion_labels(
+            out_tokens.int(), libnat.levenshtein_distance(
+                in_tokens.int(), out_tokens.int(),
+                in_masks.sum(1).int(), out_masks.sum(1).int()
+            )
+        )
+        masked_tgt_masks = masked_tgt_masks.bool() & out_masks
+        mask_ins_targets = mask_ins_targets.type_as(
+            in_tokens)[:, 1:in_masks.size(1)].masked_fill_(~in_masks[:, 1:], 0)
+        masked_tgt_tokens = out_tokens.masked_fill(masked_tgt_masks, unk_idx)
+        return masked_tgt_masks, masked_tgt_tokens, mask_ins_targets
+
+    def _get_ins_targets_cpu(in_tokens, out_tokens, padding_idx, unk_idx):
+        in_seq_len, out_seq_len = in_tokens.size(1), out_tokens.size(1)
+
+        in_tokens_list = [
+            [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist())
+        ]
+        out_tokens_list = [
+            [t for t in s if t != padding_idx]
+            for i, s in enumerate(out_tokens.tolist())
+        ]
+
+        full_labels = libnat.suggested_ed2_path(
+            in_tokens_list, out_tokens_list, padding_idx
+        )
+        mask_inputs = [
+            [len(c) if c[0] != padding_idx else 0 for c in a[:-1]] for a in full_labels
+        ]
+
+        # generate labels
+        masked_tgt_masks = []
+        for mask_input in mask_inputs:
+            mask_label = []
+            for beam_size in mask_input[1:-1]:  # HACK 1:-1
+                mask_label += [0] + [1 for _ in range(beam_size)]
+            masked_tgt_masks.append(
+                mask_label + [0 for _ in range(out_seq_len - len(mask_label))]
+            )
+        mask_ins_targets = [
+            mask_input[1:-1] + [0 for _ in range(in_seq_len - 1 - len(mask_input[1:-1]))]
+            for mask_input in mask_inputs
+        ]
+
+        # transform to tensor
+        masked_tgt_masks = torch.tensor(
+            masked_tgt_masks, device=out_tokens.device
+        ).bool()
+        mask_ins_targets = torch.tensor(mask_ins_targets, device=in_tokens.device)
+        masked_tgt_tokens = out_tokens.masked_fill(masked_tgt_masks, unk_idx)
+        return masked_tgt_masks, masked_tgt_tokens, mask_ins_targets
+
+    if use_cuda:
+        return _get_ins_targets_cuda(in_tokens, out_tokens, padding_idx, unk_idx)
+    return _get_ins_targets_cpu(in_tokens, out_tokens, padding_idx, unk_idx)
+
+
+def _get_del_targets(in_tokens, out_tokens, padding_idx):
+    libnat, use_cuda = load_libnat()
+
+    def _get_del_targets_cuda(in_tokens, out_tokens, padding_idx):
+        in_masks = in_tokens.ne(padding_idx)
+        out_masks = out_tokens.ne(padding_idx)
+
+        word_del_targets = libnat.generate_deletion_labels(
+            in_tokens.int(),
+            libnat.levenshtein_distance(
+                in_tokens.int(), out_tokens.int(),
+                in_masks.sum(1).int(), out_masks.sum(1).int()
+            )
+        )
+        word_del_targets = word_del_targets.type_as(in_tokens).masked_fill_(~in_masks, 0)
+        return word_del_targets
+
+    def _get_del_targets_cpu(in_tokens, out_tokens, padding_idx):
+        out_seq_len = out_tokens.size(1)
+        with torch.cuda.device_of(in_tokens):
+            in_tokens_list = [
+                [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist())
+            ]
+            out_tokens_list = [
+                [t for t in s if t != padding_idx]
+                for i, s in enumerate(out_tokens.tolist())
+            ]
+
+        full_labels = libnat.suggested_ed2_path(
+            in_tokens_list, out_tokens_list, padding_idx
+        )
+        word_del_targets = [b[-1] for b in full_labels]
+        word_del_targets = [
+            labels + [0 for _ in range(out_seq_len - len(labels))]
+            for labels in word_del_targets
+        ]
+
+        # transform to tensor
+        word_del_targets = torch.tensor(word_del_targets, device=out_tokens.device)
+        return word_del_targets
+
+    if use_cuda:
+        return _get_del_targets_cuda(in_tokens, out_tokens, padding_idx)
+    return _get_del_targets_cpu(in_tokens, out_tokens, padding_idx)
+
+
+def _apply_ins_masks(
+    in_tokens, in_scores, mask_ins_pred, padding_idx, unk_idx, eos_idx
+):
+
+    in_masks = in_tokens.ne(padding_idx)
+    in_lengths = in_masks.sum(1)
+
+    # HACK: hacky way to shift all the paddings to eos first.
+    in_tokens.masked_fill_(~in_masks, eos_idx)
+    mask_ins_pred.masked_fill_(~in_masks[:, 1:], 0)
+
+    out_lengths = in_lengths + mask_ins_pred.sum(1)
+    out_max_len = out_lengths.max()
+    out_masks = (
+        new_arange(out_lengths, out_max_len)[None, :]
+        < out_lengths[:, None]
+    )
+
+    reordering = (mask_ins_pred + in_masks[:, 1:].long()).cumsum(1)
+    out_tokens = (
+        in_tokens.new_zeros(in_tokens.size(0), out_max_len)
+        .fill_(padding_idx)
+        .masked_fill_(out_masks, unk_idx)
+    )
+    out_tokens[:, 0] = in_tokens[:, 0]
+    out_tokens.scatter_(1, reordering, in_tokens[:, 1:])
+
+    out_scores = None
+    if in_scores is not None:
+        in_scores.masked_fill_(~in_masks, 0)
+        out_scores = in_scores.new_zeros(*out_tokens.size())
+        out_scores[:, 0] = in_scores[:, 0]
+        out_scores.scatter_(1, reordering, in_scores[:, 1:])
+
+    return out_tokens, out_scores
+
+
+def _apply_ins_words(
+    in_tokens, in_scores, word_ins_pred, word_ins_scores, unk_idx
+):
+    word_ins_masks = in_tokens.eq(unk_idx)
+    out_tokens = in_tokens.masked_scatter(word_ins_masks, word_ins_pred[word_ins_masks])
+
+    if in_scores is not None:
+        out_scores = in_scores.masked_scatter(
+            word_ins_masks, word_ins_scores[word_ins_masks]
+        )
+    else:
+        out_scores = None
+
+    return out_tokens, out_scores
+
+
+def _apply_del_words(
+    in_tokens, in_scores, in_attn, word_del_pred, padding_idx, bos_idx, eos_idx
+):
+    # apply deletion to a tensor
+    in_masks = in_tokens.ne(padding_idx)
+    bos_eos_masks = in_tokens.eq(bos_idx) | in_tokens.eq(eos_idx)
+
+    max_len = in_tokens.size(1)
+    word_del_pred.masked_fill_(~in_masks, 1)
+    word_del_pred.masked_fill_(bos_eos_masks, 0)
+
+    reordering = (
+        new_arange(in_tokens)
+        .masked_fill_(word_del_pred, max_len)
+        .sort(1)[1]
+    )
+
+    out_tokens = in_tokens.masked_fill(word_del_pred, padding_idx).gather(1, reordering)
+
+    out_scores = None
+    if in_scores is not None:
+        out_scores = in_scores.masked_fill(word_del_pred, 0).gather(1, reordering)
+
+    out_attn = None
+    if in_attn is not None:
+        _mask = word_del_pred[:, :, None].expand_as(in_attn)
+        _reordering = reordering[:, :, None].expand_as(in_attn)
+        out_attn = in_attn.masked_fill(_mask, 0.).gather(1, _reordering)
+
+    return out_tokens, out_scores, out_attn
+
+
+def _skip(x, mask):
+    """
+    Getting sliced (dim=0) tensor by mask. Supporting tensor and list/dict of tensors.
+    """
+    if isinstance(x, int):
+        return x
+
+    if x is None:
+        return None
+
+    if isinstance(x, torch.Tensor):
+        if x.size(0) == mask.size(0):
+            return x[mask]
+        elif x.size(1) == mask.size(0):
+            return x[:, mask]
+
+    if isinstance(x, list):
+        return [_skip(x_i, mask) for x_i in x]
+
+    if isinstance(x, dict):
+        return {k: _skip(v, mask) for k, v in x.items()}
+
+    raise NotImplementedError
+
+
+def _skip_encoder_out(encoder, encoder_out, mask):
+    if not mask.any():
+        return encoder_out
+    else:
+        return encoder.reorder_encoder_out(encoder_out, mask.nonzero().squeeze())
+
+
+def _fill(x, mask, y, padding_idx):
+    """
+    Filling tensor x with y at masked positions (dim=0).
+    """
+    if x is None:
+        return y
+    assert x.dim() == y.dim() and mask.size(0) == x.size(0)
+    assert x.dim() == 2 or (x.dim() == 3 and x.size(2) == y.size(2))
+    n_selected = mask.sum()
+    assert n_selected == y.size(0)
+
+    if n_selected == x.size(0):
+        return y
+
+    if x.size(1) < y.size(1):
+        dims = [x.size(0), y.size(1) - x.size(1)]
+        if x.dim() == 3:
+            dims.append(x.size(2))
+        x = torch.cat([x, x.new_zeros(*dims).fill_(padding_idx)], 1)
+        x[mask] = y
+    elif x.size(1) > y.size(1):
+        x[mask] = padding_idx
+        if x.dim() == 2:
+            x[mask, :y.size(1)] = y
+        else:
+            x[mask, :y.size(1), :] = y
+    else:
+        x[mask] = y
+    return x
diff --git a/fairseq/models/nat/nat_crf_transformer.py b/fairseq/models/nat/nat_crf_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dd3a08f72abf81800ff9f5c2c46422e2d865f2d
--- /dev/null
+++ b/fairseq/models/nat/nat_crf_transformer.py
@@ -0,0 +1,107 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from fairseq.models.nat import NATransformerModel, base_architecture
+from fairseq.models import register_model, register_model_architecture
+from fairseq.modules import DynamicCRF
+
+
+@register_model("nacrf_transformer")
+class NACRFTransformerModel(NATransformerModel):
+    def __init__(self, args, encoder, decoder):
+        super().__init__(args, encoder, decoder)
+        self.crf_layer = DynamicCRF(
+            num_embedding=len(self.tgt_dict),
+            low_rank=args.crf_lowrank_approx,
+            beam_size=args.crf_beam_approx
+        )
+
+    @property
+    def allow_ensemble(self):
+        return False
+
+    @staticmethod
+    def add_args(parser):
+        NATransformerModel.add_args(parser)
+        parser.add_argument("--crf-lowrank-approx", type=int,
+                            help="the dimension of low-rank approximation of transition")
+        parser.add_argument("--crf-beam-approx", type=int,
+                            help="the beam size for apporixmating the normalizing factor")
+        parser.add_argument("--word-ins-loss-factor", type=float,
+                            help="weights on NAT loss used to co-training with CRF loss.")
+
+    def forward(
+        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
+    ):
+        # encoding
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+
+        # length prediction
+        length_out = self.decoder.forward_length(normalize=False, encoder_out=encoder_out)
+        length_tgt = self.decoder.forward_length_prediction(length_out, encoder_out, tgt_tokens)
+
+        # decoding
+        word_ins_out = self.decoder(
+            normalize=False,
+            prev_output_tokens=prev_output_tokens,
+            encoder_out=encoder_out)
+        word_ins_tgt, word_ins_mask = tgt_tokens, tgt_tokens.ne(self.pad)
+
+        # compute the log-likelihood of CRF
+        crf_nll = -self.crf_layer(word_ins_out, word_ins_tgt, word_ins_mask)
+        crf_nll = (crf_nll / word_ins_mask.type_as(crf_nll).sum(-1)).mean()
+
+        return {
+            "word_ins": {
+                "out": word_ins_out, "tgt": word_ins_tgt,
+                "mask": word_ins_mask, "ls": self.args.label_smoothing,
+                "nll_loss": True, "factor": self.args.word_ins_loss_factor
+            },
+            "word_crf": {
+                "loss": crf_nll
+            },
+            "length": {
+                "out": length_out, "tgt": length_tgt,
+                "factor": self.decoder.length_loss_factor
+            }
+        }
+
+    def forward_decoder(self, decoder_out, encoder_out, decoding_format=None, **kwargs):
+        output_tokens = decoder_out.output_tokens
+        output_scores = decoder_out.output_scores
+        history = decoder_out.history
+
+        # execute the decoder and get emission scores
+        output_masks = output_tokens.ne(self.pad)
+        word_ins_out = self.decoder(
+            normalize=False,
+            prev_output_tokens=output_tokens,
+            encoder_out=encoder_out
+        )
+
+        # run viterbi decoding through CRF
+        _scores, _tokens = self.crf_layer.forward_decoder(word_ins_out, output_masks)
+        output_tokens.masked_scatter_(output_masks, _tokens[output_masks])
+        output_scores.masked_scatter_(output_masks, _scores[output_masks])
+        if history is not None:
+            history.append(output_tokens.clone())
+
+        return decoder_out._replace(
+            output_tokens=output_tokens,
+            output_scores=output_scores,
+            attn=None,
+            history=history
+        )
+
+
+@register_model_architecture("nacrf_transformer", "nacrf_transformer")
+def nacrf_base_architecture(args):
+    args.crf_lowrank_approx = getattr(args, "crf_lowrank_approx", 32)
+    args.crf_beam_approx = getattr(args, "crf_beam_approx", 64)
+    args.word_ins_loss_factor = getattr(args, "word_ins_loss_factor", 0.5)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
+    base_architecture(args)
diff --git a/fairseq/models/nat/nonautoregressive_ensembles.py b/fairseq/models/nat/nonautoregressive_ensembles.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ed4d956e027c9776dcd00bf94cd14af9ece1c39
--- /dev/null
+++ b/fairseq/models/nat/nonautoregressive_ensembles.py
@@ -0,0 +1,231 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn.functional as F
+
+from fairseq.models.nat import (
+    _fill,
+    _skip,
+    _skip_encoder_out,
+    _apply_ins_masks,
+    _apply_ins_words,
+    _apply_del_words,
+)
+
+
+class _EnsembleModelEncoder(object):
+    def __init__(self, models):
+        self.models = models
+
+    def reorder_encoder_out(self, encoder_outs, new_order):
+        encoder_outs = [
+            model.encoder.reorder_encoder_out(encoder_out, new_order)
+            for model, encoder_out in zip(self.models, encoder_outs)
+        ]
+        return encoder_outs
+
+
+class BasicEnsembleModel(torch.nn.Module):
+    """A wrapper around an ensemble of models."""
+
+    def __init__(self, models):
+        super().__init__()
+        self.models = torch.nn.ModuleList(models)
+        self.bos = self.models[0].decoder.dictionary.bos()
+        self.eos = self.models[0].decoder.dictionary.eos()
+        self.pad = self.models[0].decoder.dictionary.pad()
+        self.unk = self.models[0].decoder.dictionary.unk()
+        self.encoder = _EnsembleModelEncoder(self.models)
+
+    def has_encoder(self):
+        return hasattr(self.models[0], 'encoder')
+
+    def max_decoder_positions(self):
+        return min(m.max_decoder_positions() for m in self.models)
+
+    @torch.no_grad()
+    def forward_encoder(self, encoder_input):
+        if not self.has_encoder():
+            return None
+        return [model.forward_encoder(encoder_input) for model in self.models]
+
+    @torch.no_grad()
+    def forward_decoder(self, *inputs):
+        raise NotImplementedError
+
+    def initialize_output_tokens(self, *inputs):
+        raise NotImplementedError
+
+
+class EnsembleLevT(BasicEnsembleModel):
+    """A wrapper around an ensemble of models."""
+
+    def __init__(self, models):
+        super().__init__(models)
+
+    @torch.no_grad()
+    def forward_decoder(self, decoder_out, encoder_outs, eos_penalty=0.0, max_ratio=None, **kwargs):
+        # LevT ensembling
+        # A pipeline of three steps: deletion, placeholder, and word insertion.
+        # We need to average scores in each step in a pipeline way because of dependence.
+        # deletion
+        output_tokens = decoder_out.output_tokens
+        output_scores = decoder_out.output_scores
+        attn = decoder_out.attn
+
+        bsz = output_tokens.size(0)
+        if max_ratio is None:
+            max_lens = output_tokens.new().fill_(255)
+        else:
+            if encoder_outs[0].encoder_padding_mask is None:
+                src_lens = encoder_outs[0].encoder_out.new(bsz).fill_(encoder_outs[0].encoder_out.size(1))
+            else:
+                src_lens = (~encoder_outs[0].encoder_padding_mask).sum(1)
+            max_lens = (src_lens * max_ratio).clamp(min=10).long()
+
+        # delete words
+        # do not delete tokens if it is <s> </s>
+        can_del_word = output_tokens.ne(self.pad).sum(1) > 2
+        if can_del_word.sum() != 0:  # we cannot delete, skip
+            output_tokens, output_scores, attn = self.forward_word_del(
+                encoder_outs,
+                output_tokens,
+                output_scores,
+                attn,
+                can_del_word,
+            )
+
+        # insert placeholders
+        can_ins_mask = output_tokens.ne(self.pad).sum(1) < max_lens
+        if can_ins_mask.sum() != 0:
+            output_tokens, output_scores = self.forward_mask_ins(
+                 encoder_outs,
+                 output_tokens,
+                 output_scores,
+                 can_ins_mask,
+                 eos_penalty,
+                 max_lens,
+             )
+
+        # insert words
+        can_ins_word = output_tokens.eq(self.unk).sum(1) > 0
+        if can_ins_word.sum() != 0:
+            output_tokens, output_scores, attn = self.forward_word_ins(
+                encoder_outs,
+                output_tokens,
+                output_scores,
+                attn,
+                can_ins_word,
+            )
+
+        # delete some unnecessary paddings
+        cut_off = output_tokens.ne(self.pad).sum(1).max()
+        output_tokens = output_tokens[:, :cut_off]
+        output_scores = output_scores[:, :cut_off]
+        attn = None if attn is None else attn[:, :cut_off, :]
+        return decoder_out._replace(
+            output_tokens=output_tokens,
+            output_scores=output_scores,
+            attn=attn,
+            history=None
+        )
+
+    def forward_word_del(self, encoder_outs, output_tokens, output_scores, attn, can_del_word):
+        word_del_score_avg = []
+        word_del_attn_avg = []
+        for model, encoder_out in zip(self.models, encoder_outs):
+            word_del_out, word_del_attn = model.decoder.forward_word_del(
+                _skip(output_tokens, can_del_word),
+                _skip_encoder_out(model.encoder, encoder_out, can_del_word),
+            )
+            word_del_score = F.log_softmax(word_del_out, 2)
+            word_del_score_avg.append(word_del_score)
+            word_del_attn_avg.append(word_del_attn)
+        word_del_score_avg = torch.logsumexp(torch.stack(word_del_score_avg, dim=0), dim=0) - math.log(len(self.models))
+        word_del_pred = word_del_score_avg.max(-1)[1].bool()
+        if word_del_attn_avg[0] is not None:
+            word_del_attn_avg = torch.stack(word_del_attn_avg, dim=0)/len(self.models)
+        else:
+            word_del_attn_avg = None
+
+        _tokens, _scores, _attn = _apply_del_words(
+            output_tokens[can_del_word],
+            output_scores[can_del_word],
+            word_del_attn_avg,
+            word_del_pred,
+            self.pad,
+            self.bos,
+            self.eos,
+        )
+        output_tokens = _fill(output_tokens, can_del_word, _tokens, self.pad)
+        output_scores = _fill(output_scores, can_del_word, _scores, 0)
+        attn = _fill(attn, can_del_word, _attn, 0.)
+        return output_tokens, output_scores, attn
+
+    def forward_mask_ins(self, encoder_outs, output_tokens, output_scores, can_ins_mask, eos_penalty, max_lens):
+        mask_ins_score_avg = []
+        for model, encoder_out in zip(self.models, encoder_outs):
+            mask_ins_out, _ = model.decoder.forward_mask_ins(
+                _skip(output_tokens, can_ins_mask),
+                _skip_encoder_out(model.encoder, encoder_out, can_ins_mask),
+            )
+            mask_ins_score = F.log_softmax(mask_ins_out, 2)
+            if eos_penalty > 0.0:
+                mask_ins_score[:, :, 0] -= eos_penalty
+            mask_ins_score_avg.append(mask_ins_score)
+        mask_ins_score_avg = torch.logsumexp(torch.stack(mask_ins_score_avg, dim=0), dim=0) - math.log(len(self.models))
+        mask_ins_pred = mask_ins_score_avg.max(-1)[1]
+        mask_ins_pred = torch.min(
+            mask_ins_pred, max_lens[can_ins_mask, None].expand_as(mask_ins_pred)
+        )
+        _tokens, _scores = _apply_ins_masks(
+            output_tokens[can_ins_mask],
+            output_scores[can_ins_mask],
+            mask_ins_pred,
+            self.pad,
+            self.unk,
+            self.eos,
+        )
+        output_tokens = _fill(output_tokens, can_ins_mask, _tokens, self.pad)
+        output_scores = _fill(output_scores, can_ins_mask, _scores, 0)
+        return output_tokens, output_scores
+
+    def forward_word_ins(self, encoder_outs, output_tokens, output_scores, attn, can_ins_word):
+        word_ins_score_avg = []
+        word_ins_attn_avg = []
+        for model, encoder_out in zip(self.models, encoder_outs):
+            word_ins_out, word_ins_attn = model.decoder.forward_word_ins(
+                _skip(output_tokens, can_ins_word),
+                _skip_encoder_out(model.encoder, encoder_out, can_ins_word),
+            )
+            word_ins_score = F.log_softmax(word_ins_out, 2)
+            word_ins_score_avg.append(word_ins_score)
+            word_ins_attn_avg.append(word_ins_attn)
+        word_ins_score_avg = torch.logsumexp(torch.stack(word_ins_score_avg, dim=0), dim=0) - math.log(len(self.models))
+        if word_ins_attn_avg[0] is not None:
+            word_ins_attn_avg = torch.stack(word_ins_attn_avg, dim=0)/len(self.models)
+        else:
+            word_ins_attn_avg = None
+        word_ins_score_max, word_ins_pred = word_ins_score_avg.max(-1)
+
+        _tokens, _scores = _apply_ins_words(
+            output_tokens[can_ins_word],
+            output_scores[can_ins_word],
+            word_ins_pred,
+            word_ins_score_max,
+            self.unk,
+        )
+
+        output_tokens = _fill(output_tokens, can_ins_word, _tokens, self.pad)
+        output_scores = _fill(output_scores, can_ins_word, _scores, 0)
+        attn = _fill(attn, can_ins_word, word_ins_attn, 0.)
+        return output_tokens, output_scores, attn
+
+    def initialize_output_tokens(self, encoder_outs, src_tokens):
+        # LevT doesn't do length prediction.
+        return self.models[0].initialize_output_tokens(encoder_outs[0], src_tokens)
diff --git a/fairseq/models/nat/nonautoregressive_transformer.py b/fairseq/models/nat/nonautoregressive_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..050755c30801120440d28a917f42877d3726dcf6
--- /dev/null
+++ b/fairseq/models/nat/nonautoregressive_transformer.py
@@ -0,0 +1,424 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn.functional as F
+
+from fairseq import utils
+from fairseq.iterative_refinement_generator import DecoderOut
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.transformer import Embedding
+
+from fairseq.models.nat import (
+    FairseqNATModel,
+    FairseqNATDecoder,
+    ensemble_decoder
+)
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+
+
+def _mean_pooling(enc_feats, src_masks):
+    # enc_feats: T x B x C
+    # src_masks: B x T or None
+    if src_masks is None:
+        enc_feats = enc_feats.mean(0)
+    else:
+        src_masks = (~src_masks).transpose(0, 1).type_as(enc_feats)
+        enc_feats = (
+            (enc_feats / src_masks.sum(0)[None, :, None]) * src_masks[:, :, None]
+        ).sum(0)
+    return enc_feats
+
+
+def _argmax(x, dim):
+    return (x == x.max(dim, keepdim=True)[0]).type_as(x)
+
+
+def _uniform_assignment(src_lens, trg_lens):
+    max_trg_len = trg_lens.max()
+    steps = (src_lens.float() - 1) / (trg_lens.float() - 1)  # step-size
+    # max_trg_len
+    index_t = utils.new_arange(trg_lens, max_trg_len).float()
+    index_t = steps[:, None] * index_t[None, :]  # batch_size X max_trg_len
+    index_t = torch.round(index_t).long().detach()
+    return index_t
+
+
+@register_model("nonautoregressive_transformer")
+class NATransformerModel(FairseqNATModel):
+
+    @property
+    def allow_length_beam(self):
+        return True
+
+    @staticmethod
+    def add_args(parser):
+        FairseqNATModel.add_args(parser)
+
+        # length prediction
+        parser.add_argument("--src-embedding-copy", action="store_true",
+                            help="copy encoder word embeddings as the initial input of the decoder")
+        parser.add_argument("--pred-length-offset", action="store_true",
+                            help="predicting the length difference between the target and source sentences")
+        parser.add_argument("--sg-length-pred", action="store_true",
+                            help="stop the gradients back-propagated from the length predictor")
+        parser.add_argument("--length-loss-factor", type=float,
+                            help="weights on the length prediction loss")
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        decoder = NATransformerDecoder(args, tgt_dict, embed_tokens)
+        if getattr(args, "apply_bert_init", False):
+            decoder.apply(init_bert_params)
+        return decoder
+
+    def forward(
+        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
+    ):
+        # encoding
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+
+        # length prediction
+        length_out = self.decoder.forward_length(normalize=False, encoder_out=encoder_out)
+        length_tgt = self.decoder.forward_length_prediction(length_out, encoder_out, tgt_tokens)
+
+        # decoding
+        word_ins_out = self.decoder(
+            normalize=False,
+            prev_output_tokens=prev_output_tokens,
+            encoder_out=encoder_out)
+
+        return {
+            "word_ins": {
+                "out": word_ins_out, "tgt": tgt_tokens,
+                "mask": tgt_tokens.ne(self.pad), "ls": self.args.label_smoothing,
+                "nll_loss": True
+            },
+            "length": {
+                "out": length_out, "tgt": length_tgt,
+                "factor": self.decoder.length_loss_factor
+            }
+        }
+
+    def forward_decoder(self, decoder_out, encoder_out, decoding_format=None, **kwargs):
+        step = decoder_out.step
+        output_tokens = decoder_out.output_tokens
+        output_scores = decoder_out.output_scores
+        history = decoder_out.history
+
+        # execute the decoder
+        output_masks = output_tokens.ne(self.pad)
+        _scores, _tokens = self.decoder(
+            normalize=True,
+            prev_output_tokens=output_tokens,
+            encoder_out=encoder_out,
+            step=step,
+        ).max(-1)
+
+        output_tokens.masked_scatter_(output_masks, _tokens[output_masks])
+        output_scores.masked_scatter_(output_masks, _scores[output_masks])
+        if history is not None:
+            history.append(output_tokens.clone())
+
+        return decoder_out._replace(
+            output_tokens=output_tokens,
+            output_scores=output_scores,
+            attn=None,
+            history=history
+        )
+
+    def initialize_output_tokens(self, encoder_out, src_tokens):
+        # length prediction
+        length_tgt = self.decoder.forward_length_prediction(
+            self.decoder.forward_length(normalize=True, encoder_out=encoder_out),
+            encoder_out=encoder_out
+        )
+
+        max_length = length_tgt.clamp_(min=2).max()
+        idx_length = utils.new_arange(src_tokens, max_length)
+
+        initial_output_tokens = src_tokens.new_zeros(
+            src_tokens.size(0), max_length
+        ).fill_(self.pad)
+        initial_output_tokens.masked_fill_(
+            idx_length[None, :] < length_tgt[:, None], self.unk
+        )
+        initial_output_tokens[:, 0] = self.bos
+        initial_output_tokens.scatter_(1, length_tgt[:, None] - 1, self.eos)
+
+        initial_output_scores = initial_output_tokens.new_zeros(
+            *initial_output_tokens.size()
+        ).type_as(encoder_out.encoder_out)
+
+        return DecoderOut(
+            output_tokens=initial_output_tokens,
+            output_scores=initial_output_scores,
+            attn=None,
+            step=0,
+            max_step=0,
+            history=None
+        )
+
+    def regenerate_length_beam(self, decoder_out, beam_size):
+        output_tokens = decoder_out.output_tokens
+        length_tgt = output_tokens.ne(self.pad).sum(1)
+        length_tgt = length_tgt[:, None] + utils.new_arange(length_tgt, 1, beam_size) - beam_size // 2
+        length_tgt = length_tgt.view(-1).clamp_(min=2)
+        max_length = length_tgt.max()
+        idx_length = utils.new_arange(length_tgt, max_length)
+
+        initial_output_tokens = output_tokens.new_zeros(
+            length_tgt.size(0), max_length
+        ).fill_(self.pad)
+        initial_output_tokens.masked_fill_(
+            idx_length[None, :] < length_tgt[:, None], self.unk
+        )
+        initial_output_tokens[:, 0] = self.bos
+        initial_output_tokens.scatter_(1, length_tgt[:, None] - 1, self.eos)
+
+        initial_output_scores = initial_output_tokens.new_zeros(
+            *initial_output_tokens.size()
+        ).type_as(decoder_out.output_scores)
+
+        return decoder_out._replace(
+            output_tokens=initial_output_tokens,
+            output_scores=initial_output_scores
+        )
+
+
+class NATransformerDecoder(FairseqNATDecoder):
+    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
+        super().__init__(
+            args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn
+        )
+        self.dictionary = dictionary
+        self.bos = dictionary.bos()
+        self.unk = dictionary.unk()
+        self.eos = dictionary.eos()
+
+        self.encoder_embed_dim = args.encoder_embed_dim
+        self.sg_length_pred = getattr(args, "sg_length_pred", False)
+        self.pred_length_offset = getattr(args, "pred_length_offset", False)
+        self.length_loss_factor = getattr(args, "length_loss_factor", 0.1)
+        self.src_embedding_copy = getattr(args, "src_embedding_copy", False)
+        self.embed_length = Embedding(256, self.encoder_embed_dim, None)
+
+    @ensemble_decoder
+    def forward(self, normalize, encoder_out, prev_output_tokens, step=0, **unused):
+        features, _ = self.extract_features(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            embedding_copy=(step == 0) & self.src_embedding_copy,
+        )
+        decoder_out = self.output_layer(features)
+        return F.log_softmax(decoder_out, -1) if normalize else decoder_out
+
+    @ensemble_decoder
+    def forward_length(self, normalize, encoder_out):
+        enc_feats = encoder_out.encoder_out  # T x B x C
+        src_masks = encoder_out.encoder_padding_mask  # B x T or None
+        enc_feats = _mean_pooling(enc_feats, src_masks)
+        if self.sg_length_pred:
+            enc_feats = enc_feats.detach()
+        length_out = F.linear(enc_feats, self.embed_length.weight)
+        return F.log_softmax(length_out, -1) if normalize else length_out
+
+    def extract_features(
+        self,
+        prev_output_tokens,
+        encoder_out=None,
+        early_exit=None,
+        embedding_copy=False,
+        **unused
+    ):
+        """
+        Similar to *forward* but only return features.
+
+        Inputs:
+            prev_output_tokens: Tensor(B, T)
+            encoder_out: a dictionary of hidden states and masks
+
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+            the LevenshteinTransformer decoder has full-attention to all generated tokens
+        """
+        # embedding
+        if embedding_copy:
+            src_embd = encoder_out.encoder_embedding
+            src_mask = encoder_out.encoder_padding_mask
+            src_mask = (
+                ~src_mask
+                if src_mask is not None
+                else prev_output_tokens.new_ones(*src_embd.size()[:2]).bool()
+            )
+
+            x, decoder_padding_mask = self.forward_embedding(
+                prev_output_tokens,
+                self.forward_copying_source(
+                    src_embd, src_mask, prev_output_tokens.ne(self.padding_idx)
+                ),
+            )
+
+        else:
+
+            x, decoder_padding_mask = self.forward_embedding(prev_output_tokens)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        attn = None
+        inner_states = [x]
+
+        # decoder layers
+        for i, layer in enumerate(self.layers):
+
+            # early exit from the decoder.
+            if (early_exit is not None) and (i >= early_exit):
+                break
+
+            x, attn, _ = layer(
+                x,
+                encoder_out.encoder_out if encoder_out is not None else None,
+                encoder_out.encoder_padding_mask if encoder_out is not None else None,
+                self_attn_mask=None,
+                self_attn_padding_mask=decoder_padding_mask,
+            )
+            inner_states.append(x)
+
+        if self.layer_norm:
+            x = self.layer_norm(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        if self.project_out_dim is not None:
+            x = self.project_out_dim(x)
+
+        return x, {"attn": attn, "inner_states": inner_states}
+
+    def forward_embedding(self, prev_output_tokens, states=None):
+        # embed positions
+        positions = (
+            self.embed_positions(prev_output_tokens)
+            if self.embed_positions is not None
+            else None
+        )
+
+        # embed tokens and positions
+        if states is None:
+            x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+            if self.project_in_dim is not None:
+                x = self.project_in_dim(x)
+        else:
+            x = states
+
+        if positions is not None:
+            x += positions
+        x = self.dropout_module(x)
+        decoder_padding_mask = prev_output_tokens.eq(self.padding_idx)
+        return x, decoder_padding_mask
+
+    def forward_copying_source(self, src_embeds, src_masks, tgt_masks):
+        length_sources = src_masks.sum(1)
+        length_targets = tgt_masks.sum(1)
+        mapped_inputs = _uniform_assignment(length_sources, length_targets).masked_fill(
+            ~tgt_masks, 0
+        )
+        copied_embedding = torch.gather(
+            src_embeds,
+            1,
+            mapped_inputs.unsqueeze(-1).expand(
+                *mapped_inputs.size(), src_embeds.size(-1)
+            ),
+        )
+        return copied_embedding
+
+    def forward_length_prediction(self, length_out, encoder_out, tgt_tokens=None):
+        enc_feats = encoder_out.encoder_out  # T x B x C
+        src_masks = encoder_out.encoder_padding_mask  # B x T or None
+        if self.pred_length_offset:
+            if src_masks is None:
+                src_lengs = enc_feats.new_ones(enc_feats.size(1)).fill_(
+                    enc_feats.size(0)
+                )
+            else:
+                src_lengs = (~src_masks).transpose(0, 1).type_as(enc_feats).sum(0)
+            src_lengs = src_lengs.long()
+
+        if tgt_tokens is not None:
+            # obtain the length target
+            tgt_lengs = tgt_tokens.ne(self.padding_idx).sum(1).long()
+            if self.pred_length_offset:
+                length_tgt = tgt_lengs - src_lengs + 128
+            else:
+                length_tgt = tgt_lengs
+            length_tgt = length_tgt.clamp(min=0, max=255)
+
+        else:
+            # predict the length target (greedy for now)
+            # TODO: implementing length-beam
+            pred_lengs = length_out.max(-1)[1]
+            if self.pred_length_offset:
+                length_tgt = pred_lengs - 128 + src_lengs
+            else:
+                length_tgt = pred_lengs
+
+        return length_tgt
+
+
+@register_model_architecture(
+    "nonautoregressive_transformer", "nonautoregressive_transformer"
+)
+def base_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.apply_bert_init = getattr(args, "apply_bert_init", False)
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+
+    # --- special arguments ---
+    args.sg_length_pred = getattr(args, "sg_length_pred", False)
+    args.pred_length_offset = getattr(args, "pred_length_offset", False)
+    args.length_loss_factor = getattr(args, "length_loss_factor", 0.1)
+    args.src_embedding_copy = getattr(args, "src_embedding_copy", False)
+
+
+@register_model_architecture(
+    "nonautoregressive_transformer", "nonautoregressive_transformer_wmt_en_de"
+)
+def nonautoregressive_transformer_wmt_en_de(args):
+    base_architecture(args)
diff --git a/fairseq/models/roberta/__init__.py b/fairseq/models/roberta/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..56579e591566e014d99ed5a283ee7135257f054c
--- /dev/null
+++ b/fairseq/models/roberta/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .hub_interface import *  # noqa
+from .model import *  # noqa
+from .model_camembert import *  # noqa
+from .model_xlmr import *  # noqa
diff --git a/fairseq/models/roberta/__pycache__/__init__.cpython-310.pyc b/fairseq/models/roberta/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3f0ccdcd0c6862f127dde8b32c8883783b9c076c
Binary files /dev/null and b/fairseq/models/roberta/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/models/roberta/__pycache__/hub_interface.cpython-310.pyc b/fairseq/models/roberta/__pycache__/hub_interface.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..01162df29f1fbaa1b64ef1468c9cd91cced9fcde
Binary files /dev/null and b/fairseq/models/roberta/__pycache__/hub_interface.cpython-310.pyc differ
diff --git a/fairseq/models/roberta/__pycache__/model.cpython-310.pyc b/fairseq/models/roberta/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a0554d1b4ab3ef0cddb697cdc9b4bb8de83cce92
Binary files /dev/null and b/fairseq/models/roberta/__pycache__/model.cpython-310.pyc differ
diff --git a/fairseq/models/roberta/__pycache__/model_camembert.cpython-310.pyc b/fairseq/models/roberta/__pycache__/model_camembert.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a4faeb2e409e3f360079b868c094f9bbcb02b503
Binary files /dev/null and b/fairseq/models/roberta/__pycache__/model_camembert.cpython-310.pyc differ
diff --git a/fairseq/models/roberta/__pycache__/model_xlmr.cpython-310.pyc b/fairseq/models/roberta/__pycache__/model_xlmr.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cbf3e7f900c264705adcbf5489d2d6aafa739c6c
Binary files /dev/null and b/fairseq/models/roberta/__pycache__/model_xlmr.cpython-310.pyc differ
diff --git a/fairseq/models/roberta/alignment_utils.py b/fairseq/models/roberta/alignment_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..45d2e37194c0f66e2b063884d7f3291ae48ece0f
--- /dev/null
+++ b/fairseq/models/roberta/alignment_utils.py
@@ -0,0 +1,115 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import Counter
+from typing import List
+
+import torch
+
+
+def align_bpe_to_words(roberta, bpe_tokens: torch.LongTensor, other_tokens: List[str]):
+    """
+    Helper to align GPT-2 BPE to other tokenization formats (e.g., spaCy).
+
+    Args:
+        roberta (RobertaHubInterface): RoBERTa instance
+        bpe_tokens (torch.LongTensor): GPT-2 BPE tokens of shape `(T_bpe)`
+        other_tokens (List[str]): other tokens of shape `(T_words)`
+
+    Returns:
+        List[str]: mapping from *other_tokens* to corresponding *bpe_tokens*.
+    """
+    assert bpe_tokens.dim() == 1
+    assert bpe_tokens[0] == 0
+
+    def clean(text):
+        return text.strip()
+
+    # remove whitespaces to simplify alignment
+    bpe_tokens = [roberta.task.source_dictionary.string([x]) for x in bpe_tokens]
+    bpe_tokens = [clean(roberta.bpe.decode(x) if x not in {'<s>', ''} else x) for x in bpe_tokens]
+    other_tokens = [clean(str(o)) for o in other_tokens]
+
+    # strip leading <s>
+    bpe_tokens = bpe_tokens[1:]
+    assert ''.join(bpe_tokens) == ''.join(other_tokens)
+
+    # create alignment from every word to a list of BPE tokens
+    alignment = []
+    bpe_toks = filter(lambda item: item[1] != '', enumerate(bpe_tokens, start=1))
+    j, bpe_tok = next(bpe_toks)
+    for other_tok in other_tokens:
+        bpe_indices = []
+        while True:
+            if other_tok.startswith(bpe_tok):
+                bpe_indices.append(j)
+                other_tok = other_tok[len(bpe_tok):]
+                try:
+                    j, bpe_tok = next(bpe_toks)
+                except StopIteration:
+                    j, bpe_tok = None, None
+            elif bpe_tok.startswith(other_tok):
+                # other_tok spans multiple BPE tokens
+                bpe_indices.append(j)
+                bpe_tok = bpe_tok[len(other_tok):]
+                other_tok = ''
+            else:
+                raise Exception('Cannot align "{}" and "{}"'.format(other_tok, bpe_tok))
+            if other_tok == '':
+                break
+        assert len(bpe_indices) > 0
+        alignment.append(bpe_indices)
+    assert len(alignment) == len(other_tokens)
+
+    return alignment
+
+
+def align_features_to_words(roberta, features, alignment):
+    """
+    Align given features to words.
+
+    Args:
+        roberta (RobertaHubInterface): RoBERTa instance
+        features (torch.Tensor): features to align of shape `(T_bpe x C)`
+        alignment: alignment between BPE tokens and words returned by
+            func:`align_bpe_to_words`.
+    """
+    assert features.dim() == 2
+
+    bpe_counts = Counter(j for bpe_indices in alignment for j in bpe_indices)
+    assert bpe_counts[0] == 0  # <s> shouldn't be aligned
+    denom = features.new([bpe_counts.get(j, 1) for j in range(len(features))])
+    weighted_features = features / denom.unsqueeze(-1)
+
+    output = [weighted_features[0]]
+    largest_j = -1
+    for bpe_indices in alignment:
+        output.append(weighted_features[bpe_indices].sum(dim=0))
+        largest_j = max(largest_j, *bpe_indices)
+    for j in range(largest_j + 1, len(features)):
+        output.append(weighted_features[j])
+    output = torch.stack(output)
+    assert torch.all(torch.abs(output.sum(dim=0) - features.sum(dim=0)) < 1e-4)
+    return output
+
+
+def spacy_nlp():
+    if getattr(spacy_nlp, '_nlp', None) is None:
+        try:
+            from spacy.lang.en import English
+            spacy_nlp._nlp = English()
+        except ImportError:
+            raise ImportError('Please install spacy with: pip install spacy')
+    return spacy_nlp._nlp
+
+
+def spacy_tokenizer():
+    if getattr(spacy_tokenizer, '_tokenizer', None) is None:
+        try:
+            nlp = spacy_nlp()
+            spacy_tokenizer._tokenizer = nlp.Defaults.create_tokenizer(nlp)
+        except ImportError:
+            raise ImportError('Please install spacy with: pip install spacy')
+    return spacy_tokenizer._tokenizer
diff --git a/fairseq/models/roberta/hub_interface.py b/fairseq/models/roberta/hub_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..baf0bf28b9ea4a3b15e042e9f4f86f1ffd499f7c
--- /dev/null
+++ b/fairseq/models/roberta/hub_interface.py
@@ -0,0 +1,204 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq import utils
+from fairseq.data import encoders
+
+
+class RobertaHubInterface(nn.Module):
+    """A simple PyTorch Hub interface to RoBERTa.
+
+    Usage: https://github.com/pytorch/fairseq/tree/master/examples/roberta
+    """
+
+    def __init__(self, args, task, model):
+        super().__init__()
+        self.args = args
+        self.task = task
+        self.model = model
+
+        self.bpe = encoders.build_bpe(args)
+
+        # this is useful for determining the device
+        self.register_buffer('_float_tensor', torch.tensor([0], dtype=torch.float))
+
+    @property
+    def device(self):
+        return self._float_tensor.device
+
+    def encode(self, sentence: str, *addl_sentences, no_separator=False) -> torch.LongTensor:
+        """
+        BPE-encode a sentence (or multiple sentences).
+
+        Every sequence begins with a beginning-of-sentence (`<s>`) symbol.
+        Every sentence ends with an end-of-sentence (`</s>`) and we use an
+        extra end-of-sentence (`</s>`) as a separator.
+
+        Example (single sentence): `<s> a b c </s>`
+        Example (sentence pair): `<s> d e f </s> </s> 1 2 3 </s>`
+
+        The BPE encoding follows GPT-2. One subtle detail is that the GPT-2 BPE
+        requires leading spaces. For example::
+
+            >>> roberta.encode('Hello world').tolist()
+            [0, 31414, 232, 2]
+            >>> roberta.encode(' world').tolist()
+            [0, 232, 2]
+            >>> roberta.encode('world').tolist()
+            [0, 8331, 2]
+        """
+        bpe_sentence = '<s> ' + self.bpe.encode(sentence) + ' </s>'
+        for s in addl_sentences:
+            bpe_sentence += (' </s>' if not no_separator else '')
+            bpe_sentence += ' ' + self.bpe.encode(s) + ' </s>'
+        tokens = self.task.source_dictionary.encode_line(bpe_sentence, append_eos=False, add_if_not_exist=False)
+        return tokens.long()
+
+    def decode(self, tokens: torch.LongTensor):
+        assert tokens.dim() == 1
+        tokens = tokens.numpy()
+        if tokens[0] == self.task.source_dictionary.bos():
+            tokens = tokens[1:]  # remove <s>
+        eos_mask = (tokens == self.task.source_dictionary.eos())
+        doc_mask = eos_mask[1:] & eos_mask[:-1]
+        sentences = np.split(tokens, doc_mask.nonzero()[0] + 1)
+        sentences = [self.bpe.decode(self.task.source_dictionary.string(s)) for s in sentences]
+        if len(sentences) == 1:
+            return sentences[0]
+        return sentences
+
+    def extract_features(self, tokens: torch.LongTensor, return_all_hiddens: bool = False) -> torch.Tensor:
+        if tokens.dim() == 1:
+            tokens = tokens.unsqueeze(0)
+        if tokens.size(-1) > self.model.max_positions():
+            raise ValueError('tokens exceeds maximum length: {} > {}'.format(
+                tokens.size(-1), self.model.max_positions()
+            ))
+        features, extra = self.model(
+            tokens.to(device=self.device),
+            features_only=True,
+            return_all_hiddens=return_all_hiddens,
+        )
+        if return_all_hiddens:
+            # convert from T x B x C -> B x T x C
+            inner_states = extra['inner_states']
+            return [inner_state.transpose(0, 1) for inner_state in inner_states]
+        else:
+            return features  # just the last layer's features
+
+    def register_classification_head(
+        self, name: str, num_classes: int = None, embedding_size: int = None, **kwargs
+    ):
+        self.model.register_classification_head(
+            name, num_classes=num_classes, embedding_size=embedding_size, **kwargs
+        )
+
+    def predict(self, head: str, tokens: torch.LongTensor, return_logits: bool = False):
+        features = self.extract_features(tokens.to(device=self.device))
+        logits = self.model.classification_heads[head](features)
+        if return_logits:
+            return logits
+        return F.log_softmax(logits, dim=-1)
+
+    def extract_features_aligned_to_words(self, sentence: str, return_all_hiddens: bool = False) -> torch.Tensor:
+        """Extract RoBERTa features, aligned to spaCy's word-level tokenizer."""
+        from fairseq.models.roberta import alignment_utils
+        from spacy.tokens import Doc
+
+        nlp = alignment_utils.spacy_nlp()
+        tokenizer = alignment_utils.spacy_tokenizer()
+
+        # tokenize both with GPT-2 BPE and spaCy
+        bpe_toks = self.encode(sentence)
+        spacy_toks = tokenizer(sentence)
+        spacy_toks_ws = [t.text_with_ws for t in tokenizer(sentence)]
+        alignment = alignment_utils.align_bpe_to_words(self, bpe_toks, spacy_toks_ws)
+
+        # extract features and align them
+        features = self.extract_features(bpe_toks, return_all_hiddens=return_all_hiddens)
+        features = features.squeeze(0)
+        aligned_feats = alignment_utils.align_features_to_words(self, features, alignment)
+
+        # wrap in spaCy Doc
+        doc = Doc(
+            nlp.vocab,
+            words=['<s>'] + [x.text for x in spacy_toks] + ['</s>'],
+            spaces=[True] + [x.endswith(' ') for x in spacy_toks_ws[:-1]] + [True, False],
+        )
+        assert len(doc) == aligned_feats.size(0)
+        doc.user_token_hooks['vector'] = lambda token: aligned_feats[token.i]
+        return doc
+
+    def fill_mask(self, masked_input: str, topk: int = 5):
+        masked_token = '<mask>'
+        assert masked_token in masked_input and masked_input.count(masked_token) == 1, \
+            "Please add one {0} token for the input, eg: 'He is a {0} guy'".format(masked_token)
+
+        text_spans = masked_input.split(masked_token)
+        text_spans_bpe = (' {0} '.format(masked_token)).join(
+            [self.bpe.encode(text_span.rstrip()) for text_span in text_spans]
+        ).strip()
+        tokens = self.task.source_dictionary.encode_line(
+            '<s> ' + text_spans_bpe + ' </s>',
+            append_eos=False,
+            add_if_not_exist=False,
+        )
+
+        masked_index = (tokens == self.task.mask_idx).nonzero()
+        if tokens.dim() == 1:
+            tokens = tokens.unsqueeze(0)
+
+        with utils.eval(self.model):
+            features, extra = self.model(
+                tokens.long().to(device=self.device),
+                features_only=False,
+                return_all_hiddens=False,
+            )
+        logits = features[0, masked_index, :].squeeze()
+        prob = logits.softmax(dim=0)
+        values, index = prob.topk(k=topk, dim=0)
+        topk_predicted_token_bpe = self.task.source_dictionary.string(index)
+
+        topk_filled_outputs = []
+        for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(' ')):
+            predicted_token = self.bpe.decode(predicted_token_bpe)
+            # Quick hack to fix https://github.com/pytorch/fairseq/issues/1306
+            if predicted_token_bpe.startswith('\u2581'):
+                predicted_token = ' ' + predicted_token
+            if " {0}".format(masked_token) in masked_input:
+                topk_filled_outputs.append((
+                    masked_input.replace(
+                        ' {0}'.format(masked_token), predicted_token
+                    ),
+                    values[index].item(),
+                    predicted_token,
+                ))
+            else:
+                topk_filled_outputs.append((
+                    masked_input.replace(masked_token, predicted_token),
+                    values[index].item(),
+                    predicted_token,
+                ))
+        return topk_filled_outputs
+
+    def disambiguate_pronoun(self, sentence: str) -> bool:
+        """
+        Usage::
+
+            >>> disambiguate_pronoun('The _trophy_ would not fit in the brown suitcase because [it] was too big.')
+            True
+
+            >>> disambiguate_pronoun('The trophy would not fit in the brown suitcase because [it] was too big.')
+            'The trophy'
+        """
+        assert hasattr(self.task, 'disambiguate_pronoun'), \
+            'roberta.disambiguate_pronoun() requires a model trained with the WSC task.'
+        with utils.eval(self.model):
+            return self.task.disambiguate_pronoun(self.model, sentence, use_cuda=self.device.type == 'cuda')
diff --git a/fairseq/models/roberta/model.py b/fairseq/models/roberta/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..2303fbe26e1882ab38784f4064abb5af8bf099a5
--- /dev/null
+++ b/fairseq/models/roberta/model.py
@@ -0,0 +1,396 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+RoBERTa: A Robustly Optimized BERT Pretraining Approach.
+"""
+
+import logging
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq import utils
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.modules import (
+    LayerNorm,
+    TransformerSentenceEncoder,
+)
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_
+
+from .hub_interface import RobertaHubInterface
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_model('roberta')
+class RobertaModel(FairseqEncoderModel):
+
+    @classmethod
+    def hub_models(cls):
+        return {
+            'roberta.base': 'http://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz',
+            'roberta.large': 'http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz',
+            'roberta.large.mnli': 'http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz',
+            'roberta.large.wsc': 'http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.wsc.tar.gz',
+        }
+
+    def __init__(self, args, encoder):
+        super().__init__(encoder)
+        self.args = args
+
+        # We follow BERT's random weight initialization
+        self.apply(init_bert_params)
+
+        self.classification_heads = nn.ModuleDict()
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument('--encoder-layers', type=int, metavar='L',
+                            help='num encoder layers')
+        parser.add_argument('--encoder-embed-dim', type=int, metavar='H',
+                            help='encoder embedding dimension')
+        parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='F',
+                            help='encoder embedding dimension for FFN')
+        parser.add_argument('--encoder-attention-heads', type=int, metavar='A',
+                            help='num encoder attention heads')
+        parser.add_argument('--activation-fn',
+                            choices=utils.get_available_activation_fns(),
+                            help='activation function to use')
+        parser.add_argument('--pooler-activation-fn',
+                            choices=utils.get_available_activation_fns(),
+                            help='activation function to use for pooler layer')
+        parser.add_argument('--encoder-normalize-before', action='store_true',
+                            help='apply layernorm before each encoder block')
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--attention-dropout', type=float, metavar='D',
+                            help='dropout probability for attention weights')
+        parser.add_argument('--activation-dropout', type=float, metavar='D',
+                            help='dropout probability after activation in FFN')
+        parser.add_argument('--pooler-dropout', type=float, metavar='D',
+                            help='dropout probability in the masked_lm pooler layers')
+        parser.add_argument('--max-positions', type=int,
+                            help='number of positional embeddings to learn')
+        parser.add_argument('--load-checkpoint-heads', action='store_true',
+                            help='(re-)register and load heads when loading checkpoints')
+        # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
+        parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
+                            help='LayerDrop probability for encoder')
+        parser.add_argument('--encoder-layers-to-keep', default=None,
+                            help='which layers to *keep* when pruning as a comma-separated list')
+        # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
+        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
+                            help='iterative PQ quantization noise at training time')
+        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
+                            help='block size of quantization noise at training time')
+        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
+                            help='scalar quantization noise and scalar quantization at training time')
+        parser.add_argument('--untie-weights-roberta', action='store_true',
+                            help='Untie weights between embeddings and classifiers in RoBERTa')
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present
+        base_architecture(args)
+
+        if not hasattr(args, 'max_positions'):
+            args.max_positions = args.tokens_per_sample
+
+        encoder = RobertaEncoder(args, task.source_dictionary)
+        return cls(args, encoder)
+
+    def forward(self, src_tokens, features_only=False, return_all_hiddens=False, classification_head_name=None, **kwargs):
+        if classification_head_name is not None:
+            features_only = True
+
+        x, extra = self.encoder(src_tokens, features_only, return_all_hiddens, **kwargs)
+
+        if classification_head_name is not None:
+            x = self.classification_heads[classification_head_name](x)
+        return x, extra
+
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        """Get normalized probabilities (or log probs) from a net's output."""
+        logits = net_output[0].float()
+        if log_probs:
+            return F.log_softmax(logits, dim=-1)
+        else:
+            return F.softmax(logits, dim=-1)
+
+    def register_classification_head(self, name, num_classes=None, inner_dim=None, **kwargs):
+        """Register a classification head."""
+        if name in self.classification_heads:
+            prev_num_classes = self.classification_heads[name].out_proj.out_features
+            prev_inner_dim = self.classification_heads[name].dense.out_features
+            if num_classes != prev_num_classes or inner_dim != prev_inner_dim:
+                logger.warning(
+                    're-registering head "{}" with num_classes {} (prev: {}) '
+                    'and inner_dim {} (prev: {})'.format(
+                        name, num_classes, prev_num_classes, inner_dim, prev_inner_dim
+                    )
+                )
+        self.classification_heads[name] = RobertaClassificationHead(
+            self.args.encoder_embed_dim,
+            inner_dim or self.args.encoder_embed_dim,
+            num_classes,
+            self.args.pooler_activation_fn,
+            self.args.pooler_dropout,
+            self.args.quant_noise_pq,
+            self.args.quant_noise_pq_block_size,
+        )
+
+    @property
+    def supported_targets(self):
+        return {'self'}
+
+    @classmethod
+    def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', bpe='gpt2', **kwargs):
+        from fairseq import hub_utils
+        x = hub_utils.from_pretrained(
+            model_name_or_path,
+            checkpoint_file,
+            data_name_or_path,
+            archive_map=cls.hub_models(),
+            bpe=bpe,
+            load_checkpoint_heads=True,
+            **kwargs,
+        )
+        return RobertaHubInterface(x['args'], x['task'], x['models'][0])
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + '.' if name != '' else ''
+
+        # rename decoder -> encoder before upgrading children modules
+        for k in list(state_dict.keys()):
+            if k.startswith(prefix + 'decoder'):
+                new_k = prefix + 'encoder' + k[len(prefix + 'decoder'):]
+                state_dict[new_k] = state_dict[k]
+                del state_dict[k]
+
+        # upgrade children modules
+        super().upgrade_state_dict_named(state_dict, name)
+
+        # Handle new classification heads present in the state dict.
+        current_head_names = (
+            [] if not hasattr(self, 'classification_heads')
+            else self.classification_heads.keys()
+        )
+        keys_to_delete = []
+        for k in state_dict.keys():
+            if not k.startswith(prefix + 'classification_heads.'):
+                continue
+
+            head_name = k[len(prefix + 'classification_heads.'):].split('.')[0]
+            num_classes = state_dict[prefix + 'classification_heads.' + head_name + '.out_proj.weight'].size(0)
+            inner_dim = state_dict[prefix + 'classification_heads.' + head_name + '.dense.weight'].size(0)
+
+            if getattr(self.args, 'load_checkpoint_heads', False):
+                if head_name not in current_head_names:
+                    self.register_classification_head(head_name, num_classes, inner_dim)
+            else:
+                if head_name not in current_head_names:
+                    logger.warning(
+                        'deleting classification head ({}) from checkpoint '
+                        'not present in current model: {}'.format(head_name, k)
+                    )
+                    keys_to_delete.append(k)
+                elif (
+                    num_classes != self.classification_heads[head_name].out_proj.out_features
+                    or inner_dim != self.classification_heads[head_name].dense.out_features
+                ):
+                    logger.warning(
+                        'deleting classification head ({}) from checkpoint '
+                        'with different dimensions than current model: {}'.format(head_name, k)
+                    )
+                    keys_to_delete.append(k)
+        for k in keys_to_delete:
+            del state_dict[k]
+
+        # Copy any newly-added classification heads into the state dict
+        # with their current weights.
+        if hasattr(self, 'classification_heads'):
+            cur_state = self.classification_heads.state_dict()
+            for k, v in cur_state.items():
+                if prefix + 'classification_heads.' + k not in state_dict:
+                    logger.info('Overwriting ' + prefix + 'classification_heads.' + k)
+                    state_dict[prefix + 'classification_heads.' + k] = v
+
+
+class RobertaLMHead(nn.Module):
+    """Head for masked language modeling."""
+
+    def __init__(self, embed_dim, output_dim, activation_fn, weight=None):
+        super().__init__()
+        self.dense = nn.Linear(embed_dim, embed_dim)
+        self.activation_fn = utils.get_activation_fn(activation_fn)
+        self.layer_norm = LayerNorm(embed_dim)
+
+        if weight is None:
+            weight = nn.Linear(embed_dim, output_dim, bias=False).weight
+        self.weight = weight
+        self.bias = nn.Parameter(torch.zeros(output_dim))
+
+    def forward(self, features, masked_tokens=None, **kwargs):
+        # Only project the masked tokens while training,
+        # saves both memory and computation
+        if masked_tokens is not None:
+            features = features[masked_tokens, :]
+
+        x = self.dense(features)
+        x = self.activation_fn(x)
+        x = self.layer_norm(x)
+        # project back to size of vocabulary with bias
+        x = F.linear(x, self.weight) + self.bias
+        return x
+
+
+class RobertaClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, input_dim, inner_dim, num_classes, activation_fn, pooler_dropout, q_noise=0, qn_block_size=8):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.activation_fn = utils.get_activation_fn(activation_fn)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = apply_quant_noise_(
+            nn.Linear(inner_dim, num_classes), q_noise, qn_block_size
+        )
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = self.activation_fn(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+class RobertaEncoder(FairseqEncoder):
+    """RoBERTa encoder."""
+
+    def __init__(self, args, dictionary):
+        super().__init__(dictionary)
+        self.args = args
+
+        if args.encoder_layers_to_keep:
+            args.encoder_layers = len(args.encoder_layers_to_keep.split(","))
+
+        self.sentence_encoder = TransformerSentenceEncoder(
+            padding_idx=dictionary.pad(),
+            vocab_size=len(dictionary),
+            num_encoder_layers=args.encoder_layers,
+            embedding_dim=args.encoder_embed_dim,
+            ffn_embedding_dim=args.encoder_ffn_embed_dim,
+            num_attention_heads=args.encoder_attention_heads,
+            dropout=args.dropout,
+            attention_dropout=args.attention_dropout,
+            activation_dropout=args.activation_dropout,
+            layerdrop=args.encoder_layerdrop,
+            max_seq_len=args.max_positions,
+            num_segments=0,
+            encoder_normalize_before=True,
+            apply_bert_init=True,
+            activation_fn=args.activation_fn,
+            q_noise=args.quant_noise_pq,
+            qn_block_size=args.quant_noise_pq_block_size,
+        )
+        args.untie_weights_roberta = getattr(args, 'untie_weights_roberta', False)
+
+        self.lm_head = RobertaLMHead(
+            embed_dim=args.encoder_embed_dim,
+            output_dim=len(dictionary),
+            activation_fn=args.activation_fn,
+            weight=self.sentence_encoder.embed_tokens.weight if not args.untie_weights_roberta else None,
+        )
+
+    def forward(self, src_tokens, features_only=False, return_all_hiddens=False, masked_tokens=None, **unused):
+        """
+        Args:
+            src_tokens (LongTensor): input tokens of shape `(batch, src_len)`
+            features_only (bool, optional): skip LM head and just return
+                features. If True, the output will be of shape
+                `(batch, src_len, embed_dim)`.
+            return_all_hiddens (bool, optional): also return all of the
+                intermediate hidden states (default: False).
+
+        Returns:
+            tuple:
+                - the LM output of shape `(batch, src_len, vocab)`
+                - a dictionary of additional data, where 'inner_states'
+                  is a list of hidden states. Note that the hidden
+                  states have shape `(src_len, batch, vocab)`.
+        """
+        x, extra = self.extract_features(src_tokens, return_all_hiddens=return_all_hiddens)
+        if not features_only:
+            x = self.output_layer(x, masked_tokens=masked_tokens)
+        return x, extra
+
+    def extract_features(self, src_tokens, return_all_hiddens=False, **unused):
+        inner_states, _ = self.sentence_encoder(
+            src_tokens,
+            last_state_only=not return_all_hiddens,
+        )
+        features = inner_states[-1].transpose(0, 1)  # T x B x C -> B x T x C
+        return features, {'inner_states': inner_states if return_all_hiddens else None}
+
+    def output_layer(self, features, masked_tokens=None, **unused):
+        return self.lm_head(features, masked_tokens)
+
+    def max_positions(self):
+        """Maximum output length supported by the encoder."""
+        return self.args.max_positions
+
+
+@register_model_architecture('roberta', 'roberta')
+def base_architecture(args):
+    args.encoder_layers = getattr(args, 'encoder_layers', 12)
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 768)
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 3072)
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 12)
+
+    args.activation_fn = getattr(args, 'activation_fn', 'gelu')
+    args.pooler_activation_fn = getattr(args, 'pooler_activation_fn', 'tanh')
+
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
+    args.activation_dropout = getattr(args, 'activation_dropout', 0.0)
+    args.pooler_dropout = getattr(args, 'pooler_dropout', 0.0)
+    args.encoder_layers_to_keep = getattr(args, 'encoder_layers_to_keep', None)
+    args.encoder_layerdrop = getattr(args, 'encoder_layerdrop', 0.0)
+
+
+@register_model_architecture('roberta', 'roberta_base')
+def roberta_base_architecture(args):
+    base_architecture(args)
+
+
+@register_model_architecture('roberta', 'roberta_large')
+def roberta_large_architecture(args):
+    args.encoder_layers = getattr(args, 'encoder_layers', 24)
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1024)
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 4096)
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 16)
+    base_architecture(args)
+
+
+@register_model_architecture('roberta', 'xlm')
+def xlm_architecture(args):
+    args.encoder_layers = getattr(args, 'encoder_layers', 16)
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1280)
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 1280*4)
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 16)
+    base_architecture(args)
diff --git a/fairseq/models/roberta/model_camembert.py b/fairseq/models/roberta/model_camembert.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb57d81d8df4f73d74dbdab5d877947cc39c03c1
--- /dev/null
+++ b/fairseq/models/roberta/model_camembert.py
@@ -0,0 +1,43 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+CamemBERT: a Tasty French Language Model
+"""
+
+from fairseq.models import register_model
+
+from .hub_interface import RobertaHubInterface
+from .model import RobertaModel
+
+
+@register_model('camembert')
+class CamembertModel(RobertaModel):
+
+    @classmethod
+    def hub_models(cls):
+        return {
+            'camembert': 'http://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz',
+            'camembert.v0': 'http://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz',
+            'camembert-base': 'http://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz',
+            'camembert-large': 'http://dl.fbaipublicfiles.com/fairseq/models/camembert-large.tar.gz',
+            'camembert-base-ccnet': 'http://dl.fbaipublicfiles.com/fairseq/models/camembert-base-ccnet.tar.gz',
+            'camembert-base-ccnet-4gb': 'http://dl.fbaipublicfiles.com/fairseq/models/camembert-base-ccnet-4gb.tar.gz',
+            'camembert-base-wikipedia-4gb': 'http://dl.fbaipublicfiles.com/fairseq/models/camembert-base-wikipedia-4gb.tar.gz',
+            'camembert-base-oscar-4gb': 'http://dl.fbaipublicfiles.com/fairseq/models/camembert-base-oscar-4gb.tar.gz',
+        }
+
+    @classmethod
+    def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', bpe='sentencepiece', **kwargs):
+        from fairseq import hub_utils
+        x = hub_utils.from_pretrained(
+            model_name_or_path,
+            checkpoint_file,
+            data_name_or_path,
+            archive_map=cls.hub_models(),
+            bpe=bpe,
+            load_checkpoint_heads=True,
+            **kwargs,
+        )
+        return RobertaHubInterface(x['args'], x['task'], x['models'][0])
diff --git a/fairseq/models/roberta/model_xlmr.py b/fairseq/models/roberta/model_xlmr.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa71a27d12ad237012a65019e3c60669b8837055
--- /dev/null
+++ b/fairseq/models/roberta/model_xlmr.py
@@ -0,0 +1,37 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Unsupervised Cross-lingual Representation Learning at Scale
+"""
+
+from fairseq.models import register_model
+
+from .hub_interface import RobertaHubInterface
+from .model import RobertaModel
+
+
+@register_model('xlmr')
+class XLMRModel(RobertaModel):
+
+    @classmethod
+    def hub_models(cls):
+        return {
+            'xlmr.base': 'http://dl.fbaipublicfiles.com/fairseq/models/xlmr.base.tar.gz',
+            'xlmr.large': 'http://dl.fbaipublicfiles.com/fairseq/models/xlmr.large.tar.gz',
+        }
+
+    @classmethod
+    def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', bpe='sentencepiece', **kwargs):
+        from fairseq import hub_utils
+        x = hub_utils.from_pretrained(
+            model_name_or_path,
+            checkpoint_file,
+            data_name_or_path,
+            archive_map=cls.hub_models(),
+            bpe=bpe,
+            load_checkpoint_heads=True,
+            **kwargs,
+        )
+        return RobertaHubInterface(x['args'], x['task'], x['models'][0])
diff --git a/fairseq/models/transformer.py b/fairseq/models/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fd5c2bd05de761ce8c1c8bdcd37dcc4c14c62ec
--- /dev/null
+++ b/fairseq/models/transformer.py
@@ -0,0 +1,998 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from fairseq import options, utils
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderDecoderModel,
+    FairseqIncrementalDecoder,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.fairseq_encoder import EncoderOut
+from fairseq.modules import (
+    AdaptiveSoftmax,
+    FairseqDropout,
+    LayerDropModuleList,
+    LayerNorm,
+    PositionalEmbedding,
+    SinusoidalPositionalEmbedding,
+    TransformerDecoderLayer,
+    TransformerEncoderLayer,
+)
+from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_
+from torch import Tensor
+
+
+DEFAULT_MAX_SOURCE_POSITIONS = 1024
+DEFAULT_MAX_TARGET_POSITIONS = 1024
+
+
+@register_model("transformer")
+class TransformerModel(FairseqEncoderDecoderModel):
+    """
+    Transformer model from `"Attention Is All You Need" (Vaswani, et al, 2017)
+    <https://arxiv.org/abs/1706.03762>`_.
+
+    Args:
+        encoder (TransformerEncoder): the encoder
+        decoder (TransformerDecoder): the decoder
+
+    The Transformer model provides the following named architectures and
+    command-line arguments:
+
+    .. argparse::
+        :ref: fairseq.models.transformer_parser
+        :prog:
+    """
+
+    @classmethod
+    def hub_models(cls):
+        # fmt: off
+
+        def moses_subword(path):
+            return {
+                'path': path,
+                'tokenizer': 'moses',
+                'bpe': 'subword_nmt',
+            }
+
+        def moses_fastbpe(path):
+            return {
+                'path': path,
+                'tokenizer': 'moses',
+                'bpe': 'fastbpe',
+            }
+
+        return {
+            'transformer.wmt14.en-fr': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-fr.joined-dict.transformer.tar.bz2'),
+            'transformer.wmt16.en-de': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt16.en-de.joined-dict.transformer.tar.bz2',
+            'transformer.wmt18.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/wmt18.en-de.ensemble.tar.gz'),
+            'transformer.wmt19.en-de': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz'),
+            'transformer.wmt19.en-ru': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz'),
+            'transformer.wmt19.de-en': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz'),
+            'transformer.wmt19.ru-en': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz'),
+            'transformer.wmt19.en-de.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.single_model.tar.gz'),
+            'transformer.wmt19.en-ru.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.single_model.tar.gz'),
+            'transformer.wmt19.de-en.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.single_model.tar.gz'),
+            'transformer.wmt19.ru-en.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.single_model.tar.gz'),
+        }
+        # fmt: on
+
+    def __init__(self, args, encoder, decoder):
+        super().__init__(encoder, decoder)
+        self.args = args
+        self.supports_align_args = True
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--activation-fn',
+                            choices=utils.get_available_activation_fns(),
+                            help='activation function to use')
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--attention-dropout', type=float, metavar='D',
+                            help='dropout probability for attention weights')
+        parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
+                            help='dropout probability after activation in FFN.')
+        parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained encoder embedding')
+        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension')
+        parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension for FFN')
+        parser.add_argument('--encoder-layers', type=int, metavar='N',
+                            help='num encoder layers')
+        parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
+                            help='num encoder attention heads')
+        parser.add_argument('--encoder-normalize-before', action='store_true',
+                            help='apply layernorm before each encoder block')
+        parser.add_argument('--encoder-learned-pos', action='store_true',
+                            help='use learned positional embeddings in the encoder')
+        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained decoder embedding')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension for FFN')
+        parser.add_argument('--decoder-layers', type=int, metavar='N',
+                            help='num decoder layers')
+        parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
+                            help='num decoder attention heads')
+        parser.add_argument('--decoder-learned-pos', action='store_true',
+                            help='use learned positional embeddings in the decoder')
+        parser.add_argument('--decoder-normalize-before', action='store_true',
+                            help='apply layernorm before each decoder block')
+        parser.add_argument('--decoder-output-dim', type=int, metavar='N',
+                            help='decoder output dimension (extra linear layer '
+                                 'if different from decoder embed dim')
+        parser.add_argument('--share-decoder-input-output-embed', action='store_true',
+                            help='share decoder input and output embeddings')
+        parser.add_argument('--share-all-embeddings', action='store_true',
+                            help='share encoder, decoder and output embeddings'
+                                 ' (requires shared dictionary and embed dim)')
+        parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
+                            help='if set, disables positional embeddings (outside self attention)')
+        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+                            help='comma separated list of adaptive softmax cutoff points. '
+                                 'Must be used with adaptive_loss criterion'),
+        parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
+                            help='sets adaptive softmax dropout for the tail projections')
+        parser.add_argument('--layernorm-embedding', action='store_true',
+                            help='add layernorm to embedding')
+        parser.add_argument('--no-scale-embedding', action='store_true',
+                            help='if True, dont scale embeddings')
+        # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
+        parser.add_argument('--no-cross-attention', default=False, action='store_true',
+                            help='do not perform cross-attention')
+        parser.add_argument('--cross-self-attention', default=False, action='store_true',
+                            help='perform cross+self-attention')
+        # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
+        parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
+                            help='LayerDrop probability for encoder')
+        parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
+                            help='LayerDrop probability for decoder')
+        parser.add_argument('--encoder-layers-to-keep', default=None,
+                            help='which layers to *keep* when pruning as a comma-separated list')
+        parser.add_argument('--decoder-layers-to-keep', default=None,
+                            help='which layers to *keep* when pruning as a comma-separated list')
+        # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
+        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
+                            help='iterative PQ quantization noise at training time')
+        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
+                            help='block size of quantization noise at training time')
+        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
+                            help='scalar quantization noise and scalar quantization at training time')
+        # fmt: on
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present in older models
+        base_architecture(args)
+
+        if args.encoder_layers_to_keep:
+            args.encoder_layers = len(args.encoder_layers_to_keep.split(","))
+        if args.decoder_layers_to_keep:
+            args.decoder_layers = len(args.decoder_layers_to_keep.split(","))
+
+        if getattr(args, "max_source_positions", None) is None:
+            args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS
+        if getattr(args, "max_target_positions", None) is None:
+            args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS
+
+        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
+
+        if args.share_all_embeddings:
+            if src_dict != tgt_dict:
+                raise ValueError("--share-all-embeddings requires a joined dictionary")
+            if args.encoder_embed_dim != args.decoder_embed_dim:
+                raise ValueError(
+                    "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim"
+                )
+            if args.decoder_embed_path and (
+                args.decoder_embed_path != args.encoder_embed_path
+            ):
+                raise ValueError(
+                    "--share-all-embeddings not compatible with --decoder-embed-path"
+                )
+            encoder_embed_tokens = cls.build_embedding(
+                args, src_dict, args.encoder_embed_dim, args.encoder_embed_path
+            )
+            decoder_embed_tokens = encoder_embed_tokens
+            args.share_decoder_input_output_embed = True
+        else:
+            encoder_embed_tokens = cls.build_embedding(
+                args, src_dict, args.encoder_embed_dim, args.encoder_embed_path
+            )
+            decoder_embed_tokens = cls.build_embedding(
+                args, tgt_dict, args.decoder_embed_dim, args.decoder_embed_path
+            )
+
+        encoder = cls.build_encoder(args, src_dict, encoder_embed_tokens)
+        decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens)
+        return cls(args, encoder, decoder)
+
+    @classmethod
+    def build_embedding(cls, args, dictionary, embed_dim, path=None):
+        num_embeddings = len(dictionary)
+        padding_idx = dictionary.pad()
+
+        emb = Embedding(num_embeddings, embed_dim, padding_idx)
+        # if provided, load from preloaded dictionaries
+        if path:
+            embed_dict = utils.parse_embedding(path)
+            utils.load_embedding(embed_dict, dictionary, emb)
+        return emb
+
+    @classmethod
+    def build_encoder(cls, args, src_dict, embed_tokens):
+        return TransformerEncoder(args, src_dict, embed_tokens)
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        return TransformerDecoder(
+            args,
+            tgt_dict,
+            embed_tokens,
+            no_encoder_attn=getattr(args, "no_cross_attention", False),
+        )
+
+    # TorchScript doesn't support optional arguments with variable length (**kwargs).
+    # Current workaround is to add union of all arguments in child classes.
+    def forward(
+        self,
+        src_tokens,
+        src_lengths,
+        prev_output_tokens,
+        return_all_hiddens: bool = True,
+        features_only: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+    ):
+        """
+        Run the forward pass for an encoder-decoder model.
+
+        Copied from the base class, but without ``**kwargs``,
+        which are not supported by TorchScript.
+        """
+        encoder_out = self.encoder(
+            src_tokens, src_lengths=src_lengths, return_all_hiddens=return_all_hiddens
+        )
+        decoder_out = self.decoder(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            features_only=features_only,
+            alignment_layer=alignment_layer,
+            alignment_heads=alignment_heads,
+            src_lengths=src_lengths,
+            return_all_hiddens=return_all_hiddens,
+        )
+        return decoder_out
+
+    # Since get_normalized_probs is in the Fairseq Model which is not scriptable,
+    # I rewrite the get_normalized_probs from Base Class to call the
+    # helper function in the Base Class.
+    @torch.jit.export
+    def get_normalized_probs(
+        self,
+        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+        log_probs: bool,
+        sample: Optional[Dict[str, Tensor]] = None,
+    ):
+        """Get normalized probabilities (or log probs) from a net's output."""
+        return self.get_normalized_probs_scriptable(net_output, log_probs, sample)
+
+
+class TransformerEncoder(FairseqEncoder):
+    """
+    Transformer encoder consisting of *args.encoder_layers* layers. Each layer
+    is a :class:`TransformerEncoderLayer`.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): encoding dictionary
+        embed_tokens (torch.nn.Embedding): input embedding
+    """
+
+    def __init__(self, args, dictionary, embed_tokens):
+        super().__init__(dictionary)
+        self.register_buffer("version", torch.Tensor([3]))
+
+        self.dropout_module = FairseqDropout(args.dropout, module_name=self.__class__.__name__)
+        self.encoder_layerdrop = args.encoder_layerdrop
+
+        embed_dim = embed_tokens.embedding_dim
+        self.padding_idx = embed_tokens.padding_idx
+        self.max_source_positions = args.max_source_positions
+
+        self.embed_tokens = embed_tokens
+
+        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)
+
+        self.embed_positions = (
+            PositionalEmbedding(
+                args.max_source_positions,
+                embed_dim,
+                self.padding_idx,
+                learned=args.encoder_learned_pos,
+            )
+            if not args.no_token_positional_embeddings
+            else None
+        )
+
+        if getattr(args, "layernorm_embedding", False):
+            self.layernorm_embedding = LayerNorm(embed_dim)
+        else:
+            self.layernorm_embedding = None
+
+        if not args.adaptive_input and args.quant_noise_pq > 0:
+            self.quant_noise = apply_quant_noise_(
+                nn.Linear(embed_dim, embed_dim, bias=False),
+                args.quant_noise_pq,
+                args.quant_noise_pq_block_size,
+            )
+        else:
+            self.quant_noise = None
+
+        if self.encoder_layerdrop > 0.0:
+            self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
+        else:
+            self.layers = nn.ModuleList([])
+        self.layers.extend(
+            [self.build_encoder_layer(args) for i in range(args.encoder_layers)]
+        )
+        self.num_layers = len(self.layers)
+
+        if args.encoder_normalize_before:
+            self.layer_norm = LayerNorm(embed_dim)
+        else:
+            self.layer_norm = None
+
+    def build_encoder_layer(self, args):
+        return TransformerEncoderLayer(args)
+
+    def forward_embedding(self, src_tokens):
+        # embed tokens and positions
+        x = embed = self.embed_scale * self.embed_tokens(src_tokens)
+        if self.embed_positions is not None:
+            x = embed + self.embed_positions(src_tokens)
+        if self.layernorm_embedding is not None:
+            x = self.layernorm_embedding(x)
+        x = self.dropout_module(x)
+        if self.quant_noise is not None:
+            x = self.quant_noise(x)
+        return x, embed
+
+    def forward(self, src_tokens, src_lengths, return_all_hiddens: bool = False):
+        """
+        Args:
+            src_tokens (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+            src_lengths (torch.LongTensor): lengths of each source sentence of
+                shape `(batch)`
+            return_all_hiddens (bool, optional): also return all of the
+                intermediate hidden states (default: False).
+
+        Returns:
+            namedtuple:
+                - **encoder_out** (Tensor): the last encoder layer's output of
+                  shape `(src_len, batch, embed_dim)`
+                - **encoder_padding_mask** (ByteTensor): the positions of
+                  padding elements of shape `(batch, src_len)`
+                - **encoder_embedding** (Tensor): the (scaled) embedding lookup
+                  of shape `(batch, src_len, embed_dim)`
+                - **encoder_states** (List[Tensor]): all intermediate
+                  hidden states of shape `(src_len, batch, embed_dim)`.
+                  Only populated if *return_all_hiddens* is True.
+        """
+        x, encoder_embedding = self.forward_embedding(src_tokens)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        # compute padding mask
+        encoder_padding_mask = src_tokens.eq(self.padding_idx)
+
+        encoder_states = [] if return_all_hiddens else None
+
+        # encoder layers
+        for layer in self.layers:
+            x = layer(x, encoder_padding_mask)
+            if return_all_hiddens:
+                assert encoder_states is not None
+                encoder_states.append(x)
+
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+
+        return EncoderOut(
+            encoder_out=x,  # T x B x C
+            encoder_padding_mask=encoder_padding_mask,  # B x T
+            encoder_embedding=encoder_embedding,  # B x T x C
+            encoder_states=encoder_states,  # List[T x B x C]
+            src_tokens=None,
+            src_lengths=None,
+        )
+
+    @torch.jit.export
+    def reorder_encoder_out(self, encoder_out: EncoderOut, new_order):
+        """
+        Reorder encoder output according to *new_order*.
+
+        Args:
+            encoder_out: output from the ``forward()`` method
+            new_order (LongTensor): desired order
+
+        Returns:
+            *encoder_out* rearranged according to *new_order*
+        """
+        """
+        Since encoder_padding_mask and encoder_embedding are both of type
+        Optional[Tensor] in EncoderOut, they need to be copied as local
+        variables for Torchscript Optional refinement
+        """
+        encoder_padding_mask: Optional[Tensor] = encoder_out.encoder_padding_mask
+        encoder_embedding: Optional[Tensor] = encoder_out.encoder_embedding
+
+        new_encoder_out = (
+            encoder_out.encoder_out
+            if encoder_out.encoder_out is None
+            else encoder_out.encoder_out.index_select(1, new_order)
+        )
+        new_encoder_padding_mask = (
+            encoder_padding_mask
+            if encoder_padding_mask is None
+            else encoder_padding_mask.index_select(0, new_order)
+        )
+        new_encoder_embedding = (
+            encoder_embedding
+            if encoder_embedding is None
+            else encoder_embedding.index_select(0, new_order)
+        )
+        src_tokens = encoder_out.src_tokens
+        if src_tokens is not None:
+            src_tokens = src_tokens.index_select(0, new_order)
+
+        src_lengths = encoder_out.src_lengths
+        if src_lengths is not None:
+            src_lengths = src_lengths.index_select(0, new_order)
+
+        encoder_states = encoder_out.encoder_states
+        if encoder_states is not None:
+            for idx, state in enumerate(encoder_states):
+                encoder_states[idx] = state.index_select(1, new_order)
+
+        return EncoderOut(
+            encoder_out=new_encoder_out,  # T x B x C
+            encoder_padding_mask=new_encoder_padding_mask,  # B x T
+            encoder_embedding=new_encoder_embedding,  # B x T x C
+            encoder_states=encoder_states,  # List[T x B x C]
+            src_tokens=src_tokens,  # B x T
+            src_lengths=src_lengths,  # B x 1
+        )
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        if self.embed_positions is None:
+            return self.max_source_positions
+        return min(self.max_source_positions, self.embed_positions.max_positions)
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        """Upgrade a (possibly old) state dict for new versions of fairseq."""
+        if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
+            weights_key = "{}.embed_positions.weights".format(name)
+            if weights_key in state_dict:
+                print("deleting {0}".format(weights_key))
+                del state_dict[weights_key]
+            state_dict[
+                "{}.embed_positions._float_tensor".format(name)
+            ] = torch.FloatTensor(1)
+        for i in range(self.num_layers):
+            # update layer norms
+            self.layers[i].upgrade_state_dict_named(
+                state_dict, "{}.layers.{}".format(name, i)
+            )
+
+        version_key = "{}.version".format(name)
+        if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) < 2:
+            # earlier checkpoints did not normalize after the stack of layers
+            self.layer_norm = None
+            self.normalize = False
+            state_dict[version_key] = torch.Tensor([1])
+        return state_dict
+
+
+class TransformerDecoder(FairseqIncrementalDecoder):
+    """
+    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
+    is a :class:`TransformerDecoderLayer`.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): decoding dictionary
+        embed_tokens (torch.nn.Embedding): output embedding
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+    """
+
+    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
+        self.args = args
+        super().__init__(dictionary)
+        self.register_buffer("version", torch.Tensor([3]))
+        self._future_mask = torch.empty(0)
+
+        self.dropout_module = FairseqDropout(args.dropout, module_name=self.__class__.__name__)
+        self.decoder_layerdrop = args.decoder_layerdrop
+        self.share_input_output_embed = args.share_decoder_input_output_embed
+
+        input_embed_dim = embed_tokens.embedding_dim
+        embed_dim = args.decoder_embed_dim
+        self.embed_dim = embed_dim
+        self.output_embed_dim = args.decoder_output_dim
+
+        self.padding_idx = embed_tokens.padding_idx
+        self.max_target_positions = args.max_target_positions
+
+        self.embed_tokens = embed_tokens
+
+        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)
+
+        if not args.adaptive_input and args.quant_noise_pq > 0:
+            self.quant_noise = apply_quant_noise_(
+                nn.Linear(embed_dim, embed_dim, bias=False),
+                args.quant_noise_pq,
+                args.quant_noise_pq_block_size,
+            )
+        else:
+            self.quant_noise = None
+
+        self.project_in_dim = (
+            Linear(input_embed_dim, embed_dim, bias=False)
+            if embed_dim != input_embed_dim
+            else None
+        )
+
+        self.embed_positions = (
+            PositionalEmbedding(
+                args.max_target_positions,
+                embed_dim,
+                self.padding_idx,
+                learned=args.decoder_learned_pos,
+            )
+            if not args.no_token_positional_embeddings
+            else None
+        )
+
+        if getattr(args, "layernorm_embedding", False):
+            self.layernorm_embedding = LayerNorm(embed_dim)
+        else:
+            self.layernorm_embedding = None
+
+        self.cross_self_attention = getattr(args, "cross_self_attention", False)
+
+        if self.decoder_layerdrop > 0.0:
+            self.layers = LayerDropModuleList(p=self.decoder_layerdrop)
+        else:
+            self.layers = nn.ModuleList([])
+        self.layers.extend(
+            [
+                self.build_decoder_layer(args, no_encoder_attn)
+                for _ in range(args.decoder_layers)
+            ]
+        )
+        self.num_layers = len(self.layers)
+
+        if args.decoder_normalize_before and not getattr(
+            args, "no_decoder_final_norm", False
+        ):
+            self.layer_norm = LayerNorm(embed_dim)
+        else:
+            self.layer_norm = None
+
+        self.project_out_dim = (
+            Linear(embed_dim, self.output_embed_dim, bias=False)
+            if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights
+            else None
+        )
+
+        self.adaptive_softmax = None
+        self.output_projection = None
+        if args.adaptive_softmax_cutoff is not None:
+            self.adaptive_softmax = AdaptiveSoftmax(
+                len(dictionary),
+                self.output_embed_dim,
+                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
+                dropout=args.adaptive_softmax_dropout,
+                adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None,
+                factor=args.adaptive_softmax_factor,
+                tie_proj=args.tie_adaptive_proj,
+            )
+        elif self.share_input_output_embed:
+            self.output_projection = nn.Linear(
+                self.embed_tokens.weight.shape[1],
+                self.embed_tokens.weight.shape[0],
+                bias=False,
+            )
+            self.output_projection.weight = self.embed_tokens.weight
+        else:
+            self.output_projection = nn.Linear(
+                self.output_embed_dim, len(dictionary), bias=False
+            )
+            nn.init.normal_(
+                self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5
+            )
+
+    def build_decoder_layer(self, args, no_encoder_attn=False):
+        return TransformerDecoderLayer(args, no_encoder_attn)
+
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[EncoderOut] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        features_only: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+        src_lengths: Optional[Any] = None,
+        return_all_hiddens: bool = False,
+    ):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (optional): output from the encoder, used for
+                encoder-side attention
+            incremental_state (dict): dictionary used for storing state during
+                :ref:`Incremental decoding`
+            features_only (bool, optional): only return features without
+                applying output layer (default: False).
+
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        x, extra = self.extract_features(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            incremental_state=incremental_state,
+            alignment_layer=alignment_layer,
+            alignment_heads=alignment_heads,
+        )
+        if not features_only:
+            x = self.output_layer(x)
+        return x, extra
+
+    def extract_features(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[EncoderOut] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+    ):
+        return self.extract_features_scriptable(
+            prev_output_tokens,
+            encoder_out,
+            incremental_state,
+            full_context_alignment,
+            alignment_layer,
+            alignment_heads,
+        )
+
+    """
+    A scriptable subclass of this class has an extract_features method and calls
+    super().extract_features, but super() is not supported in torchscript. Aa copy of
+    this function is made to be used in the subclass instead.
+    """
+
+    def extract_features_scriptable(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[EncoderOut] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+    ):
+        """
+        Similar to *forward* but only return features.
+
+        Includes several features from "Jointly Learning to Align and
+        Translate with Transformer Models" (Garg et al., EMNLP 2019).
+
+        Args:
+            full_context_alignment (bool, optional): don't apply
+                auto-regressive mask to self-attention (default: False).
+            alignment_layer (int, optional): return mean alignment over
+                heads at this layer (default: last layer).
+            alignment_heads (int, optional): only average alignment over
+                this many heads (default: all heads).
+
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+        """
+        if alignment_layer is None:
+            alignment_layer = self.num_layers - 1
+
+        # embed positions
+        positions = (
+            self.embed_positions(
+                prev_output_tokens, incremental_state=incremental_state
+            )
+            if self.embed_positions is not None
+            else None
+        )
+
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+            if positions is not None:
+                positions = positions[:, -1:]
+
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+
+        if self.quant_noise is not None:
+            x = self.quant_noise(x)
+
+        if self.project_in_dim is not None:
+            x = self.project_in_dim(x)
+
+        if positions is not None:
+            x += positions
+
+        if self.layernorm_embedding is not None:
+            x = self.layernorm_embedding(x)
+
+        x = self.dropout_module(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        self_attn_padding_mask: Optional[Tensor] = None
+        if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any():
+            self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)
+
+        # decoder layers
+        attn: Optional[Tensor] = None
+        inner_states: List[Optional[Tensor]] = [x]
+        for idx, layer in enumerate(self.layers):
+            if incremental_state is None and not full_context_alignment:
+                self_attn_mask = self.buffered_future_mask(x)
+            else:
+                self_attn_mask = None
+
+            x, layer_attn, _ = layer(
+                x,
+                encoder_out.encoder_out if encoder_out is not None else None,
+                encoder_out.encoder_padding_mask if encoder_out is not None else None,
+                incremental_state,
+                self_attn_mask=self_attn_mask,
+                self_attn_padding_mask=self_attn_padding_mask,
+                need_attn=bool((idx == alignment_layer)),
+                need_head_weights=bool((idx == alignment_layer)),
+            )
+            inner_states.append(x)
+            if layer_attn is not None and idx == alignment_layer:
+                attn = layer_attn.float().to(x)
+
+        if attn is not None:
+            if alignment_heads is not None:
+                attn = attn[:alignment_heads]
+
+            # average probabilities over heads
+            attn = attn.mean(dim=0)
+
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        if self.project_out_dim is not None:
+            x = self.project_out_dim(x)
+
+        return x, {"attn": [attn], "inner_states": inner_states}
+
+    def output_layer(self, features):
+        """Project features to the vocabulary size."""
+        if self.adaptive_softmax is None:
+            # project back to size of vocabulary
+            return self.output_projection(features)
+        else:
+            return features
+
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        if self.embed_positions is None:
+            return self.max_target_positions
+        return min(self.max_target_positions, self.embed_positions.max_positions)
+
+    def buffered_future_mask(self, tensor):
+        dim = tensor.size(0)
+        # self._future_mask.device != tensor.device is not working in TorchScript. This is a workaround.
+        if (
+            self._future_mask.size(0) == 0
+            or (not self._future_mask.device == tensor.device)
+            or self._future_mask.size(0) < dim
+        ):
+            self._future_mask = torch.triu(
+                utils.fill_with_neg_inf(torch.zeros([dim, dim])), 1
+            )
+        self._future_mask = self._future_mask.to(tensor)
+        return self._future_mask[:dim, :dim]
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        """Upgrade a (possibly old) state dict for new versions of fairseq."""
+        if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
+            weights_key = "{}.embed_positions.weights".format(name)
+            if weights_key in state_dict:
+                del state_dict[weights_key]
+            state_dict[
+                "{}.embed_positions._float_tensor".format(name)
+            ] = torch.FloatTensor(1)
+
+        if f"{name}.output_projection.weight" not in state_dict:
+            if self.share_input_output_embed:
+                embed_out_key = f"{name}.embed_tokens.weight"
+            else:
+                embed_out_key = f"{name}.embed_out"
+            if embed_out_key in state_dict:
+                state_dict[f"{name}.output_projection.weight"] = state_dict[
+                    embed_out_key
+                ]
+                if not self.share_input_output_embed:
+                    del state_dict[embed_out_key]
+
+        for i in range(self.num_layers):
+            # update layer norms
+            layer_norm_map = {
+                "0": "self_attn_layer_norm",
+                "1": "encoder_attn_layer_norm",
+                "2": "final_layer_norm",
+            }
+            for old, new in layer_norm_map.items():
+                for m in ("weight", "bias"):
+                    k = "{}.layers.{}.layer_norms.{}.{}".format(name, i, old, m)
+                    if k in state_dict:
+                        state_dict[
+                            "{}.layers.{}.{}.{}".format(name, i, new, m)
+                        ] = state_dict[k]
+                        del state_dict[k]
+
+        version_key = "{}.version".format(name)
+        if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2:
+            # earlier checkpoints did not normalize after the stack of layers
+            self.layer_norm = None
+            self.normalize = False
+            state_dict[version_key] = torch.Tensor([1])
+
+        return state_dict
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
+    nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+
+
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    nn.init.xavier_uniform_(m.weight)
+    if bias:
+        nn.init.constant_(m.bias, 0.0)
+    return m
+
+
+@register_model_architecture("transformer", "transformer")
+def base_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.no_cross_attention = getattr(args, "no_cross_attention", False)
+    args.cross_self_attention = getattr(args, "cross_self_attention", False)
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+
+    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
+    args.layernorm_embedding = getattr(args, "layernorm_embedding", False)
+    args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
+
+
+@register_model_architecture("transformer", "transformer_iwslt_de_en")
+def transformer_iwslt_de_en(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    base_architecture(args)
+
+
+@register_model_architecture("transformer", "transformer_wmt_en_de")
+def transformer_wmt_en_de(args):
+    base_architecture(args)
+
+
+# parameters used in the "Attention Is All You Need" paper (Vaswani et al., 2017)
+@register_model_architecture("transformer", "transformer_vaswani_wmt_en_de_big")
+def transformer_vaswani_wmt_en_de_big(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    args.dropout = getattr(args, "dropout", 0.3)
+    base_architecture(args)
+
+
+@register_model_architecture("transformer", "transformer_vaswani_wmt_en_fr_big")
+def transformer_vaswani_wmt_en_fr_big(args):
+    args.dropout = getattr(args, "dropout", 0.1)
+    transformer_vaswani_wmt_en_de_big(args)
+
+
+@register_model_architecture("transformer", "transformer_wmt_en_de_big")
+def transformer_wmt_en_de_big(args):
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    transformer_vaswani_wmt_en_de_big(args)
+
+
+# default parameters used in tensor2tensor implementation
+@register_model_architecture("transformer", "transformer_wmt_en_de_big_t2t")
+def transformer_wmt_en_de_big_t2t(args):
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.1)
+    transformer_vaswani_wmt_en_de_big(args)
diff --git a/fairseq/models/transformer_align.py b/fairseq/models/transformer_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..4195ff398264936d579574ae646865bd01dac809
--- /dev/null
+++ b/fairseq/models/transformer_align.py
@@ -0,0 +1,93 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.transformer import (
+    base_architecture,
+    transformer_wmt_en_de_big,
+    TransformerModel,
+)
+
+
+@register_model("transformer_align")
+class TransformerAlignModel(TransformerModel):
+    """
+    See "Jointly Learning to Align and Translate with Transformer
+    Models" (Garg et al., EMNLP 2019).
+    """
+
+    def __init__(self, encoder, decoder, args):
+        super().__init__(args, encoder, decoder)
+        self.alignment_heads = args.alignment_heads
+        self.alignment_layer = args.alignment_layer
+        self.full_context_alignment = args.full_context_alignment
+
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        super(TransformerAlignModel, TransformerAlignModel).add_args(parser)
+        parser.add_argument('--alignment-heads', type=int, metavar='D',
+                            help='Number of cross attention heads per layer to supervised with alignments')
+        parser.add_argument('--alignment-layer', type=int, metavar='D',
+                            help='Layer number which has to be supervised. 0 corresponding to the bottommost layer.')
+        parser.add_argument('--full-context-alignment', type=bool, metavar='D',
+                            help='Whether or not alignment is supervised conditioned on the full target context.')
+        # fmt: on
+
+    @classmethod
+    def build_model(cls, args, task):
+        # set any default arguments
+        transformer_align(args)
+
+        transformer_model = TransformerModel.build_model(args, task)
+        return TransformerAlignModel(
+            transformer_model.encoder, transformer_model.decoder, args
+        )
+
+    def forward(self, src_tokens, src_lengths, prev_output_tokens):
+        encoder_out = self.encoder(src_tokens, src_lengths)
+        return self.forward_decoder(prev_output_tokens, encoder_out)
+
+    def forward_decoder(
+        self,
+        prev_output_tokens,
+        encoder_out=None,
+        incremental_state=None,
+        features_only=False,
+        **extra_args,
+    ):
+        attn_args = {
+            "alignment_layer": self.alignment_layer,
+            "alignment_heads": self.alignment_heads,
+        }
+        decoder_out = self.decoder(prev_output_tokens, encoder_out, **attn_args)
+
+        if self.full_context_alignment:
+            attn_args["full_context_alignment"] = self.full_context_alignment
+            _, alignment_out = self.decoder(
+                prev_output_tokens,
+                encoder_out,
+                features_only=True,
+                **attn_args,
+                **extra_args,
+            )
+            decoder_out[1]["attn"] = alignment_out["attn"]
+
+        return decoder_out
+
+
+@register_model_architecture("transformer_align", "transformer_align")
+def transformer_align(args):
+    args.alignment_heads = getattr(args, "alignment_heads", 1)
+    args.alignment_layer = getattr(args, "alignment_layer", 4)
+    args.full_context_alignment = getattr(args, "full_context_alignment", False)
+    base_architecture(args)
+
+
+@register_model_architecture("transformer_align", "transformer_wmt_en_de_big_align")
+def transformer_wmt_en_de_big_align(args):
+    args.alignment_heads = getattr(args, "alignment_heads", 1)
+    args.alignment_layer = getattr(args, "alignment_layer", 4)
+    transformer_wmt_en_de_big(args)
diff --git a/fairseq/models/transformer_from_pretrained_xlm.py b/fairseq/models/transformer_from_pretrained_xlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd03c8450fa35221e7cb6c10ccad7479bff517ce
--- /dev/null
+++ b/fairseq/models/transformer_from_pretrained_xlm.py
@@ -0,0 +1,155 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from typing import Any, Dict
+
+from fairseq import checkpoint_utils
+from fairseq.data.legacy.masked_lm_dictionary import MaskedLMDictionary
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.transformer import (
+    TransformerDecoder,
+    TransformerEncoder,
+    TransformerModel,
+    base_architecture as transformer_base_architecture,
+)
+
+
+@register_model("transformer_from_pretrained_xlm")
+class TransformerFromPretrainedXLMModel(TransformerModel):
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        TransformerModel.add_args(parser)
+        parser.add_argument(
+            "--pretrained-xlm-checkpoint",
+            type=str,
+            metavar="STR",
+            help="XLM model to use for initializing transformer encoder and/or decoder",
+        )
+        parser.add_argument(
+            "--init-encoder-only",
+            action="store_true",
+            help="if set, don't load the XLM weights and embeddings into decoder",
+        )
+        parser.add_argument(
+            "--init-decoder-only",
+            action="store_true",
+            help="if set, don't load the XLM weights and embeddings into encoder",
+        )
+
+    @classmethod
+    def build_model(self, args, task, cls_dictionary=MaskedLMDictionary):
+        assert hasattr(args, "pretrained_xlm_checkpoint"), (
+            "You must specify a path for --pretrained-xlm-checkpoint to use "
+            "--arch transformer_from_pretrained_xlm"
+        )
+        assert isinstance(task.source_dictionary, cls_dictionary) and isinstance(
+            task.target_dictionary, cls_dictionary
+        ), (
+            "You should use a MaskedLMDictionary when using --arch "
+            "transformer_from_pretrained_xlm because the pretrained XLM model "
+            "was trained using data binarized with MaskedLMDictionary. "
+            "For translation, you may want to use --task "
+            "translation_from_pretrained_xlm"
+        )
+        assert not (
+            getattr(args, "init_encoder_only", False)
+            and getattr(args, "init_decoder_only", False)
+        ), "Only one of --init-encoder-only and --init-decoder-only can be set."
+        return super().build_model(args, task)
+
+    @classmethod
+    def build_encoder(cls, args, src_dict, embed_tokens):
+        return TransformerEncoderFromPretrainedXLM(args, src_dict, embed_tokens)
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        return TransformerDecoderFromPretrainedXLM(args, tgt_dict, embed_tokens)
+
+
+def upgrade_state_dict_with_xlm_weights(
+    state_dict: Dict[str, Any], pretrained_xlm_checkpoint: str
+) -> Dict[str, Any]:
+    """
+    Load XLM weights into a Transformer encoder or decoder model.
+
+    Args:
+        state_dict: state dict for either TransformerEncoder or
+            TransformerDecoder
+        pretrained_xlm_checkpoint: checkpoint to load XLM weights from
+
+    Raises:
+        AssertionError: If architecture (num layers, attention heads, etc.)
+            does not match between the current Transformer encoder or
+            decoder and the pretrained_xlm_checkpoint
+    """
+    if not os.path.exists(pretrained_xlm_checkpoint):
+        raise IOError("Model file not found: {}".format(pretrained_xlm_checkpoint))
+
+    state = checkpoint_utils.load_checkpoint_to_cpu(pretrained_xlm_checkpoint)
+    xlm_state_dict = state["model"]
+    for key in xlm_state_dict.keys():
+
+        for search_key in ["embed_tokens", "embed_positions", "layers"]:
+            if search_key in key:
+                subkey = key[key.find(search_key):]
+                assert subkey in state_dict, (
+                    "{} Transformer encoder / decoder "
+                    "state_dict does not contain {}. Cannot "
+                    "load {} from pretrained XLM checkpoint "
+                    "{} into Transformer.".format(
+                        str(state_dict.keys()),
+                        subkey, key, pretrained_xlm_checkpoint)
+                    )
+
+                state_dict[subkey] = xlm_state_dict[key]
+    return state_dict
+
+
+class TransformerEncoderFromPretrainedXLM(TransformerEncoder):
+
+    def __init__(self, args, dictionary, embed_tokens):
+        super().__init__(args, dictionary, embed_tokens)
+        if getattr(args, 'init_decoder_only', False):
+            # Don't load XLM weights for encoder if --init-decoder-only
+            return
+
+        assert hasattr(args, "pretrained_xlm_checkpoint"), (
+            "--pretrained-xlm-checkpoint must be specified to load Transformer "
+            "encoder from pretrained XLM"
+        )
+        xlm_loaded_state_dict = upgrade_state_dict_with_xlm_weights(
+            state_dict=self.state_dict(),
+            pretrained_xlm_checkpoint=args.pretrained_xlm_checkpoint,
+        )
+        self.load_state_dict(xlm_loaded_state_dict, strict=True)
+
+
+class TransformerDecoderFromPretrainedXLM(TransformerDecoder):
+
+    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
+        super().__init__(args, dictionary, embed_tokens, no_encoder_attn)
+        if getattr(args, 'init_encoder_only', False):
+            # Don't load XLM weights for decoder if --init-encoder-only
+            return
+        assert hasattr(args, "pretrained_xlm_checkpoint"), (
+            "--pretrained-xlm-checkpoint must be specified to load Transformer "
+            "decoder from pretrained XLM"
+        )
+
+        xlm_loaded_state_dict = upgrade_state_dict_with_xlm_weights(
+            state_dict=self.state_dict(),
+            pretrained_xlm_checkpoint=args.pretrained_xlm_checkpoint,
+        )
+        self.load_state_dict(xlm_loaded_state_dict, strict=True)
+
+
+@register_model_architecture(
+    "transformer_from_pretrained_xlm", "transformer_from_pretrained_xlm"
+)
+def base_architecture(args):
+    transformer_base_architecture(args)
diff --git a/fairseq/models/transformer_lm.py b/fairseq/models/transformer_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..b59363900e3ad85fb9e149f575f562efe4740bc8
--- /dev/null
+++ b/fairseq/models/transformer_lm.py
@@ -0,0 +1,304 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq import options, utils
+from fairseq.models import (
+    FairseqLanguageModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.transformer import (
+    Embedding,
+    TransformerDecoder,
+)
+from fairseq.modules import (
+    AdaptiveInput,
+    CharacterTokenEmbedder,
+)
+
+DEFAULT_MAX_TARGET_POSITIONS = 1024
+
+
+@register_model('transformer_lm')
+class TransformerLanguageModel(FairseqLanguageModel):
+
+    @classmethod
+    def hub_models(cls):
+
+        def moses_fastbpe(path):
+            return {
+                'path': path,
+                'tokenizer': 'moses',
+                'bpe': 'fastbpe',
+            }
+
+        return {
+            'transformer_lm.gbw.adaptive_huge': 'https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2',
+            'transformer_lm.wiki103.adaptive': 'https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2',
+            'transformer_lm.wmt19.en': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.bz2'),
+            'transformer_lm.wmt19.de': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.bz2'),
+            'transformer_lm.wmt19.ru': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.bz2'),
+        }
+
+    def __init__(self, decoder):
+        super().__init__(decoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--activation-fn',
+                            choices=utils.get_available_activation_fns(),
+                            help='activation function to use')
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--attention-dropout', type=float, metavar='D',
+                            help='dropout probability for attention weights')
+        parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
+                            help='dropout probability after activation in FFN.')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-output-dim', type=int, metavar='N',
+                            help='decoder output dimension')
+        parser.add_argument('--decoder-input-dim', type=int, metavar='N',
+                            help='decoder input dimension')
+        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension for FFN')
+        parser.add_argument('--decoder-layers', type=int, metavar='N',
+                            help='num decoder layers')
+        parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
+                            help='num decoder attention heads')
+        parser.add_argument('--decoder-normalize-before', action='store_true',
+                            help='apply layernorm before each decoder block')
+        parser.add_argument('--no-decoder-final-norm', action='store_true',
+                            help='don\'t add an extra layernorm after the last decoder block')
+        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+                            help='comma separated list of adaptive softmax cutoff points. '
+                                 'Must be used with adaptive_loss criterion')
+        parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
+                            help='sets adaptive softmax dropout for the tail projections')
+        parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
+                            help='adaptive input factor')
+        parser.add_argument('--no-token-positional-embeddings', action='store_true',
+                            help='if set, disables positional embeddings (outside self attention)')
+        parser.add_argument('--share-decoder-input-output-embed', action='store_true',
+                            help='share decoder input and output embeddings')
+        parser.add_argument('--character-embeddings', action='store_true',
+                            help='if set, uses character embedding convolutions to produce token embeddings')
+        parser.add_argument('--character-filters', type=str, metavar='LIST',
+                            default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
+                            help='size of character embeddings')
+        parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
+                            help='size of character embeddings')
+        parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
+                            help='number of highway layers for character token embeddder')
+        parser.add_argument('--adaptive-input', action='store_true',
+                            help='if set, uses adaptive input')
+        parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
+                            help='adaptive input factor')
+        parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
+                            help='comma separated list of adaptive input cutoff points.')
+        parser.add_argument('--tie-adaptive-weights', action='store_true',
+                            help='if set, ties the weights of adaptive softmax and adaptive input')
+        parser.add_argument('--tie-adaptive-proj', action='store_true',
+                            help='if set, ties the projection weights of adaptive softmax and adaptive input')
+        parser.add_argument('--decoder-learned-pos', action='store_true',
+                            help='use learned positional embeddings in the decoder')
+        parser.add_argument('--layernorm-embedding', action='store_true',
+                            help='add layernorm to embedding')
+        parser.add_argument('--no-scale-embedding', action='store_true',
+                            help='if True, dont scale embeddings')
+        # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
+        parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
+                            help='LayerDrop probability for decoder')
+        parser.add_argument('--decoder-layers-to-keep', default=None,
+                            help='which layers to *keep* when pruning as a comma-separated list')
+        # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
+        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
+                            help='iterative PQ quantization noise at training time')
+        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
+                            help='block size of quantization noise at training time')
+        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
+                            help='scalar quantization noise and scalar quantization at training time')
+        # fmt: on
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present in older models
+        base_lm_architecture(args)
+
+        if args.decoder_layers_to_keep:
+            args.decoder_layers = len(args.decoder_layers_to_keep.split(","))
+
+        if getattr(args, 'max_target_positions', None) is None:
+            args.max_target_positions = getattr(args, 'tokens_per_sample', DEFAULT_MAX_TARGET_POSITIONS)
+
+        if args.character_embeddings:
+            embed_tokens = CharacterTokenEmbedder(
+                task.source_dictionary, eval(args.character_filters),
+                args.character_embedding_dim, args.decoder_embed_dim,
+                args.char_embedder_highway_layers,
+            )
+        elif args.adaptive_input:
+            embed_tokens = AdaptiveInput(
+                len(task.source_dictionary), task.source_dictionary.pad(), args.decoder_input_dim,
+                args.adaptive_input_factor, args.decoder_embed_dim,
+                options.eval_str_list(args.adaptive_input_cutoff, type=int),
+                args.quant_noise_pq, args.quant_noise_pq_block_size,
+            )
+        else:
+            embed_tokens = cls.build_embedding(args, task.source_dictionary, args.decoder_input_dim)
+
+        if args.tie_adaptive_weights:
+            assert args.adaptive_input
+            assert args.adaptive_input_factor == args.adaptive_softmax_factor
+            assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format(
+                args.adaptive_softmax_cutoff, args.adaptive_input_cutoff)
+            assert args.decoder_input_dim == args.decoder_output_dim
+
+        decoder = TransformerDecoder(
+            args, task.target_dictionary, embed_tokens, no_encoder_attn=True,
+        )
+        return cls(decoder)
+
+    @classmethod
+    def build_embedding(cls, args, dictionary, embed_dim, path=None):
+        embed_tokens = Embedding(len(dictionary), embed_dim, dictionary.pad())
+        return embed_tokens
+
+
+@register_model_architecture('transformer_lm', 'transformer_lm')
+def base_lm_architecture(args):
+    # backward compatibility for older model checkpoints
+    if hasattr(args, 'no_tie_adaptive_proj'):
+        # previous models defined --no-tie-adaptive-proj, so use the existence of
+        # that option to determine if this is an "old" model checkpoint
+        args.no_decoder_final_norm = True  # old models always set this to True
+        if args.no_tie_adaptive_proj is False:
+            args.tie_adaptive_proj = True
+    if hasattr(args, 'decoder_final_norm'):
+        args.no_decoder_final_norm = not args.decoder_final_norm
+
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.0)
+
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
+    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048)
+    args.decoder_layers = getattr(args, 'decoder_layers', 6)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
+    args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
+    args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
+    args.adaptive_softmax_factor = getattr(args, 'adaptive_softmax_factor', 4)
+    args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
+    args.activation_fn = getattr(args, 'activation_fn', 'relu')
+
+    args.add_bos_token = getattr(args, 'add_bos_token', False)
+    args.no_token_positional_embeddings = getattr(args, 'no_token_positional_embeddings', False)
+    args.share_decoder_input_output_embed = getattr(args, 'share_decoder_input_output_embed', False)
+    args.character_embeddings = getattr(args, 'character_embeddings', False)
+
+    args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim)
+    args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim)
+
+    # Model training is not stable without this
+    args.decoder_normalize_before = True
+    args.no_decoder_final_norm = getattr(args, 'no_decoder_final_norm', False)
+
+    args.adaptive_input = getattr(args, 'adaptive_input', False)
+    args.adaptive_input_factor = getattr(args, 'adaptive_input_factor', 4)
+    args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', None)
+
+    args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', False)
+    args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', False)
+
+    args.no_scale_embedding = getattr(args, 'no_scale_embedding', False)
+    args.layernorm_embedding = getattr(args, 'layernorm_embedding', False)
+
+
+@register_model_architecture('transformer_lm', 'transformer_lm_big')
+def transformer_lm_big(args):
+    args.decoder_layers = getattr(args, 'decoder_layers', 12)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)
+    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
+    base_lm_architecture(args)
+
+
+@register_model_architecture('transformer_lm', 'transformer_lm_wiki103')
+@register_model_architecture('transformer_lm', 'transformer_lm_baevski_wiki103')
+def transformer_lm_baevski_wiki103(args):
+    args.decoder_layers = getattr(args, 'decoder_layers', 16)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
+    args.dropout = getattr(args, 'dropout', 0.3)
+    args.adaptive_input = getattr(args, 'adaptive_input', True)
+    args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', True)
+    args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', '20000,60000')
+    args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '20000,60000')
+    args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0.2)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
+    args.activation_dropout = getattr(args, 'activation_dropout', 0.1)
+    args.no_decoder_final_norm = getattr(args, 'no_decoder_final_norm', True)
+    args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', True)
+    transformer_lm_big(args)
+
+
+@register_model_architecture('transformer_lm', 'transformer_lm_gbw')
+@register_model_architecture('transformer_lm', 'transformer_lm_baevski_gbw')
+def transformer_lm_baevski_gbw(args):
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
+    args.no_decoder_final_norm = getattr(args, 'no_decoder_final_norm', True)
+    transformer_lm_big(args)
+
+
+@register_model_architecture('transformer_lm', 'transformer_lm_gpt')
+def transformer_lm_gpt(args):
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 768)
+    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 3072)
+    args.decoder_layers = getattr(args, 'decoder_layers', 12)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 12)
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
+    args.activation_fn = getattr(args, 'activation_fn', 'gelu')
+    base_lm_architecture(args)
+
+
+@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_small')
+def transformer_lm_gpt2_small(args):
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)
+    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
+    args.decoder_layers = getattr(args, 'decoder_layers', 24)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
+    args.activation_fn = getattr(args, 'activation_fn', 'gelu')
+    base_lm_architecture(args)
+
+
+@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_medium')
+def transformer_lm_gpt2_medium(args):
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1280)
+    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 5120)
+    args.decoder_layers = getattr(args, 'decoder_layers', 36)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 20)
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
+    args.activation_fn = getattr(args, 'activation_fn', 'gelu')
+    base_lm_architecture(args)
+
+
+@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_big')
+def transformer_lm_gpt2_big(args):
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1600)
+    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 6400)
+    args.decoder_layers = getattr(args, 'decoder_layers', 48)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 25)
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
+    args.activation_fn = getattr(args, 'activation_fn', 'gelu')
+    base_lm_architecture(args)
diff --git a/fairseq/models/wav2vec/__init__.py b/fairseq/models/wav2vec/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..06cec18183ca14cd534d14558e8b44e25f3e69d5
--- /dev/null
+++ b/fairseq/models/wav2vec/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .wav2vec import *  # noqa
+from .wav2vec2 import *  # noqa
+from .wav2vec2_asr import *  # noqa
diff --git a/fairseq/models/wav2vec/__pycache__/__init__.cpython-310.pyc b/fairseq/models/wav2vec/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c9f317e1c542342e6f8c25c06a88809f46814b2
Binary files /dev/null and b/fairseq/models/wav2vec/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/models/wav2vec/__pycache__/wav2vec.cpython-310.pyc b/fairseq/models/wav2vec/__pycache__/wav2vec.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd8055aeb10332ffaffbe5770eb4f99f2386967f
Binary files /dev/null and b/fairseq/models/wav2vec/__pycache__/wav2vec.cpython-310.pyc differ
diff --git a/fairseq/models/wav2vec/__pycache__/wav2vec2.cpython-310.pyc b/fairseq/models/wav2vec/__pycache__/wav2vec2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3510ed8ea5f865bb482b102f8c3813b826f5212b
Binary files /dev/null and b/fairseq/models/wav2vec/__pycache__/wav2vec2.cpython-310.pyc differ
diff --git a/fairseq/models/wav2vec/__pycache__/wav2vec2_asr.cpython-310.pyc b/fairseq/models/wav2vec/__pycache__/wav2vec2_asr.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b8fe2eebf75c01e20816d66bcc2c1a00ed5e876
Binary files /dev/null and b/fairseq/models/wav2vec/__pycache__/wav2vec2_asr.cpython-310.pyc differ
diff --git a/fairseq/models/wav2vec/wav2vec.py b/fairseq/models/wav2vec/wav2vec.py
new file mode 100644
index 0000000000000000000000000000000000000000..905df824f3ae0f708fc269cc486d03df78611341
--- /dev/null
+++ b/fairseq/models/wav2vec/wav2vec.py
@@ -0,0 +1,735 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import math
+import sys
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq.models import BaseFairseqModel, register_model, register_model_architecture
+from fairseq.modules import (
+    Fp32GroupNorm,
+    Fp32LayerNorm,
+    GumbelVectorQuantizer,
+    KmeansVectorQuantizer,
+    TransposeLast,
+)
+from fairseq.utils import buffered_arange
+
+logger = logging.getLogger(__name__)
+
+
+@register_model("wav2vec")
+class Wav2VecModel(BaseFairseqModel):
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument(
+            "--prediction-steps",
+            type=int,
+            metavar="N",
+            help="number of steps ahead to predict",
+        )
+        parser.add_argument(
+            "--sample-distance",
+            type=int,
+            metavar="N",
+            help="sample distance from target. does not work properly with cross-sampling",
+        )
+        parser.add_argument(
+            "--cross-sample-negatives",
+            type=int,
+            metavar="N",
+            help="num of cross sampled negatives",
+        )
+        parser.add_argument(
+            "--num-negatives", type=int, metavar="N", help="number of negative examples"
+        )
+        parser.add_argument(
+            "--conv-feature-layers",
+            type=str,
+            metavar="EXPR",
+            help="convolutional feature extraction layers [(dim, kernel_size, stride), ...]",
+        )
+        parser.add_argument(
+            "--conv-aggregator-layers",
+            type=str,
+            metavar="EXPR",
+            help="convolutional feature extraction layers [(dim, kernel_size, stride), ...]",
+        )
+        parser.add_argument(
+            "--dropout",
+            type=float,
+            metavar="D",
+            help="dropout to apply within the model",
+        )
+        parser.add_argument(
+            "--dropout-features",
+            type=float,
+            metavar="D",
+            help="dropout to apply to the features",
+        )
+        parser.add_argument(
+            "--dropout-agg",
+            type=float,
+            metavar="D",
+            help="dropout to apply after aggregation step",
+        )
+        parser.add_argument(
+            "--encoder", type=str, choices=["cnn"], help="type of encoder to use"
+        )
+        parser.add_argument(
+            "--aggregator",
+            type=str,
+            choices=["cnn", "gru"],
+            help="type of aggregator to use",
+        )
+        parser.add_argument(
+            "--gru-dim", type=int, metavar="N", help="GRU dimensionality"
+        )
+
+        parser.add_argument(
+            "--no-conv-bias",
+            action="store_true",
+            help="if set, does not learn bias for conv layers",
+        )
+        parser.add_argument(
+            "--agg-zero-pad",
+            action="store_true",
+            help="if set, zero pads in aggregator instead of repl pad",
+        )
+
+        parser.add_argument(
+            "--skip-connections-feat",
+            action="store_true",
+            help="if set, adds skip connections to the feature extractor",
+        )
+        parser.add_argument(
+            "--skip-connections-agg",
+            action="store_true",
+            help="if set, adds skip connections to the aggregator",
+        )
+        parser.add_argument(
+            "--residual-scale",
+            type=float,
+            metavar="D",
+            help="scales residual by sqrt(value)",
+        )
+
+        parser.add_argument(
+            "--log-compression",
+            action="store_true",
+            help="if set, adds a log compression to feature extractor",
+        )
+
+        parser.add_argument(
+            "--balanced-classes",
+            action="store_true",
+            help="if set, loss is scaled to balance for number of negatives",
+        )
+
+        parser.add_argument(
+            "--project-features",
+            choices=["none", "same", "new"],
+            help="if not none, features are projected using the (same or new) aggregator",
+        )
+
+        parser.add_argument(
+            "--non-affine-group-norm",
+            action="store_true",
+            help="if set, group norm is not affine",
+        )
+
+        parser.add_argument(
+            "--offset",
+            help="if set, introduces an offset from target to predictions. "
+            'if set to "auto", it is computed automatically from the receptive field',
+        )
+
+        parser.add_argument(
+            "--activation",
+            type=str,
+            choices=["relu", "gelu"],
+            help="which activation function to use",
+        )
+
+        parser.add_argument(
+            "--vq-type",
+            type=str,
+            choices=["none", "gumbel", "kmeans"],
+            help="which type of quantizer to use",
+        )
+        parser.add_argument(
+            "--vq-vars",
+            type=int,
+            metavar="N",
+            help="if set, project to this many vector quantized variables per group",
+        )
+        parser.add_argument(
+            "--vq-groups",
+            type=int,
+            metavar="N",
+            help="number of groups of latent variables",
+        )
+        parser.add_argument(
+            "--vq-dim",
+            type=int,
+            metavar="N",
+            help="uses this dimensionality for quantized vectors",
+        )
+        parser.add_argument(
+            "--vq-depth",
+            type=int,
+            metavar="N",
+            help="number of layers for vq weight projection",
+        )
+        parser.add_argument(
+            "--combine-groups",
+            action="store_true",
+            help="if set, variables are shared among groups",
+        )
+        parser.add_argument(
+            "--vq-temp",
+            type=str,
+            metavar="TEMP",
+            help="temperature for latent variable sampling with gumbel softmax. should be a tuple of 3 values (start, end, decay)",
+        )
+        parser.add_argument(
+            "--vq-gamma",
+            type=float,
+            metavar="D",
+            help="gamma parameter for kmeans style vector quantization",
+        )
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present in older models
+        base_wav2vec_architecture(args)
+
+        model = Wav2VecModel(args)
+        logger.info(model)
+        return model
+
+    def __init__(self, args):
+        super().__init__()
+
+        self.prediction_steps = args.prediction_steps
+        offset = args.offset
+
+        if args.activation == "relu":
+            activation = nn.ReLU()
+        elif args.activation == "gelu":
+            activation = nn.GELU()
+        else:
+            raise Exception("unknown activation " + args.activation)
+
+        if args.encoder == "cnn":
+            feature_enc_layers = eval(args.conv_feature_layers)
+            self.feature_extractor = ConvFeatureExtractionModel(
+                conv_layers=feature_enc_layers,
+                dropout=0.0,
+                log_compression=args.log_compression,
+                skip_connections=args.skip_connections_feat,
+                residual_scale=args.residual_scale,
+                non_affine_group_norm=args.non_affine_group_norm,
+                activation=activation,
+            )
+            embed = feature_enc_layers[-1][0]
+        else:
+            raise Exception("unknown encoder type " + args.encoder)
+
+        self.vector_quantizer = None
+        if args.vq_type == "gumbel":
+            self.vector_quantizer = GumbelVectorQuantizer(
+                dim=embed,
+                num_vars=args.vq_vars,
+                temp=eval(args.vq_temp),
+                groups=args.vq_groups,
+                combine_groups=args.combine_groups,
+                vq_dim=args.vq_dim if args.vq_dim > 0 else embed,
+                time_first=False,
+                activation=activation,
+                weight_proj_depth=args.vq_depth,
+                weight_proj_factor=2,
+            )
+        elif args.vq_type == "kmeans":
+            self.vector_quantizer = KmeansVectorQuantizer(
+                dim=embed,
+                num_vars=args.vq_vars,
+                groups=args.vq_groups,
+                combine_groups=args.combine_groups,
+                vq_dim=args.vq_dim if args.vq_dim > 0 else embed,
+                time_first=False,
+                gamma=args.vq_gamma,
+            )
+        else:
+            assert (
+                args.vq_type == "none" or args.vq_type is None
+            ), "Unknown quantizer type"
+
+        if args.offset == "auto":
+            assert args.encoder == "cnn"
+            jin = 0
+            rin = 0
+            for _, k, stride in feature_enc_layers:
+                if rin == 0:
+                    rin = k
+                rin = rin + (k - 1) * jin
+                if jin == 0:
+                    jin = stride
+                else:
+                    jin *= stride
+            offset = math.ceil(rin / jin)
+
+        offset = int(offset)
+
+        def make_aggregator():
+            if args.aggregator == "cnn":
+                agg_layers = eval(args.conv_aggregator_layers)
+                agg_dim = agg_layers[-1][0]
+                feature_aggregator = ConvAggegator(
+                    conv_layers=agg_layers,
+                    embed=embed,
+                    dropout=args.dropout,
+                    skip_connections=args.skip_connections_agg,
+                    residual_scale=args.residual_scale,
+                    non_affine_group_norm=args.non_affine_group_norm,
+                    conv_bias=not args.no_conv_bias,
+                    zero_pad=args.agg_zero_pad,
+                    activation=activation,
+                )
+            elif args.aggregator == "gru":
+                agg_dim = args.gru_dim
+                feature_aggregator = nn.Sequential(
+                    TransposeLast(),
+                    nn.GRU(
+                        input_size=embed,
+                        hidden_size=agg_dim,
+                        num_layers=1,
+                        dropout=args.dropout,
+                    ),
+                    TransposeLast(deconstruct_idx=0),
+                )
+            else:
+                raise Exception("unknown aggregator type " + args.aggregator)
+
+            return feature_aggregator, agg_dim
+
+        self.feature_aggregator, agg_dim = make_aggregator()
+
+        self.wav2vec_predictions = Wav2VecPredictionsModel(
+            in_dim=agg_dim,
+            out_dim=embed,
+            prediction_steps=args.prediction_steps,
+            n_negatives=args.num_negatives,
+            cross_sample_negatives=args.cross_sample_negatives,
+            sample_distance=args.sample_distance,
+            dropout=args.dropout,
+            offset=offset,
+            balanced_classes=args.balanced_classes,
+            infonce=args.infonce,
+        )
+
+        self.dropout_feats = nn.Dropout(p=args.dropout_features)
+        self.dropout_agg = nn.Dropout(p=args.dropout_agg)
+
+        if args.project_features == "none":
+            self.project_features = None
+        elif args.project_features == "same":
+            self.project_features = self.feature_aggregator
+        elif args.project_features == "new":
+            self.project_features, _ = make_aggregator()
+
+    def forward(self, source):
+        result = {}
+
+        features = self.feature_extractor(source)
+        if self.vector_quantizer:
+            q_res = self.vector_quantizer(features)
+            features = q_res["x"]
+            for k in q_res.keys():
+                if k != "x":
+                    result[k] = q_res[k]
+
+        x = self.dropout_feats(features)
+        x = self.feature_aggregator(x)
+        x = self.dropout_agg(x)
+
+        if self.project_features is not None:
+            features = self.project_features(features)
+        x, targets = self.wav2vec_predictions(x, features)
+        result["cpc_logits"] = x
+        result["cpc_targets"] = targets
+
+        return result
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        super().upgrade_state_dict_named(state_dict, name)
+
+    def max_positions(self):
+        """Maximum length supported by the model."""
+        return sys.maxsize
+
+    def get_logits(self, net_output):
+        logits = net_output["cpc_logits"]
+        return logits
+
+    def get_targets(self, sample, net_output):
+        t = net_output["cpc_targets"]
+        if isinstance(t, tuple):
+            t = t[0]
+        return t.contiguous()
+
+    def get_target_weights(self, targets, net_output):
+        targets = net_output["cpc_targets"]
+        if isinstance(targets, tuple) and targets[-1] is not None:
+            return targets[-1]
+        return None
+
+    def get_extra_losses(self, net_output):
+        loss = None
+        if "prob_perplexity" in net_output:
+            loss = net_output["num_vars"] - net_output["prob_perplexity"]
+        elif "kmeans_loss" in net_output:
+            loss = net_output["kmeans_loss"]
+
+        return loss
+
+
+def norm_block(is_layer_norm, dim, affine=True):
+    if is_layer_norm:
+        mod = nn.Sequential(
+            TransposeLast(),
+            Fp32LayerNorm(dim, elementwise_affine=affine),
+            TransposeLast(),
+        )
+    else:
+        mod = Fp32GroupNorm(1, dim, affine=affine)
+
+    return mod
+
+
+class ConvFeatureExtractionModel(nn.Module):
+    def __init__(
+        self,
+        conv_layers,
+        dropout,
+        log_compression,
+        skip_connections,
+        residual_scale,
+        non_affine_group_norm,
+        activation,
+    ):
+        super().__init__()
+
+        def block(n_in, n_out, k, stride):
+            return nn.Sequential(
+                nn.Conv1d(n_in, n_out, k, stride=stride, bias=False),
+                nn.Dropout(p=dropout),
+                norm_block(
+                    is_layer_norm=False, dim=n_out, affine=not non_affine_group_norm
+                ),
+                activation,
+            )
+
+        in_d = 1
+        self.conv_layers = nn.ModuleList()
+        for dim, k, stride in conv_layers:
+            self.conv_layers.append(block(in_d, dim, k, stride))
+            in_d = dim
+
+        self.log_compression = log_compression
+        self.skip_connections = skip_connections
+        self.residual_scale = math.sqrt(residual_scale)
+
+    def forward(self, x):
+        # BxT -> BxCxT
+        x = x.unsqueeze(1)
+
+        for conv in self.conv_layers:
+            residual = x
+            x = conv(x)
+            if self.skip_connections and x.size(1) == residual.size(1):
+                tsz = x.size(2)
+                r_tsz = residual.size(2)
+                residual = residual[..., :: r_tsz // tsz][..., :tsz]
+                x = (x + residual) * self.residual_scale
+
+        if self.log_compression:
+            x = x.abs()
+            x = x + 1
+            x = x.log()
+
+        return x
+
+
+class ZeroPad1d(nn.Module):
+    def __init__(self, pad_left, pad_right):
+        super().__init__()
+        self.pad_left = pad_left
+        self.pad_right = pad_right
+
+    def forward(self, x):
+        return F.pad(x, (self.pad_left, self.pad_right))
+
+
+class ConvAggegator(nn.Module):
+    def __init__(
+        self,
+        conv_layers,
+        embed,
+        dropout,
+        skip_connections,
+        residual_scale,
+        non_affine_group_norm,
+        conv_bias,
+        zero_pad,
+        activation,
+    ):
+        super().__init__()
+
+        def block(n_in, n_out, k, stride):
+            # padding dims only really make sense for stride = 1
+            ka = k // 2
+            kb = ka - 1 if k % 2 == 0 else ka
+
+            pad = (
+                ZeroPad1d(ka + kb, 0) if zero_pad else nn.ReplicationPad1d((ka + kb, 0))
+            )
+
+            return nn.Sequential(
+                pad,
+                nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias),
+                nn.Dropout(p=dropout),
+                norm_block(False, n_out, affine=not non_affine_group_norm),
+                activation,
+            )
+
+        in_d = embed
+        self.conv_layers = nn.ModuleList()
+        self.residual_proj = nn.ModuleList()
+        for dim, k, stride in conv_layers:
+            if in_d != dim and skip_connections:
+                self.residual_proj.append(nn.Conv1d(in_d, dim, 1, bias=False))
+            else:
+                self.residual_proj.append(None)
+
+            self.conv_layers.append(block(in_d, dim, k, stride))
+            in_d = dim
+        self.conv_layers = nn.Sequential(*self.conv_layers)
+        self.skip_connections = skip_connections
+        self.residual_scale = math.sqrt(residual_scale)
+
+    def forward(self, x):
+        for rproj, conv in zip(self.residual_proj, self.conv_layers):
+            residual = x
+            x = conv(x)
+            if self.skip_connections:
+                if rproj is not None:
+                    residual = rproj(residual)
+                x = (x + residual) * self.residual_scale
+        return x
+
+
+class Wav2VecPredictionsModel(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        prediction_steps,
+        n_negatives,
+        cross_sample_negatives,
+        sample_distance,
+        dropout,
+        offset,
+        balanced_classes,
+        infonce,
+    ):
+        super().__init__()
+
+        self.n_negatives = n_negatives
+        self.cross_sample_negatives = cross_sample_negatives
+        self.sample_distance = sample_distance
+        self.project_to_steps = nn.ConvTranspose2d(
+            in_dim, out_dim, (1, prediction_steps)
+        )
+        self.dropout = nn.Dropout(p=dropout)
+        self.offset = offset
+        self.balanced_classes = balanced_classes
+        self.infonce = infonce
+
+    def sample_negatives(self, y):
+        bsz, fsz, tsz = y.shape
+
+        y = y.transpose(0, 1)  # BCT -> CBT
+        y = y.contiguous().view(fsz, -1)  # CBT => C(BxT)
+
+        cross_high = tsz * bsz
+        high = tsz if self.sample_distance is None else min(tsz, self.sample_distance)
+        assert high > 1
+
+        neg_idxs = torch.randint(low=0, high=high, size=(bsz, self.n_negatives * tsz))
+
+        with torch.no_grad():
+            if self.n_negatives > 0:
+                tszs = (
+                    buffered_arange(tsz)
+                    .unsqueeze(-1)
+                    .expand(-1, self.n_negatives)
+                    .flatten()
+                )
+
+                neg_idxs = torch.randint(
+                    low=0, high=high - 1, size=(bsz, self.n_negatives * tsz)
+                )
+                neg_idxs[neg_idxs >= tszs] += 1
+
+            if self.cross_sample_negatives > 0:
+                tszs = (
+                    buffered_arange(tsz)
+                    .unsqueeze(-1)
+                    .expand(-1, self.cross_sample_negatives)
+                    .flatten()
+                )
+
+                cross_neg_idxs = torch.randint(
+                    low=0,
+                    high=cross_high - 1,
+                    size=(bsz, self.cross_sample_negatives * tsz),
+                )
+                cross_neg_idxs[cross_neg_idxs >= tszs] += 1
+
+        if self.n_negatives > 0:
+            for i in range(1, bsz):
+                neg_idxs[i] += i * high
+        else:
+            neg_idxs = cross_neg_idxs
+
+        if self.cross_sample_negatives > 0 and self.n_negatives > 0:
+            neg_idxs = torch.cat([neg_idxs, cross_neg_idxs], dim=1)
+
+        negs = y[..., neg_idxs.view(-1)]
+        negs = negs.view(
+            fsz, bsz, self.n_negatives + self.cross_sample_negatives, tsz
+        ).permute(
+            2, 1, 0, 3
+        )  # to NxBxCxT
+
+        return negs
+
+    def forward(self, x, y):
+
+        x = x.unsqueeze(-1)
+        x = self.project_to_steps(x)  # BxCxTxS
+        x = self.dropout(x)
+
+        negatives = self.sample_negatives(y)
+        y = y.unsqueeze(0)
+        targets = torch.cat([y, negatives], dim=0)  # Copies x B x C x T
+
+        copies = targets.size(0)
+        bsz, dim, tsz, steps = x.shape
+        steps = min(steps, tsz - self.offset)
+
+        predictions = x.new(
+            bsz * copies * (tsz - self.offset + 1) * steps
+            - ((steps + 1) * steps // 2) * copies * bsz
+        )
+        if self.infonce:
+            labels = predictions.new_full(
+                (predictions.shape[0] // copies,), 0, dtype=torch.long
+            )
+        else:
+            labels = torch.zeros_like(predictions)
+        weights = (
+            torch.full_like(labels, 1 / self.n_negatives)
+            if self.balanced_classes and not self.infonce
+            else None
+        )
+
+        start = end = 0
+        for i in range(steps):
+            offset = i + self.offset
+            end = start + (tsz - offset) * bsz * copies
+            if self.infonce:
+                predictions[start:end] = torch.einsum(
+                    "bct,nbct->tbn", x[..., :-offset, i], targets[..., offset:]
+                ).flatten()
+            else:
+                pos_num = (end - start) // copies
+                predictions[start:end] = torch.einsum(
+                    "bct,nbct->nbt", x[..., :-offset, i], targets[..., offset:]
+                ).flatten()
+                labels[start : start + pos_num] = 1.0
+                if weights is not None:
+                    weights[start : start + pos_num] = 1.0
+            start = end
+        assert end == predictions.numel(), "{} != {}".format(end, predictions.numel())
+
+        if self.infonce:
+            predictions = predictions.view(-1, copies)
+        else:
+            if weights is not None:
+                labels = (labels, weights)
+
+        return predictions, labels
+
+
+@register_model_architecture("wav2vec", "wav2vec")
+def base_wav2vec_architecture(args):
+    conv_feature_layers = "[(512, 10, 5)]"
+    conv_feature_layers += " + [(512, 8, 4)]"
+    conv_feature_layers += " + [(512, 4, 2)] * 3"
+    args.conv_feature_layers = getattr(args, "conv_feature_layers", conv_feature_layers)
+
+    args.conv_aggregator_layers = getattr(
+        args, "conv_aggregator_layers", "[(512, 3, 1)] * 9"
+    )
+
+    args.prediction_steps = getattr(args, "prediction_steps", 12)
+    args.num_negatives = getattr(args, "num_negatives", 1)
+    args.sample_distance = getattr(args, "sample_distance", None)
+    args.cross_sample_negatives = getattr(args, "cross_sample_negatives", 0)
+
+    args.dropout = getattr(args, "dropout", 0.0)
+    args.dropout_features = getattr(args, "dropout_features", 0.0)
+    args.dropout_agg = getattr(args, "dropout_agg", 0.0)
+    args.encoder = getattr(args, "encoder", "cnn")
+    args.aggregator = getattr(args, "aggregator", "cnn")
+
+    args.skip_connections_feat = getattr(args, "skip_connections_feat", False)
+    args.skip_connections_agg = getattr(args, "skip_connections_agg", False)
+    args.residual_scale = getattr(args, "residual_scale", 0.5)
+
+    args.gru_dim = getattr(args, "gru_dim", 512)
+
+    args.no_conv_bias = getattr(args, "no_conv_bias", False)
+    args.agg_zero_pad = getattr(args, "agg_zero_pad", False)
+
+    args.log_compression = getattr(args, "log_compression", False)
+
+    args.balanced_classes = getattr(args, "balanced_classes", False)
+    args.infonce = getattr(args, "infonce", False)
+    args.project_features = getattr(args, "project_features", "none")
+
+    args.non_affine_group_norm = getattr(args, "non_affine_group_norm", False)
+
+    args.offset = getattr(args, "offset", "auto")
+
+    args.activation = getattr(args, "activation", "relu")
+
+    args.vq_type = getattr(args, "vq_type", "none")
+    args.vq_vars = getattr(args, "vq_vars", 320)
+    args.vq_groups = getattr(args, "vq_groups", 2)
+    args.vq_dim = getattr(args, "vq_dim", 0)
+    args.vq_depth = getattr(args, "vq_depth", 1)
+    args.combine_groups = getattr(args, "combine_groups", False)
+    args.vq_temp = getattr(args, "vq_temp", "(2.0, 0.5, 0.999995)")
+    args.vq_gamma = getattr(args, "vq_gamma", 0.25)
diff --git a/fairseq/models/wav2vec/wav2vec2.py b/fairseq/models/wav2vec/wav2vec2.py
new file mode 100644
index 0000000000000000000000000000000000000000..be6d10c7a2df547c310dd3abbe5f65382ab382b0
--- /dev/null
+++ b/fairseq/models/wav2vec/wav2vec2.py
@@ -0,0 +1,1017 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import math
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from typing import List, Tuple
+
+from fairseq import utils
+from fairseq.data.data_utils import compute_mask_indices
+from fairseq.models import BaseFairseqModel, register_model, register_model_architecture
+from fairseq.modules import (
+    Fp32GroupNorm,
+    Fp32LayerNorm,
+    GradMultiply,
+    GumbelVectorQuantizer,
+    LayerNorm,
+    MultiheadAttention,
+    SamePad,
+    TransposeLast,
+)
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+from fairseq.utils import buffered_arange
+
+
+@register_model("wav2vec2")
+class Wav2Vec2Model(BaseFairseqModel):
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+
+        parser.add_argument(
+            "--extractor-mode",
+            choices=["default", "layer_norm"],
+            help="mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with --normalize)",
+        )
+
+        parser.add_argument(
+            "--encoder-layers",
+            type=int,
+            metavar="L",
+            help="num encoder layers in the transformer",
+        )
+        parser.add_argument(
+            "--encoder-embed-dim",
+            type=int,
+            metavar="H",
+            help="encoder embedding dimension",
+        )
+        parser.add_argument(
+            "--encoder-ffn-embed-dim",
+            type=int,
+            metavar="F",
+            help="encoder embedding dimension for FFN",
+        )
+        parser.add_argument(
+            "--encoder-attention-heads",
+            type=int,
+            metavar="A",
+            help="num encoder attention heads",
+        )
+        parser.add_argument(
+            "--activation-fn",
+            choices=utils.get_available_activation_fns(),
+            help="activation function to use",
+        )
+
+        parser.add_argument(
+            "--dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability for the transformer",
+        )
+
+        parser.add_argument(
+            "--attention-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability for attention weights",
+        )
+
+        parser.add_argument(
+            "--activation-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability after activation in FFN",
+        )
+
+        parser.add_argument(
+            "--final-dim",
+            type=int,
+            metavar="D",
+            help="project final representations and targets to this many dimensions",
+        )
+
+        parser.add_argument(
+            "--layer-norm-first",
+            action="store_true",
+            help="apply layernorm first in the transformer",
+        )
+
+        parser.add_argument(
+            "--encoder-layerdrop",
+            type=float,
+            help="probability of dropping a tarnsformer layer",
+        )
+
+        parser.add_argument(
+            "--conv-feature-layers",
+            type=str,
+            metavar="EXPR",
+            help="convolutional feature extraction layers [(dim, kernel_size, stride), ...]",
+        )
+
+        parser.add_argument(
+            "--logit-temp", type=float, help="temperature to divide logits by"
+        )
+
+        parser.add_argument(
+            "--quantize-targets", action="store_true", help="use quantized targets"
+        )
+
+        parser.add_argument(
+            "--quantize-input", action="store_true", help="use quantized inputs"
+        )
+
+        parser.add_argument(
+            "--feature-grad-mult",
+            type=float,
+            help="multiply feature extractor var grads by this",
+        )
+
+        parser.add_argument(
+            "--latent-vars",
+            type=int,
+            metavar="N",
+            help="number of latent variables V in each group of the codebook",
+        )
+
+        parser.add_argument(
+            "--latent-groups",
+            type=int,
+            metavar="N",
+            help="number of groups G of latent variables in the codebook",
+        )
+
+        parser.add_argument(
+            "--latent-dim",
+            type=int,
+            metavar="N",
+            help="if set, uses this dimensionality for latent variables. otherwise uses final_dim / latent_groups",
+        )
+
+        parser.add_argument("--mask-length", type=int, help="mask length")
+
+        parser.add_argument(
+            "--mask-prob", type=float, help="probability of replacing a token with mask"
+        )
+
+        parser.add_argument(
+            "--mask-selection",
+            type=str,
+            choices=["static", "uniform", "normal", "poisson"],
+            help="how to choose masks",
+        )
+
+        parser.add_argument(
+            "--mask-other",
+            type=float,
+            help="secondary mask argument (used for more complex distributions), see help in compute_mask_indices",
+        )
+
+        parser.add_argument(
+            "--no-mask-overlap",
+            action="store_true",
+            help="whether to allow masks to overlap",
+        )
+
+        parser.add_argument(
+            "--mask-min-space",
+            type=int,
+            help="min space between spans (if no overlap is enabled)",
+        )
+
+        parser.add_argument(
+            "--mask-channel-length",
+            type=int,
+            help="repeat the mask indices multiple times",
+        )
+
+        parser.add_argument(
+            "--mask-channel-prob",
+            type=float,
+            help="probability of replacing a token with mask",
+        )
+
+        parser.add_argument(
+            "--mask-channel-selection",
+            type=str,
+            choices=["static", "uniform", "normal", "poisson"],
+            help="how to choose masks",
+        )
+
+        parser.add_argument(
+            "--mask-channel-other",
+            type=float,
+            help="secondary mask argument (used for more complex distributions), see help in compute_mask_indices",
+        )
+
+        parser.add_argument(
+            "--no-mask-channel-overlap",
+            action="store_true",
+            help="whether to allow masks to overlap",
+        )
+
+        parser.add_argument(
+            "--mask-channel-min-space",
+            type=int,
+            help="min space between spans (if no overlap is enabled)",
+        )
+
+        parser.add_argument(
+            "--dropout-input",
+            type=float,
+            metavar="D",
+            help="dropout to apply to the input (after feat extr)",
+        )
+
+        parser.add_argument(
+            "--dropout-features",
+            type=float,
+            metavar="D",
+            help="dropout to apply to the features (after feat extr)",
+        )
+
+        parser.add_argument(
+            "--num-negatives", type=int, metavar="N", help="number of negative examples"
+        )
+
+        parser.add_argument(
+            "--negatives-from-everywhere",
+            action="store_true",
+            help="sample negatives from everywhere, not just masked states",
+        )
+
+        parser.add_argument(
+            "--cross-sample-negatives",
+            type=int,
+            metavar="N",
+            help="num of cross sampled negatives",
+        )
+
+        parser.add_argument(
+            "--codebook-negatives",
+            type=int,
+            metavar="N",
+            help="num of codebook sampled negatives",
+        )
+
+        parser.add_argument(
+            "--conv-pos",
+            type=int,
+            metavar="N",
+            help="number of filters for convolutional positional embeddings",
+        )
+
+        parser.add_argument(
+            "--conv-pos-groups",
+            type=int,
+            metavar="N",
+            help="number of groups for convolutional positional embedding",
+        )
+
+        parser.add_argument(
+            "--latent-temp",
+            type=str,
+            metavar="D",
+            help="temperature for latent variable sampling. can be tuple of 3 values (start, end, decay)",
+        )
+
+        parser.add_argument(
+            "--target-glu", action="store_true", help="adds projection + glu to targets"
+        )
+
+        parser.add_argument(
+            "--conv-bias", action="store_true", help="include bias in conv encoder"
+        )
+
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+
+        feature_enc_layers = eval(args.conv_feature_layers)
+        self.embed = feature_enc_layers[-1][0]
+
+        self.feature_extractor = ConvFeatureExtractionModel(
+            conv_layers=feature_enc_layers,
+            dropout=0.0,
+            mode=args.extractor_mode,
+            conv_bias=args.conv_bias,
+        )
+
+        self.post_extract_proj = (
+            nn.Linear(self.embed, args.encoder_embed_dim)
+            if self.embed != args.encoder_embed_dim and not args.quantize_input
+            else None
+        )
+
+        self.mask_prob = args.mask_prob
+        self.mask_selection = args.mask_selection
+        self.mask_other = args.mask_other
+        self.mask_length = args.mask_length
+        self.no_mask_overlap = args.no_mask_overlap
+        self.mask_min_space = args.mask_min_space
+
+        self.mask_channel_prob = args.mask_channel_prob
+        self.mask_channel_selection = args.mask_channel_selection
+        self.mask_channel_other = args.mask_channel_other
+        self.mask_channel_length = args.mask_channel_length
+        self.no_mask_channel_overlap = args.no_mask_channel_overlap
+        self.mask_channel_min_space = args.mask_channel_min_space
+
+        self.dropout_input = nn.Dropout(args.dropout_input)
+        self.dropout_features = nn.Dropout(args.dropout_features)
+
+        self.feature_grad_mult = args.feature_grad_mult
+
+        self.quantizer = None
+        self.input_quantizer = None
+
+        self.n_negatives = args.num_negatives
+        self.cross_sample_negatives = args.cross_sample_negatives
+        self.codebook_negatives = args.codebook_negatives
+        self.negatives_from_everywhere = args.negatives_from_everywhere
+
+        self.logit_temp = args.logit_temp
+
+        if args.quantize_input:
+            vq_dim = args.latent_dim if args.latent_dim > 0 else args.encoder_embed_dim
+            self.input_quantizer = (
+                GumbelVectorQuantizer(
+                    dim=args.encoder_embed_dim,
+                    num_vars=args.latent_vars,
+                    temp=eval(args.latent_temp),
+                    groups=args.latent_groups,
+                    combine_groups=False,
+                    vq_dim=vq_dim,
+                    time_first=True,
+                )
+                if not args.same_quantizer
+                else self.quantizer
+            )
+            self.project_inp = nn.Linear(vq_dim, args.encoder_embed_dim)
+
+        final_dim = args.final_dim if args.final_dim > 0 else args.encoder_embed_dim
+
+        if args.quantize_targets:
+            vq_dim = args.latent_dim if args.latent_dim > 0 else final_dim
+            self.quantizer = GumbelVectorQuantizer(
+                dim=self.embed,
+                num_vars=args.latent_vars,
+                temp=eval(args.latent_temp),
+                groups=args.latent_groups,
+                combine_groups=False,
+                vq_dim=vq_dim,
+                time_first=True,
+            )
+            self.project_q = nn.Linear(vq_dim, final_dim)
+        else:
+            self.project_q = nn.Linear(self.embed, final_dim)
+
+        self.mask_emb = nn.Parameter(
+            torch.FloatTensor(args.encoder_embed_dim).uniform_()
+        )
+
+        self.encoder = TransformerEncoder(args)
+        self.layer_norm = LayerNorm(self.embed)
+
+        self.target_glu = None
+        if args.target_glu:
+            self.target_glu = nn.Sequential(
+                nn.Linear(final_dim, final_dim * 2), nn.GLU()
+            )
+
+        self.final_proj = nn.Linear(args.encoder_embed_dim, final_dim)
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        super().upgrade_state_dict_named(state_dict, name)
+        """Upgrade a (possibly old) state dict for new versions of fairseq."""
+        return state_dict
+
+    @classmethod
+    def build_model(cls, args, task=None):
+        """Build a new model instance."""
+
+        # make sure all arguments are present
+        base_architecture(args)
+
+        return cls(args)
+
+    def apply_mask(self, x, padding_mask):
+        B, T, C = x.shape
+        if self.mask_prob > 0:
+            mask_indices = compute_mask_indices(
+                (B, T),
+                padding_mask,
+                self.mask_prob,
+                self.mask_length,
+                self.mask_selection,
+                self.mask_other,
+                min_masks=2,
+                no_overlap=self.no_mask_overlap,
+                min_space=self.mask_min_space,
+            )
+            mask_indices = torch.from_numpy(mask_indices).to(x.device)
+            x[mask_indices] = self.mask_emb
+        else:
+            mask_indices = None
+
+        if self.mask_channel_prob > 0:
+            mask_channel_indices = compute_mask_indices(
+                (B, C),
+                None,
+                self.mask_channel_prob,
+                self.mask_channel_length,
+                self.mask_channel_selection,
+                self.mask_channel_other,
+                no_overlap=self.no_mask_channel_overlap,
+                min_space=self.mask_channel_min_space,
+            )
+            mask_channel_indices = (
+                torch.from_numpy(mask_channel_indices)
+                .to(x.device)
+                .unsqueeze(1)
+                .expand(-1, T, -1)
+            )
+            x[mask_channel_indices] = 0
+
+        return x, mask_indices
+
+    def sample_negatives(self, y, num):
+
+        if self.n_negatives == 0 and self.cross_sample_negatives == 0:
+            return y.new(0)
+
+        bsz, tsz, fsz = y.shape
+        y = y.view(-1, fsz)  # BTC => (BxT)C
+
+        cross_high = tsz * bsz
+        high = tsz
+        with torch.no_grad():
+            assert high > 1, f"{bsz,tsz,fsz}"
+
+            if self.n_negatives > 0:
+                tszs = (
+                    buffered_arange(num)
+                    .unsqueeze(-1)
+                    .expand(-1, self.n_negatives)
+                    .flatten()
+                )
+
+                neg_idxs = torch.randint(
+                    low=0, high=high - 1, size=(bsz, self.n_negatives * num)
+                )
+                neg_idxs[neg_idxs >= tszs] += 1
+
+            if self.cross_sample_negatives > 0:
+                tszs = (
+                    buffered_arange(num)
+                    .unsqueeze(-1)
+                    .expand(-1, self.cross_sample_negatives)
+                    .flatten()
+                )
+
+                cross_neg_idxs = torch.randint(
+                    low=0,
+                    high=cross_high - 1,
+                    size=(bsz, self.cross_sample_negatives * num),
+                )
+                cross_neg_idxs[cross_neg_idxs >= tszs] += 1
+
+        if self.n_negatives > 0:
+            for i in range(1, bsz):
+                neg_idxs[i] += i * high
+        else:
+            neg_idxs = cross_neg_idxs
+
+        if self.cross_sample_negatives > 0 and self.n_negatives > 0:
+            neg_idxs = torch.cat([neg_idxs, cross_neg_idxs], dim=1)
+
+        negs = y[neg_idxs.view(-1)]
+        negs = negs.view(
+            bsz, num, self.n_negatives + self.cross_sample_negatives, fsz
+        ).permute(
+            2, 0, 1, 3
+        )  # to NxBxTxC
+        return negs, neg_idxs
+
+    def compute_preds(self, x, y, negatives):
+
+        neg_is_pos = (y == negatives).all(-1)
+        y = y.unsqueeze(0)
+        targets = torch.cat([y, negatives], dim=0)
+
+        logits = torch.cosine_similarity(x.float(), targets.float(), dim=-1).type_as(x)
+
+        logits /= self.logit_temp
+
+        if neg_is_pos.any():
+            logits[1:][neg_is_pos] = float("-inf")
+
+        return logits
+
+    def forward(self, source, padding_mask=None, mask=True, features_only=False):
+
+        if self.feature_grad_mult > 0:
+            features = self.feature_extractor(source)
+            if self.feature_grad_mult != 1.0:
+                features = GradMultiply.apply(features, self.feature_grad_mult)
+        else:
+            with torch.no_grad():
+                features = self.feature_extractor(source)
+
+        features_pen = features.float().pow(2).mean()
+
+        features = features.transpose(1, 2)
+        features = self.layer_norm(features)
+        unmasked_features = features.clone()
+
+        if padding_mask is not None:
+            extra = padding_mask.size(1) % features.size(1)
+            if extra > 0:
+                padding_mask = padding_mask[:, :-extra]
+            padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1)
+            padding_mask = padding_mask.all(-1)
+
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+
+        features = self.dropout_input(features)
+        unmasked_features = self.dropout_features(unmasked_features)
+
+        num_vars = None
+        code_ppl = None
+        prob_ppl = None
+        curr_temp = None
+
+        if self.input_quantizer:
+            q = self.input_quantizer(features, produce_targets=False)
+            features = q["x"]
+            num_vars = q["num_vars"]
+            code_ppl = q["code_perplexity"]
+            prob_ppl = q["prob_perplexity"]
+            curr_temp = q["temp"]
+            features = self.project_inp(features)
+
+        if mask:
+            x, mask_indices = self.apply_mask(features, padding_mask)
+            if mask_indices is not None:
+                y = unmasked_features[mask_indices].view(unmasked_features.size(0), -1, unmasked_features.size(-1))
+            else:
+                y = unmasked_features
+        else:
+            x = features
+            y = unmasked_features
+            mask_indices = None
+
+        x = self.encoder(x, padding_mask=padding_mask)
+
+        if features_only:
+            return {"x": x, "padding_mask": padding_mask}
+
+        if self.quantizer:
+            q = self.quantizer(y, produce_targets=False)
+            y = q["x"]
+            num_vars = q["num_vars"]
+            code_ppl = q["code_perplexity"]
+            prob_ppl = q["prob_perplexity"]
+            curr_temp = q["temp"]
+
+            y = self.project_q(y)
+
+            if self.negatives_from_everywhere:
+                neg_cands, *_ = self.quantizer(unmasked_features, produce_targets=False)
+                negs, _ = self.sample_negatives(neg_cands, y.size(1))
+                negs = self.project_q(negs)
+
+            else:
+                negs, _ = self.sample_negatives(y, y.size(1))
+
+            if self.codebook_negatives > 0:
+                cb_negs = self.quantizer.sample_from_codebook(
+                    y.size(0) * y.size(1), self.codebook_negatives
+                )
+                cb_negs = cb_negs.view(
+                    self.codebook_negatives, y.size(0), y.size(1), -1
+                )  # order doesnt matter
+                cb_negs = self.project_q(cb_negs)
+                negs = torch.cat([negs, cb_negs], dim=0)
+        else:
+            y = self.project_q(y)
+
+            if self.negatives_from_everywhere:
+                negs, _ = self.sample_negatives(unmasked_features, y.size(1))
+                negs = self.project_q(negs)
+            else:
+                negs, _ = self.sample_negatives(y, y.size(1))
+
+        x = x[mask_indices].view(x.size(0), -1, x.size(-1))
+
+        if self.target_glu:
+            y = self.target_glu(y)
+            negs = self.target_glu(negs)
+
+        x = self.final_proj(x)
+        x = self.compute_preds(x, y, negs)
+
+        result = {"x": x, "padding_mask": padding_mask, "features_pen": features_pen}
+
+        if prob_ppl is not None:
+            result["prob_perplexity"] = prob_ppl
+            result["code_perplexity"] = code_ppl
+            result["num_vars"] = num_vars
+            result["temp"] = curr_temp
+
+        return result
+
+    def quantize(self, x):
+        assert self.quantizer is not None
+        x = self.feature_extractor(x)
+        x = x.transpose(1, 2)
+        x = self.layer_norm(x)
+        return self.quantizer.forward_idx(x)
+
+    def extract_features(self, source, padding_mask, mask=False):
+        res = self.forward(source, padding_mask, mask=mask, features_only=True)
+        return res["x"], res["padding_mask"]
+
+    def get_logits(self, net_output):
+        logits = net_output["x"]
+        logits = logits.transpose(0, 2)
+        logits = logits.reshape(-1, logits.size(-1))
+        return logits
+
+    def get_targets(self, sample, net_output, expand_steps=True):
+        x = net_output["x"]
+        return x.new_zeros(x.size(1) * x.size(2), dtype=torch.long)
+
+    def get_extra_losses(self, net_output):
+        pen = []
+
+        if "prob_perplexity" in net_output:
+            pen.append(
+                (net_output["num_vars"] - net_output["prob_perplexity"])
+                / net_output["num_vars"]
+            )
+
+        if "features_pen" in net_output:
+            pen.append(net_output["features_pen"])
+
+        return pen
+
+    def remove_pretraining_modules(self):
+        self.quantizer = None
+        self.project_q = None
+        self.target_glu = None
+        self.final_proj = None
+
+
+class ConvFeatureExtractionModel(nn.Module):
+    def __init__(
+        self,
+        conv_layers: List[Tuple[int, int, int]],
+        dropout: float = 0.0,
+        mode: str = "default",
+        conv_bias: bool = False,
+    ):
+        super().__init__()
+
+        assert mode in {"default", "layer_norm"}
+
+        def block(
+            n_in,
+            n_out,
+            k,
+            stride,
+            is_layer_norm=False,
+            is_group_norm=False,
+            conv_bias=False,
+        ):
+            def make_conv():
+                conv = nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias)
+                nn.init.kaiming_normal_(conv.weight)
+                return conv
+
+            assert (
+                is_layer_norm and is_group_norm
+            ) == False, "layer norm and group norm are exclusive"
+
+            if is_layer_norm:
+                return nn.Sequential(
+                    make_conv(),
+                    nn.Dropout(p=dropout),
+                    nn.Sequential(
+                        TransposeLast(),
+                        Fp32LayerNorm(dim, elementwise_affine=True),
+                        TransposeLast(),
+                    ),
+                    nn.GELU(),
+                )
+            elif is_group_norm:
+                return nn.Sequential(
+                    make_conv(),
+                    nn.Dropout(p=dropout),
+                    Fp32GroupNorm(dim, dim, affine=True),
+                    nn.GELU(),
+                )
+            else:
+                return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.GELU())
+
+        in_d = 1
+        self.conv_layers = nn.ModuleList()
+        for i, cl in enumerate(conv_layers):
+            assert len(cl) == 3, "invalid conv definition: " + str(cl)
+            (dim, k, stride) = cl
+
+            self.conv_layers.append(
+                block(
+                    in_d,
+                    dim,
+                    k,
+                    stride,
+                    is_layer_norm=mode == "layer_norm",
+                    is_group_norm=mode == "default" and i == 0,
+                    conv_bias=conv_bias,
+                )
+            )
+            in_d = dim
+
+    def forward(self, x):
+
+        # BxT -> BxCxT
+        x = x.unsqueeze(1)
+
+        for conv in self.conv_layers:
+            x = conv(x)
+
+        return x
+
+
+class TransformerEncoder(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+
+        self.dropout = args.dropout
+        self.embedding_dim = args.encoder_embed_dim
+
+        self.pos_conv = nn.Conv1d(
+            self.embedding_dim,
+            self.embedding_dim,
+            kernel_size=args.conv_pos,
+            padding=args.conv_pos // 2,
+            groups=args.conv_pos_groups,
+        )
+        dropout = 0
+        std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
+        nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
+        nn.init.constant_(self.pos_conv.bias, 0)
+
+        self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2)
+        self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU())
+
+        self.layers = nn.ModuleList(
+            [
+                TransformerSentenceEncoderLayer(
+                    embedding_dim=self.embedding_dim,
+                    ffn_embedding_dim=args.encoder_ffn_embed_dim,
+                    num_attention_heads=args.encoder_attention_heads,
+                    dropout=self.dropout,
+                    attention_dropout=args.attention_dropout,
+                    activation_dropout=args.activation_dropout,
+                    activation_fn=args.activation_fn,
+                    layer_norm_first=args.layer_norm_first,
+                )
+                for _ in range(args.encoder_layers)
+            ]
+        )
+
+        self.layer_norm_first = args.layer_norm_first
+        self.layer_norm = LayerNorm(self.embedding_dim)
+        self.layerdrop = args.encoder_layerdrop
+
+        self.apply(init_bert_params)
+
+    def forward(self, x, padding_mask=None):
+        x = self.extract_features(x, padding_mask)
+
+        if self.layer_norm_first:
+            x = self.layer_norm(x)
+
+        return x
+
+    def extract_features(self, x, padding_mask=None):
+
+        x_conv = self.pos_conv(x.transpose(1, 2))
+        x_conv = x_conv.transpose(1, 2)
+        x += x_conv
+
+        if not self.layer_norm_first:
+            x = self.layer_norm(x)
+
+        x = F.dropout(x, p=self.dropout, training=self.training)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        layer_results = []
+        for i, layer in enumerate(self.layers):
+            dropout_probability = np.random.random()
+            if not self.training or (dropout_probability > self.layerdrop):
+                x, z = layer(x, self_attn_padding_mask=padding_mask, need_weights=False)
+                layer_results.append(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        return x
+
+    def max_positions(self):
+        """Maximum output length supported by the encoder."""
+        return self.args.max_positions
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        """Upgrade a (possibly old) state dict for new versions of fairseq."""
+        return state_dict
+
+
+class TransformerSentenceEncoderLayer(nn.Module):
+    """
+    Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained
+    models.
+    """
+
+    def __init__(
+        self,
+        embedding_dim: float = 768,
+        ffn_embedding_dim: float = 3072,
+        num_attention_heads: float = 8,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        activation_fn: str = "relu",
+        layer_norm_first: bool = False,
+    ) -> None:
+
+        super().__init__()
+        # Initialize parameters
+        self.embedding_dim = embedding_dim
+        self.dropout = dropout
+        self.activation_dropout = activation_dropout
+
+        # Initialize blocks
+        self.activation_fn = utils.get_activation_fn(activation_fn)
+        self.self_attn = MultiheadAttention(
+            self.embedding_dim,
+            num_attention_heads,
+            dropout=attention_dropout,
+            self_attention=True,
+        )
+
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(self.activation_dropout)
+        self.dropout3 = nn.Dropout(dropout)
+
+        self.layer_norm_first = layer_norm_first
+
+        # layer norm associated with the self attention layer
+        self.self_attn_layer_norm = LayerNorm(self.embedding_dim)
+        self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
+        self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
+
+        # layer norm associated with the position wise feed-forward NN
+        self.final_layer_norm = LayerNorm(self.embedding_dim)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        self_attn_mask: torch.Tensor = None,
+        self_attn_padding_mask: torch.Tensor = None,
+        need_weights: bool = False,
+        att_args=None,
+    ):
+        """
+        LayerNorm is applied either before or after the self-attention/ffn
+        modules similar to the original Transformer imlementation.
+        """
+        residual = x
+
+        if self.layer_norm_first:
+            x = self.self_attn_layer_norm(x)
+            x, attn = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=self_attn_padding_mask,
+                need_weights=False,
+                attn_mask=self_attn_mask,
+            )
+            x = self.dropout1(x)
+            x = residual + x
+
+            residual = x
+            x = self.final_layer_norm(x)
+            x = self.activation_fn(self.fc1(x))
+            x = self.dropout2(x)
+            x = self.fc2(x)
+            x = self.dropout3(x)
+            x = residual + x
+        else:
+            x, attn = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=self_attn_padding_mask,
+                need_weights=need_weights,
+            )
+
+            x = self.dropout1(x)
+            x = residual + x
+
+            x = self.self_attn_layer_norm(x)
+
+            residual = x
+            x = self.activation_fn(self.fc1(x))
+            x = self.dropout2(x)
+            x = self.fc2(x)
+            x = self.dropout3(x)
+            x = residual + x
+            x = self.final_layer_norm(x)
+
+        return x, attn
+
+
+@register_model_architecture("wav2vec2", "wav2vec2")
+def base_architecture(args):
+    args.extractor_mode = getattr(args, "extractor_mode", "default")
+
+    args.encoder_layers = getattr(args, "encoder_layers", 12)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 3072)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12)
+
+    args.activation_fn = getattr(args, "activation_fn", "gelu")
+
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+
+    args.final_dim = getattr(args, "final_dim", 0)
+
+    args.layer_norm_first = getattr(args, "layer_norm_first", False)
+    args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0)
+
+    conv_feature_layers = "[(512, 10, 5)]"
+    conv_feature_layers += " + [(512, 8, 4)]"
+    conv_feature_layers += " + [(512, 4, 2)] * 3"
+    conv_feature_layers += " + [(512, 1, 1)]"
+    args.conv_feature_layers = getattr(args, "conv_feature_layers", conv_feature_layers)
+
+    args.logit_temp = getattr(args, "logit_temp", 0.1)
+
+    args.quantize_targets = getattr(args, "quantize_targets", False)
+    args.quantize_input = getattr(args, "quantize_input", False)
+
+    args.feature_grad_mult = getattr(args, "feature_grad_mult", 1.0)
+
+    args.latent_vars = getattr(args, "latent_vars", 320)
+    args.latent_groups = getattr(args, "latent_groups", 2)
+    args.latent_dim = getattr(args, "latent_dim", 0)
+
+    args.mask_length = getattr(args, "mask_length", 10)
+    args.mask_prob = getattr(args, "mask_prob", 0.65)
+    args.mask_selection = getattr(args, "mask_selection", "static")
+    args.mask_other = getattr(args, "mask_other", 0)
+    args.no_mask_overlap = getattr(args, "no_mask_overlap", False)
+    args.mask_min_space = getattr(args, "mask_min_space", 1)
+
+    args.mask_channel_length = getattr(args, "mask_channel_length", 10)
+    args.mask_channel_prob = getattr(args, "mask_channel_prob", 0)
+    args.mask_channel_selection = getattr(args, "mask_channel_selection", "static")
+    args.mask_channel_other = getattr(args, "mask_channel_other", 0)
+    args.no_mask_channel_overlap = getattr(args, "no_mask_channel_overlap", False)
+    args.mask_channel_min_space = getattr(args, "mask_channel_min_space", 1)
+
+    args.dropout_input = getattr(args, "dropout_input", 0)
+    args.dropout_features = getattr(args, "dropout_features", 0)
+
+    args.num_negatives = getattr(args, "num_negatives", 100)
+    args.negatives_from_everywhere = getattr(args, "negatives_from_everywhere", False)
+    args.cross_sample_negatives = getattr(args, "cross_sample_negatives", 0)
+    args.codebook_negatives = getattr(args, "codebook_negatives", 0)
+
+    args.conv_pos = getattr(args, "conv_pos", 128)
+    args.conv_pos_groups = getattr(args, "conv_pos_groups", 16)
+
+    args.latent_temp = getattr(args, "latent_temp", "(2,0.5,0.999995)")
+
+    args.target_glu = getattr(args, "target_glu", False)
+
+    args.conv_bias = getattr(args, "conv_bias", False)
diff --git a/fairseq/models/wav2vec/wav2vec2_asr.py b/fairseq/models/wav2vec/wav2vec2_asr.py
new file mode 100644
index 0000000000000000000000000000000000000000..e47e1f700920a91d9364f7b62cb5859ddd4d37ae
--- /dev/null
+++ b/fairseq/models/wav2vec/wav2vec2_asr.py
@@ -0,0 +1,673 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import contextlib
+import copy
+import math
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq import checkpoint_utils, tasks, utils
+
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqIncrementalDecoder,
+    FairseqEncoderDecoderModel,
+    BaseFairseqModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.modules import LayerNorm, PositionalEmbedding, TransformerDecoderLayer
+
+
+def add_common_args(parser):
+    parser.add_argument("--w2v-path", help="path to wav2vec 2.0 model")
+    parser.add_argument(
+        "--no-pretrained-weights",
+        action="store_true",
+        help="if true, does not load pretrained weights",
+    )
+    parser.add_argument(
+        "--dropout-input",
+        type=float,
+        metavar="D",
+        help="dropout to apply to the input (after feat extr)",
+    )
+    parser.add_argument(
+        "--final-dropout",
+        type=float,
+        metavar="D",
+        help="dropout after transformer and before final projection",
+    )
+    parser.add_argument(
+        "--apply-mask", action="store_true", help="apply masking during fine-tuning"
+    )
+    parser.add_argument(
+        "--dropout",
+        type=float,
+        metavar="D",
+        help="dropout probability inside wav2vec 2.0 model",
+    )
+    parser.add_argument(
+        "--attention-dropout",
+        type=float,
+        metavar="D",
+        help="dropout probability for attention weights inside wav2vec 2.0 model",
+    )
+    parser.add_argument(
+        "--activation-dropout",
+        "--relu-dropout",
+        type=float,
+        metavar="D",
+        help="dropout probability after activation in FFN inside wav2vec 2.0 model",
+    )
+
+    parser.add_argument(
+        "--mask-length", type=int, help="repeat the mask indices multiple times"
+    )
+
+    parser.add_argument(
+        "--mask-prob", type=float, help="probability of replacing a token with mask"
+    )
+
+    parser.add_argument(
+        "--mask-selection",
+        type=str,
+        choices=["static", "uniform", "normal", "poisson"],
+        help="how to choose masks",
+    )
+
+    parser.add_argument(
+        "--mask-other",
+        type=float,
+        help="stdev of the mask length in case of 'normal' selection strategy",
+    )
+
+    parser.add_argument(
+        "--no-mask-overlap",
+        action="store_true",
+        help="whether to allow masks to overlap",
+    )
+
+    parser.add_argument(
+        "--mask-channel-length", type=int, help="repeat the mask indices multiple times"
+    )
+
+    parser.add_argument(
+        "--mask-channel-prob",
+        type=float,
+        help="probability of replacing a token with mask",
+    )
+
+    parser.add_argument(
+        "--mask-channel-selection",
+        type=str,
+        choices=["static", "uniform", "normal", "poisson"],
+        help="how to choose masks",
+    )
+
+    parser.add_argument(
+        "--mask-channel-other",
+        type=float,
+        help="stdev of the mask length in case of 'normal' selection strategy",
+    )
+
+    parser.add_argument(
+        "--no-mask-channel-overlap",
+        action="store_true",
+        help="whether to allow masks to overlap",
+    )
+
+    parser.add_argument(
+        "--freeze-finetune-updates",
+        default=0,
+        type=int,
+        help="dont finetune wav2vec for this many updates",
+    )
+
+    parser.add_argument(
+        "--feature-grad-mult",
+        default=None,
+        type=float,
+        help="reset feature grad mult in wav2vec 2.0 to this",
+    )
+
+    parser.add_argument(
+        "--layerdrop",
+        default=0.0,
+        type=float,
+        help="probability of dropping a layer in wav2vec 2.0",
+    )
+
+
+@register_model("wav2vec_ctc")
+class Wav2VecCtc(BaseFairseqModel):
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        add_common_args(parser)
+
+    def __init__(self, w2v_encoder, args):
+        super().__init__()
+        self.w2v_encoder = w2v_encoder
+        self.args = args
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        super().upgrade_state_dict_named(state_dict, name)
+        return state_dict
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        base_architecture(args)
+        w2v_encoder = Wav2VecEncoder(args, task.target_dictionary)
+        return cls(w2v_encoder, args)
+
+    def get_normalized_probs(self, net_output, log_probs):
+        """Get normalized probabilities (or log probs) from a net's output."""
+
+        logits = net_output["encoder_out"]
+        if log_probs:
+            return utils.log_softmax(logits.float(), dim=-1)
+        else:
+            return utils.softmax(logits.float(), dim=-1)
+
+    def forward(self, **kwargs):
+        x = self.w2v_encoder(**kwargs)
+        return x
+
+    # def max_positions(self):
+    #     return None
+
+
+@register_model("wav2vec_seq2seq")
+class TransformerModel(FairseqEncoderDecoderModel):
+    def __init__(self, args, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @staticmethod
+    def add_args(parser):
+        add_common_args(parser)
+
+        parser.add_argument(
+            "--decoder-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder embedding dimension",
+        )
+        parser.add_argument(
+            "--decoder-ffn-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder embedding dimension for FFN",
+        )
+        parser.add_argument(
+            "--decoder-layers", type=int, metavar="N", help="num decoder layers"
+        )
+        parser.add_argument(
+            "--decoder-layerdrop",
+            type=float,
+            metavar="D",
+            help="decoder layerdrop chance",
+        )
+        parser.add_argument(
+            "--decoder-attention-heads",
+            type=int,
+            metavar="N",
+            help="num decoder attention heads",
+        )
+        parser.add_argument(
+            "--decoder-learned-pos",
+            action="store_true",
+            help="use learned positional embeddings in the decoder",
+        )
+        parser.add_argument(
+            "--decoder-normalize-before",
+            action="store_true",
+            help="apply layernorm before each decoder block",
+        )
+        parser.add_argument(
+            "--no-token-positional-embeddings",
+            default=False,
+            action="store_true",
+            help="if set, disables positional embeddings (outside self attention)",
+        )
+
+        parser.add_argument(
+            "--decoder-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability in the decoder",
+        )
+        parser.add_argument(
+            "--decoder-attention-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability for attention weights inside the decoder",
+        )
+        parser.add_argument(
+            "--decoder-activation-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability after activation in FFN inside the decoder",
+        )
+
+        # fmt: on
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present in older models
+        base_architecture(args)
+
+        if not hasattr(args, "max_source_positions"):
+            args.max_source_positions = 2048
+        if not hasattr(args, "max_target_positions"):
+            args.max_target_positions = 2048
+
+        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
+
+        def build_embedding(dictionary, embed_dim):
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            emb = Embedding(num_embeddings, embed_dim, padding_idx)
+            return emb
+
+        decoder_embed_tokens = build_embedding(tgt_dict, args.decoder_embed_dim)
+
+        encoder = cls.build_encoder(args)
+        decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens)
+        return TransformerModel(args, encoder, decoder)
+
+    @classmethod
+    def build_encoder(cls, args):
+        return Wav2VecEncoder(args)
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        return TransformerDecoder(args, tgt_dict, embed_tokens)
+
+    def forward(self, **kwargs):
+        encoder_out = self.encoder(tbc=False, **kwargs)
+        decoder_out = self.decoder(encoder_out=encoder_out, **kwargs)
+        return decoder_out
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        super().upgrade_state_dict_named(state_dict, name)
+        return state_dict
+
+
+class Wav2VecEncoder(FairseqEncoder):
+    def __init__(self, args, tgt_dict=None):
+        self.apply_mask = args.apply_mask
+
+        arg_overrides = {
+            "dropout": args.dropout,
+            "activation_dropout": args.activation_dropout,
+            "dropout_input": args.dropout_input,
+            "attention_dropout": args.attention_dropout,
+            "mask_length": args.mask_length,
+            "mask_prob": args.mask_prob,
+            "mask_selection": args.mask_selection,
+            "mask_other": args.mask_other,
+            "no_mask_overlap": args.no_mask_overlap,
+            "mask_channel_length": args.mask_channel_length,
+            "mask_channel_prob": args.mask_channel_prob,
+            "mask_channel_selection": args.mask_channel_selection,
+            "mask_channel_other": args.mask_channel_other,
+            "no_mask_channel_overlap": args.no_mask_channel_overlap,
+            "encoder_layerdrop": args.layerdrop,
+            "feature_grad_mult": args.feature_grad_mult,
+        }
+
+        if getattr(args, "w2v_args", None) is None:
+            state = checkpoint_utils.load_checkpoint_to_cpu(
+                args.w2v_path, arg_overrides
+            )
+            w2v_args = state["args"]
+        else:
+            state = None
+            w2v_args = args.w2v_args
+
+        assert args.normalize == w2v_args.normalize, 'Fine-tuning works best when data normalization is the same'
+
+        w2v_args.data = args.data
+        task = tasks.setup_task(w2v_args)
+        model = task.build_model(w2v_args)
+
+        if state is not None and not args.no_pretrained_weights:
+            model.load_state_dict(state["model"], strict=True)
+
+        model.remove_pretraining_modules()
+
+        super().__init__(task.source_dictionary)
+
+        d = w2v_args.encoder_embed_dim
+
+        self.w2v_model = model
+
+        self.final_dropout = nn.Dropout(args.final_dropout)
+        self.freeze_finetune_updates = args.freeze_finetune_updates
+        self.num_updates = 0
+
+        if tgt_dict is not None:
+            self.proj = Linear(d, len(tgt_dict))
+        elif getattr(args, 'decoder_embed_dim', d) != d:
+            self.proj = Linear(d, args.decoder_embed_dim)
+        else:
+            self.proj = None
+
+    def set_num_updates(self, num_updates):
+        """Set the number of parameters updates."""
+        super().set_num_updates(num_updates)
+        self.num_updates = num_updates
+
+    def forward(self, source, padding_mask, tbc=True, **kwargs):
+
+        w2v_args = {
+            "source": source,
+            "padding_mask": padding_mask,
+            "mask": self.apply_mask and self.training,
+        }
+
+        ft = self.freeze_finetune_updates <= self.num_updates
+
+        with torch.no_grad() if not ft else contextlib.ExitStack():
+            x, padding_mask = self.w2v_model.extract_features(**w2v_args)
+
+            if tbc:
+                # B x T x C -> T x B x C
+                x = x.transpose(0, 1)
+
+        x = self.final_dropout(x)
+
+        if self.proj:
+            x = self.proj(x)
+
+        return {
+            "encoder_out": x,  # T x B x C
+            "encoder_padding_mask": padding_mask,  # B x T
+            "padding_mask": padding_mask,
+        }
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        if encoder_out["encoder_out"] is not None:
+            encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select(
+                1, new_order
+            )
+        if encoder_out["encoder_padding_mask"] is not None:
+            encoder_out["encoder_padding_mask"] = encoder_out[
+                "encoder_padding_mask"
+            ].index_select(0, new_order)
+        return encoder_out
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return None
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        return state_dict
+
+
+class TransformerDecoder(FairseqIncrementalDecoder):
+    """
+    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
+    is a :class:`TransformerDecoderLayer`.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): decoding dictionary
+        embed_tokens (torch.nn.Embedding): output embedding
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+    """
+
+    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
+        super().__init__(dictionary)
+
+        self.dropout = args.decoder_dropout
+        self.share_input_output_embed = args.share_decoder_input_output_embed
+
+        input_embed_dim = embed_tokens.embedding_dim
+        embed_dim = args.decoder_embed_dim
+        self.output_embed_dim = args.decoder_embed_dim
+        args.encoder_embed_dim = embed_dim
+
+        self.layerdrop = args.decoder_layerdrop
+
+        padding_idx = embed_tokens.padding_idx
+        self.max_target_positions = args.max_target_positions
+
+        self.embed_tokens = embed_tokens
+        self.embed_scale = math.sqrt(embed_dim)  # todo: try with input_embed_dim
+
+        self.project_in_dim = (
+            Linear(input_embed_dim, embed_dim, bias=False)
+            if embed_dim != input_embed_dim
+            else None
+        )
+
+        self.embed_positions = (
+            PositionalEmbedding(
+                args.max_target_positions,
+                embed_dim,
+                padding_idx,
+                learned=args.decoder_learned_pos,
+            )
+            if not args.no_token_positional_embeddings
+            else None
+        )
+
+        args = copy.deepcopy(args)
+        args.dropout = args.decoder_dropout
+        args.attention_dropout = args.decoder_attention_dropout
+        args.activation_dropout = args.decoder_activation_dropout
+
+        self.layers = nn.ModuleList([])
+        self.layers.extend(
+            [
+                TransformerDecoderLayer(args, no_encoder_attn)
+                for _ in range(args.decoder_layers)
+            ]
+        )
+
+        if not self.share_input_output_embed:
+            self.embed_out = nn.Parameter(
+                torch.Tensor(len(dictionary), self.output_embed_dim)
+            )
+            nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5)
+
+        if args.decoder_normalize_before and not getattr(
+            args, "no_decoder_final_norm", False
+        ):
+            self.layer_norm = LayerNorm(embed_dim)
+        else:
+            self.layer_norm = None
+
+    def forward(
+        self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused
+    ):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (Tensor, optional): output from the encoder, used for
+                encoder-side attention
+            incremental_state (dict): dictionary used for storing state during
+                :ref:`Incremental decoding`
+
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        prev_output_tokens = prev_output_tokens.long()
+        x, extra = self.extract_features(
+            prev_output_tokens, encoder_out, incremental_state
+        )
+        x = self.output_layer(x)
+        return x, extra
+
+    def extract_features(
+        self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused
+    ):
+        """
+        Similar to *forward* but only return features.
+
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+        """
+
+        # embed positions
+        positions = (
+            self.embed_positions(
+                prev_output_tokens, incremental_state=incremental_state
+            )
+            if self.embed_positions is not None
+            else None
+        )
+
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+            if positions is not None:
+                positions = positions[:, -1:]
+
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+
+        if self.project_in_dim is not None:
+            x = self.project_in_dim(x)
+
+        if positions is not None:
+            x += positions
+        x = F.dropout(x, p=self.dropout, training=self.training)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        attn = None
+
+        inner_states = [x]
+
+        # decoder layers
+        for layer in self.layers:
+            dropout_probability = np.random.random()
+            if not self.training or (dropout_probability > self.layerdrop):
+                x, attn, _ = layer(
+                    x,
+                    encoder_out["encoder_out"] if encoder_out is not None else None,
+                    encoder_out["encoder_padding_mask"]
+                    if encoder_out is not None
+                    else None,
+                    incremental_state,
+                    self_attn_mask=self.buffered_future_mask(x)
+                    if incremental_state is None
+                    else None,
+                )
+                inner_states.append(x)
+
+        if self.layer_norm:
+            x = self.layer_norm(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        return x, {"attn": attn, "inner_states": inner_states}
+
+    def output_layer(self, features, **kwargs):
+        """Project features to the vocabulary size."""
+        # project back to size of vocabulary
+        if self.share_input_output_embed:
+            return F.linear(features, self.embed_tokens.weight)
+        else:
+            return F.linear(features, self.embed_out)
+
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        if self.embed_positions is None:
+            return self.max_target_positions
+        return min(self.max_target_positions, self.embed_positions.max_positions)
+
+    def buffered_future_mask(self, tensor):
+        dim = tensor.size(0)
+        if (
+            not hasattr(self, "_future_mask")
+            or self._future_mask is None
+            or self._future_mask.device != tensor.device
+            or self._future_mask.size(0) < dim
+        ):
+            self._future_mask = torch.triu(
+                utils.fill_with_neg_inf(tensor.new(dim, dim)), 1
+            )
+        return self._future_mask[:dim, :dim]
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        return state_dict
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
+    nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+
+
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    nn.init.xavier_uniform_(m.weight)
+    if bias:
+        nn.init.constant_(m.bias, 0.0)
+    return m
+
+
+@register_model_architecture("wav2vec_ctc", "wav2vec_ctc")
+def base_architecture(args):
+    args.no_pretrained_weights = getattr(args, "no_pretrained_weights", False)
+    args.dropout_input = getattr(args, "dropout_input", 0)
+    args.final_dropout = getattr(args, "final_dropout", 0)
+    args.apply_mask = getattr(args, "apply_mask", False)
+    args.dropout = getattr(args, "dropout", 0)
+    args.attention_dropout = getattr(args, "attention_dropout", 0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0)
+
+    args.mask_length = getattr(args, "mask_length", 10)
+    args.mask_prob = getattr(args, "mask_prob", 0.5)
+    args.mask_selection = getattr(args, "mask_selection", "static")
+    args.mask_other = getattr(args, "mask_other", 0)
+    args.no_mask_overlap = getattr(args, "no_mask_overlap", False)
+    args.mask_channel_length = getattr(args, "mask_channel_length", 10)
+    args.mask_channel_prob = getattr(args, "mask_channel_prob", 0.5)
+    args.mask_channel_selection = getattr(args, "mask_channel_selection", "static")
+    args.mask_channel_other = getattr(args, "mask_channel_other", 0)
+    args.no_mask_channel_overlap = getattr(args, "no_mask_channel_overlap", False)
+
+    args.freeze_finetune_updates = getattr(args, "freeze_finetune_updates", 0)
+    args.feature_grad_mult = getattr(args, "feature_grad_mult", 0)
+    args.layerdrop = getattr(args, "layerdrop", 0.0)
+
+
+@register_model_architecture("wav2vec_seq2seq", "wav2vec_seq2seq")
+def seq2seq_architecture(args):
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
+    args.decoder_layers = getattr(args, "decoder_layers", 10)
+    args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.decoder_dropout = getattr(args, "decoder_dropout", 0)
+    args.decoder_attention_dropout = getattr(args, "decoder_attention_dropout", 0)
+    args.decoder_activation_dropout = getattr(args, "decoder_activation_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(args, "share_decoder_input_output_embed", False)
+
+    base_architecture(args)
diff --git a/fairseq/modules/__init__.py b/fairseq/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d526d4a92e706f386d815af5303636fc5f955789
--- /dev/null
+++ b/fairseq/modules/__init__.py
@@ -0,0 +1,75 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .adaptive_input import AdaptiveInput
+from .adaptive_softmax import AdaptiveSoftmax
+from .beamable_mm import BeamableMM
+from .character_token_embedder import CharacterTokenEmbedder
+from .conv_tbc import ConvTBC
+from .cross_entropy import cross_entropy
+from .downsampled_multihead_attention import DownsampledMultiHeadAttention
+from .dynamic_convolution import DynamicConv, DynamicConv1dTBC
+from .dynamic_crf_layer import DynamicCRF
+from .fairseq_dropout import FairseqDropout
+from .fp32_group_norm import Fp32GroupNorm
+from .gelu import gelu, gelu_accurate
+from .grad_multiply import GradMultiply
+from .gumbel_vector_quantizer import GumbelVectorQuantizer
+from .kmeans_vector_quantizer import KmeansVectorQuantizer
+from .layer_drop import LayerDropModuleList
+from .layer_norm import Fp32LayerNorm, LayerNorm
+from .learned_positional_embedding import LearnedPositionalEmbedding
+from .lightweight_convolution import LightweightConv, LightweightConv1dTBC
+from .linearized_convolution import LinearizedConvolution
+from .multihead_attention import MultiheadAttention
+from .positional_embedding import PositionalEmbedding
+from .same_pad import SamePad
+from .scalar_bias import ScalarBias
+from .sinusoidal_positional_embedding import SinusoidalPositionalEmbedding
+from .transformer_sentence_encoder_layer import TransformerSentenceEncoderLayer
+from .transformer_sentence_encoder import TransformerSentenceEncoder
+from .transpose_last import TransposeLast
+from .unfold import unfold1d
+from .transformer_layer import TransformerDecoderLayer, TransformerEncoderLayer
+from .vggblock import VGGBlock
+
+__all__ = [
+    'AdaptiveInput',
+    'AdaptiveSoftmax',
+    'BeamableMM',
+    'CharacterTokenEmbedder',
+    'ConvTBC',
+    'cross_entropy',
+    'DownsampledMultiHeadAttention',
+    'DynamicConv1dTBC',
+    'DynamicConv',
+    'DynamicCRF',
+    'FairseqDropout',
+    'Fp32GroupNorm',
+    'Fp32LayerNorm',
+    'gelu',
+    'gelu_accurate',
+    'GradMultiply',
+    'GumbelVectorQuantizer',
+    'KmeansVectorQuantizer',
+    'LayerDropModuleList',
+    'LayerNorm',
+    'LearnedPositionalEmbedding',
+    'LightweightConv1dTBC',
+    'LightweightConv',
+    'LinearizedConvolution',
+    'MultiheadAttention',
+    'PositionalEmbedding',
+    'SamePad',
+    'ScalarBias',
+    'SinusoidalPositionalEmbedding',
+    'TransformerSentenceEncoderLayer',
+    'TransformerSentenceEncoder',
+    'TransformerDecoderLayer',
+    'TransformerEncoderLayer',
+    'TransposeLast',
+    'VGGBlock',
+    'unfold1d',
+]
diff --git a/fairseq/modules/__pycache__/__init__.cpython-310.pyc b/fairseq/modules/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7647b49cf378406593fe979970afac6cba61cee
Binary files /dev/null and b/fairseq/modules/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/adaptive_input.cpython-310.pyc b/fairseq/modules/__pycache__/adaptive_input.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ea83f226dd60b971007625a5ab3e9886eb5a04e
Binary files /dev/null and b/fairseq/modules/__pycache__/adaptive_input.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/adaptive_softmax.cpython-310.pyc b/fairseq/modules/__pycache__/adaptive_softmax.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45d089a17c1af4e5a7290c0c871b8d2ace7bb21c
Binary files /dev/null and b/fairseq/modules/__pycache__/adaptive_softmax.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/beamable_mm.cpython-310.pyc b/fairseq/modules/__pycache__/beamable_mm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2260aed078d5243054bfa8424c2c040a70adb55
Binary files /dev/null and b/fairseq/modules/__pycache__/beamable_mm.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/character_token_embedder.cpython-310.pyc b/fairseq/modules/__pycache__/character_token_embedder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b83ee96b53db529ee2668eaf9bd385ee6d8d3444
Binary files /dev/null and b/fairseq/modules/__pycache__/character_token_embedder.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/conv_tbc.cpython-310.pyc b/fairseq/modules/__pycache__/conv_tbc.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d6ca22fdaddf541c5f6cc27a97421003810f4d1
Binary files /dev/null and b/fairseq/modules/__pycache__/conv_tbc.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/cross_entropy.cpython-310.pyc b/fairseq/modules/__pycache__/cross_entropy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1feb46386f0bc977010f5324d8468225b9059d54
Binary files /dev/null and b/fairseq/modules/__pycache__/cross_entropy.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/downsampled_multihead_attention.cpython-310.pyc b/fairseq/modules/__pycache__/downsampled_multihead_attention.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b169b3d38845383da383e9642d838d7b85d53c87
Binary files /dev/null and b/fairseq/modules/__pycache__/downsampled_multihead_attention.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/dynamic_convolution.cpython-310.pyc b/fairseq/modules/__pycache__/dynamic_convolution.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c2d3e56b70e60cdf03a5fadb372eab5be8426b4
Binary files /dev/null and b/fairseq/modules/__pycache__/dynamic_convolution.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/dynamic_crf_layer.cpython-310.pyc b/fairseq/modules/__pycache__/dynamic_crf_layer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd1b970ae395a7b860014dacba319e4ea8142b10
Binary files /dev/null and b/fairseq/modules/__pycache__/dynamic_crf_layer.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/fairseq_dropout.cpython-310.pyc b/fairseq/modules/__pycache__/fairseq_dropout.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0170b7a20d98bd77900f16becb9cf60a4bec7b08
Binary files /dev/null and b/fairseq/modules/__pycache__/fairseq_dropout.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/fp32_group_norm.cpython-310.pyc b/fairseq/modules/__pycache__/fp32_group_norm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41dd550730c8c90f10cd89c6de32125298f4eb14
Binary files /dev/null and b/fairseq/modules/__pycache__/fp32_group_norm.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/gelu.cpython-310.pyc b/fairseq/modules/__pycache__/gelu.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..405941480b21b80d974ca02b95d0f032f3e727a0
Binary files /dev/null and b/fairseq/modules/__pycache__/gelu.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/grad_multiply.cpython-310.pyc b/fairseq/modules/__pycache__/grad_multiply.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ffe9ea33667d5930ab61c657d98bca13e1748297
Binary files /dev/null and b/fairseq/modules/__pycache__/grad_multiply.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/gumbel_vector_quantizer.cpython-310.pyc b/fairseq/modules/__pycache__/gumbel_vector_quantizer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19263504dc7d3d30f2ffe62dc88160733063243b
Binary files /dev/null and b/fairseq/modules/__pycache__/gumbel_vector_quantizer.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/kmeans_vector_quantizer.cpython-310.pyc b/fairseq/modules/__pycache__/kmeans_vector_quantizer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5c3a9c0cad80373e4f5c2db0ce2b90162b3fc0e
Binary files /dev/null and b/fairseq/modules/__pycache__/kmeans_vector_quantizer.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/layer_drop.cpython-310.pyc b/fairseq/modules/__pycache__/layer_drop.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1fc4d693222120b4542e7855cc334647bb34fdc
Binary files /dev/null and b/fairseq/modules/__pycache__/layer_drop.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/layer_norm.cpython-310.pyc b/fairseq/modules/__pycache__/layer_norm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..01a31d99764aa6f86a22861e0d7337064c1ebb35
Binary files /dev/null and b/fairseq/modules/__pycache__/layer_norm.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/learned_positional_embedding.cpython-310.pyc b/fairseq/modules/__pycache__/learned_positional_embedding.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8df51332596499ec34c1fadbb3e7b3a2b74a98b3
Binary files /dev/null and b/fairseq/modules/__pycache__/learned_positional_embedding.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/lightweight_convolution.cpython-310.pyc b/fairseq/modules/__pycache__/lightweight_convolution.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eae3ab8b823c400254c4f882fc166e60e55a8c1d
Binary files /dev/null and b/fairseq/modules/__pycache__/lightweight_convolution.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/linearized_convolution.cpython-310.pyc b/fairseq/modules/__pycache__/linearized_convolution.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e4bb6eaafe65009e5f4342df0b48452ddb000fa
Binary files /dev/null and b/fairseq/modules/__pycache__/linearized_convolution.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/multihead_attention.cpython-310.pyc b/fairseq/modules/__pycache__/multihead_attention.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..25b27da4cd349c39cb4a32df9583cbe975c5bdbc
Binary files /dev/null and b/fairseq/modules/__pycache__/multihead_attention.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/positional_embedding.cpython-310.pyc b/fairseq/modules/__pycache__/positional_embedding.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1d0abd91491a14d982136090e199c4f551d355a
Binary files /dev/null and b/fairseq/modules/__pycache__/positional_embedding.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/quant_noise.cpython-310.pyc b/fairseq/modules/__pycache__/quant_noise.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3bfa36feab29401b7c09abcd257e9af3727b2dea
Binary files /dev/null and b/fairseq/modules/__pycache__/quant_noise.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/same_pad.cpython-310.pyc b/fairseq/modules/__pycache__/same_pad.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..63dc1d3862ca5d731a5bc44ca6cf51b3840dc945
Binary files /dev/null and b/fairseq/modules/__pycache__/same_pad.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/scalar_bias.cpython-310.pyc b/fairseq/modules/__pycache__/scalar_bias.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c71437cb379d5f42e07bccbeb714396f5a9ada78
Binary files /dev/null and b/fairseq/modules/__pycache__/scalar_bias.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/sinusoidal_positional_embedding.cpython-310.pyc b/fairseq/modules/__pycache__/sinusoidal_positional_embedding.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc3a680863f1e7d932cfc85b397887a84328ee4d
Binary files /dev/null and b/fairseq/modules/__pycache__/sinusoidal_positional_embedding.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/transformer_layer.cpython-310.pyc b/fairseq/modules/__pycache__/transformer_layer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0cda59153fdef6be6be508ef84628eaa5c9eeb4
Binary files /dev/null and b/fairseq/modules/__pycache__/transformer_layer.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/transformer_sentence_encoder.cpython-310.pyc b/fairseq/modules/__pycache__/transformer_sentence_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f3cc1839eb1c65842488302c76d08396b85b261
Binary files /dev/null and b/fairseq/modules/__pycache__/transformer_sentence_encoder.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/transformer_sentence_encoder_layer.cpython-310.pyc b/fairseq/modules/__pycache__/transformer_sentence_encoder_layer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..37a64b8aaf1ab2d0750dd554313c49e30a45b78f
Binary files /dev/null and b/fairseq/modules/__pycache__/transformer_sentence_encoder_layer.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/transpose_last.cpython-310.pyc b/fairseq/modules/__pycache__/transpose_last.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6f08e03254f64bb4725adb4315130988d763c62
Binary files /dev/null and b/fairseq/modules/__pycache__/transpose_last.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/unfold.cpython-310.pyc b/fairseq/modules/__pycache__/unfold.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0c86516990ff72131faf40fa7ece0f3e5348cf7
Binary files /dev/null and b/fairseq/modules/__pycache__/unfold.cpython-310.pyc differ
diff --git a/fairseq/modules/__pycache__/vggblock.cpython-310.pyc b/fairseq/modules/__pycache__/vggblock.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e06080e8b4223e14217264fa10ca8b00ce902308
Binary files /dev/null and b/fairseq/modules/__pycache__/vggblock.cpython-310.pyc differ
diff --git a/fairseq/modules/adaptive_input.py b/fairseq/modules/adaptive_input.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cfe8fca6605f4cb5a4b0134d45c362acd60e67c
--- /dev/null
+++ b/fairseq/modules/adaptive_input.py
@@ -0,0 +1,78 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+from torch import nn
+from fairseq.modules.quant_noise import quant_noise
+
+from typing import List
+
+
+class AdaptiveInput(nn.Module):
+
+    def __init__(
+        self,
+        vocab_size: int,
+        padding_idx: int,
+        initial_dim: int,
+        factor: float,
+        output_dim: int,
+        cutoff: List[int],
+        q_noise: float = 0,
+        qn_block_size: int = 8,
+    ):
+        super().__init__()
+
+        if vocab_size > cutoff[-1]:
+            cutoff = cutoff + [vocab_size]
+        else:
+            assert vocab_size == cutoff[
+                -1], 'cannot specify cutoff larger than vocab size'
+
+        self.cutoff = cutoff
+        self.embedding_dim = output_dim
+        self.padding_idx = padding_idx
+
+        self.embeddings = nn.ModuleList()
+        for i in range(len(self.cutoff)):
+            prev = self.cutoff[i - 1] if i > 0 else 0
+            size = self.cutoff[i] - prev
+            dim = int(initial_dim // (factor ** i))
+            seq = nn.Sequential(
+                nn.Embedding(size, dim, self.padding_idx),
+                quant_noise(nn.Linear(dim, output_dim, bias=False), q_noise, qn_block_size),
+            )
+
+            self.embeddings.append(seq)
+            self.padding_idx = None
+        self.padding_idx = padding_idx
+
+        def init_weights(m):
+            if isinstance(m, nn.Embedding):
+                nn.init.normal_(m.weight, mean=0, std=m.weight.shape[1] ** -0.5)
+                nn.init.constant_(m.weight[padding_idx], 0)
+            elif hasattr(m, 'weight'):
+                nn.init.xavier_uniform_(m.weight)
+
+        self.apply(init_weights)
+
+        self.register_buffer('_float_tensor', torch.FloatTensor(1))
+
+    def weights_for_band(self, band: int):
+        return self.embeddings[band][0].weight, self.embeddings[band][1].weight
+
+    def forward(self, input: torch.Tensor):
+        result = self._float_tensor.new(input.shape + (self.embedding_dim,))
+        for i in range(len(self.cutoff)):
+            mask = input.lt(self.cutoff[i])
+            if i > 0:
+                mask.mul_(input.ge(self.cutoff[i - 1]))
+                chunk_input = input[mask] - self.cutoff[i - 1]
+            else:
+                chunk_input = input[mask]
+            if mask.any():
+                result[mask] = self.embeddings[i](chunk_input)
+        return result
diff --git a/fairseq/modules/adaptive_softmax.py b/fairseq/modules/adaptive_softmax.py
new file mode 100644
index 0000000000000000000000000000000000000000..96f8b75ad34da2bc2d0048babbd74034764938ea
--- /dev/null
+++ b/fairseq/modules/adaptive_softmax.py
@@ -0,0 +1,214 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import operator
+import functools
+
+import torch
+import torch.nn.functional as F
+from fairseq.modules.quant_noise import quant_noise
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from torch import nn
+
+
+class TiedLinear(nn.Module):
+    def __init__(self, weight, transpose):
+        super().__init__()
+        self.weight = weight
+        self.transpose = transpose
+
+    def forward(self, input):
+        return F.linear(input, self.weight.t() if self.transpose else self.weight)
+
+
+class TiedHeadModule(nn.Module):
+    def __init__(self, weights, input_dim, num_classes, q_noise, qn_block_size):
+        super().__init__()
+        tied_emb, _ = weights
+        self.num_words, emb_dim = tied_emb.size()
+
+        self.word_proj = quant_noise(TiedLinear(tied_emb, transpose=False), q_noise, qn_block_size)
+        if input_dim != emb_dim:
+            self.word_proj = nn.Sequential(
+                quant_noise(nn.Linear(input_dim, emb_dim, bias=False), q_noise, qn_block_size),
+                self.word_proj,
+            )
+
+        self.class_proj = quant_noise(nn.Linear(input_dim, num_classes, bias=False), q_noise, qn_block_size)
+        self.out_dim = self.num_words + num_classes
+
+        self.register_buffer('_float_tensor', torch.FloatTensor(1))
+
+    def forward(self, input):
+        inp_sz = functools.reduce(operator.mul, input.shape[:-1], 1)
+        out = self._float_tensor.new(inp_sz, self.out_dim)
+        out[:, :self.num_words] = self.word_proj(input.view(inp_sz, -1))
+        out[:, self.num_words:] = self.class_proj(input.view(inp_sz, -1))
+        return out
+
+
+class AdaptiveSoftmax(nn.Module):
+    """
+    This is an implementation of the efficient softmax approximation for
+    graphical processing units (GPU), described in the paper "Efficient softmax
+    approximation for GPUs" (http://arxiv.org/abs/1609.04309).
+    """
+
+    def __init__(self, vocab_size, input_dim, cutoff, dropout, factor=4., adaptive_inputs=None, tie_proj=False,
+                 q_noise=0, qn_block_size=8):
+        super().__init__()
+
+        if vocab_size > cutoff[-1]:
+            cutoff = cutoff + [vocab_size]
+        else:
+            assert vocab_size == cutoff[
+                -1], 'cannot specify cutoff larger than vocab size'
+
+        output_dim = cutoff[0] + len(cutoff) - 1
+
+        self.vocab_size = vocab_size
+        self.cutoff = cutoff
+        self.dropout_module = FairseqDropout(dropout, module_name=self.__class__.__name__)
+        self.input_dim = input_dim
+        self.factor = factor
+        self.q_noise = q_noise
+        self.qn_block_size = qn_block_size
+
+        self.lsm = nn.LogSoftmax(dim=1)
+
+        if adaptive_inputs is not None:
+            self.head = TiedHeadModule(adaptive_inputs.weights_for_band(0), input_dim, len(cutoff) - 1, self.q_noise, self.qn_block_size)
+        else:
+            self.head = quant_noise(nn.Linear(input_dim, output_dim, bias=False), self.q_noise, self.qn_block_size)
+
+        self._make_tail(adaptive_inputs, tie_proj)
+
+        def init_weights(m):
+            if hasattr(m, 'weight') and not isinstance(m, TiedLinear) and not isinstance(m, TiedHeadModule):
+                nn.init.xavier_uniform_(m.weight)
+
+        self.apply(init_weights)
+
+        self.register_buffer('version', torch.LongTensor([1]))
+
+    def _make_tail(self, adaptive_inputs=None, tie_proj=False):
+        self.tail = nn.ModuleList()
+        for i in range(len(self.cutoff) - 1):
+            dim = int(self.input_dim // self.factor ** (i + 1))
+
+            tied_emb, tied_proj = adaptive_inputs.weights_for_band(i + 1) \
+                if adaptive_inputs is not None else (None, None)
+
+            if tied_proj is not None:
+                if tie_proj:
+                    proj = quant_noise(TiedLinear(tied_proj, transpose=True), self.q_noise, self.qn_block_size)
+                else:
+                    proj = quant_noise(nn.Linear(tied_proj.size(0), tied_proj.size(1), bias=False), self.q_noise, self.qn_block_size)
+            else:
+                proj = quant_noise(nn.Linear(self.input_dim, dim, bias=False), self.q_noise, self.qn_block_size)
+
+            if tied_emb is None:
+                out_proj = nn.Linear(dim, self.cutoff[i + 1] - self.cutoff[i], bias=False)
+            else:
+                out_proj = TiedLinear(tied_emb, transpose=False)
+
+            m = nn.Sequential(
+                proj,
+                nn.Dropout(self.dropout_module.p),
+                quant_noise(out_proj, self.q_noise, self.qn_block_size),
+            )
+
+            self.tail.append(m)
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        version_name = name + '.version'
+        if version_name not in state_dict:
+            raise Exception('This version of the model is no longer supported')
+
+    def adapt_target(self, target):
+        """
+        In order to be efficient, the AdaptiveSoftMax does not compute the
+        scores for all the word of the vocabulary for all the examples. It is
+        thus necessary to call the method adapt_target of the AdaptiveSoftMax
+        layer inside each forward pass.
+        """
+
+        target = target.view(-1)
+        new_target = [target.clone()]
+        target_idxs = []
+
+        for i in range(len(self.cutoff) - 1):
+            mask = target.ge(self.cutoff[i]).mul(target.lt(self.cutoff[i + 1]))
+            new_target[0][mask] = self.cutoff[0] + i
+
+            if mask.any():
+                target_idxs.append(mask.nonzero().squeeze(1))
+                new_target.append(target[mask].add(-self.cutoff[i]))
+            else:
+                target_idxs.append(None)
+                new_target.append(None)
+
+        return new_target, target_idxs
+
+    def forward(self, input, target):
+        """
+        Args:
+            input: (b x t x d)
+            target: (b x t)
+        Returns:
+            2 lists: output for each cutoff section and new targets by cut off
+        """
+
+        input = input.contiguous().view(-1, input.size(-1))
+        input = self.dropout_module(input)
+
+        new_target, target_idxs = self.adapt_target(target)
+        output = [self.head(input)]
+
+        for i in range(len(target_idxs)):
+            if target_idxs[i] is not None:
+                output.append(self.tail[i](input.index_select(0, target_idxs[i])))
+            else:
+                output.append(None)
+
+        return output, new_target
+
+    def get_log_prob(self, input, target):
+        """
+        Computes the log probabilities for all the words of the vocabulary,
+        given a 2D tensor of hidden vectors.
+        """
+
+        bsz, length, dim = input.size()
+        input = input.contiguous().view(-1, dim)
+
+        if target is not None:
+            _, target_idxs = self.adapt_target(target)
+        else:
+            target_idxs = None
+
+        head_y = self.head(input)
+        log_probs = head_y.new_zeros(input.size(0), self.vocab_size)
+
+        head_sz = self.cutoff[0] + len(self.tail)
+        log_probs[:, :head_sz] = self.lsm(head_y)
+        tail_priors = log_probs[:, self.cutoff[0]: head_sz].clone()
+
+        for i in range(len(self.tail)):
+            start = self.cutoff[i]
+            end = self.cutoff[i + 1]
+
+            if target_idxs is None:
+                tail_out = log_probs[:, start:end]
+                tail_out.copy_(self.tail[i](input))
+                log_probs[:, start:end] = self.lsm(tail_out).add_(tail_priors[:, i, None])
+            elif target_idxs[i] is not None:
+                idxs = target_idxs[i]
+                tail_out = log_probs[idxs, start:end]
+                tail_out.copy_(self.tail[i](input[idxs]))
+                log_probs[idxs, start:end] = self.lsm(tail_out).add_(tail_priors[idxs, i, None])
+
+        log_probs = log_probs.view(bsz, length, -1)
+        return log_probs
diff --git a/fairseq/modules/beamable_mm.py b/fairseq/modules/beamable_mm.py
new file mode 100644
index 0000000000000000000000000000000000000000..df77105a946258a58f3615fd51f399454223c0e2
--- /dev/null
+++ b/fairseq/modules/beamable_mm.py
@@ -0,0 +1,47 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+
+
+class BeamableMM(nn.Module):
+    """This module provides an optimized MM for beam decoding with attention.
+
+    It leverage the fact that the source-side of the input is replicated beam
+    times and the target-side of the input is of width one. This layer speeds up
+    inference by replacing the inputs {(bsz x 1 x nhu), (bsz x sz2 x nhu)}
+    with smaller inputs {(bsz/beam x beam x nhu), (bsz/beam x sz2 x nhu)}.
+    """
+    def __init__(self, beam_size=None):
+        super(BeamableMM, self).__init__()
+        self.beam_size = beam_size
+
+    def forward(self, input1, input2):
+        if (
+            not self.training and           # test mode
+            self.beam_size is not None and  # beam size is set
+            input1.dim() == 3 and           # only support batched input
+            input1.size(1) == 1             # single time step update
+        ):
+            bsz, beam = input1.size(0), self.beam_size
+
+            # bsz x 1 x nhu --> bsz/beam x beam x nhu
+            input1 = input1[:, 0, :].unfold(0, beam, beam).transpose(2, 1)
+
+            # bsz x sz2 x nhu --> bsz/beam x sz2 x nhu
+            input2 = input2.unfold(0, beam, beam)[:, :, :, 0]
+
+            # use non batched operation if bsz = beam
+            if input1.size(0) == 1:
+                output = torch.mm(input1[0, :, :], input2[0, :, :])
+            else:
+                output = input1.bmm(input2)
+            return output.view(bsz, 1, -1)
+        else:
+            return input1.bmm(input2)
+
+    def set_beam_size(self, beam_size):
+        self.beam_size = beam_size
diff --git a/fairseq/modules/character_token_embedder.py b/fairseq/modules/character_token_embedder.py
new file mode 100644
index 0000000000000000000000000000000000000000..3abdaf4f28a5affb505a40cf07f5d95f550cceb4
--- /dev/null
+++ b/fairseq/modules/character_token_embedder.py
@@ -0,0 +1,204 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import List, Tuple
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from fairseq.data import Dictionary
+
+CHAR_PAD_IDX = 0
+CHAR_EOS_IDX = 257
+
+
+logger = logging.getLogger(__name__)
+
+
+class CharacterTokenEmbedder(torch.nn.Module):
+    def __init__(
+            self,
+            vocab: Dictionary,
+            filters: List[Tuple[int, int]],
+            char_embed_dim: int,
+            word_embed_dim: int,
+            highway_layers: int,
+            max_char_len: int = 50,
+            char_inputs: bool = False
+    ):
+        super(CharacterTokenEmbedder, self).__init__()
+
+        self.onnx_trace = False
+        self.embedding_dim = word_embed_dim
+        self.max_char_len = max_char_len
+        self.char_embeddings = nn.Embedding(257, char_embed_dim, padding_idx=0)
+        self.symbol_embeddings = nn.Parameter(torch.FloatTensor(2, word_embed_dim))
+        self.eos_idx, self.unk_idx = 0, 1
+        self.char_inputs = char_inputs
+
+        self.convolutions = nn.ModuleList()
+        for width, out_c in filters:
+            self.convolutions.append(
+                nn.Conv1d(char_embed_dim, out_c, kernel_size=width)
+            )
+
+        last_dim = sum(f[1] for f in filters)
+
+        self.highway = Highway(last_dim, highway_layers) if highway_layers > 0 else None
+
+        self.projection = nn.Linear(last_dim, word_embed_dim)
+
+        assert vocab is not None or char_inputs, "vocab must be set if not using char inputs"
+        self.vocab = None
+        if vocab is not None:
+            self.set_vocab(vocab, max_char_len)
+
+        self.reset_parameters()
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+
+    def set_vocab(self, vocab, max_char_len):
+        word_to_char = torch.LongTensor(len(vocab), max_char_len)
+
+        truncated = 0
+        for i in range(len(vocab)):
+            if i < vocab.nspecial:
+                char_idxs = [0] * max_char_len
+            else:
+                chars = vocab[i].encode()
+                # +1 for padding
+                char_idxs = [c + 1 for c in chars] + [0] * (max_char_len - len(chars))
+            if len(char_idxs) > max_char_len:
+                truncated += 1
+                char_idxs = char_idxs[:max_char_len]
+            word_to_char[i] = torch.LongTensor(char_idxs)
+
+        if truncated > 0:
+            logger.info('truncated {} words longer than {} characters'.format(truncated, max_char_len))
+
+        self.vocab = vocab
+        self.word_to_char = word_to_char
+
+    @property
+    def padding_idx(self):
+        return Dictionary().pad() if self.vocab is None else self.vocab.pad()
+
+    def reset_parameters(self):
+        nn.init.xavier_normal_(self.char_embeddings.weight)
+        nn.init.xavier_normal_(self.symbol_embeddings)
+        nn.init.xavier_uniform_(self.projection.weight)
+
+        nn.init.constant_(self.char_embeddings.weight[self.char_embeddings.padding_idx], 0.)
+        nn.init.constant_(self.projection.bias, 0.)
+
+    def forward(
+            self,
+            input: torch.Tensor,
+    ):
+        if self.char_inputs:
+            chars = input.view(-1, self.max_char_len)
+            pads = chars[:, 0].eq(CHAR_PAD_IDX)
+            eos = chars[:, 0].eq(CHAR_EOS_IDX)
+            if eos.any():
+                if self.onnx_trace:
+                    chars = torch.where(eos.unsqueeze(1), chars.new_zeros(1), chars)
+                else:
+                    chars[eos] = 0
+
+            unk = None
+        else:
+            flat_words = input.view(-1)
+            chars = self.word_to_char[flat_words.type_as(self.word_to_char)].type_as(input)
+            pads = flat_words.eq(self.vocab.pad())
+            eos = flat_words.eq(self.vocab.eos())
+            unk = flat_words.eq(self.vocab.unk())
+
+        word_embs = self._convolve(chars)
+        if self.onnx_trace:
+            if pads.any():
+                word_embs = torch.where(pads.unsqueeze(1), word_embs.new_zeros(1), word_embs)
+            if eos.any():
+                word_embs = torch.where(eos.unsqueeze(1), self.symbol_embeddings[self.eos_idx], word_embs)
+            if unk is not None and unk.any():
+                word_embs = torch.where(unk.unsqueeze(1), self.symbol_embeddings[self.unk_idx], word_embs)
+        else:
+            if pads.any():
+                word_embs[pads] = 0
+            if eos.any():
+                word_embs[eos] = self.symbol_embeddings[self.eos_idx]
+            if unk is not None and unk.any():
+                word_embs[unk] = self.symbol_embeddings[self.unk_idx]
+
+        return word_embs.view(input.size()[:2] + (-1,))
+
+    def _convolve(
+            self,
+            char_idxs: torch.Tensor,
+    ):
+        char_embs = self.char_embeddings(char_idxs)
+        char_embs = char_embs.transpose(1, 2)  # BTC -> BCT
+
+        conv_result = []
+
+        for conv in self.convolutions:
+            x = conv(char_embs)
+            x, _ = torch.max(x, -1)
+            x = F.relu(x)
+            conv_result.append(x)
+
+        x = torch.cat(conv_result, dim=-1)
+
+        if self.highway is not None:
+            x = self.highway(x)
+        x = self.projection(x)
+
+        return x
+
+
+class Highway(torch.nn.Module):
+    """
+    A `Highway layer <https://arxiv.org/abs/1505.00387>`_.
+    Adopted from the AllenNLP implementation.
+    """
+
+    def __init__(
+            self,
+            input_dim: int,
+            num_layers: int = 1
+    ):
+        super(Highway, self).__init__()
+        self.input_dim = input_dim
+        self.layers = nn.ModuleList([nn.Linear(input_dim, input_dim * 2)
+                                     for _ in range(num_layers)])
+        self.activation = nn.ReLU()
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for layer in self.layers:
+            # As per comment in AllenNLP:
+            # We should bias the highway layer to just carry its input forward.  We do that by
+            # setting the bias on `B(x)` to be positive, because that means `g` will be biased to
+            # be high, so we will carry the input forward.  The bias on `B(x)` is the second half
+            # of the bias vector in each Linear layer.
+            nn.init.constant_(layer.bias[self.input_dim:], 1)
+
+            nn.init.constant_(layer.bias[:self.input_dim], 0)
+            nn.init.xavier_normal_(layer.weight)
+
+    def forward(
+            self,
+            x: torch.Tensor
+    ):
+        for layer in self.layers:
+            projection = layer(x)
+            proj_x, gate = projection.chunk(2, dim=-1)
+            proj_x = self.activation(proj_x)
+            gate = torch.sigmoid(gate)
+            x = gate * x + (gate.new_tensor([1]) - gate) * proj_x
+        return x
diff --git a/fairseq/modules/conv_tbc.py b/fairseq/modules/conv_tbc.py
new file mode 100644
index 0000000000000000000000000000000000000000..1aa3eff9dca20a07ba84438312823d16e2493c12
--- /dev/null
+++ b/fairseq/modules/conv_tbc.py
@@ -0,0 +1,36 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch.nn.modules.utils import _single
+
+
+class ConvTBC(torch.nn.Module):
+    """1D convolution over an input of shape (time x batch x channel)
+
+    The implementation uses gemm to perform the convolution. This implementation
+    is faster than cuDNN for small kernel sizes.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, padding=0):
+        super(ConvTBC, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _single(kernel_size)
+        self.padding = _single(padding)
+
+        self.weight = torch.nn.Parameter(torch.Tensor(
+            self.kernel_size[0], in_channels, out_channels))
+        self.bias = torch.nn.Parameter(torch.Tensor(out_channels))
+
+    def forward(self, input):
+        return torch.conv_tbc(input.contiguous(), self.weight, self.bias, self.padding[0])
+
+    def __repr__(self):
+        s = ('{name}({in_channels}, {out_channels}, kernel_size={kernel_size}'
+             ', padding={padding}')
+        if self.bias is None:
+            s += ', bias=False'
+        s += ')'
+        return s.format(name=self.__class__.__name__, **self.__dict__)
diff --git a/fairseq/modules/cross_entropy.py b/fairseq/modules/cross_entropy.py
new file mode 100644
index 0000000000000000000000000000000000000000..b46143f3af3fca88443b202b3416333eb1f8d6a4
--- /dev/null
+++ b/fairseq/modules/cross_entropy.py
@@ -0,0 +1,51 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+import torch.nn.functional as F
+
+
+logger = logging.getLogger(__name__)
+
+
+def _cross_entropy_pytorch(logits, target, ignore_index=None, reduction='mean'):
+    lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+    return F.nll_loss(
+        lprobs, target, ignore_index=ignore_index, reduction=reduction,
+    )
+
+
+try:
+    import xentropy_cuda
+    from apex.contrib import xentropy
+
+    logger.info('using fused cross entropy')
+
+    def cross_entropy(logits, target, ignore_index=-100, reduction='mean'):
+        if logits.device == torch.device('cpu'):
+            return _cross_entropy_pytorch(logits, target, ignore_index, reduction)
+        else:
+            half_to_float = (logits.dtype == torch.half)
+            losses = xentropy.SoftmaxCrossEntropyLoss.apply(
+                logits, target, 0.0, ignore_index, half_to_float,
+            )
+            if reduction == 'sum':
+                return losses.sum()
+            elif reduction == 'mean':
+                if ignore_index >= 0:
+                    return losses.sum() / target.ne(ignore_index).sum()
+                else:
+                    return losses.mean()
+            elif reduction == 'none':
+                return losses
+            else:
+                raise NotImplementedError
+
+except ImportError:
+
+    def cross_entropy(logits, target, ignore_index=-100, reduction='mean'):
+        return _cross_entropy_pytorch(logits, target, ignore_index, reduction)
diff --git a/fairseq/modules/cuda_utils.cu b/fairseq/modules/cuda_utils.cu
new file mode 100644
index 0000000000000000000000000000000000000000..516f1d92440e9e2c092f122e45d81b45cb135602
--- /dev/null
+++ b/fairseq/modules/cuda_utils.cu
@@ -0,0 +1,203 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * 
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+template <typename U, typename V>	
+constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) {	
+  return (a + b - 1) / b;	
+}
+
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__inline__ __device__
+void zeroSharedMem(scalar_t* data) {
+  /*
+    Given an array of length FS + SB, zero out the first padding_l and last
+    (FS - padding_l) values in the array
+  */
+
+  int tid = threadIdx.x;
+
+  if (FS < SB) {
+
+    // zero all if we have enough threads in a block to do all of them
+    if (tid < padding_l || tid > SB - FS + padding_l - 1) {
+      data[tid] = scalar_t(0.0);
+    }
+  } else {
+
+    // otherwise zero out one block at a time
+    const int numIterations = divUp<int, int>(FS, SB);
+    for (int i = 0; i < numIterations; i++) {
+      int offset = i * SB;
+      if (tid + offset < padding_l) {
+        data[tid + offset] = scalar_t(0.0);
+      } else if (tid + offset < FS) {
+        data[SB + tid + offset] = scalar_t(0.0);
+      }
+    }
+  }
+}
+
+template<typename scalar_t>
+__inline__ __device__
+scalar_t warpReduce(scalar_t data) {
+  /*
+    Reduce an array within each warp. After processing all values in warp will
+    caontain the sum of all original values in that warp.
+
+    data - pointer to data to reduce
+  */
+  data += __shfl_xor_sync(SHFL_MASK, data, 16);
+  data += __shfl_xor_sync(SHFL_MASK, data, 8);
+  data += __shfl_xor_sync(SHFL_MASK, data, 4);
+  data += __shfl_xor_sync(SHFL_MASK, data, 2);
+  data += __shfl_xor_sync(SHFL_MASK, data, 1);
+  return data;
+}
+
+template<typename scalar_t>
+__inline__ __device__
+scalar_t blockReduce(scalar_t data) {
+  /*
+     Reduce an entire array on the block level. After processing, the
+     first value in the array will contain the reduced sum.
+
+     data - pointer to data to reduce
+  */
+
+  static __shared__ scalar_t warpSum[32];
+  const int tid = threadIdx.x;
+  int wid = tid / 32;
+  int lane = tid % 32;
+
+  __syncthreads();
+
+  // reduce each warp then write to shared memory
+  scalar_t sum = warpReduce(data);
+  if (lane == 0) {
+    warpSum[wid] = sum;
+  }
+  
+  __syncthreads();
+
+  scalar_t v;
+  // perform final sum of partial warp sums
+  if (tid < blockDim.x / 32) {
+    v = warpSum[lane];
+  } else {
+    v = scalar_t(0.0);
+  }
+
+  if (wid == 0) {
+    v = warpReduce(v);
+  }
+  __syncthreads();
+
+  return v;
+}
+
+void checkCudaStatus(cudaError_t status, int lineNumber = -1) {
+
+  if (status != cudaSuccess) {
+    std::cout << cudaGetErrorString(status)
+              << " at line " << lineNumber << std::endl;
+    std::cout << "Exiting" << std::endl;
+    exit(1);
+  }
+}
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__device__
+void load_input_to_shared(const scalar_t* input, // global memory
+                          int inputOffset, int sequenceLength,
+                          int iteration, int numIterations,
+                          bool no_prev, scalar_t* output /* shared memory */) {
+  /*
+    Load a block size of input into shared memory with
+    right and left overhang of total size FS. If previously
+    loaded memory, overlap will be shifted over to reduce
+    global memory access
+
+    input - pointer to start of channel sequence
+    inputOffset - how far in the sequence to start loading
+    sequenceLength - total length of sequence
+    iteration - which block of sequence we are loading
+    numIterations - total number of blocks to load
+    no_prev - whether to load the whole block if the previous block
+              wasn't loaded
+    output - shared memory to write input to
+  */
+
+  const int tid = threadIdx.x;
+
+  // Load the left "overhang" of input
+  if (iteration > 0) {
+    if (padding_l < SB) {
+
+      // load all at once
+      if (tid < padding_l) {
+        output[tid] = (no_prev) ? input[inputOffset - padding_l + tid] : output[tid + SB];
+      }
+    } else {
+
+      // load in chunks of size SB
+      int numIterations = divUp<int, int>(padding_l, SB);
+      for (int i = 0; i < numIterations; i++) {
+        int offset = i * SB;
+        if ((tid + offset) < padding_l) {
+          output[tid + offset] = (no_prev) ? input[inputOffset - padding_l + tid + offset] : output[tid + offset + SB];
+        }
+      }
+    }
+  }
+
+  // Load the right "overhang" of input
+  if (iteration < (numIterations - 1)) {
+    const int elementsLeft = sequenceLength - (iteration+1) * SB;
+
+    if ((FS - padding_l) < SB) {
+
+      // load all at once
+      if (tid < (FS - padding_l)) {
+          output[padding_l + SB + tid] = (tid < elementsLeft) ? input[inputOffset + SB + tid] : scalar_t(0.0);
+      }
+    } else {
+
+      // load in chunks of size SB
+      int numIterations = divUp<int, int>(FS - padding_l, SB);
+      for (int i = 0; i < numIterations; i++) {
+        int offset = i * SB;
+        if ((tid + offset) < (FS - padding_l)) {
+          output[padding_l + SB + tid + offset] = ((tid + offset) < elementsLeft) ? input[inputOffset + SB + tid + offset] : scalar_t(0.0);
+        }
+      }
+    }
+  }
+
+  // We should also clear out the right "overhang"
+  if (iteration == (numIterations - 1)) {
+    if ((FS - padding_l) < SB) {
+
+      // clear out all at once
+      if (tid < (FS - padding_l)) {
+          output[padding_l + SB + tid] = scalar_t(0.0);
+      }
+    } else {
+
+      // clear in chunks of size SB
+      int numIterations = divUp<int, int>(FS - padding_l, SB);
+      for (int i = 0; i < numIterations; i++) {
+        int offset = i * SB;
+        if ((tid + offset) < (FS - padding_l)) {
+          output[padding_l + SB + tid + offset] = scalar_t(0.0);
+        }
+      }
+    }
+  }
+  output[tid + padding_l] = ((inputOffset + tid) < sequenceLength) ? input[inputOffset + tid] : scalar_t(0.0);
+}
diff --git a/fairseq/modules/downsampled_multihead_attention.py b/fairseq/modules/downsampled_multihead_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..eeaf9bbdd3c3315003c55ae8a743bac0db452a29
--- /dev/null
+++ b/fairseq/modules/downsampled_multihead_attention.py
@@ -0,0 +1,256 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq.modules.scalar_bias import scalar_bias
+from fairseq.modules.fairseq_dropout import FairseqDropout
+
+
+class SingleHeadAttention(nn.Module):
+    """
+    Single-head attention that supports Gating and Downsampling
+    """
+    def __init__(
+        self, out_channels, embed_dim, head_dim, head_index, dropout=0.,
+        bias=True, project_input=True, gated=False, downsample=False,
+        num_heads=1,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.dropout_module = FairseqDropout(dropout, module_name=self.__class__.__name__)
+        self.head_index = head_index
+        self.head_dim = head_dim
+        self.project_input = project_input
+        self.gated = gated
+        self.downsample = downsample
+        self.num_heads = num_heads
+        self.projection = None
+
+        k_layers = []
+        v_layers = []
+        if self.downsample:
+            k_layers.append(Downsample(self.head_index))
+            v_layers.append(Downsample(self.head_index))
+            out_proj_size = self.head_dim
+        else:
+            out_proj_size = self.head_dim * self.num_heads
+        if self.gated:
+            k_layers.append(GatedLinear(self.embed_dim, out_proj_size, bias=bias))
+            self.in_proj_q = GatedLinear(self.embed_dim, out_proj_size, bias=bias)
+            v_layers.append(GatedLinear(self.embed_dim, out_proj_size, bias=bias))
+        else:
+            k_layers.append(Linear(self.embed_dim, out_proj_size, bias=bias))
+            self.in_proj_q = Linear(self.embed_dim, out_proj_size, bias=bias)
+            v_layers.append(Linear(self.embed_dim, out_proj_size, bias=bias))
+
+        self.in_proj_k = nn.Sequential(*k_layers)
+        self.in_proj_v = nn.Sequential(*v_layers)
+
+        if self.downsample:
+            self.out_proj = Linear(out_proj_size, self.head_dim, bias=bias)
+        else:
+            self.out_proj = Linear(out_proj_size, out_channels, bias=bias)
+
+        self.scaling = self.head_dim**-0.5
+
+    def forward(
+        self, query, key, value, mask_future_timesteps=False,
+        key_padding_mask=None, use_scalar_bias=False,
+    ):
+        """Input shape: Time x Batch x Channel
+        Self-attention can be implemented by passing in the same arguments for
+        query, key and value. Future timesteps can be masked with the
+        `mask_future_timesteps` argument. Padding elements can be excluded from
+        the key by passing a binary ByteTensor (`key_padding_mask`) with shape:
+        batch x src_len, where padding elements are indicated by 1s.
+        """
+        src_len, bsz, out_channels = key.size()
+        tgt_len = query.size(0)
+        assert list(query.size()) == [tgt_len, bsz, out_channels]
+        assert key.size() == value.size()
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+
+        if self.downsample:
+            size = bsz
+        else:
+            size = bsz * self.num_heads
+
+        k = key
+        v = value
+        q = query
+        if self.project_input:
+            q = self.in_proj_q(q)
+            k = self.in_proj_k(k)
+            v = self.in_proj_v(v)
+            src_len = k.size()[0]
+        q *= self.scaling
+
+        if not self.downsample:
+            q = q.view(tgt_len, size, self.head_dim)
+            k = k.view(src_len, size, self.head_dim)
+            v = v.view(src_len, size, self.head_dim)
+
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        if mask_future_timesteps:
+            assert query.size() == key.size(), \
+                'mask_future_timesteps only applies to self-attention'
+            attn_weights *= torch.tril(
+                attn_weights.data.new([1]).expand(tgt_len, tgt_len).clone(),
+                diagonal=-1,
+            )[:, ::self.head_index + 1 if self.downsample else 1].unsqueeze(0)
+            attn_weights += torch.triu(
+                attn_weights.data.new([-math.inf]).expand(tgt_len, tgt_len).clone(),
+                diagonal=0
+            )[:, ::self.head_index + 1 if self.downsample else 1].unsqueeze(0)
+        tgt_size = tgt_len
+        if use_scalar_bias:
+            attn_weights = scalar_bias(attn_weights, 2)
+            v = scalar_bias(v, 1)
+            tgt_size += 1
+
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            if key_padding_mask.max() > 0:
+                if self.downsample:
+                    attn_weights = attn_weights.view(bsz, 1, tgt_len, src_len)
+                else:
+                    attn_weights = attn_weights.view(size, self.num_heads, tgt_len, src_len)
+                attn_weights = attn_weights.masked_fill(
+                    key_padding_mask.unsqueeze(1).unsqueeze(2),
+                    -math.inf,
+                )
+                attn_weights = attn_weights.view(size, tgt_len, src_len)
+        attn_weights = F.softmax(attn_weights, dim=-1)
+        attn_weights = self.dropout_module(attn_weights)
+
+        attn = torch.bmm(attn_weights, v)
+        if self.downsample:
+            attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.head_dim)
+        else:
+            attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.embed_dim)
+
+        attn = self.out_proj(attn)
+
+        return attn, attn_weights
+
+
+class DownsampledMultiHeadAttention(nn.ModuleList):
+    """
+    Multi-headed attention with Gating and Downsampling
+    """
+    def __init__(
+        self, out_channels, embed_dim, num_heads, dropout=0., bias=True,
+        project_input=True, gated=False, downsample=False,
+    ):
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.downsample = downsample
+        self.gated = gated
+        self.project_input = project_input
+        assert self.head_dim * num_heads == embed_dim
+
+        if self.downsample:
+            attention_heads = []
+            for index in range(self.num_heads):
+                attention_heads.append(
+                    SingleHeadAttention(
+                        out_channels, self.embed_dim, self.head_dim, index,
+                        dropout, bias, self.project_input, self.gated,
+                        self.downsample, self.num_heads,
+                    )
+                )
+            super().__init__(modules=attention_heads)
+            self.out_proj = Linear(embed_dim, out_channels, bias=bias)
+        else:
+            # either we have a list of attention heads, or just one attention head
+            # if not being downsampled, we can do the heads with one linear layer instead of separate ones
+            super().__init__()
+            self.attention_module = SingleHeadAttention(
+                out_channels, self.embed_dim, self.head_dim, 1, dropout,
+                bias, self.project_input, self.gated, self.downsample, self.num_heads,
+            )
+
+    def forward(
+        self, query, key, value, mask_future_timesteps=False,
+        key_padding_mask=None, use_scalar_bias=False,
+    ):
+        src_len, bsz, embed_dim = key.size()
+        tgt_len = query.size(0)
+        assert embed_dim == self.embed_dim
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+        assert key.size() == value.size()
+
+        tgt_size = tgt_len
+        if use_scalar_bias:
+            tgt_size += 1
+
+        attn = []
+        attn_weights = []
+        if self.downsample:
+            for attention_head_number in range(self.num_heads):
+                # call the forward of each attention head
+                _attn, _attn_weight = self[attention_head_number](
+                    query, key, value, mask_future_timesteps, key_padding_mask, use_scalar_bias,
+                )
+                attn.append(_attn)
+                attn_weights.append(_attn_weight)
+            full_attn = torch.cat(attn, dim=2)
+            full_attn = self.out_proj(full_attn)
+            return full_attn, attn_weights[0].clone()
+        else:
+            _attn, _attn_weight = self.attention_module(
+                query, key, value, mask_future_timesteps, key_padding_mask, use_scalar_bias,
+            )
+            attn.append(_attn)
+            attn_weights.append(_attn_weight)
+            full_attn = torch.cat(attn, dim=2)
+            full_attn_weights = torch.cat(attn_weights)
+            full_attn_weights = full_attn_weights.view(bsz, self.num_heads, tgt_size, src_len)
+            full_attn_weights = full_attn_weights.sum(dim=1) / self.num_heads
+            return full_attn, full_attn_weights
+
+
+class Downsample(nn.Module):
+    """
+    Selects every nth element, where n is the index
+    """
+    def __init__(self, index):
+        super().__init__()
+        self.index = index
+
+    def forward(self, x):
+        return x[::self.index+1]
+
+
+def Linear(in_features, out_features, dropout=0., bias=True):
+    """Weight-normalized Linear layer (input: B x T x C)"""
+    m = nn.Linear(in_features, out_features, bias=bias)
+    m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features))
+    m.bias.data.zero_()
+    return nn.utils.weight_norm(m)
+
+
+def GatedLinear(in_features, out_features, dropout=0., bias=True):
+    """Weight-normalized Linear layer (input: B x T x C) with interspersed GLU units"""
+    return nn.Sequential(
+        Linear(in_features, out_features*4, dropout, bias),
+        nn.GLU(),
+        Linear(out_features*2, out_features*2, dropout, bias),
+        nn.GLU(),
+        Linear(out_features, out_features, dropout, bias)
+    )
diff --git a/fairseq/modules/dynamic_convolution.py b/fairseq/modules/dynamic_convolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a8ecb99a8b6a9ccc114f77896e31ae4386dc7a7
--- /dev/null
+++ b/fairseq/modules/dynamic_convolution.py
@@ -0,0 +1,245 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq import utils
+from .unfold import unfold1d
+from fairseq.incremental_decoding_utils import with_incremental_state
+from fairseq.modules.fairseq_dropout import FairseqDropout
+
+
+def DynamicConv(input_size, kernel_size=1, padding_l=None, num_heads=1,
+                weight_dropout=0., weight_softmax=False,
+                renorm_padding=False, bias=False, conv_bias=False,
+                query_size=None, in_proj=False):
+    if torch.cuda.is_available():
+        try:
+            from fairseq.modules.dynamicconv_layer import DynamicconvLayer
+            return DynamicconvLayer(input_size, kernel_size=kernel_size,
+                                    padding_l=padding_l, num_heads=num_heads,
+                                    weight_dropout=weight_dropout,
+                                    weight_softmax=weight_softmax, bias=bias)
+        except ImportError as e:
+            print(e)
+    return DynamicConv1dTBC(input_size, kernel_size=kernel_size,
+                            padding_l=padding_l, num_heads=num_heads,
+                            weight_dropout=weight_dropout,
+                            weight_softmax=weight_softmax, bias=bias)
+
+
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    nn.init.xavier_uniform_(m.weight)
+    if bias:
+        nn.init.constant_(m.bias, 0.)
+    return m
+
+
+@with_incremental_state
+class DynamicConv1dTBC(nn.Module):
+    '''Dynamic lightweight convolution taking T x B x C inputs
+    Args:
+        input_size: # of channels of the input
+        kernel_size: convolution channels
+        padding_l: padding to the left when using "same" padding
+        num_heads: number of heads used. The weight is of shape (num_heads, 1, kernel_size)
+        weight_dropout: the drop rate of the DropConnect to drop the weight
+        weight_softmax: normalize the weight with softmax before the convolution
+        renorm_padding: re-normalize the filters to ignore the padded part (only the non-padding parts sum up to 1)
+        bias: use bias
+        conv_bias: bias of the convolution
+        query_size: specified when feeding a different input as the query
+        in_proj: project the input and generate the filter together
+
+    Shape:
+        Input: TxBxC, i.e. (timesteps, batch_size, input_size)
+        Output: TxBxC, i.e. (timesteps, batch_size, input_size)
+
+    Attributes:
+        weight: the learnable weights of the module of shape
+            `(num_heads, 1, kernel_size)`
+        bias:   the learnable bias of the module of shape `(input_size)`
+    '''
+    def __init__(self, input_size, kernel_size=1, padding_l=None, num_heads=1,
+                 weight_dropout=0., weight_softmax=False,
+                 renorm_padding=False, bias=False, conv_bias=False,
+                 query_size=None, in_proj=False):
+        super().__init__()
+        self.input_size = input_size
+        self.query_size = input_size if query_size is None else query_size
+        self.kernel_size = kernel_size
+        self.padding_l = padding_l
+        self.num_heads = num_heads
+        self.weight_dropout_module = FairseqDropout(weight_dropout, module_name=self.__class__.__name__)
+        self.weight_softmax = weight_softmax
+        self.renorm_padding = renorm_padding
+
+        if in_proj:
+            self.weight_linear = Linear(self.input_size, self.input_size + num_heads * kernel_size * 1)
+        else:
+            self.weight_linear = Linear(self.query_size, num_heads * kernel_size * 1, bias=bias)
+        if conv_bias:
+            self.conv_bias = nn.Parameter(torch.Tensor(input_size))
+        else:
+            self.conv_bias = None
+        self.reset_parameters()
+
+    @property
+    def in_proj(self):
+        return self.weight_linear.out_features == self.input_size + self.num_heads * self.kernel_size
+
+    def reset_parameters(self):
+        self.weight_linear.reset_parameters()
+        if self.conv_bias is not None:
+            nn.init.constant_(self.conv_bias, 0.)
+
+    def forward(self, x, incremental_state=None, query=None, unfold=None):
+        '''Assuming the input, x, of the shape T x B x C and producing an output in the shape T x B x C
+        args:
+            x: Input of shape T x B x C, i.e. (timesteps, batch_size, input_size)
+            incremental_state: A dict to keep the state
+            unfold: unfold the input or not. If not, we use the matrix trick instead
+            query: use the specified query to predict the conv filters
+        '''
+        unfold = x.size(0) > 512 if unfold is None else unfold  # use unfold mode as default for long sequence to save memory
+        unfold = unfold or (incremental_state is not None)
+        assert query is None or not self.in_proj
+
+        if query is None:
+            query = x
+        if unfold:
+            output = self._forward_unfolded(x, incremental_state, query)
+        else:
+            output = self._forward_expanded(x, incremental_state, query)
+
+        if self.conv_bias is not None:
+            output = output + self.conv_bias.view(1, 1, -1)
+        return output
+
+    def _forward_unfolded(self, x, incremental_state, query):
+        '''The conventional implementation of convolutions.
+        Unfolding the input by having a window shifting to the right.'''
+        T, B, C = x.size()
+        K, H = self.kernel_size, self.num_heads
+        R = C // H
+        assert R * H == C == self.input_size
+
+        if self.in_proj:
+            proj = self.weight_linear(x)
+            x = proj.narrow(2, 0, self.input_size).contiguous()
+            weight = proj.narrow(2, self.input_size, H*K).contiguous().view(T*B*H, -1)
+        else:
+            weight = self.weight_linear(query).view(T*B*H, -1)
+
+        # renorm_padding is only implemented in _forward_expanded
+        assert not self.renorm_padding or incremental_state is not None
+
+        if incremental_state is not None:
+            input_buffer = self._get_input_buffer(incremental_state)
+            if input_buffer is None:
+                input_buffer = x.new()
+            x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3)
+            if self.kernel_size > 1:
+                self._set_input_buffer(incremental_state, x_unfold[:, :, :, -self.kernel_size+1:])
+            x_unfold = x_unfold.view(T*B*H, R, -1)
+        else:
+            padding_l = self.padding_l
+            if K > T and padding_l == K-1:
+                weight = weight.narrow(1, K-T, T)
+                K, padding_l = T, T-1
+            # unfold the input: T x B x C --> T' x B x C x K
+            x_unfold = unfold1d(x, K, padding_l, 0)
+            x_unfold = x_unfold.view(T*B*H, R, K)
+
+        if self.weight_softmax and not self.renorm_padding:
+            weight = F.softmax(weight, dim=1)
+        weight = weight.narrow(1, 0, K)
+
+        if incremental_state is not None:
+            weight = weight[:, -x_unfold.size(2):]
+            K = weight.size(1)
+
+        if self.weight_softmax and self.renorm_padding:
+            weight = F.softmax(weight, dim=1)
+
+        weight = self.weight_dropout_module(weight, inplace=False)
+
+        output = torch.bmm(x_unfold, weight.unsqueeze(2))  # T*B*H x R x 1
+        output = output.view(T, B, C)
+        return output
+
+    def _forward_expanded(self, x, incremental_stat, query):
+        '''Turn the convolution filters into band matrices and do matrix multiplication.
+        This is faster when the sequence is short, but less memory efficient.
+        This is not used in the decoder during inference.
+        '''
+        T, B, C = x.size()
+        K, H = self.kernel_size, self.num_heads
+        R = C // H
+        assert R * H == C == self.input_size
+        if self.in_proj:
+            proj = self.weight_linear(x)
+            x = proj.narrow(2, 0, self.input_size).contiguous()
+            weight = proj.narrow(2, self.input_size, H*K).contiguous().view(T*B*H, -1)
+        else:
+            weight = self.weight_linear(query).view(T*B*H, -1)
+
+        if not self.renorm_padding:
+            if self.weight_softmax:
+                weight = F.softmax(weight, dim=1)
+            weight = self.weight_dropout_module(weight, inplace=False)
+        weight = weight.narrow(1, 0, K).contiguous()
+        weight = weight.view(T, B*H, K).transpose(0, 1)
+
+        x = x.view(T, B*H, R).transpose(0, 1)
+        if self.weight_softmax and self.renorm_padding:
+            # turn the convolution filters into band matrices
+            weight_expanded = weight.new(B*H, T, T+K-1).fill_(float('-inf'))
+            weight_expanded.as_strided((B*H, T, K), (T*(T+K-1), T+K, 1)).copy_(weight)
+            weight_expanded = weight_expanded.narrow(2, self.padding_l, T)
+            # normalize the weight over valid positions like self-attention
+            weight_expanded = F.softmax(weight_expanded, dim=2)
+            weight_expanded = self.weight_dropout_module(weight_expanded, inplace=False)
+        else:
+            P = self.padding_l
+            # For efficieny, we cut the kernel size and reduce the padding when the kernel is larger than the length
+            if K > T and P == K-1:
+                weight = weight.narrow(2, K-T, T)
+                K, P = T, T-1
+            # turn the convolution filters into band matrices
+            weight_expanded = weight.new_zeros(B*H, T, T+K-1, requires_grad=False)
+            weight_expanded.as_strided((B*H, T, K), (T*(T+K-1), T+K, 1)).copy_(weight)
+            weight_expanded = weight_expanded.narrow(2, P, T)  # B*H x T x T
+        output = torch.bmm(weight_expanded, x)
+        output = output.transpose(0, 1).contiguous().view(T, B, C)
+        return output
+
+    def reorder_incremental_state(self, incremental_state, new_order):
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            input_buffer = input_buffer.index_select(1, new_order)
+            self._set_input_buffer(incremental_state, input_buffer)
+
+    def _get_input_buffer(self, incremental_state):
+        return utils.get_incremental_state(self, incremental_state, 'input_buffer')
+
+    def _set_input_buffer(self, incremental_state, new_buffer):
+        return utils.set_incremental_state(self, incremental_state, 'input_buffer', new_buffer)
+
+    def extra_repr(self):
+        s = '{}, kernel_size={}, padding_l={}, num_heads={}, weight_softmax={}, conv_bias={}, renorm_padding={}, in_proj={}'.format(
+            self.input_size, self.kernel_size, self.padding_l,
+            self.num_heads, self.weight_softmax, self.conv_bias is not None, self.renorm_padding,
+            self.in_proj,
+        )
+
+        if self.query_size != self.input_size:
+            s += ', query_size={}'.format(self.query_size)
+        if self.weight_dropout_module.p > 0.:
+            s += ', weight_dropout={}'.format(self.weight_dropout_module.p)
+        return s
diff --git a/fairseq/modules/dynamic_crf_layer.py b/fairseq/modules/dynamic_crf_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f5acf377261b5be80dc09b9b88e507e9f1c9ff7
--- /dev/null
+++ b/fairseq/modules/dynamic_crf_layer.py
@@ -0,0 +1,184 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+This file is to re-implemented the low-rank and beam approximation of CRF layer
+Proposed by:
+
+Sun, Zhiqing, et al.
+Fast Structured Decoding for Sequence Models
+https://arxiv.org/abs/1910.11555
+
+The CRF implementation is mainly borrowed from
+https://github.com/kmkurn/pytorch-crf/blob/master/torchcrf/__init__.py
+
+"""
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+def logsumexp(x, dim=1):
+    return torch.logsumexp(x.float(), dim=dim).type_as(x)
+
+
+class DynamicCRF(nn.Module):
+    """Dynamic CRF layer is used to approximate the traditional
+       Conditional Random Fields (CRF)
+       $P(y | x) = 1/Z(x) exp(sum_i s(y_i, x) + sum_i t(y_{i-1}, y_i, x))$
+
+       where in this function, we assume the emition scores (s) are given,
+       and the transition score is a |V| x |V| matrix $M$
+
+       in the following two aspects:
+        (1) it used a low-rank approximation for the transition matrix:
+            $M = E_1 E_2^T$
+        (2) it used a beam to estimate the normalizing factor Z(x)
+    """
+
+    def __init__(self, num_embedding, low_rank=32, beam_size=64):
+        super().__init__()
+
+        self.E1 = nn.Embedding(num_embedding, low_rank)
+        self.E2 = nn.Embedding(num_embedding, low_rank)
+
+        self.vocb = num_embedding
+        self.rank = low_rank
+        self.beam = beam_size
+
+    def extra_repr(self):
+        return "vocab_size={}, low_rank={}, beam_size={}".format(
+            self.vocb, self.rank, self.beam)
+
+    def forward(self, emissions, targets, masks, beam=None):
+        """
+        Compute the conditional log-likelihood of a sequence of target tokens given emission scores
+
+        Args:
+            emissions (`~torch.Tensor`): Emission score are usually the unnormalized decoder output
+                ``(batch_size, seq_len, vocab_size)``. We assume batch-first
+            targets (`~torch.LongTensor`): Sequence of target token indices
+                ``(batch_size, seq_len)
+            masks (`~torch.ByteTensor`): Mask tensor with the same size as targets
+
+        Returns:
+            `~torch.Tensor`: approximated log-likelihood
+        """
+        numerator = self._compute_score(emissions, targets, masks)
+        denominator = self._compute_normalizer(emissions, targets, masks, beam)
+        return numerator - denominator
+
+    def forward_decoder(self, emissions, masks=None, beam=None):
+        """
+        Find the most likely output sequence using Viterbi algorithm.
+
+        Args:
+            emissions (`~torch.Tensor`): Emission score are usually the unnormalized decoder output
+                ``(batch_size, seq_len, vocab_size)``. We assume batch-first
+            masks (`~torch.ByteTensor`): Mask tensor with the same size as targets
+
+        Returns:
+            `~torch.LongTensor`: decoded sequence from the CRF model
+        """
+        return self._viterbi_decode(emissions, masks, beam)
+
+    def _compute_score(self, emissions, targets, masks=None):
+        batch_size, seq_len = targets.size()
+        emission_scores = emissions.gather(2, targets[:, :, None])[:, :, 0]  # B x T
+        transition_scores = (self.E1(targets[:, :-1]) * self.E2(targets[:, 1:])).sum(2)
+
+        scores = emission_scores
+        scores[:, 1:] += transition_scores
+
+        if masks is not None:
+            scores = scores * masks.type_as(scores)
+        return scores.sum(-1)
+
+    def _compute_normalizer(self, emissions, targets=None, masks=None, beam=None):
+        # HACK: we include "target" which is a hueristic for training
+        # HACK: we use a beam of tokens to approximate the normalizing factor (which is bad?)
+
+        beam = beam if beam is not None else self.beam
+        batch_size, seq_len = emissions.size()[:2]
+        if targets is not None:
+            _emissions = emissions.scatter(2, targets[:, :, None], np.float('inf'))
+            beam_targets = _emissions.topk(beam, 2)[1]
+            beam_emission_scores = emissions.gather(2, beam_targets)
+        else:
+            beam_emission_scores, beam_targets = emissions.topk(beam, 2)
+        beam_transition_score1 = self.E1(beam_targets[:, :-1])  # B x (T-1) x K x D
+        beam_transition_score2 = self.E2(beam_targets[:, 1:])   # B x (T-1) x K x D
+        beam_transition_matrix = torch.bmm(
+            beam_transition_score1.view(-1, beam, self.rank),
+            beam_transition_score2.view(-1, beam, self.rank).transpose(1, 2))
+        beam_transition_matrix = beam_transition_matrix.view(batch_size, -1, beam, beam)
+
+        # compute the normalizer in the log-space
+        score = beam_emission_scores[:, 0]  # B x K
+        for i in range(1, seq_len):
+            next_score = score[:, :, None] + beam_transition_matrix[:, i-1]
+            next_score = logsumexp(next_score, dim=1) + beam_emission_scores[:, i]
+
+            if masks is not None:
+                score = torch.where(masks[:, i:i+1], next_score, score)
+            else:
+                score = next_score
+
+        # Sum (log-sum-exp) over all possible tags
+        return logsumexp(score, dim=1)
+
+    def _viterbi_decode(self, emissions, masks=None, beam=None):
+        # HACK: we use a beam of tokens to approximate the normalizing factor (which is bad?)
+
+        beam = beam if beam is not None else self.beam
+        batch_size, seq_len = emissions.size()[:2]
+        beam_emission_scores, beam_targets = emissions.topk(beam, 2)
+        beam_transition_score1 = self.E1(beam_targets[:, :-1])  # B x (T-1) x K x D
+        beam_transition_score2 = self.E2(beam_targets[:, 1:])   # B x (T-1) x K x D
+        beam_transition_matrix = torch.bmm(
+            beam_transition_score1.view(-1, beam, self.rank),
+            beam_transition_score2.view(-1, beam, self.rank).transpose(1, 2))
+        beam_transition_matrix = beam_transition_matrix.view(batch_size, -1, beam, beam)
+
+        traj_tokens, traj_scores = [], []
+        finalized_tokens, finalized_scores = [], []
+
+        # compute the normalizer in the log-space
+        score = beam_emission_scores[:, 0]  # B x K
+        dummy = torch.arange(beam, device=score.device).expand(*score.size()).contiguous()
+
+        for i in range(1, seq_len):
+            traj_scores.append(score)
+            _score = score[:, :, None] + beam_transition_matrix[:, i-1]
+            _score, _index = _score.max(dim=1)
+            _score = _score + beam_emission_scores[:, i]
+
+            if masks is not None:
+                score = torch.where(masks[:, i: i+1], _score, score)
+                index = torch.where(masks[:, i: i+1], _index, dummy)
+            else:
+                score, index = _score, _index
+            traj_tokens.append(index)
+
+        # now running the back-tracing and find the best
+        best_score, best_index = score.max(dim=1)
+        finalized_tokens.append(best_index[:, None])
+        finalized_scores.append(best_score[:, None])
+
+        for idx, scs in zip(reversed(traj_tokens), reversed(traj_scores)):
+            previous_index = finalized_tokens[-1]
+            finalized_tokens.append(idx.gather(1, previous_index))
+            finalized_scores.append(scs.gather(1, previous_index))
+
+        finalized_tokens.reverse()
+        finalized_tokens = torch.cat(finalized_tokens, 1)
+        finalized_tokens = beam_targets.gather(2, finalized_tokens[:, :, None])[:, :, 0]
+
+        finalized_scores.reverse()
+        finalized_scores = torch.cat(finalized_scores, 1)
+        finalized_scores[:, 1:] = finalized_scores[:, 1:] - finalized_scores[:, :-1]
+
+        return finalized_scores, finalized_tokens
diff --git a/fairseq/modules/dynamicconv_layer/__init__.py b/fairseq/modules/dynamicconv_layer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..22dc6f403d2a0ecdb1b9e7e69ed96bd560e93b2c
--- /dev/null
+++ b/fairseq/modules/dynamicconv_layer/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .dynamicconv_layer import DynamicconvLayer  # noqa
diff --git a/fairseq/modules/dynamicconv_layer/cuda_function_gen.py b/fairseq/modules/dynamicconv_layer/cuda_function_gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..926d6ca846be37a2b7ac451ca62706763b53013b
--- /dev/null
+++ b/fairseq/modules/dynamicconv_layer/cuda_function_gen.py
@@ -0,0 +1,223 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+def gen_forward():
+
+    kernels = [3, 5, 7, 15, 31, 63, 127, 255]
+    blocks = [32, 64, 128, 256]
+
+    head = """
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "dynamicconv_cuda.cuh"
+
+std::vector<at::Tensor> dynamicconv_cuda_forward(at::Tensor input, at::Tensor weight, int padding_l) {
+
+    at::DeviceGuard g(input.device());
+    const auto minibatch = input.size(0);
+    const auto numFeatures = input.size(1);
+    const auto sequenceLength = input.size(2);
+
+    const auto numHeads = weight.size(1);
+    const auto filterSize = weight.size(2);
+
+    const auto numFiltersInBlock = numFeatures / numHeads;
+    const dim3 blocks(minibatch, numFeatures);
+
+    auto output = at::zeros_like(input);
+    auto stream = at::cuda::getCurrentCUDAStream();
+"""
+
+    switch = """
+    switch(filterSize) {
+"""
+
+    case_k = """
+        case {k}:
+"""
+
+    main_block = """
+            if (padding_l == {pad}) {{
+                AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "dynamicconv_forward", ([&] {{
+                    dynamicconv_forward_kernel<{k}, {b_size}, {pad}, scalar_t>
+                    <<<blocks, {b_size}, 0, stream>>>(
+                            input.data<scalar_t>(),
+                            weight.data<scalar_t>(),
+                            minibatch,
+                            sequenceLength,
+                            numFeatures,
+                            numFiltersInBlock,
+                            numHeads,
+                            output.data<scalar_t>());
+                }}));
+            }} else
+"""
+
+    bad_padding = """
+            {
+                std::cout << "WARNING: Unsupported padding size - skipping forward pass" << std::endl;
+            }
+            break;\n
+"""
+
+    end = """
+        default:
+            std::cout << "WARNING: Unsupported filter length passed - skipping forward pass" << std::endl;
+    }
+
+    return {output};
+}
+"""
+
+    with open("dynamicconv_cuda_forward.cu", 'w') as forward:
+        forward.write(head)
+        forward.write(switch)
+        for k in kernels:
+            b_size = 32
+            for b in blocks:
+                if b > k:
+                    b_size = b
+                    break
+            forward.write(case_k.format(k=k))
+            for pad in [k // 2, k - 1]:
+                forward.write(main_block.format(k=k, b_size=b_size, pad=pad))
+            forward.write(bad_padding)
+        forward.write(end)
+
+
+def gen_backward():
+
+    kernels = [3, 5, 7, 15, 31, 63, 127, 255]
+    thresh = [512, 512, 512, 512, 512, 380, 256, 256]
+    min_block = [64, 64, 64, 64, 64, 64, 128, 256]
+    seqs = [32 * x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]]
+
+    head = """
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "dynamicconv_cuda.cuh"
+
+std::vector<at::Tensor> dynamicconv_cuda_backward(at::Tensor gradOutput, int padding_l, at::Tensor input, at::Tensor weight) {
+
+    at::DeviceGuard g(input.device());
+    const auto minibatch = input.size(0);
+    const auto numFeatures = input.size(1);
+    const auto sequenceLength = input.size(2);
+
+    const auto numHeads = weight.size(1);
+    const auto filterSize = weight.size(2);
+
+    const auto numFiltersInBlock = numFeatures / numHeads;
+    auto numChunks = 1;
+
+    auto gradInput = at::zeros_like(input);
+    auto gradWeight = at::zeros_like(weight);
+    auto stream = at::cuda::getCurrentCUDAStream();
+
+    dim3 blocks(minibatch, numHeads, numChunks);
+"""
+
+    sequence_if = """
+    if (sequenceLength < {seq}) {{
+        switch(filterSize) {{
+"""
+
+    case_k = """
+            case {k}:
+"""
+
+    chunks_reset = """
+                numChunks = int(ceilf(sequenceLength/float({b_size})));
+                blocks = dim3(minibatch, numHeads, numChunks);
+"""
+
+    main_block = """
+                if (padding_l == {p}) {{
+                    AT_DISPATCH_FLOATING_TYPES_AND_HALF(gradOutput.scalar_type(), "dynamicconv_backward", ([&] {{
+                        dynamicconv_backward_kernel<{k}, {b_size}, {p}, scalar_t>
+                        <<<blocks, {b_size}, 0, stream>>>(
+                                    gradOutput.data<scalar_t>(),
+                                    input.data<scalar_t>(),
+                                    weight.data<scalar_t>(),
+                                    minibatch,
+                                    sequenceLength,
+                                    numFeatures,
+                                    numFiltersInBlock,
+                                    numHeads,
+                                    gradWeight.data<scalar_t>(),
+                                    gradInput.data<scalar_t>());
+                    }}));
+                }} else
+"""
+
+    bad_padding = """
+                {
+                    std::cout << "WARNING: Unsupported padding size - skipping backward pass" << std::endl;
+                }
+                break;\n
+"""
+
+    bad_filter = """
+            default:
+                std::cout << "WARNING: Unsupported filter length passed - skipping backward pass" << std::endl;
+        }
+"""
+
+    con_else = """
+    } else
+"""
+
+    final_else = """
+    {
+        switch(filterSize) {
+"""
+
+    last_return = """
+    }
+    return {gradInput, gradWeight};
+}
+"""
+
+    with open("dynamicconv_cuda_backward.cu", 'w') as backward:
+        backward.write(head)
+        for seq in seqs:
+            backward.write(sequence_if.format(seq=seq))
+            for k, t, m in zip(kernels, thresh, min_block):
+                backward.write(case_k.format(k=k))
+                if seq <= t:
+                    b_size = seq
+                else:
+                    b_size = m
+                    backward.write(chunks_reset.format(b_size=b_size))
+                for p in [k // 2, k - 1]:
+                    backward.write(main_block.format(k=k, b_size=b_size, p=p))
+                backward.write(bad_padding)
+            backward.write(bad_filter)
+            backward.write(con_else)
+        backward.write(final_else)
+        for k, m in zip(kernels, min_block):
+            backward.write(case_k.format(k=k))
+            backward.write(chunks_reset.format(b_size=m))
+            for p in [k // 2, k - 1]:
+                backward.write(main_block.format(k=k, b_size=m, p=p))
+            backward.write(bad_padding)
+        backward.write(bad_filter)
+        backward.write(last_return)
+
+
+if __name__ == "__main__":
+    gen_forward()
+    gen_backward()
diff --git a/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ebd4df0e9608d769f31eadc6e0b487505f11b279
--- /dev/null
+++ b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp
@@ -0,0 +1,56 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+#include <vector>
+
+std::vector<at::Tensor> dynamicconv_cuda_forward(
+    at::Tensor input,
+    at::Tensor filters,
+    int padding_l);
+
+std::vector<at::Tensor> dynamicconv_cuda_backward(
+    at::Tensor gradOutput,
+    int padding_l,
+    at::Tensor input,
+    at::Tensor filters);
+
+
+#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+std::vector<at::Tensor> dynamicconv_forward(
+    at::Tensor input,
+    at::Tensor filters,
+    int padding_l) {
+
+    CHECK_INPUT(input);
+    CHECK_INPUT(filters);
+
+    return dynamicconv_cuda_forward(input, filters,
+            padding_l);
+}
+
+std::vector<at::Tensor> dynamicconv_backward(
+    at::Tensor gradOutput,
+    int padding_l,
+    at::Tensor input,
+    at::Tensor filters) {
+
+    CHECK_INPUT(gradOutput);
+    CHECK_INPUT(input);
+    CHECK_INPUT(filters);
+
+    return dynamicconv_cuda_backward(gradOutput, padding_l,
+            input, filters);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("forward", &dynamicconv_forward, "dynamicconv forward (CUDA)");
+    m.def("backward", &dynamicconv_backward, "dynamicconv backward (CUDA)");
+}
diff --git a/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2196259433aefc88f96cd5bbcae57740a9a8c2dc
--- /dev/null
+++ b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh
@@ -0,0 +1,51 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * 
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include <stdlib.h>
+#include <assert.h>
+#include <math.h>
+
+#define SHFL_MASK 0xffffffff
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void dynamicconv_forward_kernel(const scalar_t* input,
+                                const scalar_t* weight,
+                                int minibatch, 
+                                int sequenceLength,
+                                int numFeatures, 
+                                int numFiltersInBlock,
+                                int numHeads,
+                                scalar_t* output);
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void dynamicconv_backward_kernel(
+    const scalar_t* gradOutput, // B * C * T
+    const scalar_t* input, // B * C * T
+    const scalar_t* weight,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    int numHeads,
+    scalar_t* gradWeight,
+    scalar_t* gradInput); // B * H * k * T
diff --git a/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..300d35b6478080a9594a22e335988c321d43127f
--- /dev/null
+++ b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu
@@ -0,0 +1,168 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * 
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "dynamicconv_cuda.cuh"
+#include "dynamicconv_cuda_forward.cu"
+#include "dynamicconv_cuda_backward.cu"
+#include "../cuda_utils.cu"
+
+// FS is filter size and kernels are specialized for filter sizes
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void dynamicconv_forward_kernel(const scalar_t* input,
+                                const scalar_t* weight,
+                                int minibatch,
+                                int sequenceLength,
+                                int numFeatures,
+                                int numFiltersInBlock,
+                                int numHeads,
+                                scalar_t* output) {
+  assert(blockDim.x == SB);
+
+  const int tid = threadIdx.x;
+  const int batchIdx = blockIdx.x;
+  const int featureIdx = blockIdx.y;
+  const int head = featureIdx / numFiltersInBlock;
+
+  const int IOOffset = batchIdx * numFeatures * sequenceLength
+                       + featureIdx * sequenceLength;
+  const scalar_t* inputFeature = &input[IOOffset];
+  scalar_t* outputFeature = &output[IOOffset];
+
+  scalar_t filter[FS];
+
+  __shared__ scalar_t tempInput[SB + FS];
+  zeroSharedMem<FS, SB, padding_l>(tempInput);
+
+  const int numIterations = divUp<int, int>(sequenceLength, SB);
+
+  for (int i = 0; i < numIterations; ++i) {
+    __syncthreads();
+    const int inputOffset = i * SB;
+    load_input_to_shared<FS, SB, padding_l>(inputFeature, inputOffset,
+                                            sequenceLength, i,
+                                            numIterations, false, tempInput);
+    __syncthreads();
+    if (inputOffset + tid < sequenceLength) {
+
+      #pragma unroll
+      for (int k = 0; k < FS; ++k) {
+        const int filterOffset = batchIdx * numHeads * FS * sequenceLength
+                                 + head * FS * sequenceLength
+                                 + k * sequenceLength
+                                 + i * SB + tid;
+        filter[k] = weight[filterOffset];
+      }
+
+      scalar_t out = scalar_t(0.0);
+      #pragma unroll
+      for (int k = 0; k < FS; ++k) {
+        out += filter[k] * tempInput[tid + k];
+      }
+
+      outputFeature[inputOffset + tid] = out;
+
+    }
+  }
+}
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void dynamicconv_backward_kernel(
+    const scalar_t* gradOutput, // B * C * T
+    const scalar_t* input, // B * C * T
+    const scalar_t* weight,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    int numHeads,
+    scalar_t* gradWeight,
+    scalar_t* gradInput) { // B * H * k * T
+
+  assert(blockDim.x == SB);
+
+  // each block operates on a single batch and filter head
+  const int tid = threadIdx.x;
+  const int batchIdx = blockIdx.x;
+  const int headIdx = blockIdx.y;
+  const int chunkIdx = blockIdx.z;
+
+  const int numChunks = divUp<int, int>(sequenceLength, SB);
+  const int inputOffset = chunkIdx * SB;
+
+  // initialize shared memory for output gradient and input
+  __shared__ scalar_t tempGradOutput[SB + FS];
+  __shared__ scalar_t tempInput[SB + FS];
+  const int padding = FS - padding_l - 1;
+
+  zeroSharedMem<FS, SB, padding>(tempGradOutput);
+  zeroSharedMem<FS, SB, padding_l>(tempInput);
+
+  // initialize local filter and weight gradient sum arrays
+  scalar_t tempGradSum[FS];
+  scalar_t bfilter[FS];
+  for (int k = 0; k < FS; ++k) {
+    tempGradSum[k] = scalar_t(0.0);
+
+    int idxOffset = inputOffset + tid + k - padding;
+    if (idxOffset >= 0 && idxOffset < sequenceLength) {
+      int bfilterOffset = batchIdx * numHeads * FS * sequenceLength
+                          + headIdx * FS * sequenceLength
+                          + (FS - k  - 1) * sequenceLength
+                          + idxOffset;
+      bfilter[k] = weight[bfilterOffset];
+    } else {
+      bfilter[k] = scalar_t(0.0);
+    }
+  }
+
+
+  // iterate over filter block
+  for (int featureIdx = 0; featureIdx < numFiltersInBlock; ++featureIdx) {
+    __syncthreads();
+
+    // load input and output gradient for this channel and chunk
+    const int IOOffset = batchIdx * numFeatures * sequenceLength
+                         + (headIdx * numFiltersInBlock + featureIdx) * sequenceLength;
+    const scalar_t* inputFeature = &input[IOOffset];
+    const scalar_t* gradOutputFeature = &gradOutput[IOOffset];
+    scalar_t* gradInputFeature = &gradInput[IOOffset];
+
+    load_input_to_shared<FS, SB, padding>(gradOutputFeature, inputOffset,
+                                            sequenceLength, chunkIdx,
+                                            numChunks, true, tempGradOutput);
+    load_input_to_shared<FS, SB, padding_l>(inputFeature, inputOffset,
+                                            sequenceLength, chunkIdx,
+                                            numChunks, true, tempInput);
+    __syncthreads();
+ 
+    // sum input and weight gradients
+    scalar_t out = scalar_t(0.0);
+    #pragma unroll
+    for (int k = 0; k < FS; ++k) {
+      tempGradSum[k] += tempInput[tid + k] * tempGradOutput[tid + padding];
+      out += bfilter[k] * tempGradOutput[tid + k];
+    }
+    
+    if (inputOffset + tid < sequenceLength) {
+      gradInputFeature[inputOffset + tid] = out;
+    }
+  }
+
+  const int gradOffset = batchIdx * numHeads * FS * sequenceLength
+               + headIdx * FS * sequenceLength;
+  scalar_t *gradWeightFeature = &gradWeight[gradOffset];
+
+  // write weight gradient
+  if (inputOffset + tid < sequenceLength) {
+    for (int k = 0; k < FS; ++k) {
+      const int outputOffset = k * sequenceLength + inputOffset + tid;
+      gradWeightFeature[outputOffset] = tempGradSum[k];
+    }
+  }
+}
diff --git a/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py b/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..52cc1e8118885920912ae8d45cad0855dcf73090
--- /dev/null
+++ b/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
@@ -0,0 +1,216 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch import nn
+from torch.autograd import Function
+import torch.nn.functional as F
+
+import dynamicconv_cuda
+from fairseq import utils
+from fairseq.modules.unfold import unfold1d
+from fairseq.incremental_decoding_utils import with_incremental_state
+from fairseq.modules.fairseq_dropout import FairseqDropout
+
+
+class dynamicconvFunction(Function):
+
+    @staticmethod
+    def forward(ctx, x, weights, padding_l):
+        ctx.padding_l = padding_l
+        outputs = dynamicconv_cuda.forward(x, weights, padding_l)
+        variables = [x, weights]
+        ctx.save_for_backward(*variables)
+        return outputs[0]
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        outputs = dynamicconv_cuda.backward(
+                grad_output.contiguous(),
+                ctx.padding_l,
+                *ctx.saved_tensors)
+        grad_input, grad_weights = outputs
+        return grad_input, grad_weights, None
+
+
+@with_incremental_state
+class DynamicconvLayer(nn.Module):
+    def __init__(
+            self,
+            input_size,
+            kernel_size=1,
+            padding_l=None,
+            weight_softmax=False,
+            num_heads=1,
+            weight_dropout=0.,
+            bias=False,
+            renorm_padding=False,
+            conv_bias=False,
+            query_size=None,
+    ):
+
+        super(DynamicconvLayer, self).__init__()
+        self.input_size = input_size
+        self.query_size = input_size if query_size is None else query_size
+        self.kernel_size = kernel_size
+        self.padding_l = padding_l
+        self.num_heads = num_heads
+        self.weight_softmax = weight_softmax
+        self.weight_dropout_module = FairseqDropout(weight_dropout, module_name=self.__class__.__name__)
+        self.renorm_padding = renorm_padding
+        self.bias = bias
+
+        self.weight_linear = nn.Linear(input_size, num_heads * kernel_size, bias)
+        if conv_bias:
+            self.conv_bias = nn.Parameter(torch.Tensor(input_size))
+        else:
+            self.conv_bias = None
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.xavier_uniform_(self.weight_linear.weight)
+        if self.conv_bias is not None:
+            nn.init.constant_(self.conv_bias, 0.)
+            nn.init.constant_(self.weight_linaer.bias, 0.)
+
+    def forward(self, x, incremental_state=None, query=None, unfold=None):
+
+        T, B, C = x.size()
+        K, H = self.kernel_size, self.num_heads
+        # R = C // H
+
+        # during inference time, incremental BMM is faster
+        if incremental_state is not None:
+            unfold = x.size(0) > 512 if unfold is None else unfold  # use unfold mode as default for long sequence to save memory
+            unfold = unfold or (incremental_state is not None)
+            assert query is None
+
+            if query is None:
+                query = x
+            if unfold:
+                output = self._forward_unfolded(x, incremental_state, query)
+            else:
+                output = self._forward_expanded(x, incremental_state, query)
+
+            if self.conv_bias is not None:
+                output = output + self.conv_bias.view(1, 1, -1)
+
+            return output
+
+        # during training time, use CUDA kernel
+        else:
+            weight = self.weight_linear(x).view(T, B, H, K)
+            if self.weight_softmax:
+                weight = F.softmax(weight, dim=-1)
+            if self.weight_dropout_module.p:
+                weight = self.weight_dropout_module(weight)
+
+            weight = weight.permute(1, 2, 3, 0).contiguous()
+            self.filters = weight
+            x = x.permute(1, 2, 0).contiguous()
+            output = dynamicconvFunction.apply(x, weight, self.padding_l).permute(2, 0, 1)
+            if self.conv_bias is not None:
+                output = output + self.conv_bias.view(1, 1, -1)
+            return output
+
+    def reorder_incremental_state(self, incremental_state, new_order):
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            input_buffer = input_buffer.index_select(1, new_order)
+            self._set_input_buffer(incremental_state, input_buffer)
+
+    def _get_input_buffer(self, incremental_state):
+        return utils.get_incremental_state(self, incremental_state, 'input_buffer')
+
+    def _set_input_buffer(self, incremental_state, new_buffer):
+        return utils.set_incremental_state(self, incremental_state, 'input_buffer', new_buffer)
+
+    def _forward_unfolded(self, x, incremental_state, query):
+        '''The conventional implementation of convolutions.
+        Unfolding the input by having a window shifting to the right.'''
+        T, B, C = x.size()
+        K, H = self.kernel_size, self.num_heads
+        R = C // H
+        assert R * H == C == self.input_size
+
+        weight = self.weight_linear(query).view(T*B*H, -1)
+
+        # renorm_padding is only implemented in _forward_expanded
+        assert not self.renorm_padding or incremental_state is not None
+
+        if incremental_state is not None:
+            input_buffer = self._get_input_buffer(incremental_state)
+            if input_buffer is None:
+                input_buffer = x.new()
+            x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3)
+            if self.kernel_size > 1:
+                self._set_input_buffer(incremental_state, x_unfold[:, :, :, -self.kernel_size+1:])
+            x_unfold = x_unfold.view(T*B*H, R, -1)
+        else:
+            padding_l = self.padding_l
+            if K > T and padding_l == K-1:
+                weight = weight.narrow(1, K-T, T)
+                K, padding_l = T, T-1
+            # unfold the input: T x B x C --> T' x B x C x K
+            x_unfold = unfold1d(x, K, padding_l, 0)
+            x_unfold = x_unfold.view(T*B*H, R, K)
+
+        if self.weight_softmax and not self.renorm_padding:
+            weight = F.softmax(weight, dim=1)
+        weight = weight.narrow(1, 0, K)
+
+        if incremental_state is not None:
+            weight = weight[:, -x_unfold.size(2):]
+            K = weight.size(1)
+
+        if self.weight_softmax and self.renorm_padding:
+            weight = F.softmax(weight, dim=1)
+
+        weight = self.weight_dropout_module(weight, inplace=False)
+
+        output = torch.bmm(x_unfold, weight.unsqueeze(2))  # T*B*H x R x 1
+        output = output.view(T, B, C)
+        return output
+
+    def _forward_expanded(self, x, incremental_stat, query):
+        '''Turn the convolution filters into band matrices and do matrix multiplication.
+        This is faster when the sequence is short, but less memory efficient.
+        This is not used in the decoder during inference.
+        '''
+        T, B, C = x.size()
+        K, H = self.kernel_size, self.num_heads
+        R = C // H
+        assert R * H == C == self.input_size
+        weight = self.weight_linear(query).view(T*B*H, -1)
+
+        if not self.renorm_padding:
+            if self.weight_softmax:
+                weight = F.softmax(weight, dim=1)
+            weight = self.weight_dropout_module(weight, inplace=False)
+        weight = weight.narrow(1, 0, K).contiguous()
+        weight = weight.view(T, B*H, K).transpose(0, 1)
+
+        x = x.view(T, B*H, R).transpose(0, 1)
+        if self.weight_softmax and self.renorm_padding:
+            # turn the convolution filters into band matrices
+            weight_expanded = weight.new(B*H, T, T+K-1).fill_(float('-inf'))
+            weight_expanded.as_strided((B*H, T, K), (T*(T+K-1), T+K, 1)).copy_(weight)
+            weight_expanded = weight_expanded.narrow(2, self.padding_l, T)
+            # normalize the weight over valid positions like self-attention
+            weight_expanded = F.softmax(weight_expanded, dim=2)
+            weight_expanded = self.weight_dropout_module(weight_expanded, inplace=False)
+        else:
+            P = self.padding_l
+            # For efficieny, we cut the kernel size and reduce the padding when the kernel is larger than the length
+            if K > T and P == K-1:
+                weight = weight.narrow(2, K-T, T)
+                K, P = T, T-1
+            # turn the convolution filters into band matrices
+            weight_expanded = weight.new_zeros(B*H, T, T+K-1, requires_grad=False)
+            weight_expanded.as_strided((B*H, T, K), (T*(T+K-1), T+K, 1)).copy_(weight)
+            weight_expanded = weight_expanded.narrow(2, P, T)  # B*H x T x T
+        output = torch.bmm(weight_expanded, x)
+        output = output.transpose(0, 1).contiguous().view(T, B, C)
+        return output
diff --git a/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp b/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8a6af4285da3c40a01383541acf1f455ffc060fb
--- /dev/null
+++ b/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp
@@ -0,0 +1,35 @@
+#include <torch/torch.h>
+#include <vector>
+
+std::vector<float*> dynamicconv_cpu_forward(
+    float* input,
+    float* filters,
+    int padding_l);
+
+std::vector<float*> dynamicconv_cpu_backward(
+    float* gradOutput,
+    int padding_l,
+    float* input,
+    float* filters);
+
+std::vector<float*> dynamicconv_forward(
+    float* input,
+    float* filters,
+    int padding_l) {
+
+    return dynamicconv_cpu_forward(input, filters, padding_l);
+}
+
+std::vector<float*> dynamicconv_backward(
+    float* gradOutput,
+    int padding_l,
+    float* input,
+    float* filters) {
+
+    return dynamicconv_cpu_backward(gradOutput, padding_l, input, filters);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("forward", &dynamicconv_forward, "dynamicconv forward (CPU)");
+    m.def("backward", &dynamicconv_backward, "dynamicconv backward (CPU)");
+}
diff --git a/fairseq/modules/dynamicconv_layer/setup.py b/fairseq/modules/dynamicconv_layer/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d789c328371409bd82c9f0087efe6cff459f151
--- /dev/null
+++ b/fairseq/modules/dynamicconv_layer/setup.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from setuptools import setup
+from torch.utils.cpp_extension import CUDAExtension, BuildExtension
+
+setup(
+    name='dynamicconv_layer',
+    ext_modules=[
+        CUDAExtension(
+            name='dynamicconv_cuda',
+            sources=[
+                'dynamicconv_cuda.cpp',
+                'dynamicconv_cuda_kernel.cu',
+            ],
+        ),
+    ],
+    cmdclass={
+        'build_ext': BuildExtension
+    })
diff --git a/fairseq/modules/fairseq_dropout.py b/fairseq/modules/fairseq_dropout.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbfacf477f4a0879a40dd15a641b2c3e86cc4ef7
--- /dev/null
+++ b/fairseq/modules/fairseq_dropout.py
@@ -0,0 +1,52 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import List, Optional
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+logger = logging.getLogger(__name__)
+
+
+class FairseqDropout(nn.Module):
+
+    def __init__(self, p, module_name=None):
+        super().__init__()
+        self.p = p
+        self.module_name = module_name
+        self.apply_during_inference = False
+
+    def forward(self, x, inplace: bool = False):
+        if self.training or self.apply_during_inference:
+            return F.dropout(x, p=self.p, training=True, inplace=inplace)
+        else:
+            return x
+
+    def make_generation_fast_(
+        self,
+        name: str,
+        retain_dropout: bool = False,
+        retain_dropout_modules: Optional[List[str]] = None,
+        **kwargs
+    ):
+        if retain_dropout:
+            if retain_dropout_modules is not None and self.module_name is None:
+                logger.warning(
+                    'Cannot enable dropout during inference for module {} '
+                    'because module_name was not set'.format(name)
+                )
+            elif (
+                retain_dropout_modules is None  # if None, apply to all modules
+                or self.module_name in retain_dropout_modules
+            ):
+                logger.info(
+                    'Enabling dropout during inference for module: {}'.format(name)
+                )
+                self.apply_during_inference = True
+            else:
+                logger.info('Disabling dropout for module: {}'.format(name))
diff --git a/fairseq/modules/fp32_group_norm.py b/fairseq/modules/fp32_group_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..d03aac022e30c8c14a600062d1d86429504ba003
--- /dev/null
+++ b/fairseq/modules/fp32_group_norm.py
@@ -0,0 +1,25 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Layer norm done in fp32 (for fp16 training)
+"""
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Fp32GroupNorm(nn.GroupNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def forward(self, input):
+        output = F.group_norm(
+            input.float(),
+            self.num_groups,
+            self.weight.float() if self.weight is not None else None,
+            self.bias.float() if self.bias is not None else None,
+            self.eps,
+        )
+        return output.type_as(input)
diff --git a/fairseq/modules/gelu.py b/fairseq/modules/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2f1ecff4a3ae3de3eb7d327b9163c46b18a15ed
--- /dev/null
+++ b/fairseq/modules/gelu.py
@@ -0,0 +1,25 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+See "Gaussian Error Linear Units (GELUs)" by Dan Hendrycks and Kevin Gimpel with
+the corresponding GitHub repo: https://github.com/hendrycks/GELUs
+"""
+
+import math
+
+import torch
+import torch.nn as nn
+
+
+def gelu_accurate(x):
+    if not hasattr(gelu_accurate, "_a"):
+        gelu_accurate._a = math.sqrt(2 / math.pi)
+    return (
+        0.5 * x * (1 + torch.tanh(gelu_accurate._a * (x + 0.044715 * torch.pow(x, 3))))
+    )
+
+
+def gelu(x: torch.Tensor) -> torch.Tensor:
+    return torch.nn.functional.gelu(x.float()).type_as(x)
diff --git a/fairseq/modules/grad_multiply.py b/fairseq/modules/grad_multiply.py
new file mode 100644
index 0000000000000000000000000000000000000000..08d15f55dfda9c61a1cf8641ea31424fe1d97f57
--- /dev/null
+++ b/fairseq/modules/grad_multiply.py
@@ -0,0 +1,18 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+
+class GradMultiply(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, scale):
+        ctx.scale = scale
+        res = x.new(x)
+        return res
+
+    @staticmethod
+    def backward(ctx, grad):
+        return grad * ctx.scale, None
diff --git a/fairseq/modules/gumbel_vector_quantizer.py b/fairseq/modules/gumbel_vector_quantizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..01ddd2298b7541bf4923d8df6d82d22ac5d1aadb
--- /dev/null
+++ b/fairseq/modules/gumbel_vector_quantizer.py
@@ -0,0 +1,198 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class GumbelVectorQuantizer(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_vars,
+        temp,
+        groups,
+        combine_groups,
+        vq_dim,
+        time_first,
+        activation=nn.GELU(),
+        weight_proj_depth=1,
+        weight_proj_factor=1,
+    ):
+        """Vector quantization using gumbel softmax
+
+        Args:
+            dim: input dimension (channels)
+            num_vars: number of quantized vectors per group
+            temp: temperature for training. this should be a tuple of 3 elements: (start, stop, decay factor)
+            groups: number of groups for vector quantization
+            combine_groups: whether to use the vectors for all groups
+            vq_dim: dimensionality of the resulting quantized vector
+            time_first: if true, expect input in BxTxC format, otherwise in BxCxT
+            activation: what activation to use (should be a module). this is only used if weight_proj_depth is > 1
+            weight_proj_depth: number of layers (with activation in between) to project input before computing logits
+            weight_proj_factor: this is used only if weight_proj_depth is > 1. scales the inner dimensionality of
+                                projections by this factor
+        """
+        super().__init__()
+
+        self.groups = groups
+        self.combine_groups = combine_groups
+        self.input_dim = dim
+        self.num_vars = num_vars
+        self.time_first = time_first
+
+        assert (
+            vq_dim % groups == 0
+        ), f"dim {vq_dim} must be divisible by groups {groups} for concatenation"
+
+        var_dim = vq_dim // groups
+        num_groups = groups if not combine_groups else 1
+
+        self.vars = nn.Parameter(torch.FloatTensor(1, num_groups * num_vars, var_dim))
+        nn.init.uniform_(self.vars)
+
+        if weight_proj_depth > 1:
+
+            def block(input_dim, output_dim):
+                return nn.Sequential(nn.Linear(input_dim, output_dim), activation)
+
+            inner_dim = self.input_dim * weight_proj_factor
+            self.weight_proj = nn.Sequential(
+                *[
+                    block(self.input_dim if i == 0 else inner_dim, inner_dim)
+                    for i in range(weight_proj_depth - 1)
+                ],
+                nn.Linear(inner_dim, groups * num_vars),
+            )
+        else:
+            self.weight_proj = nn.Linear(self.input_dim, groups * num_vars)
+            nn.init.normal_(self.weight_proj.weight, mean=0, std=1)
+            nn.init.zeros_(self.weight_proj.bias)
+
+        assert len(temp) == 3, temp
+
+        self.max_temp, self.min_temp, self.temp_decay = temp
+        self.curr_temp = self.max_temp
+        self.codebook_indices = None
+
+    def set_num_updates(self, num_updates):
+        self.curr_temp = max(
+            self.max_temp * self.temp_decay ** num_updates, self.min_temp
+        )
+    def get_codebook_indices(self):
+        if self.codebook_indices is None:
+            from itertools import product
+
+            p = [range(self.num_vars)] * self.groups
+            inds = list(product(*p))
+            self.codebook_indices = torch.tensor(
+                inds, dtype=torch.long, device=self.vars.device
+            ).flatten()
+
+            if not self.combine_groups:
+                self.codebook_indices = self.codebook_indices.view(
+                    self.num_vars ** self.groups, -1
+                )
+                for b in range(1, self.groups):
+                    self.codebook_indices[:, b] += self.num_vars * b
+                self.codebook_indices = self.codebook_indices.flatten()
+        return self.codebook_indices
+
+    def codebook(self):
+        indices = self.get_codebook_indices()
+        return (
+            self.vars.squeeze(0)
+                .index_select(0, indices)
+                .view(self.num_vars ** self.groups, -1)
+        )
+
+    def sample_from_codebook(self, b, n):
+        indices = self.get_codebook_indices()
+        indices = indices.view(-1, self.groups)
+        cb_size = indices.size(0)
+        assert (
+                n < cb_size
+        ), f"sample size {n} is greater than size of codebook {cb_size}"
+        sample_idx = torch.randint(low=0, high=cb_size, size=(b * n,))
+        indices = indices[sample_idx]
+
+        z = self.vars.squeeze(0).index_select(0, indices.flatten()).view(b, n, -1)
+        return z
+
+    def to_codebook_index(self, indices):
+        res = indices.new_full(indices.shape[:-1], 0)
+        for i in range(self.groups):
+            exponent = self.groups - i - 1
+            res += indices[..., i] * (self.num_vars ** exponent)
+        return res
+
+    def forward_idx(self, x):
+        res = self.forward(x, produce_targets=True)
+        return res["x"], res["targets"]
+
+    def forward(self, x, produce_targets=False):
+
+        result = {"num_vars": self.num_vars * self.groups}
+
+        if not self.time_first:
+            x = x.transpose(1, 2)
+
+        bsz, tsz, fsz = x.shape
+        x = x.reshape(-1, fsz)
+        x = self.weight_proj(x)
+        x = x.view(bsz * tsz * self.groups, -1)
+
+        _, k = x.max(-1)
+        hard_x = (
+            x.new_zeros(*x.shape)
+            .scatter_(-1, k.view(-1, 1), 1.0)
+            .view(bsz * tsz, self.groups, -1)
+        )
+        hard_probs = torch.mean(hard_x.float(), dim=0)
+        result["code_perplexity"] = torch.exp(
+            -torch.sum(hard_probs * torch.log(hard_probs + 1e-7), dim=-1)
+        ).sum()
+
+        avg_probs = torch.softmax(
+            x.view(bsz * tsz, self.groups, -1).float(), dim=-1
+        ).mean(dim=0)
+        result["prob_perplexity"] = torch.exp(
+            -torch.sum(avg_probs * torch.log(avg_probs + 1e-7), dim=-1)
+        ).sum()
+
+        result["temp"] = self.curr_temp
+
+        if self.training:
+            x = F.gumbel_softmax(x.float(), tau=self.curr_temp, hard=True).type_as(x)
+        else:
+            x = hard_x
+
+        x = x.view(bsz * tsz, -1)
+
+        vars = self.vars
+        if self.combine_groups:
+            vars = vars.repeat(1, self.groups, 1)
+
+        if produce_targets:
+            result["targets"] = (
+                x.view(bsz * tsz * self.groups, -1)
+                .argmax(dim=-1)
+                .view(bsz, tsz, self.groups)
+                .detach()
+            )
+
+        x = x.unsqueeze(-1) * vars
+        x = x.view(bsz * tsz, self.groups, self.num_vars, -1)
+        x = x.sum(-2)
+        x = x.view(bsz, tsz, -1)
+
+        if not self.time_first:
+            x = x.transpose(1, 2)  # BTC -> BCT
+
+        result["x"] = x
+
+        return result
diff --git a/fairseq/modules/kmeans_vector_quantizer.py b/fairseq/modules/kmeans_vector_quantizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..be56e6081bc836b77203236b495e4f6391e56020
--- /dev/null
+++ b/fairseq/modules/kmeans_vector_quantizer.py
@@ -0,0 +1,128 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+
+from fairseq.modules import Fp32GroupNorm
+
+
+class KmeansVectorQuantizer(nn.Module):
+    def __init__(
+        self, dim, num_vars, groups, combine_groups, vq_dim, time_first, gamma=0.25
+    ):
+        '''Vector quantization using straight pass-through estimator (i.e. kmeans)
+
+                Args:
+                    dim: input dimension (channels)
+                    num_vars: number of quantized vectors per group
+                    groups: number of groups for vector quantization
+                    combine_groups: whether to use the vectors for all groups
+                    vq_dim: dimensionality of the resulting quantized vector
+                    time_first: if true, expect input in BxTxC format, otherwise in BxCxT
+                    gamma: commitment loss coefficient
+                '''
+        super().__init__()
+
+        self.groups = groups
+        self.combine_groups = combine_groups
+        self.input_dim = dim
+        self.num_vars = num_vars
+        self.vq_dim = vq_dim
+        self.time_first = time_first
+
+        assert (
+            vq_dim % groups == 0
+        ), f"dim {vq_dim} must be divisible by groups {groups} for concatenation"
+
+        self.var_dim = vq_dim // groups
+        num_groups = groups if not combine_groups else 1
+
+        self.embedding = nn.Parameter(
+            0.01 * torch.randn(num_vars, num_groups, self.var_dim)
+        )
+        self.projection = nn.Sequential(
+            nn.Conv1d(dim, dim, kernel_size=1, groups=groups, bias=False),
+            Fp32GroupNorm(groups, dim),
+        )
+        self.gamma = gamma
+        self.mse_mean = nn.MSELoss(reduction="mean")
+
+    def _pass_grad(self, x, y):
+        """ Manually set gradient for backward pass.
+        for y = f(x), ensure that during the backward pass,
+        dL/dy = dL/dx regardless of f(x).
+        Returns:
+            y, with the gradient forced to be dL/dy = dL/dx.
+        """
+
+        return y.detach() + (x - x.detach())
+
+    @property
+    def expand_embedding(self):
+        if self.combine_groups:
+            return self.embedding.expand(self.num_vars, self.groups, self.var_dim)
+        return self.embedding
+
+    def forward_idx(self, x):
+        res = self.forward(x, produce_targets=True)
+        return res["x"], res["targets"]
+
+    def forward(self, x, produce_targets=False):
+
+        result = {"num_vars": self.num_vars}
+
+        if self.time_first:
+            x = x.transpose(1, 2)
+
+        bsz, fsz, tsz = x.shape
+
+        ze = self.projection(x)
+        ze_ = ze.view(bsz, self.groups, self.var_dim, tsz).permute(0, 3, 1, 2)
+        d = (
+            (ze_.unsqueeze(0) - self.expand_embedding.unsqueeze(1).unsqueeze(1))
+            .view(self.num_vars, bsz, tsz, self.groups, -1)
+            .norm(dim=-1, p=2)
+        )
+        idx = d.argmin(dim=0)
+        zq = (
+            torch.stack(
+                [
+                    self.expand_embedding[idx[..., group], group]
+                    for group in range(self.groups)
+                ],
+                dim=-2,
+            )
+            .view(bsz, tsz, self.groups * self.var_dim)
+            .permute(0, 2, 1)
+        )
+        assert ze.shape == zq.shape, (ze.shape, zq.shape)
+        x = self._pass_grad(ze, zq)
+
+        hard_x = (
+            idx.new_zeros(bsz*tsz*self.groups, self.num_vars)
+                .scatter_(-1, idx.view(-1, 1), 1.0)
+                .view(bsz * tsz, self.groups, -1)
+        )
+        hard_probs = torch.mean(hard_x.float(), dim=0)
+        result["code_perplexity"] = torch.exp(
+            -torch.sum(hard_probs * torch.log(hard_probs + 1e-7), dim=-1)
+        ).sum()
+
+        if produce_targets:
+            result["targets"] = idx
+
+        if self.time_first:
+            x = x.transpose(1, 2)  # BCT -> BTC
+        result["x"] = x
+
+        ze = ze.float()
+        zq = zq.float()
+        latent_loss = self.mse_mean(zq, ze.detach())
+        commitment_loss = self.mse_mean(ze, zq.detach())
+
+        result["kmeans_loss"] = latent_loss + self.gamma * commitment_loss
+
+        return result
diff --git a/fairseq/modules/layer_drop.py b/fairseq/modules/layer_drop.py
new file mode 100644
index 0000000000000000000000000000000000000000..8961d8bcbc492c40c6b30973234416ce5a414f5a
--- /dev/null
+++ b/fairseq/modules/layer_drop.py
@@ -0,0 +1,44 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+LayerDrop as described in https://arxiv.org/abs/1909.11556.
+"""
+
+import torch
+import torch.nn as nn
+
+
+class LayerDropModuleList(nn.ModuleList):
+    """
+    A LayerDrop implementation based on :class:`torch.nn.ModuleList`.
+
+    We refresh the choice of which layers to drop every time we iterate
+    over the LayerDropModuleList instance. During evaluation we always
+    iterate over all layers.
+
+    Usage::
+
+        layers = LayerDropList(p=0.5, modules=[layer1, layer2, layer3])
+        for layer in layers:  # this might iterate over layers 1 and 3
+            x = layer(x)
+        for layer in layers:  # this might iterate over all layers
+            x = layer(x)
+        for layer in layers:  # this might not iterate over any layers
+            x = layer(x)
+
+    Args:
+        p (float): probability of dropping out each layer
+        modules (iterable, optional): an iterable of modules to add
+    """
+
+    def __init__(self, p, modules=None):
+        super().__init__(modules)
+        self.p = p
+
+    def __iter__(self):
+        dropout_probs = torch.empty(len(self)).uniform_()
+        for i, m in enumerate(super().__iter__()):
+            if not self.training or (dropout_probs[i] > self.p):
+                yield m
diff --git a/fairseq/modules/layer_norm.py b/fairseq/modules/layer_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fee32d4fcfa8ff087765ae29028839e090a0288
--- /dev/null
+++ b/fairseq/modules/layer_norm.py
@@ -0,0 +1,47 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+try:
+    from apex.normalization import FusedLayerNorm as _FusedLayerNorm
+
+    has_fused_layernorm = True
+
+    class FusedLayerNorm(_FusedLayerNorm):
+        @torch.jit.unused
+        def forward(self, x):
+            if not x.is_cuda:
+                return super().forward(x)
+            else:
+                with torch.cuda.device(x.device):
+                    return super().forward(x)
+
+except ImportError:
+    has_fused_layernorm = False
+
+
+def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False):
+    if not export and torch.cuda.is_available() and has_fused_layernorm:
+        return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
+    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
+
+
+class Fp32LayerNorm(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def forward(self, input):
+        output = F.layer_norm(
+            input.float(),
+            self.normalized_shape,
+            self.weight.float() if self.weight is not None else None,
+            self.bias.float() if self.bias is not None else None,
+            self.eps,
+        )
+        return output.type_as(input)
diff --git a/fairseq/modules/learned_positional_embedding.py b/fairseq/modules/learned_positional_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..378d0f707183dd344dbb9288dda394b11053acf0
--- /dev/null
+++ b/fairseq/modules/learned_positional_embedding.py
@@ -0,0 +1,61 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+from torch import Tensor
+
+
+class LearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    Padding ids are ignored by either offsetting based on padding_idx
+    or by setting padding_idx to None and ensuring that the appropriate
+    position ids are passed to the forward function.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
+        super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.onnx_trace = False
+        if self.padding_idx is not None:
+            self.max_positions = self.num_embeddings - self.padding_idx - 1
+        else:
+            self.max_positions = self.num_embeddings
+
+    def forward(
+        self,
+        input: Tensor,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        positions: Optional[Tensor] = None,
+    ):
+        """Input is expected to be of size [bsz x seqlen]."""
+        assert (positions is None) or (
+            self.padding_idx is None
+        ), "If positions is pre-computed then padding_idx should not be set."
+
+        if positions is None:
+            if incremental_state is not None:
+                # positions is the same for every token when decoding a single step
+                # Without the int() cast, it doesn't work in some cases when exporting to ONNX
+                positions = torch.zeros(
+                    (1, 1), device=input.device, dtype=input.dtype
+                ).fill_(int(self.padding_idx + input.size(1)))
+            else:
+                positions = utils.make_positions(
+                    input, self.padding_idx, onnx_trace=self.onnx_trace
+                )
+        return F.embedding(
+            positions,
+            self.weight,
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+        )
diff --git a/fairseq/modules/lightconv_layer/__init__.py b/fairseq/modules/lightconv_layer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b2a99c1227f827768911e5e22e79f6865ffbfd3
--- /dev/null
+++ b/fairseq/modules/lightconv_layer/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .lightconv_layer import LightconvLayer  # noqa
diff --git a/fairseq/modules/lightconv_layer/cuda_function_gen.py b/fairseq/modules/lightconv_layer/cuda_function_gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..afec9e19e7176a19c5e60389cb1bb0250c84de4b
--- /dev/null
+++ b/fairseq/modules/lightconv_layer/cuda_function_gen.py
@@ -0,0 +1,289 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+def gen_forward():
+
+    kernels = [3, 5, 7, 15, 31, 63, 127, 255]
+    seqs = [32 * x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]]
+
+    head = """
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "lightconv_cuda.cuh"
+
+std::vector<at::Tensor> lightconv_cuda_forward(at::Tensor input, at::Tensor filters, int padding_l) {
+
+    at::DeviceGuard g(input.device());
+    const auto minibatch = input.size(0);
+    const auto numFeatures = input.size(1);
+    const auto sequenceLength = input.size(2);
+
+    const auto numHeads = filters.size(0);
+    const auto filterSize = filters.size(1);
+
+    const auto numFiltersInBlock = numFeatures / numHeads;
+
+    const dim3 blocks(minibatch, numFeatures);
+
+    auto output = at::zeros_like(input);
+    auto stream = at::cuda::getCurrentCUDAStream();
+"""
+
+    sequence_if = """
+    if (sequenceLength <= {seq}) {{
+        switch(filterSize) {{
+"""
+
+    case_k = """
+            case {k}:
+"""
+
+    main_block = """
+                if (padding_l == {pad}) {{
+                    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "lightconv_forward", ([&] {{
+                        lightconv_forward_kernel<{k}, {b_size}, {pad}, scalar_t>
+                        <<<blocks, {b_size}, 0, stream>>>(
+                                input.data<scalar_t>(),
+                                filters.data<scalar_t>(),
+                                minibatch,
+                                sequenceLength,
+                                numFeatures,
+                                numFiltersInBlock,
+                                output.data<scalar_t>());
+                    }}));
+                }} else
+"""
+
+    bad_padding = """
+                {
+                    std::cout << "WARNING: Unsupported padding size - skipping forward pass" << std::endl;
+                }
+                break;
+"""
+
+    bad_filter = """
+            default:
+                std::cout << "WARNING: Unsupported filter length passed - skipping forward pass" << std::endl;
+        }
+"""
+
+    con_else = """
+    } else
+"""
+
+    final_else = """
+    {
+        switch(filterSize) {
+"""
+
+    final_return = """
+    }
+
+    return {output};
+}
+"""
+
+    with open("lightconv_cuda_forward.cu", 'w') as forward:
+        forward.write(head)
+        for seq in seqs:
+            forward.write(sequence_if.format(seq=seq))
+            for k in kernels:
+                forward.write(case_k.format(k=k))
+                for pad in [k // 2, k - 1]:
+                    forward.write(main_block.format(k=k, b_size=seq, pad=pad))
+                forward.write(bad_padding)
+            forward.write(bad_filter)
+            forward.write(con_else)
+
+        forward.write(final_else)
+        for k in kernels:
+            forward.write(case_k.format(k=k))
+            for pad in [k // 2, k - 1]:
+                forward.write(main_block.format(k=k, b_size=seq, pad=pad))
+            forward.write(bad_padding)
+        forward.write(bad_filter)
+        forward.write(final_return)
+
+
+def gen_backward():
+
+    head = """
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "lightconv_cuda.cuh"
+
+std::vector<at::Tensor> lightconv_cuda_backward(
+        at::Tensor gradOutput,
+        int padding_l,
+        at::Tensor input,
+        at::Tensor filters) {
+
+    // gradWrtInput
+    const int minibatch = input.size(0);
+    const int numFeatures = input.size(1);
+    const int sequenceLength = input.size(2);
+
+    const int numHeads = filters.size(0);
+    const int filterSize = filters.size(1);
+
+    const dim3 gradBlocks(minibatch, numFeatures);
+    const dim3 weightGradFirstpassShortBlocks(minibatch, numHeads);
+    const dim3 weightGradSecondpassBlocks(numHeads, filterSize);
+
+    const int numFiltersInBlock = numFeatures / numHeads;
+
+    auto gradInput = at::zeros_like(input);
+    auto gradFilters = at::zeros_like(filters);
+
+    at::DeviceGuard g(input.device());
+    auto stream = at::cuda::getCurrentCUDAStream();
+
+    switch(filterSize) {
+"""
+
+    sequence_if = """
+            if (sequenceLength <= {seq}) {{
+"""
+
+    case_k = """
+        case {k}:
+"""
+
+    main_block = """
+                if (padding_l == {p}) {{
+                    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "lightconv_backward", ([&] {{
+                        lightconv_grad_wrt_input_kernel<{k}, {b_size}, {p}, scalar_t>
+                        <<<gradBlocks, {b_size}, 0, stream>>>(
+                                gradOutput.data<scalar_t>(),
+                                filters.data<scalar_t>(),
+                                minibatch,
+                                sequenceLength,
+                                numFeatures,
+                                numFiltersInBlock,
+                                gradInput.data<scalar_t>());
+
+"""
+
+    weight_grad_short = """
+                        at::Tensor tempSumGradFilters = at::zeros({{minibatch, numHeads, filterSize}}, input.options().dtype(at::kFloat));
+                        lightconv_grad_wrt_weights_firstpass_short_kernel<{k}, {b_size}, {p}, scalar_t>
+                        <<<weightGradFirstpassShortBlocks, {b_size}, 0, stream>>>(
+                                input.data<scalar_t>(),
+                                gradOutput.data<scalar_t>(),
+                                minibatch,
+                                sequenceLength,
+                                numFeatures,
+                                numFiltersInBlock,
+                                numHeads,
+                                tempSumGradFilters.data<float>()
+                        );
+
+                        lightconv_grad_wrt_weights_secondpass_short_kernel<{k}, {b_size}, scalar_t>
+                        <<<weightGradSecondpassBlocks, {b_size}, 0, stream>>>(
+                                tempSumGradFilters.data<float>(),
+                                minibatch,
+                                numFiltersInBlock,
+                                gradFilters.data<scalar_t>()
+                        );
+                    }}));
+                }} else
+"""
+
+    weight_grad = """
+                        at::Tensor tempSumGradFilters = at::zeros({{minibatch, numFeatures, filterSize}}, input.options().dtype(at::kFloat));
+                        lightconv_grad_wrt_weights_firstpass_kernel<{k}, {b_size}, {p}, scalar_t>
+                        <<<gradBlocks, {b_size}, 0, stream>>>(
+                                input.data<scalar_t>(),
+                                gradOutput.data<scalar_t>(),
+                                minibatch,
+                                sequenceLength,
+                                numFeatures,
+                                numFiltersInBlock,
+                                tempSumGradFilters.data<float>()
+                        );
+
+                        lightconv_grad_wrt_weights_secondpass_kernel<{k}, {b_size}, scalar_t>
+                        <<<weightGradSecondpassBlocks, {b_size}, 0, stream>>>(
+                                tempSumGradFilters.data<float>(),
+                                minibatch,
+                                numFiltersInBlock,
+                                gradFilters.data<scalar_t>()
+                        );
+                    }}));
+                }} else
+"""
+
+    bad_padding = """
+                {
+                    std::cout << "WARNING: Unsupported padding size - skipping backward pass" << std::endl;
+                }
+"""
+
+    breakout = """
+                break;
+"""
+
+    bad_filter = """
+        default:
+            std::cout << "WARNING: Unsupported filter length passed - skipping backward pass" << std::endl;
+"""
+
+    con_else = """
+            } else
+"""
+
+    final_else = """
+    {
+        switch(filterSize) {
+"""
+
+    last_return = """
+    }
+    return {gradInput, gradFilters};
+}
+"""
+
+    kernels = [3, 5, 7, 15, 31, 63, 127, 255]
+    seqs = [32 * x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]]
+    thresh = [32, 32, 64, 128, 256, -1, -1, -1]
+    max_mem = [-1, -1, -1, -1, -1, 192, 96, 64]
+
+    with open("lightconv_cuda_backward.cu", 'w') as backward:
+        backward.write(head)
+        for (k, t, mem) in zip(kernels, thresh, max_mem):
+            backward.write(case_k.format(k=k))
+            for seq in seqs:
+                if (t == -1 or seq <= t) and (mem == -1 or seq < mem):
+                    backward.write(sequence_if.format(seq=seq))
+                    for p in [k // 2, k - 1]:
+                        backward.write(main_block.format(k=k, b_size=seq, p=p))
+                        backward.write(weight_grad_short.format(k=k, b_size=seq, p=p))
+                    backward.write(bad_padding)
+                else:
+                    for p in [k // 2, k - 1]:
+                        backward.write(main_block.format(k=k, b_size=32, p=p))
+                        backward.write(weight_grad.format(k=k, b_size=32, p=p))
+                    backward.write(bad_padding)
+                    backward.write(breakout)
+                    break
+                backward.write(con_else)
+        backward.write(bad_filter)
+        backward.write(last_return)
+
+
+if __name__ == "__main__":
+    gen_forward()
+    gen_backward()
diff --git a/fairseq/modules/lightconv_layer/lightconv_cuda.cpp b/fairseq/modules/lightconv_layer/lightconv_cuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4bf6b5ad365d604bd91eda384bb422857b640744
--- /dev/null
+++ b/fairseq/modules/lightconv_layer/lightconv_cuda.cpp
@@ -0,0 +1,54 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+#include <vector>
+
+std::vector<at::Tensor> lightconv_cuda_forward(
+    at::Tensor input,
+    at::Tensor filters,
+    int padding_l);
+
+std::vector<at::Tensor> lightconv_cuda_backward(
+    at::Tensor gradOutput,
+    int padding_l,
+    at::Tensor input,
+    at::Tensor filters);
+
+
+#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+std::vector<at::Tensor> lightconv_forward(
+    at::Tensor input,
+    at::Tensor filters,
+    int padding_l) {
+
+    CHECK_INPUT(input);
+    CHECK_INPUT(filters);
+
+    return lightconv_cuda_forward(input, filters, padding_l);
+}
+
+std::vector<at::Tensor> lightconv_backward(
+    at::Tensor gradOutput,
+    int padding_l,
+    at::Tensor input,
+    at::Tensor filters) {
+
+    CHECK_INPUT(gradOutput);
+    CHECK_INPUT(input);
+    CHECK_INPUT(filters);
+
+    return lightconv_cuda_backward(gradOutput, padding_l, input, filters);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("forward", &lightconv_forward, "lighconv forward (CUDA)");
+    m.def("backward", &lightconv_backward, "lighconv backward (CUDA)");
+}
diff --git a/fairseq/modules/lightconv_layer/lightconv_cuda.cuh b/fairseq/modules/lightconv_layer/lightconv_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3cae57b68fc96872a5047a7a0d081b78456e8fae
--- /dev/null
+++ b/fairseq/modules/lightconv_layer/lightconv_cuda.cuh
@@ -0,0 +1,83 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * 
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include <stdlib.h>
+#include <assert.h>
+
+#define SHFL_MASK 0xffffffff
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void lightconv_forward_kernel(const scalar_t* input,
+                              const scalar_t* filters,
+                              int minibatch, int sequenceLength,
+                              int numFeatures, int numFiltersInBlock,
+                              scalar_t* output);
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void lightconv_grad_wrt_input_kernel(
+    const scalar_t* input, 
+    const scalar_t* filters,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    scalar_t* output);
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void lightconv_grad_wrt_weights_firstpass_short_kernel(
+    const scalar_t* input,
+    const scalar_t* gradInput,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    int numHeads,
+    float* output);
+
+template<int FS, int SB, typename scalar_t>
+__global__
+void lightconv_grad_wrt_weights_secondpass_short_kernel(
+    const float* input,
+    const int minibatch, 
+    const int numFiltersInBlock,
+    scalar_t* output);
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void lightconv_grad_wrt_weights_firstpass_kernel(
+    const scalar_t* input,
+    const scalar_t* gradInput,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    float* output);
+
+template<int FS, int SB, typename scalar_t>
+__global__
+void lightconv_grad_wrt_weights_secondpass_kernel(
+    const float* input,
+    const int minibatch, 
+    const int numFiltersInBlock,
+    scalar_t* output);
+
diff --git a/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu b/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8ee83a56c89754c2abbe717b269d07ca9e64eef2
--- /dev/null
+++ b/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu
@@ -0,0 +1,375 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * 
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "lightconv_cuda.cuh"
+#include "lightconv_cuda_forward.cu"
+#include "lightconv_cuda_backward.cu"
+#include "../cuda_utils.cu"
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void lightconv_forward_kernel(const scalar_t* input,
+                              const scalar_t* filters,
+                              int minibatch, int sequenceLength,
+                              int numFeatures, int numFiltersInBlock,
+                              scalar_t* output) {
+
+  const int tid = threadIdx.x;
+  const int batchIdx = blockIdx.x;
+  const int featureIdx = blockIdx.y;
+  const int filterIdx = featureIdx / numFiltersInBlock;
+
+  const int IOOffset = numFeatures * sequenceLength * batchIdx + featureIdx * sequenceLength;
+  const scalar_t* inputFeature = &input[IOOffset];
+  scalar_t* outputFeature = &output[IOOffset];
+  const scalar_t* inputFilter = &filters[filterIdx * FS];
+
+  assert(blockDim.x == SB);
+
+  scalar_t filter[FS];
+  #pragma unroll
+  for (int i = 0; i < FS; ++i) {
+    filter[i] = inputFilter[i];
+  }
+
+  __shared__ scalar_t temp[SB + FS];
+  zeroSharedMem<FS, SB, padding_l>(temp);
+
+  const int numIterations = divUp<int, int>(sequenceLength, SB);
+
+  for (int i = 0; i < numIterations; ++i) {
+    // Read input into shared memory
+    const int inputOffset = i * SB;
+
+    load_input_to_shared<FS, SB, padding_l>(inputFeature, inputOffset, sequenceLength,
+                                 i, numIterations, (numIterations == 1), temp);
+
+    __syncthreads();
+
+    scalar_t out = 0;
+    #pragma unroll
+    for (int j = 0; j < FS; ++j) {
+      out += filter[j] * temp[tid + j];
+    }
+
+    // Write output
+    const int outputOffset = inputOffset;
+    if ((outputOffset + tid) < sequenceLength) {
+      outputFeature[outputOffset + tid] = out;
+    }
+
+    __syncthreads();
+  }
+}
+
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void lightconv_grad_wrt_input_kernel(
+    const scalar_t* input,
+    const scalar_t* filters,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    scalar_t* output) {
+
+  // input grad kernel is similar to forward kernel
+  const int tid = threadIdx.x;
+  const int batchIdx = blockIdx.x;
+  const int featureIdx = blockIdx.y;
+  const int filterIdx = featureIdx / numFiltersInBlock;
+
+  const int IOOffset = numFeatures * sequenceLength * batchIdx + featureIdx * sequenceLength;
+  const scalar_t* inputFeature = &input[IOOffset];
+  scalar_t* outputFeature = &output[IOOffset];
+  const scalar_t* inputFilter = &filters[filterIdx * FS];
+
+  assert(blockDim.x == SB);
+
+  scalar_t filter[FS];
+
+  // The only change is loading the filter in reverse
+  #pragma unroll
+  for (int i = 0; i < FS; ++i) {
+    filter[i] = inputFilter[FS - i - 1];
+  }
+
+  __shared__ scalar_t temp[SB + FS];
+  const int padding = FS - padding_l - 1;
+  zeroSharedMem<FS, SB, padding>(temp);
+
+  __syncthreads();
+
+  const int numIterations = divUp<int, int>(sequenceLength, SB);
+
+  for (int i = 0; i < numIterations; ++i) {
+    // Read input into shared memory
+    const int inputOffset = i * SB;
+
+    load_input_to_shared<FS, SB, padding>(inputFeature, inputOffset, sequenceLength,
+                                 i, numIterations, false, temp);
+
+    __syncthreads();
+
+    scalar_t out = 0;
+    #pragma unroll
+    for (int j = 0; j < FS; ++j) {
+      out += filter[j] * temp[tid + j];
+    }
+
+    // Write output
+    const int outputOffset = inputOffset;
+    if ((outputOffset + tid) < sequenceLength) {
+      outputFeature[outputOffset + tid] = out;
+    }
+
+    __syncthreads();
+  }
+}
+
+// This is by far the most expensive kernel in terms of time taken.
+// Can be 16x slower than the forward or grad_wrt_input when filter size is 31
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void lightconv_grad_wrt_weights_firstpass_short_kernel(
+    const scalar_t* input,
+    const scalar_t* gradInput,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    int numHeads,
+    float* output) {
+
+  const int tid = threadIdx.x;
+  const int batchIdx = blockIdx.x;
+  const int filterIdx = blockIdx.y;
+
+  const int numIterations = divUp<int, int>(sequenceLength, SB);
+
+  float* tempOutputGradWeight = &output[filterIdx * FS * minibatch];
+
+  assert(blockDim.x == SB);
+
+  __shared__ scalar_t tempInput[SB + FS];
+  __shared__ scalar_t tempGradInput[SB + FS];
+
+  // local weight accumulation
+  float accumWeights[FS];
+
+  // Initialize memory
+  for (int i = 0; i < FS; ++i) {
+    accumWeights[i] = float(0.0);
+  }
+
+
+  // loop over each sequence within filterblock
+  for (int idxInFilterBlock = 0; idxInFilterBlock < numFiltersInBlock; ++idxInFilterBlock) {
+
+    const int featureOffset = batchIdx * numFeatures * sequenceLength + (filterIdx * numFiltersInBlock + idxInFilterBlock) * sequenceLength;
+    const scalar_t* inputFeature = &input[featureOffset];
+    const scalar_t* gradInputFeature = &gradInput[featureOffset];
+
+    zeroSharedMem<FS, SB, padding_l>(tempInput);
+    zeroSharedMem<FS, SB, (FS/2)>(tempGradInput);
+    __syncthreads();
+
+    for (int i = 0; i < numIterations; ++i) {
+
+      const int inputOffset = i * SB;
+
+      load_input_to_shared<FS, SB, padding_l>(inputFeature, inputOffset, sequenceLength,
+                                    i, numIterations, false, tempInput);
+      load_input_to_shared<FS, SB, (FS/2)>(gradInputFeature, inputOffset, sequenceLength,
+                                    i, numIterations, false, tempGradInput);
+
+      __syncthreads();
+
+      const int gradIndex = (FS/2) + tid;
+      scalar_t tempGrad = tempGradInput[gradIndex];
+
+      #pragma unroll
+      for (int j = 0; j < FS; j++) {
+        const int inputIndex = tid + j;
+        accumWeights[j] += tempInput[inputIndex] * tempGrad;
+      }
+
+      __syncthreads();
+
+    }
+
+  }
+
+  // Row-major sum
+  for (int filterWeightIdx = 0; filterWeightIdx < FS; ++filterWeightIdx) {
+
+    float temp;
+    if (tid < sequenceLength) {
+        temp = accumWeights[filterWeightIdx];
+    } else {
+        temp = float(0.0);
+    }
+
+    const int outputOffset = filterWeightIdx * minibatch + batchIdx;
+
+    temp = blockReduce(temp);
+
+    if (tid == 0) {
+      tempOutputGradWeight[outputOffset] = temp;
+    }
+  }
+}
+
+template<int FS, int SB, typename scalar_t>
+__global__
+void lightconv_grad_wrt_weights_secondpass_short_kernel(
+    const float* input,
+    const int minibatch,
+    const int numFiltersInBlock,
+    scalar_t* output) {
+
+  assert(blockDim.x == SB);
+
+  const int tid = threadIdx.x;
+
+  const int filterIdx = blockIdx.x;
+  const int filterWeightIdx = blockIdx.y;
+
+  const int inputOffset = filterIdx * FS * minibatch +
+                          filterWeightIdx * minibatch;
+  const float* tempInput = &input[inputOffset];
+
+  // read into shared memory for reduction
+  int readIndex = tid;
+
+  float sum = 0.0;
+  while (readIndex < minibatch) {
+    sum += tempInput[readIndex];
+    readIndex += SB;
+  }
+
+  float temp = blockReduce(sum);
+
+  if (tid == 0) {
+    output[blockIdx.x * FS + blockIdx.y] = temp;
+  }
+}
+
+// This is by far the most expensive kernel in terms of time taken.
+// Can be 16x slower than the forward or grad_wrt_input when filter size is 31
+template<int FS, int SB, int padding_l, typename scalar_t>
+__global__
+void lightconv_grad_wrt_weights_firstpass_kernel(
+    const scalar_t* input,
+    const scalar_t* gradInput,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    float* output) {
+
+  assert(blockDim.x == SB);
+
+  const int tid = threadIdx.x;
+  const int batchIdx = blockIdx.x;
+  const int featureIdx = blockIdx.y;
+  const int filterIdx = featureIdx / numFiltersInBlock;
+  const int idxInFilterBlock = featureIdx % numFiltersInBlock;
+
+  const int numIterations = divUp<int, int>(sequenceLength, SB);
+
+  float temp;
+
+  __shared__ scalar_t tempInput[SB + FS];
+  __shared__ scalar_t tempGradInput[SB + FS];
+  zeroSharedMem<FS, SB, padding_l>(tempInput);
+  zeroSharedMem<FS, SB, (FS/2)>(tempGradInput);
+  __syncthreads();
+
+  float accumWeights[FS];
+
+  for (int i = 0; i < FS; ++i) {
+    accumWeights[i] = float(0.0);
+  }
+
+  const int IOOffset = batchIdx * numFeatures * sequenceLength + featureIdx * sequenceLength;
+  const scalar_t* inputFeature = &input[IOOffset];
+  const scalar_t* gradInputFeature = &gradInput[IOOffset];
+  float* tempOutputGradWeight = &output[filterIdx * FS * minibatch * numFiltersInBlock];
+
+  for (int i = 0; i < numIterations; ++i) {
+    const int inputOffset = i * SB;
+
+    load_input_to_shared<FS, SB, padding_l>(inputFeature, inputOffset, sequenceLength,
+                                 i, numIterations, false, tempInput);
+    load_input_to_shared<FS, SB, (FS/2)>(gradInputFeature, inputOffset, sequenceLength,
+                                 i, numIterations, false, tempGradInput);
+    __syncthreads();
+
+    #pragma unroll
+    for (int j = 0; j < FS; ++j) {
+      accumWeights[j] += tempInput[tid + j] * tempGradInput[tid + (FS/2)];
+    }
+
+    __syncthreads();
+  }
+
+  // Row-major sum
+  for (int filterWeightIdx = 0; filterWeightIdx < FS; ++filterWeightIdx) {
+
+    // Write to shared memory before reduction
+    if (tid < sequenceLength) {
+      temp = accumWeights[filterWeightIdx];
+    } else {
+      temp = float(0.0);
+    }
+
+    temp = blockReduce(temp);
+
+    const int outputOffset = filterWeightIdx * minibatch * numFiltersInBlock +
+                             batchIdx * numFiltersInBlock +
+                             idxInFilterBlock;
+
+    if (tid == 0) {
+      tempOutputGradWeight[outputOffset] = temp;
+    }
+  }
+}
+
+template<int FS, int SB, typename scalar_t>
+__global__
+void lightconv_grad_wrt_weights_secondpass_kernel(
+    const float* input,
+    const int minibatch,
+    const int numFiltersInBlock,
+    scalar_t* output) {
+
+  assert(blockDim.x == SB);
+  const int tid = threadIdx.x;
+
+  // What is the id within a minibatch
+  const int filterIdx = blockIdx.x;
+  const int filterWeightIdx = blockIdx.y;
+
+  const int inputOffset = filterIdx * FS * minibatch * numFiltersInBlock +
+                          filterWeightIdx * minibatch * numFiltersInBlock;
+  const float* tempInput = &input[inputOffset];
+
+  int readIndex = tid;
+
+  float sum = float(0.0);
+  while (readIndex < (minibatch * numFiltersInBlock)) {
+    sum += tempInput[readIndex];
+    readIndex += SB;
+  }
+
+  float temp = blockReduce(sum);
+
+  if (tid == 0) {
+    output[blockIdx.x * FS + blockIdx.y] = temp;
+  }
+}
diff --git a/fairseq/modules/lightconv_layer/lightconv_layer.py b/fairseq/modules/lightconv_layer/lightconv_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b4c9a951eadb628619a68957258586472d208f4
--- /dev/null
+++ b/fairseq/modules/lightconv_layer/lightconv_layer.py
@@ -0,0 +1,129 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch import nn
+from torch.autograd import Function
+import torch.nn.functional as F
+
+import lightconv_cuda
+from fairseq import utils
+from fairseq.incremental_decoding_utils import with_incremental_state
+from fairseq.modules.fairseq_dropout import FairseqDropout
+
+
+class lightconvFunction(Function):
+
+    @staticmethod
+    def forward(ctx, x, weights, padding_l):
+        ctx.padding_l = padding_l
+        outputs = lightconv_cuda.forward(x, weights, padding_l)
+        variables = [x, weights]
+        ctx.save_for_backward(*variables)
+        return outputs[0]
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        outputs = lightconv_cuda.backward(
+                grad_output.contiguous(),
+                ctx.padding_l,
+                *ctx.saved_tensors)
+        grad_input, grad_weights = outputs
+        return grad_input, grad_weights, None
+
+
+@with_incremental_state
+class LightconvLayer(nn.Module):
+    def __init__(
+            self,
+            input_size,
+            kernel_size=1,
+            padding_l=None,
+            weight_softmax=False,
+            num_heads=1,
+            weight_dropout=0.,
+            bias=False,
+    ):
+        super(LightconvLayer, self).__init__()
+        self.input_size = input_size
+        self.kernel_size = kernel_size
+        self.padding_l = padding_l
+        self.num_heads = num_heads
+        self.weight_softmax = weight_softmax
+        self.weight_dropout_module = FairseqDropout(weight_dropout, module_name=self.__class__.__name__)
+
+        self.weight = nn.Parameter(torch.Tensor(num_heads, kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(input_size))
+        else:
+            self.bias = None
+        self.reset_parameters()
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + '.' if name != '' else ''
+        for k, v in state_dict.items():
+            if k.endswith(prefix + 'weight'):
+                if v.dim() == 3 and v.size(1) == 1:
+                    state_dict[k] = v.squeeze(1)
+
+    def reset_parameters(self):
+        nn.init.xavier_uniform_(self.weight)
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0.)
+
+    def forward(self, x, incremental_state=None):
+
+        # during inference time, incremental BMM is faster
+        if incremental_state is not None:
+            T, B, C = x.size()
+            K, H = self.kernel_size, self.num_heads
+            R = C // H
+            input_buffer = self._get_input_buffer(incremental_state)
+            if input_buffer is None:
+                input_buffer = x.new()
+            x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3)
+            if self.kernel_size > 1:
+                self._set_input_buffer(incremental_state, x_unfold[:, :, :, -self.kernel_size+1:])
+            x_unfold = x_unfold.view(T*B*H, R, -1)
+
+            weight = self.weight
+            if self.weight_softmax:
+                weight = F.softmax(weight.float(), dim=1).type_as(weight)
+
+            weight = weight[:, -x_unfold.size(2):]
+
+            K = weight.size(1)
+
+            weight = weight.view(1, H, K).expand(T*B, H, K).contiguous().view(T*B*H, K, 1)
+
+            weight = self.weight_dropout_module(weight)
+            output = torch.bmm(x_unfold, weight)  # T*B*H x R x 1
+            output = output.view(T, B, C)
+            return output
+
+        # during training time, use CUDA kernel
+        else:
+            x = x.permute(1, 2, 0).contiguous()
+            weight = self.weight
+            if self.weight_softmax:
+                weight = F.softmax(self.weight, -1)
+            if self.weight_dropout_module.p:
+                weight = self.weight_dropout_module(weight)
+            return lightconvFunction.apply(x, weight, self.padding_l).permute(2, 0, 1)
+
+    def reorder_incremental_state(self, incremental_state, new_order):
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            input_buffer = input_buffer.index_select(1, new_order)
+            self._set_input_buffer(incremental_state, input_buffer)
+
+    def _get_input_buffer(self, incremental_state):
+        return utils.get_incremental_state(self, incremental_state, 'input_buffer')
+
+    def _set_input_buffer(self, incremental_state, new_buffer):
+        return utils.set_incremental_state(self, incremental_state, 'input_buffer', new_buffer)
+
+    def half(self):
+        return self._apply(lambda t: t.half() if t.is_floating_point() else t)
diff --git a/fairseq/modules/lightconv_layer/setup.py b/fairseq/modules/lightconv_layer/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..0eac1df03cd1eeee8bea56f110c6b9f2f97c0dc5
--- /dev/null
+++ b/fairseq/modules/lightconv_layer/setup.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from setuptools import setup
+from torch.utils.cpp_extension import CUDAExtension, BuildExtension
+
+setup(
+    name='lightconv_layer',
+    ext_modules=[
+        CUDAExtension('lightconv_cuda', [
+            'lightconv_cuda.cpp',
+            'lightconv_cuda_kernel.cu',
+        ]),
+    ],
+    cmdclass={
+        'build_ext': BuildExtension
+    })
diff --git a/fairseq/modules/lightweight_convolution.py b/fairseq/modules/lightweight_convolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d4cddb134c0367770380977952d1617efeccf06
--- /dev/null
+++ b/fairseq/modules/lightweight_convolution.py
@@ -0,0 +1,256 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq import utils
+from fairseq.modules.unfold import unfold1d
+from fairseq.incremental_decoding_utils import with_incremental_state
+from fairseq.modules.fairseq_dropout import FairseqDropout
+
+
+def LightweightConv(input_size, kernel_size=1, padding_l=None, num_heads=1,
+                    weight_dropout=0., weight_softmax=False, bias=False):
+    if torch.cuda.is_available():
+        try:
+            from fairseq.modules.lightconv_layer import LightconvLayer
+            return LightconvLayer(input_size, kernel_size=kernel_size,
+                                  padding_l=padding_l, num_heads=num_heads,
+                                  weight_dropout=weight_dropout,
+                                  weight_softmax=weight_softmax, bias=bias)
+        except ImportError as e:
+            print(e)
+    return LightweightConv1dTBC(input_size, kernel_size=kernel_size,
+                                padding_l=padding_l, num_heads=num_heads,
+                                weight_dropout=weight_dropout,
+                                weight_softmax=weight_softmax, bias=bias)
+
+
+class LightweightConv1d(nn.Module):
+    '''Lightweight Convolution assuming the input is BxCxT
+    This is just an example that explains LightConv clearer than the TBC version.
+    We don't use this module in the model.
+
+    Args:
+        input_size: # of channels of the input and output
+        kernel_size: convolution channels
+        padding: padding
+        num_heads: number of heads used. The weight is of shape
+            `(num_heads, 1, kernel_size)`
+        weight_softmax: normalize the weight with softmax before the convolution
+
+    Shape:
+        Input: BxCxT, i.e. (batch_size, input_size, timesteps)
+        Output: BxCxT, i.e. (batch_size, input_size, timesteps)
+
+    Attributes:
+        weight: the learnable weights of the module of shape
+            `(num_heads, 1, kernel_size)`
+        bias: the learnable bias of the module of shape `(input_size)`
+    '''
+
+    def __init__(self, input_size, kernel_size=1, padding=0, num_heads=1,
+                 weight_softmax=False, bias=False, weight_dropout=0.):
+        super().__init__()
+        self.input_size = input_size
+        self.kernel_size = kernel_size
+        self.num_heads = num_heads
+        self.padding = padding
+        self.weight_softmax = weight_softmax
+        self.weight = nn.Parameter(torch.Tensor(num_heads, 1, kernel_size))
+
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(input_size))
+        else:
+            self.bias = None
+        self.weight_dropout_module = FairseqDropout(weight_dropout, module_name=self.__class__.__name__)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.xavier_uniform_(self.weight)
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0.)
+
+    def forward(self, input):
+        '''
+        input size: B x C x T
+        output size: B x C x T
+        '''
+        B, C, T = input.size()
+        H = self.num_heads
+
+        weight = self.weight
+        if self.weight_softmax:
+            weight = F.softmax(weight, dim=-1)
+
+        weight = self.weight_dropout_module(weight)
+        # Merge every C/H entries into the batch dimension (C = self.input_size)
+        # B x C x T -> (B * C/H) x H x T
+        # One can also expand the weight to C x 1 x K by a factor of C/H
+        # and do not reshape the input instead, which is slow though
+        input = input.view(-1, H, T)
+        output = F.conv1d(input, weight, padding=self.padding, groups=self.num_heads)
+        output = output.view(B, C, T)
+        if self.bias is not None:
+            output = output + self.bias.view(1, -1, 1)
+
+        return output
+
+
+@with_incremental_state
+class LightweightConv1dTBC(nn.Module):
+    '''Lightweight Convolution assuming the input is TxBxC
+    Args:
+        input_size: # of channels of the input
+        kernel_size: convolution channels
+        padding_l: padding to the left when using "same" padding
+        num_heads: number of heads used. The weight is of shape (num_heads, 1, kernel_size)
+        weight_dropout: the drop rate of the DropConnect to drop the weight
+        weight_softmax: normalize the weight with softmax before the convolution
+        bias: use bias
+
+    Shape:
+        Input: TxBxC, i.e. (timesteps, batch_size, input_size)
+        Output: TxBxC, i.e. (timesteps, batch_size, input_size)
+
+    Attributes:
+        weight: the learnable weights of the module of shape
+            `(num_heads, 1, kernel_size)`
+        bias:   the learnable bias of the module of shape `(input_size)`
+    '''
+    def __init__(self, input_size, kernel_size=1, padding_l=None, num_heads=1,
+                 weight_dropout=0., weight_softmax=False, bias=False):
+        super().__init__()
+        self.input_size = input_size
+        self.kernel_size = kernel_size
+        self.padding_l = padding_l
+        self.num_heads = num_heads
+        self.weight_dropout_module = FairseqDropout(weight_dropout, module_name=self.__class__.__name__)
+        self.weight_softmax = weight_softmax
+
+        self.weight = nn.Parameter(torch.Tensor(num_heads, 1, kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(input_size))
+        else:
+            self.bias = None
+
+        self.reset_parameters()
+        self.onnx_trace = False
+
+    def reset_parameters(self):
+        nn.init.xavier_uniform_(self.weight)
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0.)
+
+    def forward(self, x, incremental_state=None, unfold=False):
+        '''Assuming the input, x, of the shape T x B x C and producing an output in the shape T x B x C
+        args:
+            x: Input of shape T x B x C, i.e. (timesteps, batch_size, input_size)
+            incremental_state: A dict to keep the state
+            unfold: unfold the input or not. If not, we use the matrix trick instead
+        '''
+        unfold = unfold or (incremental_state is not None)
+
+        if unfold:
+            output = self._forward_unfolded(x, incremental_state)
+        else:
+            output = self._forward_expanded(x, incremental_state)
+
+        if self.bias is not None:
+            output = output + self.bias.view(1, 1, -1)
+        return output
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+
+    def _forward_unfolded(self, x, incremental_state):
+        '''The conventional implementation of convolutions.
+        Unfolding the input by having a window shifting to the right.'''
+        T, B, C = x.size()
+        K, H = self.kernel_size, self.num_heads
+        R = C // H
+        assert R * H == C == self.input_size
+
+        weight = self.weight.view(H, K)
+        if incremental_state is not None:
+            input_buffer = self._get_input_buffer(incremental_state)
+            if input_buffer is None:
+                input_buffer = x.new()
+            x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3)
+            if self.kernel_size > 1:
+                self._set_input_buffer(incremental_state, x_unfold[:, :, :, -self.kernel_size+1:])
+            x_unfold = x_unfold.view(T*B*H, R, -1)
+        else:
+            # unfold the input: T x B x C --> T' x B x C x K
+            x_unfold = unfold1d(x, self.kernel_size, self.padding_l, 0)
+            x_unfold = x_unfold.view(T*B*H, R, K)
+
+        if self.weight_softmax:
+            weight = utils.softmax(weight, dim=1, onnx_trace=self.onnx_trace).type_as(weight)
+
+        if incremental_state is not None:
+            weight = weight[:, -x_unfold.size(2):]
+            K = weight.size(1)
+
+        weight = weight.view(1, H, K).expand(T*B, H, K).contiguous().view(T*B*H, K, 1)
+
+        weight = self.weight_dropout_module(weight)
+        output = torch.bmm(x_unfold, weight)  # T*B*H x R x 1
+        output = output.view(T, B, C)
+        return output
+
+    def _forward_expanded(self, x, incremental_state):
+        '''Turn the convolution filters into band matrices and do matrix multiplication.
+        This is faster when the sequence is short, but less memory efficient.
+        This is not used in the decoder during inference.
+        '''
+        T, B, C = x.size()
+        K, H = self.kernel_size, self.num_heads
+        R = C // H
+        assert R * H == C == self.input_size
+
+        weight = self.weight.view(H, K)
+        if self.weight_softmax:
+            weight = utils.softmax(weight, dim=1, onnx_trace=self.onnx_trace).type_as(weight)
+        weight = weight.view(1, H, K).expand(T*B, H, K).contiguous()
+        weight = weight.view(T, B*H, K).transpose(0, 1)
+
+        x = x.view(T, B*H, R).transpose(0, 1)
+        P = self.padding_l
+        if K > T and P == K-1:
+            weight = weight.narrow(2, K-T, T)
+            K, P = T, T-1
+        # turn the convolution filters into band matrices
+        weight_expanded = weight.new_zeros(B*H, T, T+K-1, requires_grad=False)
+        weight_expanded.as_strided((B*H, T, K), (T*(T+K-1), T+K, 1)).copy_(weight)
+        weight_expanded = weight_expanded.narrow(2, P, T)
+        weight_expanded = self.weight_dropout_module(weight_expanded)
+
+        output = torch.bmm(weight_expanded, x)
+        output = output.transpose(0, 1).contiguous().view(T, B, C)
+        return output
+
+    def reorder_incremental_state(self, incremental_state, new_order):
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            input_buffer = input_buffer.index_select(1, new_order)
+            self._set_input_buffer(incremental_state, input_buffer)
+
+    def _get_input_buffer(self, incremental_state):
+        return utils.get_incremental_state(self, incremental_state, 'input_buffer')
+
+    def _set_input_buffer(self, incremental_state, new_buffer):
+        return utils.set_incremental_state(self, incremental_state, 'input_buffer', new_buffer)
+
+    def extra_repr(self):
+        s = '{}, kernel_size={}, padding_l={}, num_heads={}, weight_softmax={}, bias={}'.format(
+            self.input_size, self.kernel_size, self.padding_l,
+            self.num_heads, self.weight_softmax, self.bias is not None
+        )
+        if self.weight_dropout_module.p > 0.:
+            s += ', weight_dropout={}'.format(self.weight_dropout_module.p)
+        return s
diff --git a/fairseq/modules/linearized_convolution.py b/fairseq/modules/linearized_convolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dd4b151c14bf4cb8968567f2574026a033f2be3
--- /dev/null
+++ b/fairseq/modules/linearized_convolution.py
@@ -0,0 +1,100 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn.functional as F
+
+from fairseq import utils
+from .conv_tbc import ConvTBC
+from fairseq.incremental_decoding_utils import with_incremental_state
+
+
+@with_incremental_state
+class LinearizedConvolution(ConvTBC):
+    """An optimized version of nn.Conv1d.
+
+    At training time, this module uses ConvTBC, which is an optimized version
+    of Conv1d. At inference time, it optimizes incremental generation (i.e.,
+    one time step at a time) by replacing the convolutions with linear layers.
+    Note that the input order changes from training to inference.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, **kwargs):
+        super().__init__(in_channels, out_channels, kernel_size, **kwargs)
+        self._linearized_weight = None
+        self.register_backward_hook(self._clear_linearized_weight)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        state = ConvTBC.state_dict(self, destination, prefix, keep_vars=keep_vars)
+        # don't store redundant _linearized_weight in checkpoints
+        if prefix + '_linearized_weight' in state:
+            del state[prefix + '_linearized_weight']
+        return state
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + '.' if name != '' else ''
+        if prefix + '_linearized_weight' in state_dict:
+            del state_dict[prefix + '_linearized_weight']
+
+    def forward(self, input, incremental_state=None):
+        """
+        Args:
+            incremental_state: Used to buffer signal; if not None, then input is
+                expected to contain a single frame. If the input order changes
+                between time steps, call reorder_incremental_state.
+        Input:
+            Time x Batch x Channel during training
+            Batch x Time x Channel during inference
+        """
+        if incremental_state is None:
+            output = super().forward(input)
+            if self.kernel_size[0] > 1 and self.padding[0] > 0:
+                # remove future timesteps added by padding
+                output = output[:-self.padding[0], :, :]
+            return output
+
+        # reshape weight
+        weight = self._get_linearized_weight()
+        kw = self.kernel_size[0]
+
+        bsz = input.size(0)  # input: bsz x len x dim
+        if kw > 1:
+            input = input.data
+            input_buffer = self._get_input_buffer(incremental_state)
+            if input_buffer is None:
+                input_buffer = input.new(bsz, kw, input.size(2)).zero_()
+                self._set_input_buffer(incremental_state, input_buffer)
+            else:
+                # shift buffer
+                input_buffer[:, :-1, :] = input_buffer[:, 1:, :].clone()
+            # append next input
+            input_buffer[:, -1, :] = input[:, -1, :]
+            input = input_buffer
+        with torch.no_grad():
+            output = F.linear(input.view(bsz, -1), weight, self.bias)
+        return output.view(bsz, 1, -1)
+
+    def reorder_incremental_state(self, incremental_state, new_order):
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            input_buffer = input_buffer.index_select(0, new_order)
+            self._set_input_buffer(incremental_state, input_buffer)
+
+    def _get_input_buffer(self, incremental_state):
+        return utils.get_incremental_state(self, incremental_state, 'input_buffer')
+
+    def _set_input_buffer(self, incremental_state, new_buffer):
+        return utils.set_incremental_state(self, incremental_state, 'input_buffer', new_buffer)
+
+    def _get_linearized_weight(self):
+        if self._linearized_weight is None:
+            kw = self.kernel_size[0]
+            weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous()
+            assert weight.size() == (self.out_channels, kw, self.in_channels)
+            self._linearized_weight = torch.nn.Parameter(weight.view(self.out_channels, -1))
+        return self._linearized_weight
+
+    def _clear_linearized_weight(self, *args):
+        self._linearized_weight = None
diff --git a/fairseq/modules/multihead_attention.py b/fairseq/modules/multihead_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..e33dd450ee837f6a3150efac6d4c05edc637eb04
--- /dev/null
+++ b/fairseq/modules/multihead_attention.py
@@ -0,0 +1,477 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.nn import Parameter
+
+from fairseq import utils
+from fairseq.incremental_decoding_utils import with_incremental_state
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from fairseq.modules.quant_noise import quant_noise
+
+
+@with_incremental_state
+class MultiheadAttention(nn.Module):
+    """Multi-headed attention.
+
+    See "Attention Is All You Need" for more details.
+    """
+
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        kdim=None,
+        vdim=None,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        self_attention=False,
+        encoder_decoder_attention=False,
+        q_noise=0.0,
+        qn_block_size=8,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout_module = FairseqDropout(
+            dropout, module_name=self.__class__.__name__
+        )
+
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim ** -0.5
+
+        self.self_attention = self_attention
+        self.encoder_decoder_attention = encoder_decoder_attention
+
+        assert not self.self_attention or self.qkv_same_dim, (
+            "Self-attention requires query, key and " "value to be of the same size"
+        )
+
+        self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size)
+        self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size)
+        self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size)
+
+        self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size)
+
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+
+        self.add_zero_attn = add_zero_attn
+
+        self.reset_parameters()
+
+        self.onnx_trace = False
+        self.tpu = False
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+
+    def prepare_for_tpu_(self, **kwargs):
+        self.tpu = True
+
+    def reset_parameters(self):
+        if self.qkv_same_dim:
+            # Empirically observed the convergence to be much better with
+            # the scaled initialization
+            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
+        else:
+            nn.init.xavier_uniform_(self.k_proj.weight)
+            nn.init.xavier_uniform_(self.v_proj.weight)
+            nn.init.xavier_uniform_(self.q_proj.weight)
+
+        nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.out_proj.bias is not None:
+            nn.init.constant_(self.out_proj.bias, 0.)
+        if self.bias_k is not None:
+            nn.init.xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            nn.init.xavier_normal_(self.bias_v)
+
+    def forward(
+        self,
+        query,
+        key: Optional[Tensor],
+        value: Optional[Tensor],
+        key_padding_mask: Optional[Tensor] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        need_weights: bool = True,
+        static_kv: bool = False,
+        attn_mask: Optional[Tensor] = None,
+        before_softmax: bool = False,
+        need_head_weights: bool = False,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """Input shape: Time x Batch x Channel
+
+        Args:
+            key_padding_mask (ByteTensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where
+                padding elements are indicated by 1s.
+            need_weights (bool, optional): return the attention weights,
+                averaged over heads (default: False).
+            attn_mask (ByteTensor, optional): typically used to
+                implement causal attention, where the mask prevents the
+                attention from looking forward in time (default: None).
+            before_softmax (bool, optional): return the raw attention
+                weights and values before the attention softmax.
+            need_head_weights (bool, optional): return the attention
+                weights for each head. Implies *need_weights*. Default:
+                return the average attention weights over all heads.
+        """
+        if need_head_weights:
+            need_weights = True
+
+        tgt_len, bsz, embed_dim = query.size()
+        assert embed_dim == self.embed_dim
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+
+        if (
+            not self.onnx_trace
+            and not self.tpu  # don't use PyTorch version on TPUs
+            and incremental_state is None
+            and not static_kv
+            # A workaround for quantization to work. Otherwise JIT compilation
+            # treats bias in linear module as method.
+            and not torch.jit.is_scripting()
+        ):
+            assert key is not None and value is not None
+            return F.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                torch.empty([0]),
+                torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout_module.p,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                self.training or self.dropout_module.apply_during_inference,
+                key_padding_mask,
+                need_weights,
+                attn_mask,
+                use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj.weight,
+                k_proj_weight=self.k_proj.weight,
+                v_proj_weight=self.v_proj.weight,
+            )
+
+        if incremental_state is not None:
+            saved_state = self._get_input_buffer(incremental_state)
+            if saved_state is not None and "prev_key" in saved_state:
+                # previous time steps are cached - no need to recompute
+                # key and value if they are static
+                if static_kv:
+                    assert self.encoder_decoder_attention and not self.self_attention
+                    key = value = None
+        else:
+            saved_state = None
+
+        if self.self_attention:
+            q = self.q_proj(query)
+            k = self.k_proj(query)
+            v = self.v_proj(query)
+        elif self.encoder_decoder_attention:
+            # encoder-decoder attention
+            q = self.q_proj(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                k = self.k_proj(key)
+                v = self.v_proj(key)
+
+        else:
+            assert key is not None and value is not None
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
+        q *= self.scaling
+
+        if self.bias_k is not None:
+            assert self.bias_v is not None
+            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = torch.cat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
+                )
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [
+                        key_padding_mask,
+                        key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
+                    ],
+                    dim=1,
+                )
+
+        q = (
+            q.contiguous()
+            .view(tgt_len, bsz * self.num_heads, self.head_dim)
+            .transpose(0, 1)
+        )
+        if k is not None:
+            k = (
+                k.contiguous()
+                .view(-1, bsz * self.num_heads, self.head_dim)
+                .transpose(0, 1)
+            )
+        if v is not None:
+            v = (
+                v.contiguous()
+                .view(-1, bsz * self.num_heads, self.head_dim)
+                .transpose(0, 1)
+            )
+
+        if saved_state is not None:
+            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
+            if "prev_key" in saved_state:
+                _prev_key = saved_state["prev_key"]
+                assert _prev_key is not None
+                prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    k = prev_key
+                else:
+                    assert k is not None
+                    k = torch.cat([prev_key, k], dim=1)
+            if "prev_value" in saved_state:
+                _prev_value = saved_state["prev_value"]
+                assert _prev_value is not None
+                prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    v = prev_value
+                else:
+                    assert v is not None
+                    v = torch.cat([prev_value, v], dim=1)
+            prev_key_padding_mask: Optional[Tensor] = None
+            if "prev_key_padding_mask" in saved_state:
+                prev_key_padding_mask = saved_state["prev_key_padding_mask"]
+            assert k is not None and v is not None
+            key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
+                key_padding_mask=key_padding_mask,
+                prev_key_padding_mask=prev_key_padding_mask,
+                batch_size=bsz,
+                src_len=k.size(1),
+                static_kv=static_kv,
+            )
+
+            saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_key_padding_mask"] = key_padding_mask
+            # In this branch incremental_state is never None
+            assert incremental_state is not None
+            incremental_state = self._set_input_buffer(incremental_state, saved_state)
+        assert k is not None
+        src_len = k.size(1)
+
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+
+        if self.add_zero_attn:
+            assert v is not None
+            src_len += 1
+            k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
+            v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
+            if attn_mask is not None:
+                attn_mask = torch.cat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
+                )
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [
+                        key_padding_mask,
+                        torch.zeros(key_padding_mask.size(0), 1).type_as(
+                            key_padding_mask
+                        ),
+                    ],
+                    dim=1,
+                )
+
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        attn_weights = MultiheadAttention.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
+
+        assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0)
+            if self.onnx_trace:
+                attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1)
+            attn_weights += attn_mask
+
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            if not self.tpu:
+                attn_weights = attn_weights.masked_fill(
+                    key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
+                    float("-inf")
+                )
+            else:
+                attn_weights = attn_weights.transpose(0, 2)
+                attn_weights = attn_weights.masked_fill(key_padding_mask, float('-inf'))
+                attn_weights = attn_weights.transpose(0, 2)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if before_softmax:
+            return attn_weights, v
+
+        attn_weights_float = utils.softmax(
+            attn_weights, dim=-1, onnx_trace=self.onnx_trace
+        )
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = self.dropout_module(attn_weights)
+
+        assert v is not None
+        attn = torch.bmm(attn_probs, v)
+        assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        if self.onnx_trace and attn.size(1) == 1:
+            # when ONNX tracing a single decoder step (sequence length == 1)
+            # the transpose is a no-op copy before view, thus unnecessary
+            attn = attn.contiguous().view(tgt_len, bsz, embed_dim)
+        else:
+            attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        attn = self.out_proj(attn)
+        attn_weights: Optional[Tensor] = None
+        if need_weights:
+            attn_weights = attn_weights_float.view(
+                bsz, self.num_heads, tgt_len, src_len
+            ).transpose(1, 0)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dim=0)
+
+        return attn, attn_weights
+
+    @staticmethod
+    def _append_prev_key_padding_mask(
+        key_padding_mask: Optional[Tensor],
+        prev_key_padding_mask: Optional[Tensor],
+        batch_size: int,
+        src_len: int,
+        static_kv: bool,
+    ) -> Optional[Tensor]:
+        # saved key padding masks have shape (bsz, seq_len)
+        if prev_key_padding_mask is not None and static_kv:
+            new_key_padding_mask = prev_key_padding_mask
+        elif prev_key_padding_mask is not None and key_padding_mask is not None:
+            new_key_padding_mask = torch.cat(
+                [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1
+            )
+        # During incremental decoding, as the padding token enters and
+        # leaves the frame, there will be a time when prev or current
+        # is None
+        elif prev_key_padding_mask is not None:
+            filler = torch.zeros(
+                (batch_size, src_len - prev_key_padding_mask.size(1)),
+                device=prev_key_padding_mask.device,
+            )
+            new_key_padding_mask = torch.cat(
+                [prev_key_padding_mask.float(), filler.float()], dim=1
+            )
+        elif key_padding_mask is not None:
+            filler = torch.zeros(
+                (batch_size, src_len - key_padding_mask.size(1)),
+                device=key_padding_mask.device,
+            )
+            new_key_padding_mask = torch.cat(
+                [filler.float(), key_padding_mask.float()], dim=1
+            )
+        else:
+            new_key_padding_mask = prev_key_padding_mask
+        return new_key_padding_mask
+
+    @torch.jit.export
+    def reorder_incremental_state(
+        self, incremental_state: Dict[str, Dict[str, Optional[Tensor]]], new_order: Tensor
+    ):
+        """Reorder buffered internal state (for incremental generation)."""
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            for k in input_buffer.keys():
+                input_buffer_k = input_buffer[k]
+                if input_buffer_k is not None:
+                    if self.encoder_decoder_attention and input_buffer_k.size(0) == new_order.size(0):
+                        break
+                    input_buffer[k] = input_buffer_k.index_select(0, new_order)
+            incremental_state = self._set_input_buffer(incremental_state, input_buffer)
+        return incremental_state
+
+    def _get_input_buffer(
+        self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
+    ) -> Dict[str, Optional[Tensor]]:
+        result = self.get_incremental_state(incremental_state, "attn_state")
+        if result is not None:
+            return result
+        else:
+            empty_result: Dict[str, Optional[Tensor]] = {}
+            return empty_result
+
+    def _set_input_buffer(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+        buffer: Dict[str, Optional[Tensor]],
+    ):
+        return self.set_incremental_state(incremental_state, "attn_state", buffer)
+
+    def apply_sparse_mask(attn_weights, tgt_len: int, src_len: int, bsz: int):
+        return attn_weights
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + "." if name != "" else ""
+        items_to_add = {}
+        keys_to_remove = []
+        for k in state_dict.keys():
+            if k.endswith(prefix + "in_proj_weight"):
+                # in_proj_weight used to be q + k + v with same dimensions
+                dim = int(state_dict[k].shape[0] / 3)
+                items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim]
+                items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim]
+                items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim :]
+
+                keys_to_remove.append(k)
+
+                k_bias = prefix + "in_proj_bias"
+                if k_bias in state_dict.keys():
+                    dim = int(state_dict[k].shape[0] / 3)
+                    items_to_add[prefix + "q_proj.bias"] = state_dict[k_bias][:dim]
+                    items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][
+                        dim : 2 * dim
+                    ]
+                    items_to_add[prefix + "v_proj.bias"] = state_dict[k_bias][2 * dim :]
+
+                    keys_to_remove.append(prefix + "in_proj_bias")
+
+        for k in keys_to_remove:
+            del state_dict[k]
+
+        for key, value in items_to_add.items():
+            state_dict[key] = value
diff --git a/fairseq/modules/positional_embedding.py b/fairseq/modules/positional_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..511460fcb711ac6016d635f26beee44ae3d630af
--- /dev/null
+++ b/fairseq/modules/positional_embedding.py
@@ -0,0 +1,32 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.nn as nn
+from .learned_positional_embedding import LearnedPositionalEmbedding
+from .sinusoidal_positional_embedding import SinusoidalPositionalEmbedding
+
+
+def PositionalEmbedding(
+        num_embeddings: int,
+        embedding_dim: int,
+        padding_idx: int,
+        learned: bool = False,
+):
+    if learned:
+        # if padding_idx is specified then offset the embedding ids by
+        # this index and adjust num_embeddings appropriately
+        # TODO: The right place for this offset would be inside
+        # LearnedPositionalEmbedding. Move this there for a cleaner implementation.
+        if padding_idx is not None:
+            num_embeddings = num_embeddings + padding_idx + 1
+        m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx)
+        nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
+        if padding_idx is not None:
+            nn.init.constant_(m.weight[padding_idx], 0)
+    else:
+        m = SinusoidalPositionalEmbedding(
+            embedding_dim, padding_idx, init_size=num_embeddings + padding_idx + 1,
+        )
+    return m
diff --git a/fairseq/modules/quant_noise.py b/fairseq/modules/quant_noise.py
new file mode 100644
index 0000000000000000000000000000000000000000..b38ea263d32337affc612b6ca185254bf59e8bf0
--- /dev/null
+++ b/fairseq/modules/quant_noise.py
@@ -0,0 +1,90 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+
+
+def quant_noise(module, p, block_size):
+    """
+    Wraps modules and applies quantization noise to the weights for
+    subsequent quantization with Iterative Product Quantization as
+    described in "Training with Quantization Noise for Extreme Model Compression"
+
+    Args:
+        - module: nn.Module
+        - p: amount of Quantization Noise
+        - block_size: size of the blocks for subsequent quantization with iPQ
+
+    Remarks:
+        - Module weights must have the right sizes wrt the block size
+        - Only Linear, Embedding and Conv2d modules are supported for the moment
+        - For more detail on how to quantize by blocks with convolutional weights,
+          see "And the Bit Goes Down: Revisiting the Quantization of Neural Networks"
+        - We implement the simplest form of noise here as stated in the paper
+          which consists in randomly dropping blocks
+    """
+
+    # if no quantization noise, don't register hook
+    if p <= 0:
+        return module
+
+    # supported modules
+    assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d))
+
+    # test whether module.weight has the right sizes wrt block_size
+    is_conv = module.weight.ndim == 4
+
+    # 2D matrix
+    if not is_conv:
+        assert module.weight.size(1) % block_size == 0, "Input features must be a multiple of block sizes"
+
+    # 4D matrix
+    else:
+        # 1x1 convolutions
+        if module.kernel_size == (1, 1):
+            assert module.in_channels % block_size == 0, "Input channels must be a multiple of block sizes"
+        # regular convolutions
+        else:
+            k = module.kernel_size[0] * module.kernel_size[1]
+            assert k % block_size == 0, "Kernel size must be a multiple of block size"
+
+    def _forward_pre_hook(mod, input):
+        # no noise for evaluation
+        if mod.training:
+            if not is_conv:
+                # gather weight and sizes
+                weight = mod.weight
+                in_features = weight.size(1)
+                out_features = weight.size(0)
+
+                # split weight matrix into blocks and randomly drop selected blocks
+                mask = torch.zeros(in_features // block_size * out_features, device=weight.device)
+                mask.bernoulli_(p)
+                mask = mask.repeat_interleave(block_size, -1).view(-1, in_features)
+
+            else:
+                # gather weight and sizes
+                weight = mod.weight
+                in_channels = mod.in_channels
+                out_channels = mod.out_channels
+
+                # split weight matrix into blocks and randomly drop selected blocks
+                if mod.kernel_size == (1, 1):
+                    mask = torch.zeros(int(in_channels // block_size * out_channels), device=weight.device)
+                    mask.bernoulli_(p)
+                    mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels)
+                else:
+                    mask = torch.zeros(weight.size(0), weight.size(1), device=weight.device)
+                    mask.bernoulli_(p)
+                    mask = mask.unsqueeze(2).unsqueeze(3).repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1])
+
+            # scale weights and apply mask
+            mask = mask.to(torch.bool)  # x.bool() is not currently supported in TorchScript
+            s = 1 / (1 - p)
+            mod.weight.data = s * weight.masked_fill(mask, 0)
+
+    module.register_forward_pre_hook(_forward_pre_hook)
+    return module
diff --git a/fairseq/modules/quantization/__init__.py b/fairseq/modules/quantization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fairseq/modules/quantization/pq/__init__.py b/fairseq/modules/quantization/pq/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b10b51b1b0ca21aaec96344f86a0ab9df0c22f8
--- /dev/null
+++ b/fairseq/modules/quantization/pq/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .utils import SizeTracker, quantize_model_  # NOQA
diff --git a/fairseq/modules/quantization/pq/em.py b/fairseq/modules/quantization/pq/em.py
new file mode 100644
index 0000000000000000000000000000000000000000..420d8afda25da3fb4ad34ddc284d969bf4f09dae
--- /dev/null
+++ b/fairseq/modules/quantization/pq/em.py
@@ -0,0 +1,211 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import random
+import logging
+from collections import Counter
+
+import torch
+
+
+class EM:
+    """
+    EM algorithm used to quantize the columns of W to minimize
+
+                         ||W - W_hat||^2
+
+    Args:
+        - W: weight matrix of size (in_features x out_features)
+        - n_iter: number of k-means iterations
+        - n_centroids: number of centroids (size of codebook)
+        - eps: for cluster reassignment when an empty cluster is found
+        - max_tentatives for cluster reassignment when an empty cluster is found
+        - verbose: print error after each iteration
+
+    Remarks:
+        - If one cluster is empty, the most populated cluster is split into
+          two clusters
+        - All the relevant dimensions are specified in the code
+    """
+
+    def __init__(
+        self, W, n_centroids=256, n_iter=20, eps=1e-6, max_tentatives=30, verbose=True
+    ):
+        self.W = W
+        self.n_centroids = n_centroids
+        self.n_iter = n_iter
+        self.eps = eps
+        self.max_tentatives = max_tentatives
+        self.verbose = verbose
+        self.centroids = torch.Tensor()
+        self.assignments = torch.Tensor()
+        self.objective = []
+
+    def initialize_centroids(self):
+        """
+        Initializes the centroids by sampling random columns from W.
+        """
+
+        in_features, out_features = self.W.size()
+        indices = torch.randint(
+            low=0, high=out_features, size=(self.n_centroids,)
+        ).long()
+        self.centroids = self.W[:, indices].t()  # (n_centroids x in_features)
+
+    def step(self, i):
+        """
+        There are two standard steps for each iteration: expectation (E) and
+        minimization (M). The E-step (assignment) is performed with an exhaustive
+        search and the M-step (centroid computation) is performed with
+        the exact solution.
+
+        Args:
+            - i: step number
+
+        Remarks:
+            - The E-step heavily uses PyTorch broadcasting to speed up computations
+              and reduce the memory overhead
+        """
+
+        # assignments (E-step)
+        distances = self.compute_distances()  # (n_centroids x out_features)
+        self.assignments = torch.argmin(distances, dim=0)  # (out_features)
+        n_empty_clusters = self.resolve_empty_clusters()
+
+        # centroids (M-step)
+        for k in range(self.n_centroids):
+            W_k = self.W[:, self.assignments == k]  # (in_features x size_of_cluster_k)
+            self.centroids[k] = W_k.mean(dim=1)  # (in_features)
+
+        # book-keeping
+        obj = (self.centroids[self.assignments].t() - self.W).norm(p=2).item()
+        self.objective.append(obj)
+        if self.verbose:
+            logging.info(
+                f"Iteration: {i},\t"
+                f"objective: {obj:.6f},\t"
+                f"resolved empty clusters: {n_empty_clusters}"
+            )
+
+    def resolve_empty_clusters(self):
+        """
+        If one cluster is empty, the most populated cluster is split into
+        two clusters by shifting the respective centroids. This is done
+        iteratively for a fixed number of tentatives.
+        """
+
+        # empty clusters
+        counts = Counter(map(lambda x: x.item(), self.assignments))
+        empty_clusters = set(range(self.n_centroids)) - set(counts.keys())
+        n_empty_clusters = len(empty_clusters)
+
+        tentatives = 0
+        while len(empty_clusters) > 0:
+            # given an empty cluster, find most populated cluster and split it into two
+            k = random.choice(list(empty_clusters))
+            m = counts.most_common(1)[0][0]
+            e = torch.randn_like(self.centroids[m]) * self.eps
+            self.centroids[k] = self.centroids[m].clone()
+            self.centroids[k] += e
+            self.centroids[m] -= e
+
+            # recompute assignments
+            distances = self.compute_distances()  # (n_centroids x out_features)
+            self.assignments = torch.argmin(distances, dim=0)  # (out_features)
+
+            # check for empty clusters
+            counts = Counter(map(lambda x: x.item(), self.assignments))
+            empty_clusters = set(range(self.n_centroids)) - set(counts.keys())
+
+            # increment tentatives
+            if tentatives == self.max_tentatives:
+                logging.info(
+                    f"Could not resolve all empty clusters, {len(empty_clusters)} remaining"
+                )
+                raise EmptyClusterResolveError
+            tentatives += 1
+
+        return n_empty_clusters
+
+    def compute_distances(self):
+        """
+        For every centroid m, computes
+
+                          ||M - m[None, :]||_2
+
+        Remarks:
+            - We rely on PyTorch's broadcasting to speed up computations
+              and reduce the memory overhead
+            - Without chunking, the sizes in the broadcasting are modified as:
+              (n_centroids x n_samples x out_features) -> (n_centroids x out_features)
+            - The broadcasting computation is automatically chunked so that
+              the tensors fit into the memory of the GPU
+        """
+
+        nb_centroids_chunks = 1
+
+        while True:
+            try:
+                return torch.cat(
+                    [
+                        (self.W[None, :, :] - centroids_c[:, :, None]).norm(p=2, dim=1)
+                        for centroids_c in self.centroids.chunk(
+                            nb_centroids_chunks, dim=0
+                        )
+                    ],
+                    dim=0,
+                )
+            except RuntimeError:
+                nb_centroids_chunks *= 2
+
+    def assign(self):
+        """
+        Assigns each column of W to its closest centroid, thus essentially
+        performing the E-step in train().
+
+        Remarks:
+            - The function must be called after train() or after loading
+              centroids using self.load(), otherwise it will return empty tensors
+        """
+
+        distances = self.compute_distances()  # (n_centroids x out_features)
+        self.assignments = torch.argmin(distances, dim=0)  # (out_features)
+
+    def save(self, path, layer):
+        """
+        Saves centroids and assignments.
+
+        Args:
+            - path: folder used to save centroids and assignments
+        """
+
+        torch.save(self.centroids, os.path.join(path, "{}_centroids.pth".format(layer)))
+        torch.save(
+            self.assignments, os.path.join(path, "{}_assignments.pth".format(layer))
+        )
+        torch.save(self.objective, os.path.join(path, "{}_objective.pth".format(layer)))
+
+    def load(self, path, layer):
+        """
+        Loads centroids and assignments from a given path
+
+        Args:
+            - path: folder use to load centroids and assignments
+        """
+
+        self.centroids = torch.load(
+            os.path.join(path, "{}_centroids.pth".format(layer))
+        )
+        self.assignments = torch.load(
+            os.path.join(path, "{}_assignments.pth".format(layer))
+        )
+        self.objective = torch.load(
+            os.path.join(path, "{}_objective.pth".format(layer))
+        )
+
+
+class EmptyClusterResolveError(Exception):
+    pass
diff --git a/fairseq/modules/quantization/pq/modules/__init__.py b/fairseq/modules/quantization/pq/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f52f6f37a6861a87becb7b269117cb9e24285b9e
--- /dev/null
+++ b/fairseq/modules/quantization/pq/modules/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .qconv import PQConv2d  # NOQA
+from .qlinear import PQLinear  # NOQA
+from .qemb import PQEmbedding  # NOQA
diff --git a/fairseq/modules/quantization/pq/modules/qconv.py b/fairseq/modules/quantization/pq/modules/qconv.py
new file mode 100644
index 0000000000000000000000000000000000000000..d15ec192e8cda6265a198e583a9bf7fb194dd129
--- /dev/null
+++ b/fairseq/modules/quantization/pq/modules/qconv.py
@@ -0,0 +1,115 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.utils import _pair
+
+
+class PQConv2d(nn.Module):
+    """
+    Quantized counterpart of nn.Conv2d module. Stores the centroid, the assignments
+    and the non-quantized biases. The full weight is re-instantiated at each forward
+    pass and autograd automatically computes the gradients with respect to the
+    centroids.
+
+    Args:
+        - centroids: centroids of size n_centroids x block_size
+        - assignments: assignments of the centroids to the subvectors
+          of size self.out_channels x n_blocks
+        - bias: the non-quantized bias, must be either torch.Tensor or None
+
+    Remarks:
+        - We refer the reader to the official documentation of the nn.Conv2d module
+          for the other arguments and the behavior of the module.
+        - Performance tests on GPU show that this implementation is 10% slower than
+          the non-quantized nn.Conv2d module for a standard training loop.
+        - During the backward, the gradients are averaged by cluster and not summed.
+          This explains the hook registered to the centroids.
+    """
+
+    def __init__(
+        self,
+        centroids,
+        assignments,
+        bias,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        padding_mode="zeros",
+    ):
+        super(PQConv2d, self).__init__()
+        self.block_size = centroids.size(1)
+        self.n_centroids = centroids.size(0)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.padding_mode = padding_mode
+        # check compatibility
+        if in_channels // groups * np.prod(self.kernel_size) % self.block_size != 0:
+            raise ValueError("Wrong PQ sizes")
+        if len(assignments) % out_channels != 0:
+            raise ValueError("Wrong PQ sizes")
+        if in_channels % groups != 0:
+            raise ValueError("in_channels must be divisible by groups")
+        if out_channels % groups != 0:
+            raise ValueError("out_channels must be divisible by groups")
+        # define parameters
+        self.centroids = nn.Parameter(centroids, requires_grad=True)
+        self.register_buffer("assignments", assignments)
+        self.register_buffer("counts", torch.bincount(assignments).type_as(centroids))
+        if bias is not None:
+            self.bias = nn.Parameter(bias)
+        else:
+            self.register_parameter("bias", None)
+        # register hook for averaging gradients per centroids instead of summing
+        self.centroids.register_hook(lambda x: x / self.counts[:, None])
+
+    @property
+    def weight(self):
+        return (
+            self.centroids[self.assignments]
+            .reshape(-1, self.out_channels, self.block_size)
+            .permute(1, 0, 2)
+            .reshape(
+                self.out_channels, self.in_channels // self.groups, *self.kernel_size
+            )
+        )
+
+    def forward(self, x):
+        return F.conv2d(
+            x,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+        )
+
+    def extra_repr(self):
+        s = "{in_channels}, {out_channels}, kernel_size={kernel_size}, stride={stride}"
+        if self.padding != (0,) * len(self.padding):
+            s += ", padding={padding}"
+        if self.dilation != (1,) * len(self.dilation):
+            s += ", dilation={dilation}"
+        if self.groups != 1:
+            s += ", groups={groups}"
+        if self.bias is None:
+            s += ", bias=False"
+        if self.padding_mode != "zeros":
+            s += ", padding_mode={padding_mode}"
+        s += ", n_centroids={n_centroids}, block_size={block_size}"
+        return s.format(**self.__dict__)
diff --git a/fairseq/modules/quantization/pq/modules/qemb.py b/fairseq/modules/quantization/pq/modules/qemb.py
new file mode 100644
index 0000000000000000000000000000000000000000..98d856d04e5876dc7cc19067808f98b3106a2558
--- /dev/null
+++ b/fairseq/modules/quantization/pq/modules/qemb.py
@@ -0,0 +1,87 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class PQEmbedding(nn.Module):
+    """
+    Quantized counterpart of nn.Embedding module. Stores the centroids and
+    the assignments. The full weight is re-instantiated at each forward
+    pass.
+
+    Args:
+        - centroids: centroids of size n_centroids x block_size
+        - assignments: assignments of the centroids to the subvectors
+          of size self.out_features x n_blocks
+        - bias: the non-quantized bias
+
+    Remarks:
+        - We refer the reader to the official documentation of the nn.Embedding module
+          for the other arguments and the behavior of the module
+        - Performance tests on GPU show that this implementation is 10% slower than
+          the non-quantized nn.Embedding module for a standard training loop.
+    """
+
+    def __init__(self, centroids, assignments, num_embeddings, embedding_dim,
+                     padding_idx=None, max_norm=None, norm_type=2.,
+                     scale_grad_by_freq=False, sparse=False, _weight=None):
+        super(PQEmbedding, self).__init__()
+        self.block_size = centroids.size(1)
+        self.n_centroids = centroids.size(0)
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        if padding_idx is not None:
+            if padding_idx > 0:
+                assert padding_idx < self.num_embeddings, 'Padding_idx must be within num_embeddings'
+            elif padding_idx < 0:
+                assert padding_idx >= -self.num_embeddings, 'Padding_idx must be within num_embeddings'
+                padding_idx = self.num_embeddings + padding_idx
+        self.padding_idx = padding_idx
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.scale_grad_by_freq = scale_grad_by_freq
+        self.sparse = sparse
+        # check compatibility
+        if self.embedding_dim % self.block_size != 0:
+            raise ValueError("Wrong PQ sizes")
+        if len(assignments) % self.num_embeddings != 0:
+            raise ValueError("Wrong PQ sizes")
+        # define parameters
+        self.centroids = nn.Parameter(centroids, requires_grad=True)
+        self.register_buffer("assignments", assignments)
+        self.register_buffer("counts", torch.bincount(assignments).type_as(centroids))
+
+    @property
+    def weight(self):
+        return (
+            self.centroids[self.assignments]
+            .reshape(-1, self.num_embeddings, self.block_size)
+            .permute(1, 0, 2)
+            .flatten(1, 2)
+        )
+
+    def forward(self, input):
+        return F.embedding(
+            input, self.weight, self.padding_idx, self.max_norm,
+            self.norm_type, self.scale_grad_by_freq, self.sparse)
+
+    def extra_repr(self):
+        s = '{num_embeddings}, {embedding_dim}'
+        if self.padding_idx is not None:
+            s += ', padding_idx={padding_idx}'
+        if self.max_norm is not None:
+            s += ', max_norm={max_norm}'
+        if self.norm_type != 2:
+            s += ', norm_type={norm_type}'
+        if self.scale_grad_by_freq is not False:
+            s += ', scale_grad_by_freq={scale_grad_by_freq}'
+        if self.sparse is not False:
+            s += ', sparse=True'
+        s += ', n_centroids={n_centroids}, block_size={block_size}'
+
+        return s.format(**self.__dict__)
diff --git a/fairseq/modules/quantization/pq/modules/qlinear.py b/fairseq/modules/quantization/pq/modules/qlinear.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bdd25a8685bb7c7b32e1f02372aaeb26d8ba53a
--- /dev/null
+++ b/fairseq/modules/quantization/pq/modules/qlinear.py
@@ -0,0 +1,71 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class PQLinear(nn.Module):
+    """
+    Quantized counterpart of nn.Linear module. Stores the centroid, the assignments
+    and the non-quantized biases. The full weight is re-instantiated at each forward
+    pass.
+
+    Args:
+        - centroids: centroids of size n_centroids x block_size
+        - assignments: assignments of the centroids to the subvectors
+          of size self.out_features x n_blocks
+        - bias: the non-quantized bias
+
+    Remarks:
+        - We refer the reader to the official documentation of the nn.Linear module
+          for the other arguments and the behavior of the module
+        - Performance tests on GPU show that this implementation is 15% slower than
+          the non-quantized nn.Linear module for a standard training loop.
+    """
+
+    def __init__(self, centroids, assignments, bias, in_features, out_features):
+        super(PQLinear, self).__init__()
+        self.block_size = centroids.size(1)
+        self.n_centroids = centroids.size(0)
+        self.in_features = in_features
+        self.out_features = out_features
+        # check compatibility
+        if self.in_features % self.block_size != 0:
+            raise ValueError("Wrong PQ sizes")
+        if len(assignments) % self.out_features != 0:
+            raise ValueError("Wrong PQ sizes")
+        # define parameters
+        self.centroids = nn.Parameter(centroids, requires_grad=True)
+        self.register_buffer("assignments", assignments)
+        self.register_buffer("counts", torch.bincount(assignments).type_as(centroids))
+        if bias is not None:
+            self.bias = nn.Parameter(bias)
+        else:
+            self.register_parameter("bias", None)
+
+    @property
+    def weight(self):
+        return (
+            self.centroids[self.assignments]
+            .reshape(-1, self.out_features, self.block_size)
+            .permute(1, 0, 2)
+            .flatten(1, 2)
+        )
+
+    def forward(self, x):
+        return F.linear(
+            x,
+            self.weight,
+            self.bias,
+        )
+
+    def extra_repr(self):
+        return f"in_features={self.in_features},\
+                 out_features={self.out_features},\
+                 n_centroids={self.n_centroids},\
+                 block_size={self.block_size},\
+                 bias={self.bias is not None}"
diff --git a/fairseq/modules/quantization/pq/pq.py b/fairseq/modules/quantization/pq/pq.py
new file mode 100644
index 0000000000000000000000000000000000000000..eddc2eb34602403f10979f54cd23a45bc2f104d5
--- /dev/null
+++ b/fairseq/modules/quantization/pq/pq.py
@@ -0,0 +1,128 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .em import EM, EmptyClusterResolveError
+
+
+class PQ(EM):
+    """
+    Quantizes the layer weights W with the standard Product Quantization
+    technique. This learns a codebook of codewords or centroids of size
+    block_size from W. For further reference on using PQ to quantize
+    neural networks, see "And the Bit Goes Down: Revisiting the Quantization
+    of Neural Networks", Stock et al., ICLR 2020.
+
+    PQ is performed in two steps:
+    (1) The matrix W (weights or fully-connected or convolutional layer)
+        is reshaped to (block_size, -1).
+            - If W is fully-connected (2D), its columns are split into
+              blocks of size block_size.
+            - If W is convolutional (4D), its filters are split along the
+              spatial dimension.
+    (2) We apply the standard EM/k-means algorithm to the resulting reshaped matrix.
+
+    Args:
+        - W: weight matrix to quantize of size (in_features x out_features)
+        - block_size: size of the blocks (subvectors)
+        - n_centroids: number of centroids
+        - n_iter: number of k-means iterations
+        - eps: for cluster reassignment when an empty cluster is found
+        - max_tentatives for cluster reassignment when an empty cluster is found
+        - verbose: print information after each iteration
+
+    Remarks:
+        - block_size be compatible with the shape of W
+    """
+
+    def __init__(
+        self,
+        W,
+        block_size,
+        n_centroids=256,
+        n_iter=20,
+        eps=1e-6,
+        max_tentatives=30,
+        verbose=True,
+    ):
+        self.block_size = block_size
+        W_reshaped = self._reshape(W)
+        super(PQ, self).__init__(
+            W_reshaped,
+            n_centroids=n_centroids,
+            n_iter=n_iter,
+            eps=eps,
+            max_tentatives=max_tentatives,
+            verbose=verbose,
+        )
+
+    def _reshape(self, W):
+        """
+        Reshapes the matrix W as expained in step (1).
+        """
+
+        # fully connected: by convention the weight has size out_features x in_features
+        if len(W.size()) == 2:
+            self.out_features, self.in_features = W.size()
+            assert (
+                self.in_features % self.block_size == 0
+            ), "Linear: n_blocks must be a multiple of in_features"
+            return (
+                W.reshape(self.out_features, -1, self.block_size)
+                .permute(2, 1, 0)
+                .flatten(1, 2)
+            )
+
+        # convolutional: we reshape along the spatial dimension
+        elif len(W.size()) == 4:
+            self.out_channels, self.in_channels, self.k_h, self.k_w = W.size()
+            assert (
+                self.in_channels * self.k_h * self.k_w
+            ) % self.block_size == 0, (
+                "Conv2d: n_blocks must be a multiple of in_channels * k_h * k_w"
+            )
+            return (
+                W.reshape(self.out_channels, -1, self.block_size)
+                .permute(2, 1, 0)
+                .flatten(1, 2)
+            )
+        # not implemented
+        else:
+            raise NotImplementedError(W.size())
+
+    def encode(self):
+        """
+        Performs self.n_iter EM steps.
+        """
+
+        self.initialize_centroids()
+        for i in range(self.n_iter):
+            try:
+                self.step(i)
+            except EmptyClusterResolveError:
+                break
+
+    def decode(self):
+        """
+        Returns the encoded full weight matrix. Must be called after
+        the encode function.
+        """
+
+        # fully connected case
+        if "k_h" not in self.__dict__:
+            return (
+                self.centroids[self.assignments]
+                .reshape(-1, self.out_features, self.block_size)
+                .permute(1, 0, 2)
+                .flatten(1, 2)
+            )
+
+        # convolutional case
+        else:
+            return (
+                self.centroids[self.assignments]
+                .reshape(-1, self.out_channels, self.block_size)
+                .permute(1, 0, 2)
+                .reshape(self.out_channels, self.in_channels, self.k_h, self.k_w)
+            )
diff --git a/fairseq/modules/quantization/pq/utils.py b/fairseq/modules/quantization/pq/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..57aaa1b7a39cbdcf2684a464fd16ae60df29fae5
--- /dev/null
+++ b/fairseq/modules/quantization/pq/utils.py
@@ -0,0 +1,335 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import re
+from operator import attrgetter, itemgetter
+
+import numpy as np
+import torch.nn as nn
+import torch.distributed as dist
+
+from .modules import PQConv2d, PQLinear, PQEmbedding
+from .pq import PQ
+
+
+def quantize_model_(
+    model,
+    size_tracker,
+    layers_to_quantize,
+    block_sizes_config,
+    n_centroids_config,
+    step=0,
+    n_iter=15,
+    eps=1e-6,
+    max_tentatives=100,
+    verbose=True,
+):
+    """
+    Quantize a model in-place by stages. All the targeted
+    layers are replaced by their quantized counterpart,
+    and the model is ready for the finetuning of the
+    centroids in a standard training loop (no modifications
+    required). Note that we do not quantize biases.
+
+    Args:
+        - model: a nn.Module
+        - size_tracker: useful for tracking quatization statistics
+        - layers_to_quantize: a list containing regexps for
+          filtering the layers to quantize at each stage according
+          to their name (as in model.named_parameters())
+        - block_sizes_config: dict like
+          {
+              'Conv2d': ('kernel_size', {'(3, 3)': 9, '(1, 1)': 4}),
+              'Linear': ('in_features', {'*': 8})
+          }
+          For instance, all conv2d layers with kernel size 3x3 have
+          a block size of 9 and all Linear layers are quantized with
+          a block size of 8, irrespective of their size.
+        - n_centroids_config: dict like
+          {
+              'Conv2d': ('kernel_size', {'*': 256}),
+              'Linear': ('in_features', {'*': 256})
+          }
+          For instance, all conv2d layers are quantized with 256 centroids
+        - step: the layers to quantize inplace corresponding
+          to layers_to_quantize[step]
+    """
+
+    quantized_layers = get_layers(model, layers_to_quantize[step])
+
+    for layer in quantized_layers:
+
+        # book-keeping
+        is_master_process = (not dist.is_initialized()) or (dist.is_initialized() and dist.get_rank() == 0)
+        verbose = verbose and is_master_process
+
+        # get block size and centroids
+        module = attrgetter(layer)(model)
+        block_size = get_param(module, layer, block_sizes_config)
+        n_centroids = get_param(module, layer, n_centroids_config)
+        if verbose:
+            logging.info(f"Quantizing layer {layer} with block size {block_size} and {n_centroids} centroids")
+
+        # quantize layer
+        weight = module.weight.data.clone()
+        is_bias = 'bias' in [x[0] for x in module.named_parameters()]
+        bias = module.bias.data.clone() if is_bias else None
+        quantizer = PQ(
+            weight,
+            block_size,
+            n_centroids=n_centroids,
+            n_iter=n_iter,
+            eps=eps,
+            max_tentatives=max_tentatives,
+            verbose=verbose,
+        )
+
+        # quantization performed on all GPUs with same seed
+        quantizer.encode()
+        centroids = quantizer.centroids.contiguous()
+        assignments = quantizer.assignments.contiguous()
+
+        # broadcast results to make sure weights are up-to-date
+        if dist.is_initialized():
+            dist.broadcast(centroids, 0)
+            dist.broadcast(assignments, 0)
+
+        # instantiate the quantized counterpart
+        if isinstance(module, nn.Linear):
+            out_features, in_features = map(
+                lambda k: module.__dict__[k], ["out_features", "in_features"]
+            )
+            quantized_module = PQLinear(
+                centroids, assignments, bias, in_features, out_features
+            )
+        elif isinstance(module, nn.Embedding):
+            num_embeddings, embedding_dim = map(
+                lambda k: module.__dict__[k], ["num_embeddings", "embedding_dim"]
+            )
+            quantized_module = PQEmbedding(
+                centroids, assignments, num_embeddings, embedding_dim
+            )
+        elif isinstance(module, nn.Conv2d):
+            out_channels, in_channels, kernel_size = map(
+                lambda k: module.__dict__[k],
+                ["out_channels", "in_channels", "kernel_size"],
+            )
+            stride, padding, dilation, groups, padding_mode = map(
+                lambda k: module.__dict__[k],
+                ["stride", "padding", "dilation", "groups", "padding_mode"],
+            )
+
+            quantized_module = PQConv2d(
+                centroids,
+                assignments,
+                bias,
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+                padding_mode=padding_mode,
+            )
+        else:
+            raise ValueError(f"Module {module} not yet supported for quantization")
+
+        # replace layer by its quantized counterpart
+        attrsetter(layer)(model, quantized_module)
+
+        # update statistics
+        size_tracker.update(weight, block_size, n_centroids)
+
+    # return name of quantized layers
+    return quantized_layers
+
+
+def get_layers(model, filter_regexp):
+    """
+    Filters out the layers according to a regexp. Note that
+    we omit biases.
+
+    Args:
+        - model: a nn.Module
+        - filter_regexp: a regexp to filter the layers to keep
+          according to their name in model.named_parameters().
+          For instance, the regexp:
+
+             down_layers\\.[123456]\\.(conv[12]|identity\\.conv))
+
+          is keeping blocks down_layers from 1 to 6, and inside
+          each block is keeping conv1, conv2 and identity.conv.
+
+    Remarks:
+        - We add (module\\.)? at the beginning of the regexp to
+          account for the possible use of nn.parallel.DataParallel
+    """
+
+    # get all parameter names
+    all_layers = map(itemgetter(0), model.named_parameters())
+
+    # remove biases
+    all_layers = filter(lambda x: "bias" not in x, all_layers)
+
+    # remove .weight in all other names (or .weight_orig is spectral norm)
+    all_layers = map(lambda x: x.replace(".weight_orig", ""), all_layers)
+    all_layers = map(lambda x: x.replace(".weight", ""), all_layers)
+
+    # return filtered layers
+    filter_regexp = "(module\\.)?" + "(" + filter_regexp + ")"
+    r = re.compile(filter_regexp)
+
+    return list(filter(r.match, all_layers))
+
+
+def get_param(module, layer_name, param_config):
+    """
+    Given a quantization configuration, get the right parameter
+    for the module to be quantized.
+
+    Args:
+        - module: a nn.Module
+        - layer_name: the name of the layer
+        - param_config: a dict like
+          {
+              'Conv2d': ('kernel_size', {'(3, 3)': 9, '(1, 1)': 4}),
+              'Linear': ('in_features', {'*': 8})
+          }
+          For instance, all conv2d layers with kernel size 3x3 have
+          a block size of 9 and all Linear layers are quantized with
+          a block size of 8, irrespective of their size.
+
+    Remarks:
+        - if 'fuzzy_name' is passed as a parameter, layers whose layer_name
+          include 'fuzzy_name' will be assigned the given parameter.
+          In the following example, conv.expand layers will have a block
+          size of 9 while conv.reduce will have a block size of 4 and all
+          other layers will have a block size of 2.
+          {
+              'Conv2d': ('fuzzy_name', {'expand': 9, 'reduce': 4, '*': 2}),
+              'Linear': ('fuzzy_name', {'classifier': 8, 'projection': 4})
+          }
+
+    """
+
+    layer_type = module.__class__.__name__
+
+    if layer_type not in param_config:
+        raise KeyError(f"Layer type {layer_type} not in config for layer {module}")
+
+    feature, params = param_config[module.__class__.__name__]
+
+    if feature != "fuzzy_name":
+        feature_value = str(getattr(module, feature))
+        if feature_value not in params:
+            if "*" in params:
+                feature_value = "*"
+            else:
+                raise KeyError(
+                    f"{feature}={feature_value} not in config for layer {module}"
+                )
+    else:
+        feature_values = [name for name in params if name in layer_name]
+        if len(feature_values) == 0:
+            if "*" in params:
+                feature_value = "*"
+            else:
+                raise KeyError(
+                    f"name={layer_name} not in config for {module}"
+                )
+        else:
+            feature_value = feature_values[0]
+
+    return params[feature_value]
+
+
+class SizeTracker(object):
+    """
+    Class to keep track of the compressed network size with iPQ.
+
+    Args:
+        - model: a nn.Module
+
+    Remarks:
+        - The compressed size is the sum of three components
+          for each layer in the network:
+              (1) Storing the centroids given by iPQ in fp16
+              (2) Storing the assignments of the blocks in int8
+              (3) Storing all non-compressed elements such as biases
+        - This cost in only valid if we use 256 centroids (then
+          indexing can indeed by done with int8).
+    """
+
+    def __init__(self, model):
+        self.model = model
+        self.size_non_compressed_model = self.compute_size()
+        self.size_non_quantized = self.size_non_compressed_model
+        self.size_index = 0
+        self.size_centroids = 0
+        self.n_quantized_layers = 0
+
+    def compute_size(self):
+        """
+        Computes the size of the model (in MB).
+        """
+
+        res = 0
+        for _, p in self.model.named_parameters():
+            res += p.numel()
+        return res * 4 / 1024 / 1024
+
+    def update(self, W, block_size, n_centroids):
+        """
+        Updates the running statistics when quantizing a new layer.
+        """
+
+        # bits per weights
+        bits_per_weight = np.log2(n_centroids) / block_size
+        self.n_quantized_layers += 1
+
+        # size of indexing the subvectors of size block_size (in MB)
+        size_index_layer = bits_per_weight * W.numel() / 8 / 1024 / 1024
+        self.size_index += size_index_layer
+
+        # size of the centroids stored in float16 (in MB)
+        size_centroids_layer = n_centroids * block_size * 2 / 1024 / 1024
+        self.size_centroids += size_centroids_layer
+
+        # size of non-compressed layers, e.g. LayerNorms or biases (in MB)
+        size_uncompressed_layer = W.numel() * 4 / 1024 / 1024
+        self.size_non_quantized -= size_uncompressed_layer
+
+    def __repr__(self):
+        size_compressed = (
+            self.size_index + self.size_centroids + self.size_non_quantized
+        )
+        compression_ratio = self.size_non_compressed_model / size_compressed  # NOQA
+        return (
+            f"Non-compressed model size: {self.size_non_compressed_model:.2f} MB. "
+            f"After quantizing {self.n_quantized_layers} layers, size "
+            f"(indexing + centroids + other): {self.size_index:.2f} MB + "
+            f"{self.size_centroids:.2f} MB + {self.size_non_quantized:.2f} MB = "
+            f"{size_compressed:.2f} MB, compression ratio: {compression_ratio:.2f}x"
+        )
+
+
+def attrsetter(*items):
+    def resolve_attr(obj, attr):
+        attrs = attr.split(".")
+        head = attrs[:-1]
+        tail = attrs[-1]
+
+        for name in head:
+            obj = getattr(obj, name)
+        return obj, tail
+
+    def g(obj, val):
+        for attr in items:
+            resolved_obj, resolved_attr = resolve_attr(obj, attr)
+            setattr(resolved_obj, resolved_attr, val)
+
+    return g
diff --git a/fairseq/modules/quantization/quantization_options.py b/fairseq/modules/quantization/quantization_options.py
new file mode 100644
index 0000000000000000000000000000000000000000..b46d682c0edaeaaf2a230e51d50da2a32d4bda98
--- /dev/null
+++ b/fairseq/modules/quantization/quantization_options.py
@@ -0,0 +1,44 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+def parse_config_yaml(yaml_data):
+    # Initialize to default options.
+    quantization_options = {
+        "n_centroids": {
+            "Linear": ["in_features", {"*": 256}],
+            "Embedding": ["embedding_dim", {"*": 256}],
+        },
+        "block_sizes": {
+            "Linear": ["fuzzy_name", {"fc": 8, "attn": 4, "emb": 4}],
+            "Embedding": ["fuzzy_name", {"emb": 8}],
+        },
+        "layers_to_quantize": [
+            "decoder\\.layers\\.\\d+\\.fc[12]",
+            "decoder\\.embed_tokens\\.embeddings\\.[012]\\.[01]",
+            "decoder\\.layers\\.\\d+\\.self_attn\\.(k_proj|v_proj|q_proj|out_proj)",
+        ],
+    }
+
+    if "n_centroids" in yaml_data:
+        quantization_options["n_centroids"] = {
+            layer: convert_yaml_to_tuple(layer_data)
+            for layer, layer_data in yaml_data["n_centroids"].items()
+        }
+    if "block_sizes" in yaml_data:
+        quantization_options["block_sizes"] = {
+            layer: convert_yaml_to_tuple(layer_data)
+            for layer, layer_data in yaml_data["block_sizes"].items()
+        }
+    if "layers_to_quantize" in yaml_data:
+        quantization_options["layers_to_quantize"] = yaml_data["layers_to_quantize"]
+
+    return quantization_options
+
+
+def convert_yaml_to_tuple(yaml_dictionary):
+    """Converts a yaml dictionary with two keys: `key` and `value` into a two
+    argument tuple of those values."""
+    return (yaml_dictionary["key"], yaml_dictionary["value"])
diff --git a/fairseq/modules/quantization/scalar/__init__.py b/fairseq/modules/quantization/scalar/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..143834f3d036780eb6844c82f0c6f2d10cfe2f61
--- /dev/null
+++ b/fairseq/modules/quantization/scalar/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .utils import quantize_model_  # NOQA
diff --git a/fairseq/modules/quantization/scalar/modules/__init__.py b/fairseq/modules/quantization/scalar/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ead4669611f95d3616ccf1cf2adba631281680e2
--- /dev/null
+++ b/fairseq/modules/quantization/scalar/modules/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .qconv import IntConv2d  # NOQA
+from .qlinear import IntLinear  # NOQA
+from .qemb import IntEmbedding  # NOQA
+from .qact import ActivationQuantizer  # NOQA
diff --git a/fairseq/modules/quantization/scalar/modules/qact.py b/fairseq/modules/quantization/scalar/modules/qact.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9f79011c15a463d63eeca880904d68b751ef167
--- /dev/null
+++ b/fairseq/modules/quantization/scalar/modules/qact.py
@@ -0,0 +1,80 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from ..ops import emulate_int
+
+
+class ActivationQuantizer:
+    """
+    Fake scalar quantization of the activations using a forward hook.
+
+    Args:
+        - module. a nn.Module for which we quantize the *post-activations*
+        - p: proportion of activations to quantize, set by default to 1
+        - update_step: to recompute quantization parameters
+        - bits: number of bits for quantization
+        - method: choose among {"tensor", "histogram", "channel"}
+        - clamp_threshold: to prevent gradients overflow
+
+    Remarks:
+        - Parameters scale and zero_point are recomputed every update_step
+          forward pass to reduce the overhead
+        - For the list of quantization methods and number of bits, see ops.py
+        - To remove the hook from the module, simply call self.handle.remove()
+        - At test time, the activations are fully quantized
+        - We use the straight-through estimator so that the gradients
+          back-propagate nicely in the network, this is implemented with
+          the detach() trick
+        - The activations are hard-clamped in [-clamp_threshold, clamp_threshold]
+          to prevent overflow during the backward pass
+    """
+    def __init__(self, module, p=1, update_step=1000, bits=8,
+                 method="histogram", clamp_threshold=5):
+        self.module = module
+        self.p = p
+        self.update_step = update_step
+        self.counter = 0
+        self.bits = bits
+        self.method = method
+        self.clamp_threshold = clamp_threshold
+        self.handle = None
+        self.register_hook()
+
+    def register_hook(self):
+        # forward hook
+        def quantize_hook(module, x, y):
+
+            # update parameters every 1000 iterations
+            if self.counter % self.update_step == 0:
+                self.scale = None
+                self.zero_point = None
+            self.counter += 1
+
+            # train with QuantNoise and evaluate the fully quantized network
+            p = self.p if self.module.training else 1
+
+            # quantize activations
+            y_q, self.scale, self.zero_point = emulate_int(
+                y.detach(),
+                bits=self.bits,
+                method=self.method,
+                scale=self.scale,
+                zero_point=self.zero_point,
+            )
+
+            # mask to apply noise
+            mask = torch.zeros_like(y)
+            mask.bernoulli_(1 - p)
+            noise = (y_q - y).masked_fill(mask.bool(), 0)
+
+            # using straight-through estimator (STE)
+            clamp_low = - self.scale * self.zero_point
+            clamp_high = self.scale * (2 ** self.bits - 1 - self.zero_point)
+            return torch.clamp(y, clamp_low.item(), clamp_high.item()) + noise.detach()
+
+        # register hook
+        self.handle = self.module.register_forward_hook(quantize_hook)
diff --git a/fairseq/modules/quantization/scalar/modules/qconv.py b/fairseq/modules/quantization/scalar/modules/qconv.py
new file mode 100644
index 0000000000000000000000000000000000000000..d718c9b90d7610223a6ad8211edda6f0c31cfd41
--- /dev/null
+++ b/fairseq/modules/quantization/scalar/modules/qconv.py
@@ -0,0 +1,146 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn.functional as F
+from torch.nn.modules.conv import _ConvNd
+from torch.nn.modules.utils import _pair
+
+from ..ops import emulate_int
+
+
+class IntConv2d(_ConvNd):
+    """
+    Quantized counterpart of the nn.Conv2d module that applies QuantNoise during training.
+
+    Args:
+        - standard nn.Conv2d parameters
+        - p: amount of noise to inject (0 = no quantization, 1 = quantize all the weights)
+        - bits: number of bits
+        - method: choose among {"tensor", "histogram", "channel"}
+        - update_step: recompute scale and zero_point every update_steps iterations
+
+    Remarks:
+        - We use the straight-thgourh estimator so that the gradients
+          back-propagate nicely in the network, this is implemented with
+          the detach() trick
+        - Parameters scale and zero_point are recomputed every update_step
+          forward pass to reduce the overhead
+        - At test time, the weights are fully quantized
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        padding_mode="zeros",
+        p=0,
+        bits=8,
+        method="histogram",
+        update_step=1000,
+    ):
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        dilation = _pair(dilation)
+        super(IntConv2d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            False,
+            _pair(0),
+            groups,
+            bias,
+            padding_mode,
+        )
+
+        # quantization parameters
+        self.p = p
+        self.bits = bits
+        self.method = method
+        self.update_step = update_step
+        self.counter = 0
+
+    def _conv_forward(self, input, weight):
+        if self.padding_mode != "zeros":
+            return F.conv2d(
+                F.pad(input, self._padding_repeated_twice, mode=self.padding_mode),
+                weight,
+                self.bias,
+                self.stride,
+                _pair(0),
+                self.dilation,
+                self.groups,
+            )
+        return F.conv2d(
+            input,
+            weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+        )
+
+    def forward(self, input):
+        # train with QuantNoise and evaluate the fully quantized network
+        p = self.p if self.training else 1
+
+        # update parameters every 100 iterations
+        if self.counter % self.update_step == 0:
+            self.scale = None
+            self.zero_point = None
+        self.counter += 1
+
+        # quantize weight
+        weight_quantized, self.scale, self.zero_point = emulate_int(
+            self.weight.detach(),
+            bits=self.bits,
+            method=self.method,
+            scale=self.scale,
+            zero_point=self.zero_point,
+        )
+
+        # mask to apply noise
+        mask = torch.zeros_like(self.weight)
+        mask.bernoulli_(1 - p)
+        noise = (weight_quantized - self.weight).masked_fill(mask.bool(), 0)
+
+        # using straight-through estimator (STE)
+        clamp_low = - self.scale * self.zero_point
+        clamp_high = self.scale * (2 ** self.bits - 1 - self.zero_point)
+        weight = torch.clamp(self.weight, clamp_low.item(), clamp_high.item()) + noise.detach()
+
+        # return output
+        output = self._conv_forward(input, weight)
+        return output
+
+    def extra_repr(self):
+        return (
+            "in_channels={}, out_channels={}, kernel_size={}, stride={}, "
+            "padding={}, dilation={}, groups={}, bias={}, quant_noise={}, "
+            "bits={}, method={}".format(
+                self.in_channels,
+                self.out_channels,
+                self.kernel_size,
+                self.stride,
+                self.padding,
+                self.dilation,
+                self.groups,
+                self.bias is not None,
+                self.p,
+                self.bits,
+                self.method,
+            )
+        )
diff --git a/fairseq/modules/quantization/scalar/modules/qemb.py b/fairseq/modules/quantization/scalar/modules/qemb.py
new file mode 100644
index 0000000000000000000000000000000000000000..835b2782a756e83d3990d321fe0e43eadfd071d2
--- /dev/null
+++ b/fairseq/modules/quantization/scalar/modules/qemb.py
@@ -0,0 +1,132 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..ops import emulate_int
+
+
+class IntEmbedding(nn.Module):
+    """
+    Quantized counterpart of the nn.Embedding module that applies QuantNoise during training.
+
+    Args:
+        - num_embeddings: number of tokens
+        - embedding_dim: embedding dimension
+        - p: amount of noise to inject (0 = no quantization, 1 = quantize all the weights)
+        - bits: number of bits
+        - method: choose among {"tensor", "histogram", "channel"}
+        - update_step: recompute scale and zero_point every update_steps iterations
+
+    Remarks:
+        - We use the straight-through estimator so that the gradients
+          back-propagate nicely in the network, this is implemented with
+          the detach() trick
+        - Parameters scale and zero_point are recomputed every update_step
+          forward pass to reduce the overhead
+        - At test time, the weights are fully quantized
+    """
+
+    def __init__(
+        self,
+        num_embeddings,
+        embedding_dim,
+        padding_idx=None,
+        max_norm=None,
+        norm_type=2.,
+        scale_grad_by_freq=False,
+        sparse=False,
+        _weight=None,
+        p=0,
+        update_step=1000,
+        bits=8,
+        method="histogram",
+    ):
+        super(IntEmbedding, self).__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        if padding_idx is not None:
+            if padding_idx > 0:
+                assert padding_idx < self.num_embeddings, 'Padding_idx must be within num_embeddings'
+            elif padding_idx < 0:
+                assert padding_idx >= -self.num_embeddings, 'Padding_idx must be within num_embeddings'
+                padding_idx = self.num_embeddings + padding_idx
+        self.padding_idx = padding_idx
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.scale_grad_by_freq = scale_grad_by_freq
+        if _weight is None:
+            self.weight = nn.Parameter(torch.Tensor(num_embeddings, embedding_dim))
+            self.reset_parameters()
+        else:
+            assert list(_weight.shape) == [num_embeddings, embedding_dim], \
+                'Shape of weight does not match num_embeddings and embedding_dim'
+            self.weight = nn.Parameter(_weight)
+        self.sparse = sparse
+
+        # quantization parameters
+        self.p = p
+        self.bits = bits
+        self.method = method
+        self.update_step = update_step
+        self.counter = 0
+
+    def reset_parameters(self):
+        nn.init.normal_(self.weight)
+        if self.padding_idx is not None:
+            with torch.no_grad():
+                self.weight[self.padding_idx].fill_(0)
+
+    def forward(self, input):
+        # train with QuantNoise and evaluate the fully quantized network
+        p = self.p if self.training else 1
+
+        # update parameters every 1000 iterations
+        if self.counter % self.update_step == 0:
+            self.scale = None
+            self.zero_point = None
+        self.counter += 1
+
+        # quantize weight
+        weight_quantized, self.scale, self.zero_point = emulate_int(
+            self.weight.detach(),
+            bits=self.bits,
+            method=self.method,
+            scale=self.scale,
+            zero_point=self.zero_point,
+        )
+
+        # mask to apply noise
+        mask = torch.zeros_like(self.weight)
+        mask.bernoulli_(1 - p)
+        noise = (weight_quantized - self.weight).masked_fill(mask.bool(), 0)
+
+        # using straight-through estimator (STE)
+        clamp_low = - self.scale * self.zero_point
+        clamp_high = self.scale * (2 ** self.bits - 1 - self.zero_point)
+        weight = torch.clamp(self.weight, clamp_low.item(), clamp_high.item()) + noise.detach()
+
+        # return output
+        output = F.embedding(
+            input, weight, self.padding_idx, self.max_norm,
+            self.norm_type, self.scale_grad_by_freq, self.sparse)
+        return output
+
+    def extra_repr(self):
+        s = '{num_embeddings}, {embedding_dim}'
+        if self.padding_idx is not None:
+            s += ', padding_idx={padding_idx}'
+        if self.max_norm is not None:
+            s += ', max_norm={max_norm}'
+        if self.norm_type != 2:
+            s += ', norm_type={norm_type}'
+        if self.scale_grad_by_freq is not False:
+            s += ', scale_grad_by_freq={scale_grad_by_freq}'
+        if self.sparse is not False:
+            s += ', sparse=True'
+        s += 'quant_noise={p}, bits={bits}, method={method}'
+        return s.format(**self.__dict__)
diff --git a/fairseq/modules/quantization/scalar/modules/qlinear.py b/fairseq/modules/quantization/scalar/modules/qlinear.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d4b27dc6cfbeab0115272b80a0629fa6784e258
--- /dev/null
+++ b/fairseq/modules/quantization/scalar/modules/qlinear.py
@@ -0,0 +1,110 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..ops import emulate_int
+
+
+class IntLinear(nn.Module):
+    """
+    Quantized counterpart of the nn.Linear module that applies QuantNoise during training.
+
+    Args:
+        - in_features: input features
+        - out_features: output features
+        - bias: bias or not
+        - p: amount of noise to inject (0 = no quantization, 1 = quantize all the weights)
+        - bits: number of bits
+        - method: choose among {"tensor", "histogram", "channel"}
+        - update_step: recompute scale and zero_point every update_steps iterations
+
+    Remarks:
+        - We use the straight-through estimator so that the gradients
+          back-propagate nicely in the network, this is implemented with
+          the detach() trick.
+        - Parameters scale and zero_point are recomputed every update_step
+          forward pass to reduce the overhead
+        - At test time, the weights are fully quantized
+    """
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        bias=True,
+        p=0,
+        update_step=3000,
+        bits=8,
+        method="histogram",
+    ):
+        super(IntLinear, self).__init__()
+        self.in_features = int(in_features)
+        self.out_features = int(out_features)
+        self.weight = torch.nn.Parameter(torch.Tensor(out_features, in_features))
+        self.chosen_bias = bias
+        if self.chosen_bias:
+            self.bias = torch.nn.Parameter(torch.Tensor(out_features))
+        else:
+            self.register_parameter("bias", None)
+        self.reset_parameters()
+
+        # quantization parameters
+        self.p = p
+        self.bits = bits
+        self.method = method
+        self.update_step = update_step
+        self.counter = 0
+
+    def reset_parameters(self):
+        nn.init.xavier_uniform_(self.weight)
+        if self.chosen_bias:
+            nn.init.constant_(self.bias, 0.0)
+        return
+
+    def forward(self, input):
+        # train with QuantNoise and evaluate the fully quantized network
+        p = self.p if self.training else 1
+
+        # update parameters every 100 iterations
+        if self.counter % self.update_step == 0:
+            self.scale = None
+            self.zero_point = None
+        self.counter += 1
+
+        # quantize weight
+        weight_quantized, self.scale, self.zero_point = emulate_int(
+            self.weight.detach(),
+            bits=self.bits,
+            method=self.method,
+            scale=self.scale,
+            zero_point=self.zero_point,
+        )
+
+        # mask to apply noise
+        mask = torch.zeros_like(self.weight)
+        mask.bernoulli_(1 - p)
+        noise = (weight_quantized - self.weight).masked_fill(mask.bool(), 0)
+
+        # using straight-through estimator (STE)
+        clamp_low = - self.scale * self.zero_point
+        clamp_high = self.scale * (2 ** self.bits - 1 - self.zero_point)
+        weight = torch.clamp(self.weight, clamp_low.item(), clamp_high.item()) + noise.detach()
+
+        # return output
+        output = F.linear(input, weight, self.bias)
+        return output
+
+    def extra_repr(self):
+        return "in_features={}, out_features={}, bias={}, quant_noise={}, bits={}, method={}".format(
+            self.in_features,
+            self.out_features,
+            self.bias is not None,
+            self.p,
+            self.bits,
+            self.method,
+        )
diff --git a/fairseq/modules/quantization/scalar/ops.py b/fairseq/modules/quantization/scalar/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..90bc737cc840821504d1a894070482ac952dced4
--- /dev/null
+++ b/fairseq/modules/quantization/scalar/ops.py
@@ -0,0 +1,47 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+
+def emulate_int(w, bits, method, scale=None, zero_point=None):
+    q = globals()[f"emulate_int{bits}_{method}"]
+    return q(w, scale=scale, zero_point=zero_point)
+
+
+def quantize(w, scale, zero_point):
+    return (torch.clamp(torch.round(w / scale + zero_point), 0, 255) - zero_point) * scale
+
+
+def emulate_int8_histogram(w, scale=None, zero_point=None):
+    if scale is None:
+        obs = torch.quantization.observer.HistogramObserver()
+        _ = obs(w.float())
+        scale, zero_point = obs.calculate_qparams()
+        scale = scale.cuda().type_as(w)
+        zero_point = zero_point.cuda().type_as(w)
+    return quantize(w, scale, zero_point), scale, zero_point
+
+
+def emulate_int8_channel(w, scale=None, zero_point=None):
+    if scale is None:
+        obs = torch.quantization.observer.PerChannelMinMaxObserver(
+            ch_axis=-1, qscheme=torch.per_channel_symmetric
+        )
+        _ = obs(w)
+        scale, zero_point, ch_axis = obs.get_qparams()
+        scale = scale.cuda().type_as(w)
+        zero_point = zero_point.cuda().type_as(w)
+    return quantize(w, scale, zero_point), scale, zero_point
+
+
+def emulate_int8_tensor(w, scale=None, zero_point=None):
+    if scale is None:
+        obs = torch.quantization.observer.MinMaxObserver()
+        _ = obs(w)
+        scale, zero_point = obs.calculate_qparams()
+        scale = scale.cuda().type_as(w)
+        zero_point = zero_point.cuda().type_as(w)
+    return quantize(w, scale, zero_point), scale, zero_point
diff --git a/fairseq/modules/quantization/scalar/utils.py b/fairseq/modules/quantization/scalar/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4071f7b80aa101ad60702e3a187c85880c37a62b
--- /dev/null
+++ b/fairseq/modules/quantization/scalar/utils.py
@@ -0,0 +1,67 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from operator import attrgetter
+
+import torch.nn as nn
+import torch.distributed as dist
+
+from ..pq.utils import get_layers, attrsetter
+from .modules import IntConv2d, IntLinear, IntEmbedding, ActivationQuantizer
+
+
+MAPPING = {nn.Linear: IntLinear, nn.Embedding: IntEmbedding, nn.Conv2d: IntConv2d}
+
+
+def quantize_model_(model, p=0.2, bits=8, update_step=3000):
+    """
+    Replaces all modules with their scalar quantized counterpart and
+    registers hooks to quantize the post-ativations of those modules.
+
+    Args:
+        - model: a nn.Module
+        - p: amount of noise (0 for no noise, 1 to quantize all the weights/activations)
+        - bits: number of bits
+        - update_step: update quantization parameters every update_step steps
+    """
+
+    # quantize all layers
+    quantized_layers = get_layers(model, "(.*?)")
+
+    for layer in quantized_layers:
+
+        # book-keeping
+        is_master_process = (not dist.is_initialized()) or (dist.is_initialized() and dist.get_rank() == 0)
+
+        # recover module
+        module = attrgetter(layer)(model)
+        if is_master_process:
+            logging.info(f"Quantizing layer {layer} with bits={bits} and QuantNoise={p}")
+
+        # quantization params
+        q_params = {"p": p, "update_step": update_step, "bits": bits, "method": "histogram", "counter": 0}
+
+        # instantiate the quantized counterpart
+        if isinstance(module, tuple(MAPPING.keys())):
+            QuantizedModule = MAPPING[module.__class__]
+            quantized_module = QuantizedModule.__new__(QuantizedModule)
+            params = module.__dict__
+            params.update(q_params)
+            quantized_module.__dict__.update(params)
+
+        else:
+            if is_master_process:
+                logging.info(f"Module {module} not yet supported for quantization")
+            continue
+
+        # activation quantization
+        a_q = ActivationQuantizer(quantized_module, p=0, bits=bits, method="histogram")
+
+        # replace layer by its quantized counterpart
+        attrsetter(layer)(model, quantized_module)
+
+    # return name of quantized layers
+    return quantized_layers
diff --git a/fairseq/modules/same_pad.py b/fairseq/modules/same_pad.py
new file mode 100644
index 0000000000000000000000000000000000000000..b46f94d6357888bde46035d8fcd57ceff5d24a88
--- /dev/null
+++ b/fairseq/modules/same_pad.py
@@ -0,0 +1,18 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from torch import nn
+
+
+class SamePad(nn.Module):
+    def __init__(self, kernel_size):
+        super().__init__()
+        self.remove = kernel_size % 2 == 0
+
+    def forward(self, x):
+        if self.remove:
+            x = x[:, :, :-1]
+        return x
diff --git a/fairseq/modules/scalar_bias.py b/fairseq/modules/scalar_bias.py
new file mode 100644
index 0000000000000000000000000000000000000000..c96247c75914fabb8a2b7ff731bb82b588f72690
--- /dev/null
+++ b/fairseq/modules/scalar_bias.py
@@ -0,0 +1,31 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+import torch
+
+
+class ScalarBias(torch.autograd.Function):
+    """
+    Adds a vector of scalars, used in self-attention mechanism to allow
+    the model to optionally attend to this vector instead of the past
+    """
+
+    @staticmethod
+    def forward(ctx, input, dim, bias_init):
+        size = list(input.size())
+        size[dim] += 1
+        output = input.new(*size).fill_(bias_init)
+        output.narrow(dim, 1, size[dim] - 1).copy_(input)
+        ctx.dim = dim
+        return output
+
+    @staticmethod
+    def backward(ctx, grad):
+        return grad.narrow(ctx.dim, 1, grad.size(ctx.dim) - 1), None, None
+
+
+def scalar_bias(input, dim, bias_init=0):
+    return ScalarBias.apply(input, dim, bias_init)
diff --git a/fairseq/modules/sinusoidal_positional_embedding.py b/fairseq/modules/sinusoidal_positional_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..857830faf7cb64950021947e2c5babcb906c48d3
--- /dev/null
+++ b/fairseq/modules/sinusoidal_positional_embedding.py
@@ -0,0 +1,105 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Any, Optional
+
+import torch
+import torch.onnx.operators
+from fairseq import utils
+from torch import Tensor, nn
+
+
+class SinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length.
+
+    Padding symbols are ignored.
+    """
+
+    def __init__(self, embedding_dim, padding_idx, init_size=1024):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.weights = SinusoidalPositionalEmbedding.get_embedding(
+            init_size, embedding_dim, padding_idx
+        )
+        self.onnx_trace = False
+        self.register_buffer("_float_tensor", torch.FloatTensor(1))
+        self.max_positions = int(1e5)
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+
+    @staticmethod
+    def get_embedding(
+        num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None
+    ):
+        """Build sinusoidal embeddings.
+
+        This matches the implementation in tensor2tensor, but differs slightly
+        from the description in Section 3.5 of "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(
+            1
+        ) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(
+            num_embeddings, -1
+        )
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+        return emb
+
+    def forward(
+        self,
+        input,
+        incremental_state: Optional[Any] = None,
+        timestep: Optional[Tensor] = None,
+        positions: Optional[Any] = None,
+    ):
+        """Input is expected to be of size [bsz x seqlen]."""
+        bspair = torch.onnx.operators.shape_as_tensor(input)
+        bsz, seq_len = bspair[0], bspair[1]
+        max_pos = self.padding_idx + 1 + seq_len
+        if self.weights is None or max_pos > self.weights.size(0):
+            # recompute/expand embeddings if needed
+            self.weights = SinusoidalPositionalEmbedding.get_embedding(
+                max_pos, self.embedding_dim, self.padding_idx
+            )
+        self.weights = self.weights.to(self._float_tensor)
+
+        if incremental_state is not None:
+            # positions is the same for every token when decoding a single step
+            pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len
+            if self.onnx_trace:
+                return (
+                    self.weights.index_select(index=self.padding_idx + pos, dim=0)
+                    .unsqueeze(1)
+                    .repeat(bsz, 1, 1)
+                )
+            return self.weights[self.padding_idx + pos, :].expand(bsz, 1, -1)
+
+        positions = utils.make_positions(
+            input, self.padding_idx, onnx_trace=self.onnx_trace
+        )
+        if self.onnx_trace:
+            flat_embeddings = self.weights.detach().index_select(0, positions.view(-1))
+            embedding_shape = torch.cat(
+                (bsz.view(1), seq_len.view(1), torch.tensor([-1], dtype=torch.long))
+            )
+            embeddings = torch.onnx.operators.reshape_from_tensor_shape(
+                flat_embeddings, embedding_shape
+            )
+            return embeddings
+        return (
+            self.weights.index_select(0, positions.view(-1))
+            .view(bsz, seq_len, -1)
+            .detach()
+        )
diff --git a/fairseq/modules/sparse_multihead_attention.py b/fairseq/modules/sparse_multihead_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..61430195c253fc0346814f7ba92e2a588553e99d
--- /dev/null
+++ b/fairseq/modules/sparse_multihead_attention.py
@@ -0,0 +1,104 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import torch
+from .multihead_attention import MultiheadAttention
+
+
+class SparseMultiheadAttention(MultiheadAttention):
+    """ Sparse Multi-Headed Attention.
+
+    "Generating Long Sequences with Sparse Transformers". Implements
+    fixed factorized self attention, where l=stride and c=expressivity.
+    A(1) includes all words in the stride window and A(2) takes a summary of c
+    words from the end of each stride window.
+    If is_bidirectional=False, we do not include any words past the current word,
+    as in the paper.
+    """
+
+    def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=True,
+                 add_bias_kv=False, add_zero_attn=False, self_attention=False,
+                 encoder_decoder_attention=False, stride=32, expressivity=8, is_bidirectional=True):
+
+        super().__init__(
+            embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv,
+            add_zero_attn, self_attention, encoder_decoder_attention
+        )
+
+        self.is_bidirectional = is_bidirectional
+        self.stride = stride
+        self.expressivity = expressivity
+        assert(self.stride > 0 and self.stride >= self.expressivity)
+
+    # Used for Ai(2) calculations - beginning of [l-c, l] range
+    def compute_checkpoint(self, word_index):
+        if word_index % self.stride == 0 and word_index != 0:
+            checkpoint_index = word_index - self.expressivity
+        else:
+            checkpoint_index = (
+                math.floor(word_index / self.stride) * self.stride
+                + self.stride - self.expressivity
+            )
+        return checkpoint_index
+
+    # Computes Ai(2)
+    def compute_subset_summaries(self, absolute_max):
+        checkpoint_index = self.compute_checkpoint(0)
+        subset_two = set()
+        while checkpoint_index <= absolute_max-1:
+            summary = set(range(checkpoint_index, min(
+                checkpoint_index+self.expressivity+1, absolute_max)
+            ))
+            subset_two = subset_two.union(summary)
+            checkpoint_index = self.compute_checkpoint(checkpoint_index+self.stride)
+        return subset_two
+
+    # Sparse Transformer Fixed Attention Pattern: https://arxiv.org/pdf/1904.10509.pdf
+    def compute_fixed_attention_subset(self, word_index, tgt_len):
+        # +1s account for range function; [min, max) -> [min, max]
+        if not self.is_bidirectional:
+            absolute_max = word_index + 1
+        else:
+            absolute_max = tgt_len
+
+        # Subset 1 - whole window
+        rounded_index = math.floor((word_index + self.stride) / self.stride) * self.stride
+        if word_index % self.stride == 0 and word_index != 0:
+            subset_one = set(range(word_index-self.stride, min(absolute_max, word_index+1)))
+        else:
+            subset_one = set(range(max(0, rounded_index - self.stride), min(
+                absolute_max, rounded_index+1))
+            )
+
+        # Subset 2 - summary per window
+        # If bidirectional, subset 2 is the same for every index
+        subset_two = set()
+        if not self.is_bidirectional:
+            subset_two = self.compute_subset_summaries(absolute_max)
+
+        return subset_one.union(subset_two)
+
+    # Compute sparse mask - if bidirectional, can pre-compute and store
+    def buffered_sparse_mask(self, tensor, tgt_len, src_len):
+        assert(tgt_len > self.stride)
+        sparse_mask = torch.empty((tgt_len, src_len)).float().fill_(float('-inf'))
+
+        # If bidirectional, subset 2 is the same for every index
+        subset_summaries = set()
+        if self.is_bidirectional:
+            subset_summaries = self.compute_subset_summaries(tgt_len)
+
+        for i in range(tgt_len):
+            fixed_attention_subset = self.compute_fixed_attention_subset(i, tgt_len)
+            fixed_attention_subset = fixed_attention_subset.union(subset_summaries)
+            included_word_indices = torch.LongTensor(list(fixed_attention_subset))
+            sparse_mask[i].index_fill_(0, included_word_indices, 0)
+        return sparse_mask.type_as(tensor)
+
+    def apply_sparse_mask(self, attn_weights, tgt_len, src_len, bsz):
+        sparse_mask = self.buffered_sparse_mask(attn_weights, tgt_len, src_len)
+        sparse_mask = sparse_mask.unsqueeze(0).expand(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights += sparse_mask
diff --git a/fairseq/modules/sparse_transformer_sentence_encoder.py b/fairseq/modules/sparse_transformer_sentence_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d50d5a88289d8d24530af300d63cbae829f110f
--- /dev/null
+++ b/fairseq/modules/sparse_transformer_sentence_encoder.py
@@ -0,0 +1,79 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.nn as nn
+from fairseq.modules import TransformerSentenceEncoder
+from fairseq.modules.sparse_transformer_sentence_encoder_layer import SparseTransformerSentenceEncoderLayer
+
+
+class SparseTransformerSentenceEncoder(TransformerSentenceEncoder):
+    """
+    Sparse implementation of the TransformerSentenceEncoder
+    - see SparseMultiheadAttention
+    """
+
+    def __init__(
+        self,
+        padding_idx: int,
+        vocab_size: int,
+        num_encoder_layers: int = 6,
+        embedding_dim: int = 768,
+        ffn_embedding_dim: int = 3072,
+        num_attention_heads: int = 8,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        max_seq_len: int = 256,
+        num_segments: int = 2,
+        use_position_embeddings: bool = True,
+        offset_positions_by_padding: bool = True,
+        encoder_normalize_before: bool = False,
+        apply_bert_init: bool = False,
+        activation_fn: str = "relu",
+        learned_pos_embedding: bool = True,
+        embed_scale: float = None,
+        freeze_embeddings: bool = False,
+        n_trans_layers_to_freeze: int = 0,
+        export: bool = False,
+        is_bidirectional: bool = True,
+        stride: int = 32,
+        expressivity: int = 8,
+    ) -> None:
+
+        super().__init__(
+            padding_idx, vocab_size, num_encoder_layers, embedding_dim,
+            ffn_embedding_dim, num_attention_heads, dropout, attention_dropout,
+            activation_dropout, max_seq_len, num_segments, use_position_embeddings,
+            offset_positions_by_padding, encoder_normalize_before, apply_bert_init,
+            activation_fn, learned_pos_embedding, embed_scale, freeze_embeddings,
+            n_trans_layers_to_freeze, export
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                SparseTransformerSentenceEncoderLayer(
+                    embedding_dim=self.embedding_dim,
+                    ffn_embedding_dim=ffn_embedding_dim,
+                    num_attention_heads=num_attention_heads,
+                    dropout=dropout,
+                    attention_dropout=attention_dropout,
+                    activation_dropout=activation_dropout,
+                    activation_fn=activation_fn,
+                    export=export,
+                    is_bidirectional=is_bidirectional,
+                    stride=stride,
+                    expressivity=expressivity,
+                )
+                for _ in range(num_encoder_layers)
+            ]
+        )
+
+        def freeze_module_params(m):
+            if m is not None:
+                for p in m.parameters():
+                    p.requires_grad = False
+
+        for layer in range(n_trans_layers_to_freeze):
+            freeze_module_params(self.layers[layer])
diff --git a/fairseq/modules/sparse_transformer_sentence_encoder_layer.py b/fairseq/modules/sparse_transformer_sentence_encoder_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..21c2fe4d5ae0b4279b13ff355365e698300c7c90
--- /dev/null
+++ b/fairseq/modules/sparse_transformer_sentence_encoder_layer.py
@@ -0,0 +1,45 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.modules import TransformerSentenceEncoderLayer
+from fairseq.modules.sparse_multihead_attention import SparseMultiheadAttention
+
+
+class SparseTransformerSentenceEncoderLayer(TransformerSentenceEncoderLayer):
+    """
+    Implements a Sprase Transformer Encoder Layer (see SparseMultiheadAttention)
+    """
+
+    def __init__(
+        self,
+        embedding_dim: int = 768,
+        ffn_embedding_dim: int = 3072,
+        num_attention_heads: int = 8,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        activation_fn: str = 'relu',
+        export: bool = False,
+        is_bidirectional: bool = True,
+        stride: int = 32,
+        expressivity: int = 8,
+    ) -> None:
+
+        super().__init__(
+            embedding_dim, ffn_embedding_dim, num_attention_heads, dropout,
+            attention_dropout, activation_dropout, activation_fn, export
+        )
+
+        self.self_attn = SparseMultiheadAttention(
+            self.embedding_dim,
+            num_attention_heads,
+            dropout=attention_dropout,
+            add_bias_kv=False,
+            add_zero_attn=False,
+            self_attention=True,
+            is_bidirectional=is_bidirectional,
+            stride=stride,
+            expressivity=expressivity,
+        )
diff --git a/fairseq/modules/transformer_layer.py b/fairseq/modules/transformer_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..037d8e88aeb0e82e224fe3290ee337bb89e48e48
--- /dev/null
+++ b/fairseq/modules/transformer_layer.py
@@ -0,0 +1,391 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List, Optional
+
+import torch
+import torch.nn as nn
+from fairseq import utils
+from fairseq.modules import LayerNorm, MultiheadAttention
+from fairseq.modules.quant_noise import quant_noise
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from torch import Tensor
+
+
+class TransformerEncoderLayer(nn.Module):
+    """Encoder layer block.
+
+    In the original paper each operation (multi-head attention or FFN) is
+    postprocessed with: `dropout -> add residual -> layernorm`. In the
+    tensor2tensor code they suggest that learning is more robust when
+    preprocessing each layer with layernorm and postprocessing with:
+    `dropout -> add residual`. We default to the approach in the paper, but the
+    tensor2tensor approach can be enabled by setting
+    *args.encoder_normalize_before* to ``True``.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+    """
+
+    def __init__(self, args):
+        super().__init__()
+        self.embed_dim = args.encoder_embed_dim
+        self.quant_noise = getattr(args, "quant_noise_pq", 0)
+        self.quant_noise_block_size = getattr(args, "quant_noise_pq_block_size", 8)
+        self.self_attn = self.build_self_attention(self.embed_dim, args)
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
+        self.dropout_module = FairseqDropout(args.dropout, module_name=self.__class__.__name__)
+        self.activation_fn = utils.get_activation_fn(
+            activation=getattr(args, "activation_fn", "relu")
+        )
+        activation_dropout_p = getattr(args, "activation_dropout", 0)
+        if activation_dropout_p == 0:
+            # for backwards compatibility with models that use args.relu_dropout
+            activation_dropout_p = getattr(args, "relu_dropout", 0)
+        self.activation_dropout_module = FairseqDropout(
+            float(activation_dropout_p), module_name=self.__class__.__name__
+        )
+        self.normalize_before = args.encoder_normalize_before
+        self.fc1 = self.build_fc1(
+            self.embed_dim, args.encoder_ffn_embed_dim, self.quant_noise, self.quant_noise_block_size
+        )
+        self.fc2 = self.build_fc2(
+            args.encoder_ffn_embed_dim, self.embed_dim, self.quant_noise, self.quant_noise_block_size
+        )
+
+        self.final_layer_norm = LayerNorm(self.embed_dim)
+
+    def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
+        return quant_noise(nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size)
+
+    def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
+        return quant_noise(nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size)
+
+    def build_self_attention(self, embed_dim, args):
+        return MultiheadAttention(
+            embed_dim,
+            args.encoder_attention_heads,
+            dropout=args.attention_dropout,
+            self_attention=True,
+            q_noise=self.quant_noise,
+            qn_block_size=self.quant_noise_block_size,
+        )
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        """
+        Rename layer norm states from `...layer_norms.0.weight` to
+        `...self_attn_layer_norm.weight` and `...layer_norms.1.weight` to
+        `...final_layer_norm.weight`
+        """
+        layer_norm_map = {"0": "self_attn_layer_norm", "1": "final_layer_norm"}
+        for old, new in layer_norm_map.items():
+            for m in ("weight", "bias"):
+                k = "{}.layer_norms.{}.{}".format(name, old, m)
+                if k in state_dict:
+                    state_dict["{}.{}.{}".format(name, new, m)] = state_dict[k]
+                    del state_dict[k]
+
+    def forward(self, x, encoder_padding_mask, attn_mask: Optional[Tensor] = None):
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
+                `(batch, seq_len)` where padding elements are indicated by ``1``.
+            attn_mask (ByteTensor): binary tensor of shape `(tgt_len, src_len)`,
+                where `tgt_len` is the length of output and `src_len` is the
+                length of input, though here both are equal to `seq_len`.
+                `attn_mask[tgt_i, src_j] = 1` means that when calculating the
+                embedding for `tgt_i`, we exclude (mask out) `src_j`. This is
+                useful for strided self-attention.
+
+        Returns:
+            encoded output of shape `(seq_len, batch, embed_dim)`
+        """
+        # anything in original attn_mask = 1, becomes -1e8
+        # anything in original attn_mask = 0, becomes 0
+        # Note that we cannot use -inf here, because at some edge cases,
+        # the attention weight (before softmax) for some padded element in query
+        # will become -inf, which results in NaN in model parameters
+        if attn_mask is not None:
+            attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -1e8)
+
+        residual = x
+        if self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+        x, _ = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            key_padding_mask=encoder_padding_mask,
+            attn_mask=attn_mask,
+        )
+        x = self.dropout_module(x)
+        x = residual + x
+        if not self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.final_layer_norm(x)
+
+        x = self.activation_fn(self.fc1(x))
+        x = self.activation_dropout_module(x)
+        x = self.fc2(x)
+        x = self.dropout_module(x)
+        x = residual + x
+        if not self.normalize_before:
+            x = self.final_layer_norm(x)
+        return x
+
+
+class TransformerDecoderLayer(nn.Module):
+    """Decoder layer block.
+
+    In the original paper each operation (multi-head attention, encoder
+    attention or FFN) is postprocessed with: `dropout -> add residual ->
+    layernorm`. In the tensor2tensor code they suggest that learning is more
+    robust when preprocessing each layer with layernorm and postprocessing with:
+    `dropout -> add residual`. We default to the approach in the paper, but the
+    tensor2tensor approach can be enabled by setting
+    *args.decoder_normalize_before* to ``True``.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+    """
+
+    def __init__(
+        self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False
+    ):
+        super().__init__()
+        self.embed_dim = args.decoder_embed_dim
+        self.dropout_module = FairseqDropout(args.dropout, module_name=self.__class__.__name__)
+        self.quant_noise = getattr(args, "quant_noise_pq", 0)
+        self.quant_noise_block_size = getattr(args, "quant_noise_pq_block_size", 8)
+
+        self.cross_self_attention = getattr(args, "cross_self_attention", False)
+
+        self.self_attn = self.build_self_attention(
+            self.embed_dim,
+            args,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+        )
+        self.activation_fn = utils.get_activation_fn(
+            activation=getattr(args, "activation_fn", "relu")
+        )
+        activation_dropout_p = getattr(args, "activation_dropout", 0)
+        if activation_dropout_p == 0:
+            # for backwards compatibility with models that use args.relu_dropout
+            activation_dropout_p = getattr(args, "relu_dropout", 0)
+        self.activation_dropout_module = FairseqDropout(
+            float(activation_dropout_p), module_name=self.__class__.__name__)
+        self.normalize_before = args.decoder_normalize_before
+
+        # use layerNorm rather than FusedLayerNorm for exporting.
+        # char_inputs can be used to determint this.
+        # TODO  remove this once we update apex with the fix
+        export = getattr(args, "char_inputs", False)
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
+
+        if no_encoder_attn:
+            self.encoder_attn = None
+            self.encoder_attn_layer_norm = None
+        else:
+            self.encoder_attn = self.build_encoder_attention(self.embed_dim, args)
+            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
+
+        self.fc1 = self.build_fc1(
+            self.embed_dim, args.decoder_ffn_embed_dim, self.quant_noise, self.quant_noise_block_size
+        )
+        self.fc2 = self.build_fc2(
+            args.decoder_ffn_embed_dim, self.embed_dim, self.quant_noise, self.quant_noise_block_size
+        )
+
+        self.final_layer_norm = LayerNorm(self.embed_dim, export=export)
+        self.need_attn = True
+
+        self.onnx_trace = False
+
+    def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
+        return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
+
+    def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
+        return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
+
+    def build_self_attention(self, embed_dim, args, add_bias_kv=False, add_zero_attn=False):
+        return MultiheadAttention(
+            embed_dim,
+            args.decoder_attention_heads,
+            dropout=args.attention_dropout,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+            self_attention=not getattr(args, "cross_self_attention", False),
+            q_noise=self.quant_noise,
+            qn_block_size=self.quant_noise_block_size,
+        )
+
+    def build_encoder_attention(self, embed_dim, args):
+        return MultiheadAttention(
+            embed_dim,
+            args.decoder_attention_heads,
+            kdim=getattr(args, "encoder_embed_dim", None),
+            vdim=getattr(args, "encoder_embed_dim", None),
+            dropout=args.attention_dropout,
+            encoder_decoder_attention=True,
+            q_noise=self.quant_noise,
+            qn_block_size=self.quant_noise_block_size,
+        )
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+
+    def forward(
+        self,
+        x,
+        encoder_out: Optional[torch.Tensor] = None,
+        encoder_padding_mask: Optional[torch.Tensor] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        prev_self_attn_state: Optional[List[torch.Tensor]] = None,
+        prev_attn_state: Optional[List[torch.Tensor]] = None,
+        self_attn_mask: Optional[torch.Tensor] = None,
+        self_attn_padding_mask: Optional[torch.Tensor] = None,
+        need_attn: bool = False,
+        need_head_weights: bool = False,
+    ):
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor, optional): binary
+                ByteTensor of shape `(batch, src_len)` where padding
+                elements are indicated by ``1``.
+            need_attn (bool, optional): return attention weights
+            need_head_weights (bool, optional): return attention weights
+                for each head (default: return average over heads).
+
+        Returns:
+            encoded output of shape `(seq_len, batch, embed_dim)`
+        """
+        if need_head_weights:
+            need_attn = True
+
+        residual = x
+        if self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+        if prev_self_attn_state is not None:
+            prev_key, prev_value = prev_self_attn_state[:2]
+            saved_state: Dict[str, Optional[Tensor]] = {
+                "prev_key": prev_key,
+                "prev_value": prev_value,
+            }
+            if len(prev_self_attn_state) >= 3:
+                saved_state["prev_key_padding_mask"] = prev_self_attn_state[2]
+            assert incremental_state is not None
+            self.self_attn._set_input_buffer(incremental_state, saved_state)
+        _self_attn_input_buffer = self.self_attn._get_input_buffer(incremental_state)
+        if self.cross_self_attention and not (
+            incremental_state is not None
+            and _self_attn_input_buffer is not None
+            and "prev_key" in _self_attn_input_buffer
+        ):
+            if self_attn_mask is not None:
+                assert encoder_out is not None
+                self_attn_mask = torch.cat(
+                    (x.new_zeros(x.size(0), encoder_out.size(0)), self_attn_mask), dim=1
+                )
+            if self_attn_padding_mask is not None:
+                if encoder_padding_mask is None:
+                    assert encoder_out is not None
+                    encoder_padding_mask = self_attn_padding_mask.new_zeros(
+                        encoder_out.size(1), encoder_out.size(0)
+                    )
+                self_attn_padding_mask = torch.cat(
+                    (encoder_padding_mask, self_attn_padding_mask), dim=1
+                )
+            assert encoder_out is not None
+            y = torch.cat((encoder_out, x), dim=0)
+        else:
+            y = x
+
+        x, attn = self.self_attn(
+            query=x,
+            key=y,
+            value=y,
+            key_padding_mask=self_attn_padding_mask,
+            incremental_state=incremental_state,
+            need_weights=False,
+            attn_mask=self_attn_mask,
+        )
+        x = self.dropout_module(x)
+        x = residual + x
+        if not self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+
+        if self.encoder_attn is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.encoder_attn_layer_norm(x)
+            if prev_attn_state is not None:
+                prev_key, prev_value = prev_attn_state[:2]
+                saved_state: Dict[str, Optional[Tensor]] = {
+                    "prev_key": prev_key,
+                    "prev_value": prev_value,
+                }
+                if len(prev_attn_state) >= 3:
+                    saved_state["prev_key_padding_mask"] = prev_attn_state[2]
+                assert incremental_state is not None
+                self.encoder_attn._set_input_buffer(incremental_state, saved_state)
+
+            x, attn = self.encoder_attn(
+                query=x,
+                key=encoder_out,
+                value=encoder_out,
+                key_padding_mask=encoder_padding_mask,
+                incremental_state=incremental_state,
+                static_kv=True,
+                need_weights=need_attn or (not self.training and self.need_attn),
+                need_head_weights=need_head_weights,
+            )
+            x = self.dropout_module(x)
+            x = residual + x
+            if not self.normalize_before:
+                x = self.encoder_attn_layer_norm(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.final_layer_norm(x)
+
+        x = self.activation_fn(self.fc1(x))
+        x = self.activation_dropout_module(x)
+        x = self.fc2(x)
+        x = self.dropout_module(x)
+        x = residual + x
+        if not self.normalize_before:
+            x = self.final_layer_norm(x)
+        if self.onnx_trace and incremental_state is not None:
+            saved_state = self.self_attn._get_input_buffer(incremental_state)
+            assert saved_state is not None
+            if self_attn_padding_mask is not None:
+                self_attn_state = [
+                    saved_state["prev_key"],
+                    saved_state["prev_value"],
+                    saved_state["prev_key_padding_mask"],
+                ]
+            else:
+                self_attn_state = [saved_state["prev_key"], saved_state["prev_value"]]
+            return x, attn, self_attn_state
+        return x, attn, None
+
+    def make_generation_fast_(self, need_attn: bool = False, **kwargs):
+        self.need_attn = need_attn
+
+
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    nn.init.xavier_uniform_(m.weight)
+    if bias:
+        nn.init.constant_(m.bias, 0.0)
+    return m
diff --git a/fairseq/modules/transformer_sentence_encoder.py b/fairseq/modules/transformer_sentence_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a6994181b84c3f3dac318b2539b282cd0f42590
--- /dev/null
+++ b/fairseq/modules/transformer_sentence_encoder.py
@@ -0,0 +1,278 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+from fairseq.modules import (
+    FairseqDropout,
+    LayerDropModuleList,
+    LayerNorm,
+    MultiheadAttention,
+    PositionalEmbedding,
+    TransformerSentenceEncoderLayer,
+)
+from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_
+
+
+def init_bert_params(module):
+    """
+    Initialize the weights specific to the BERT Model.
+    This overrides the default initializations depending on the specified arguments.
+        1. If normal_init_linear_weights is set then weights of linear
+           layer will be initialized using the normal distribution and
+           bais will be set to the specified value.
+        2. If normal_init_embed_weights is set then weights of embedding
+           layer will be initialized using the normal distribution.
+        3. If normal_init_proj_weights is set then weights of
+           in_project_weight for MultiHeadAttention initialized using
+           the normal distribution (to be validated).
+    """
+
+    if isinstance(module, nn.Linear):
+        module.weight.data.normal_(mean=0.0, std=0.02)
+        if module.bias is not None:
+            module.bias.data.zero_()
+    if isinstance(module, nn.Embedding):
+        module.weight.data.normal_(mean=0.0, std=0.02)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+    if isinstance(module, MultiheadAttention):
+        module.q_proj.weight.data.normal_(mean=0.0, std=0.02)
+        module.k_proj.weight.data.normal_(mean=0.0, std=0.02)
+        module.v_proj.weight.data.normal_(mean=0.0, std=0.02)
+
+
+class TransformerSentenceEncoder(nn.Module):
+    """
+    Implementation for a Bi-directional Transformer based Sentence Encoder used
+    in BERT/XLM style pre-trained models.
+
+    This first computes the token embedding using the token embedding matrix,
+    position embeddings (if specified) and segment embeddings
+    (if specified). After applying the specified number of
+    TransformerEncoderLayers, it outputs all the internal states of the
+    encoder as well as the final representation associated with the first
+    token (usually CLS token).
+
+    Input:
+        - tokens: B x T matrix representing sentences
+        - segment_labels: B x T matrix representing segment label for tokens
+
+    Output:
+        - a tuple of the following:
+            - a list of internal model states used to compute the
+              predictions where each tensor has shape T x B x C
+            - sentence representation associated with first input token
+              in format B x C.
+    """
+
+    def __init__(
+        self,
+        padding_idx: int,
+        vocab_size: int,
+        num_encoder_layers: int = 6,
+        embedding_dim: int = 768,
+        ffn_embedding_dim: int = 3072,
+        num_attention_heads: int = 8,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        layerdrop: float = 0.0,
+        max_seq_len: int = 256,
+        num_segments: int = 2,
+        use_position_embeddings: bool = True,
+        offset_positions_by_padding: bool = True,
+        encoder_normalize_before: bool = False,
+        apply_bert_init: bool = False,
+        activation_fn: str = "relu",
+        learned_pos_embedding: bool = True,
+        embed_scale: float = None,
+        freeze_embeddings: bool = False,
+        n_trans_layers_to_freeze: int = 0,
+        export: bool = False,
+        traceable: bool = False,
+        q_noise: float = 0.0,
+        qn_block_size: int = 8,
+    ) -> None:
+
+        super().__init__()
+        self.padding_idx = padding_idx
+        self.vocab_size = vocab_size
+        self.dropout_module = FairseqDropout(dropout, module_name=self.__class__.__name__)
+        self.layerdrop = layerdrop
+        self.max_seq_len = max_seq_len
+        self.embedding_dim = embedding_dim
+        self.num_segments = num_segments
+        self.use_position_embeddings = use_position_embeddings
+        self.apply_bert_init = apply_bert_init
+        self.learned_pos_embedding = learned_pos_embedding
+        self.traceable = traceable
+        self.tpu = False  # whether we're on TPU
+
+        self.embed_tokens = self.build_embedding(
+            self.vocab_size, self.embedding_dim, self.padding_idx
+        )
+        self.embed_scale = embed_scale
+
+        if q_noise > 0:
+            self.quant_noise = apply_quant_noise_(
+                nn.Linear(self.embedding_dim, self.embedding_dim, bias=False),
+                q_noise,
+                qn_block_size,
+            )
+        else:
+            self.quant_noise = None
+
+        self.segment_embeddings = (
+            nn.Embedding(self.num_segments, self.embedding_dim, padding_idx=None)
+            if self.num_segments > 0
+            else None
+        )
+
+        self.embed_positions = (
+            PositionalEmbedding(
+                self.max_seq_len,
+                self.embedding_dim,
+                padding_idx=(self.padding_idx if offset_positions_by_padding else None),
+                learned=self.learned_pos_embedding,
+            )
+            if self.use_position_embeddings
+            else None
+        )
+
+        if self.layerdrop > 0.0:
+            self.layers = LayerDropModuleList(p=self.layerdrop)
+        else:
+            self.layers = nn.ModuleList([])
+        self.layers.extend([
+            self.build_transformer_sentence_encoder_layer(
+                embedding_dim=self.embedding_dim,
+                ffn_embedding_dim=ffn_embedding_dim,
+                num_attention_heads=num_attention_heads,
+                dropout=self.dropout_module.p,
+                attention_dropout=attention_dropout,
+                activation_dropout=activation_dropout,
+                activation_fn=activation_fn,
+                export=export,
+                q_noise=q_noise,
+                qn_block_size=qn_block_size,
+            )
+            for _ in range(num_encoder_layers)
+        ])
+
+        if encoder_normalize_before:
+            self.emb_layer_norm = LayerNorm(self.embedding_dim, export=export)
+        else:
+            self.emb_layer_norm = None
+
+        # Apply initialization of model params after building the model
+        if self.apply_bert_init:
+            self.apply(init_bert_params)
+
+        def freeze_module_params(m):
+            if m is not None:
+                for p in m.parameters():
+                    p.requires_grad = False
+
+        if freeze_embeddings:
+            freeze_module_params(self.embed_tokens)
+            freeze_module_params(self.segment_embeddings)
+            freeze_module_params(self.embed_positions)
+            freeze_module_params(self.emb_layer_norm)
+
+        for layer in range(n_trans_layers_to_freeze):
+            freeze_module_params(self.layers[layer])
+
+    def build_embedding(self, vocab_size, embedding_dim, padding_idx):
+        return nn.Embedding(vocab_size, embedding_dim, padding_idx)
+
+    def build_transformer_sentence_encoder_layer(
+        self,
+        embedding_dim,
+        ffn_embedding_dim,
+        num_attention_heads,
+        dropout,
+        attention_dropout,
+        activation_dropout,
+        activation_fn,
+        export,
+        q_noise,
+        qn_block_size,
+    ):
+        return TransformerSentenceEncoderLayer(
+            embedding_dim=embedding_dim,
+            ffn_embedding_dim=ffn_embedding_dim,
+            num_attention_heads=num_attention_heads,
+            dropout=dropout,
+            attention_dropout=attention_dropout,
+            activation_dropout=activation_dropout,
+            activation_fn=activation_fn,
+            export=export,
+            q_noise=q_noise,
+            qn_block_size=qn_block_size,
+        )
+
+    def prepare_for_tpu_(self, **kwargs):
+        self.tpu = True
+
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        segment_labels: torch.Tensor = None,
+        last_state_only: bool = False,
+        positions: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        # compute padding mask. This is needed for multi-head attention
+        padding_mask = tokens.eq(self.padding_idx)
+        if not self.traceable and not self.tpu and not padding_mask.any():
+            padding_mask = None
+
+        x = self.embed_tokens(tokens)
+
+        if self.embed_scale is not None:
+            x *= self.embed_scale
+
+        if self.embed_positions is not None:
+            x += self.embed_positions(tokens, positions=positions)
+
+        if self.segment_embeddings is not None and segment_labels is not None:
+            x += self.segment_embeddings(segment_labels)
+
+        if self.quant_noise is not None:
+            x = self.quant_noise(x)
+
+        if self.emb_layer_norm is not None:
+            x = self.emb_layer_norm(x)
+
+        x = self.dropout_module(x)
+
+        # account for padding while computing the representation
+        if padding_mask is not None:
+            x *= 1 - padding_mask.unsqueeze(-1).type_as(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        inner_states = []
+        if not last_state_only:
+            inner_states.append(x)
+
+        for layer in self.layers:
+            x, _ = layer(x, self_attn_padding_mask=padding_mask)
+            if not last_state_only:
+                inner_states.append(x)
+
+        sentence_rep = x[0, :, :]
+
+        if last_state_only:
+            inner_states = [x]
+
+        if self.traceable:
+            return torch.stack(inner_states), sentence_rep
+        else:
+            return inner_states, sentence_rep
diff --git a/fairseq/modules/transformer_sentence_encoder_layer.py b/fairseq/modules/transformer_sentence_encoder_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..383938f68fae6920bb79f4a48bc3bbcf708cc80d
--- /dev/null
+++ b/fairseq/modules/transformer_sentence_encoder_layer.py
@@ -0,0 +1,139 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Optional
+
+import torch
+import torch.nn as nn
+
+from fairseq import utils
+from fairseq.modules import (
+    LayerNorm,
+    MultiheadAttention,
+)
+from fairseq.modules.quant_noise import quant_noise
+from fairseq.modules.fairseq_dropout import FairseqDropout
+
+
+
+class TransformerSentenceEncoderLayer(nn.Module):
+    """
+    Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained
+    models.
+    """
+
+    def __init__(
+        self,
+        embedding_dim: int = 768,
+        ffn_embedding_dim: int = 3072,
+        num_attention_heads: int = 8,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        activation_fn: str = 'relu',
+        export: bool = False,
+        q_noise: float = 0.0,
+        qn_block_size: int = 8,
+        init_fn: Callable = None,
+    ) -> None:
+        super().__init__()
+
+        if init_fn is not None:
+            init_fn()
+
+        # Initialize parameters
+        self.embedding_dim = embedding_dim
+        self.dropout_module = FairseqDropout(dropout, module_name=self.__class__.__name__)
+        self.activation_dropout_module = FairseqDropout(activation_dropout, module_name=self.__class__.__name__)
+
+        # Initialize blocks
+        self.activation_fn = utils.get_activation_fn(activation_fn)
+        self.self_attn = self.build_self_attention(
+            self.embedding_dim,
+            num_attention_heads,
+            dropout=attention_dropout,
+            self_attention=True,
+            q_noise=q_noise,
+            qn_block_size=qn_block_size,
+        )
+
+        # layer norm associated with the self attention layer
+        self.self_attn_layer_norm = LayerNorm(self.embedding_dim, export=export)
+
+        self.fc1 = self.build_fc1(
+            self.embedding_dim,
+            ffn_embedding_dim,
+            q_noise=q_noise,
+            qn_block_size=qn_block_size,
+        )
+        self.fc2 = self.build_fc2(
+            ffn_embedding_dim,
+            self.embedding_dim,
+            q_noise=q_noise,
+            qn_block_size=qn_block_size,
+        )
+
+        # layer norm associated with the position wise feed-forward NN
+        self.final_layer_norm = LayerNorm(self.embedding_dim, export=export)
+
+    def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
+        return quant_noise(
+            nn.Linear(input_dim, output_dim), q_noise, qn_block_size
+        )
+
+    def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
+        return quant_noise(
+            nn.Linear(input_dim, output_dim), q_noise, qn_block_size
+        )
+
+    def build_self_attention(
+        self,
+        embed_dim,
+        num_attention_heads,
+        dropout,
+        self_attention,
+        q_noise,
+        qn_block_size,
+    ):
+        return MultiheadAttention(
+            embed_dim,
+            num_attention_heads,
+            dropout=dropout,
+            self_attention=True,
+            q_noise=q_noise,
+            qn_block_size=qn_block_size,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        self_attn_mask: Optional[torch.Tensor] = None,
+        self_attn_padding_mask: Optional[torch.Tensor] = None,
+    ):
+        """
+        LayerNorm is applied either before or after the self-attention/ffn
+        modules similar to the original Transformer implementation.
+        """
+        residual = x
+        x, attn = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            key_padding_mask=self_attn_padding_mask,
+            need_weights=False,
+            attn_mask=self_attn_mask,
+        )
+        x = self.dropout_module(x)
+        x = residual + x
+        x = self.self_attn_layer_norm(x)
+
+        residual = x
+        x = self.activation_fn(self.fc1(x))
+        x = self.activation_dropout_module(x)
+        x = self.fc2(x)
+        x = self.dropout_module(x)
+        x = residual + x
+        x = self.final_layer_norm(x)
+        return x, attn
diff --git a/fairseq/modules/transpose_last.py b/fairseq/modules/transpose_last.py
new file mode 100644
index 0000000000000000000000000000000000000000..e578b3ec5097bfac5c976b207ea46bec1d9bd4f5
--- /dev/null
+++ b/fairseq/modules/transpose_last.py
@@ -0,0 +1,20 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+transpose last 2 dimensions of the input
+"""
+
+import torch.nn as nn
+
+
+class TransposeLast(nn.Module):
+    def __init__(self, deconstruct_idx=None):
+        super().__init__()
+        self.deconstruct_idx = deconstruct_idx
+
+    def forward(self, x):
+        if self.deconstruct_idx is not None:
+            x = x[self.deconstruct_idx]
+        return x.transpose(-2, -1)
diff --git a/fairseq/modules/unfold.py b/fairseq/modules/unfold.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a142db69868ff1e36241c7f032d0f886b6b9428
--- /dev/null
+++ b/fairseq/modules/unfold.py
@@ -0,0 +1,17 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.nn.functional as F
+
+
+def unfold1d(x, kernel_size, padding_l, pad_value=0):
+    '''unfold T x B x C to T x B x C x K'''
+    if kernel_size > 1:
+        T, B, C = x.size()
+        x = F.pad(x, (0, 0, 0, 0, padding_l, kernel_size - 1 - padding_l), value=pad_value)
+        x = x.as_strided((T, B, C, kernel_size), (B*C, C, 1, B*C))
+    else:
+        x = x.unsqueeze(3)
+    return x
diff --git a/fairseq/modules/vggblock.py b/fairseq/modules/vggblock.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee5ee19a34816c7350c21fba7c4907fec8ca7a61
--- /dev/null
+++ b/fairseq/modules/vggblock.py
@@ -0,0 +1,116 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+from collections.abc import Iterable
+from itertools import repeat
+
+import torch
+import torch.nn as nn
+
+
+def _pair(v):
+    if isinstance(v, Iterable):
+        assert len(v) == 2, "len(v) != 2"
+        return v
+    return tuple(repeat(v, 2))
+
+
+def infer_conv_output_dim(conv_op, input_dim, sample_inchannel):
+    sample_seq_len = 200
+    sample_bsz = 10
+    x = torch.randn(sample_bsz, sample_inchannel, sample_seq_len, input_dim)
+    # N x C x H x W
+    # N: sample_bsz, C: sample_inchannel, H: sample_seq_len, W: input_dim
+    x = conv_op(x)
+    # N x C x H x W
+    x = x.transpose(1, 2)
+    # N x H x C x W
+    bsz, seq = x.size()[:2]
+    per_channel_dim = x.size()[3]
+    # bsz: N, seq: H, CxW the rest
+    return x.contiguous().view(bsz, seq, -1).size(-1), per_channel_dim
+
+
+class VGGBlock(torch.nn.Module):
+    """
+    VGG motibated cnn module https://arxiv.org/pdf/1409.1556.pdf
+
+    Args:
+        in_channels: (int) number of input channels (typically 1)
+        out_channels: (int) number of output channels
+        conv_kernel_size: convolution channels
+        pooling_kernel_size: the size of the pooling window to take a max over
+        num_conv_layers: (int) number of convolution layers
+        input_dim: (int) input dimension
+        conv_stride: the stride of the convolving kernel.
+            Can be a single number or a tuple (sH, sW)  Default: 1
+        padding: implicit paddings on both sides of the input.
+            Can be a single number or a tuple (padH, padW). Default: None
+        layer_norm: (bool) if layer norm is going to be applied. Default: False
+
+    Shape:
+        Input: BxCxTxfeat, i.e. (batch_size, input_size, timesteps, features)
+        Output: BxCxTxfeat, i.e. (batch_size, input_size, timesteps, features)
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        conv_kernel_size,
+        pooling_kernel_size,
+        num_conv_layers,
+        input_dim,
+        conv_stride=1,
+        padding=None,
+        layer_norm=False,
+    ):
+        assert (
+            input_dim is not None
+        ), "Need input_dim for LayerNorm and infer_conv_output_dim"
+        super(VGGBlock, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.conv_kernel_size = _pair(conv_kernel_size)
+        self.pooling_kernel_size = _pair(pooling_kernel_size)
+        self.num_conv_layers = num_conv_layers
+        self.padding = (
+            tuple(e // 2 for e in self.conv_kernel_size)
+            if padding is None
+            else _pair(padding)
+        )
+        self.conv_stride = _pair(conv_stride)
+
+        self.layers = nn.ModuleList()
+        for layer in range(num_conv_layers):
+            conv_op = nn.Conv2d(
+                in_channels if layer == 0 else out_channels,
+                out_channels,
+                self.conv_kernel_size,
+                stride=self.conv_stride,
+                padding=self.padding,
+            )
+            self.layers.append(conv_op)
+            if layer_norm:
+                conv_output_dim, per_channel_dim = infer_conv_output_dim(
+                    conv_op, input_dim, in_channels if layer == 0 else out_channels
+                )
+                self.layers.append(nn.LayerNorm(per_channel_dim))
+                input_dim = per_channel_dim
+            self.layers.append(nn.ReLU())
+
+        if self.pooling_kernel_size is not None:
+            pool_op = nn.MaxPool2d(kernel_size=self.pooling_kernel_size, ceil_mode=True)
+            self.layers.append(pool_op)
+            self.total_output_dim, self.output_dim = infer_conv_output_dim(
+                pool_op, input_dim, out_channels
+            )
+
+    def forward(self, x):
+        for i, _ in enumerate(self.layers):
+            x = self.layers[i](x)
+        return x
diff --git a/fairseq/nan_detector.py b/fairseq/nan_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..89ea982f69af94875be6855130e3e612f0e5eb21
--- /dev/null
+++ b/fairseq/nan_detector.py
@@ -0,0 +1,89 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+class NanDetector:
+    """
+        Detects the first NaN or Inf in forward and/or backward pass and logs, together with the module name
+    """
+
+    def __init__(self, model, forward=True, backward=True):
+        self.bhooks = []
+        self.fhooks = []
+        self.forward = forward
+        self.backward = backward
+        self.reset()
+
+        for name, mod in model.named_modules():
+            mod.__module_name = name
+            self.add_hooks(mod)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        self.close()
+
+    def add_hooks(self, module):
+        if self.forward:
+            self.fhooks.append(module.register_forward_hook(self.fhook_fn))
+        if self.backward:
+            self.bhooks.append(module.register_backward_hook(self.bhook_fn))
+
+    def reset(self):
+        self.has_printed_f = False
+        self.has_printed_b = False
+
+    def _detect(self, tensor, name, backward):
+        err = None
+        if (
+            tensor.numel() >= 2
+        ):  # single value tensors (like the loss) will not provide much info
+            with torch.no_grad():
+                if torch.isnan(tensor).any():
+                    err = "NaN"
+                elif torch.isinf(tensor).any():
+                    err = "Inf"
+        if err is not None:
+            err = f"{err} detected in output of {name}, shape: {tensor.shape}, {'backward' if backward else 'forward'}"
+        return err
+
+    def _apply(self, module, inp, x, backward):
+        if torch.is_tensor(x):
+            if isinstance(inp, tuple) and len(inp) > 0:
+                inp = inp[0]
+            err = self._detect(x, module.__module_name, backward)
+            if err is not None:
+                if torch.is_tensor(inp) and not backward:
+                    err += (
+                        f" input max: {inp.max().item()}, input min: {inp.min().item()}"
+                    )
+
+                has_printed_attr = 'has_printed_b' if backward else 'has_printed_f'
+                logger.warning(err)
+                setattr(self, has_printed_attr, True)
+        elif isinstance(x, dict):
+            for v in x.values():
+                self._apply(module, inp, v, backward)
+        elif isinstance(x, list) or isinstance(x, tuple):
+            for v in x:
+                self._apply(module, inp, v, backward)
+
+    def fhook_fn(self, module, inp, output):
+        if not self.has_printed_f:
+            self._apply(module, inp, output, backward=False)
+
+    def bhook_fn(self, module, inp, output):
+        if not self.has_printed_b:
+            self._apply(module, inp, output, backward=True)
+
+    def close(self):
+        for hook in self.fhooks + self.bhooks:
+            hook.remove()
diff --git a/fairseq/optim/__init__.py b/fairseq/optim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b8334d8c25b788aea5780079648d55052f64d5c
--- /dev/null
+++ b/fairseq/optim/__init__.py
@@ -0,0 +1,33 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import importlib
+import os
+
+from fairseq import registry
+from fairseq.optim.fairseq_optimizer import FairseqOptimizer
+from fairseq.optim.fp16_optimizer import FP16Optimizer, MemoryEfficientFP16Optimizer
+from fairseq.optim.bmuf import FairseqBMUF  # noqa
+
+
+__all__ = [
+    'FairseqOptimizer',
+    'FP16Optimizer',
+    'MemoryEfficientFP16Optimizer',
+]
+
+
+build_optimizer, register_optimizer, OPTIMIZER_REGISTRY = registry.setup_registry(
+    '--optimizer',
+    base_class=FairseqOptimizer,
+    default='nag',
+)
+
+
+# automatically import any Python files in the optim/ directory
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith('.py') and not file.startswith('_'):
+        module = file[:file.find('.py')]
+        importlib.import_module('fairseq.optim.' + module)
diff --git a/fairseq/optim/__pycache__/__init__.cpython-310.pyc b/fairseq/optim/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9ab0231c84df08851d5c6f4c179d6cf129fb6d6
Binary files /dev/null and b/fairseq/optim/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/optim/__pycache__/adadelta.cpython-310.pyc b/fairseq/optim/__pycache__/adadelta.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a4129ee70ebf74c8e82ba39db34c3f1243797be0
Binary files /dev/null and b/fairseq/optim/__pycache__/adadelta.cpython-310.pyc differ
diff --git a/fairseq/optim/__pycache__/adafactor.cpython-310.pyc b/fairseq/optim/__pycache__/adafactor.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d0d6d4fa3f136d954fca4f9a694c338e12b39d1a
Binary files /dev/null and b/fairseq/optim/__pycache__/adafactor.cpython-310.pyc differ
diff --git a/fairseq/optim/__pycache__/adagrad.cpython-310.pyc b/fairseq/optim/__pycache__/adagrad.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03fa6e7012330d6731f583c64d2c93f0b7e88095
Binary files /dev/null and b/fairseq/optim/__pycache__/adagrad.cpython-310.pyc differ
diff --git a/fairseq/optim/__pycache__/adam.cpython-310.pyc b/fairseq/optim/__pycache__/adam.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c15fb2ea707c606c24c0833365fdb03541aca0b
Binary files /dev/null and b/fairseq/optim/__pycache__/adam.cpython-310.pyc differ
diff --git a/fairseq/optim/__pycache__/adamax.cpython-310.pyc b/fairseq/optim/__pycache__/adamax.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0dc9db59a20e3bada2772c95e234b69bba742aac
Binary files /dev/null and b/fairseq/optim/__pycache__/adamax.cpython-310.pyc differ
diff --git a/fairseq/optim/__pycache__/bmuf.cpython-310.pyc b/fairseq/optim/__pycache__/bmuf.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29eed3c91781060bf0b29889d754dec4d61e7fa8
Binary files /dev/null and b/fairseq/optim/__pycache__/bmuf.cpython-310.pyc differ
diff --git a/fairseq/optim/__pycache__/dynamic_loss_scaler.cpython-310.pyc b/fairseq/optim/__pycache__/dynamic_loss_scaler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2729d79800d541a9432a707f0e3f681445f46b5
Binary files /dev/null and b/fairseq/optim/__pycache__/dynamic_loss_scaler.cpython-310.pyc differ
diff --git a/fairseq/optim/__pycache__/fairseq_optimizer.cpython-310.pyc b/fairseq/optim/__pycache__/fairseq_optimizer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cfb2da269c132a6f486191faecb4b39c1d41c75d
Binary files /dev/null and b/fairseq/optim/__pycache__/fairseq_optimizer.cpython-310.pyc differ
diff --git a/fairseq/optim/__pycache__/fp16_optimizer.cpython-310.pyc b/fairseq/optim/__pycache__/fp16_optimizer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4421b5b1c55128be5b02e9e994f4801621ea29d1
Binary files /dev/null and b/fairseq/optim/__pycache__/fp16_optimizer.cpython-310.pyc differ
diff --git a/fairseq/optim/__pycache__/fused_adam.cpython-310.pyc b/fairseq/optim/__pycache__/fused_adam.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e36b1a7e75c3ce5cb9b1b53acba942a147c91cb
Binary files /dev/null and b/fairseq/optim/__pycache__/fused_adam.cpython-310.pyc differ
diff --git a/fairseq/optim/__pycache__/fused_lamb.cpython-310.pyc b/fairseq/optim/__pycache__/fused_lamb.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66a245e719e26594d2ce9265f4d82b5af0447548
Binary files /dev/null and b/fairseq/optim/__pycache__/fused_lamb.cpython-310.pyc differ
diff --git a/fairseq/optim/__pycache__/nag.cpython-310.pyc b/fairseq/optim/__pycache__/nag.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a49e8e1575bef56f71c3f9b8d337feff603ec32
Binary files /dev/null and b/fairseq/optim/__pycache__/nag.cpython-310.pyc differ
diff --git a/fairseq/optim/__pycache__/sgd.cpython-310.pyc b/fairseq/optim/__pycache__/sgd.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb18d98abae412c9f471859695bebc9059bf9c32
Binary files /dev/null and b/fairseq/optim/__pycache__/sgd.cpython-310.pyc differ
diff --git a/fairseq/optim/adadelta.py b/fairseq/optim/adadelta.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a76e27fe41dd5966b8bcd61768f511dcf4e5d30
--- /dev/null
+++ b/fairseq/optim/adadelta.py
@@ -0,0 +1,47 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.optim
+
+from . import FairseqOptimizer, register_optimizer
+
+
+@register_optimizer('adadelta')
+class Adadelta(FairseqOptimizer):
+    def __init__(self, args, params):
+        super().__init__(args)
+        self._optimizer = torch.optim.Adadelta(params, **self.optimizer_config)
+
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--adadelta-rho', type=float, default=0.9, metavar='RHO',
+                            help='coefficient used for computing a running average of squared gradients')
+        parser.add_argument('--adadelta-eps', type=float, default=1e-6, metavar='EPS',
+                            help='term added to the denominator to improve numerical stability')
+        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
+                            help='weight decay')
+        parser.add_argument('--anneal-eps', action='store_true', help='flag to anneal eps')
+        # fmt: on
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            'lr': self.args.lr[0],
+            'rho': self.args.adadelta_rho,
+            'eps': self.args.adadelta_eps,
+            'weight_decay': self.args.weight_decay,
+        }
+
+    @property
+    def supports_flat_params(self):
+        return True
diff --git a/fairseq/optim/adafactor.py b/fairseq/optim/adafactor.py
new file mode 100644
index 0000000000000000000000000000000000000000..f52ec0f139b91ed55272011b4fa459a73af16546
--- /dev/null
+++ b/fairseq/optim/adafactor.py
@@ -0,0 +1,237 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import torch
+import torch.optim
+
+from . import FairseqOptimizer, register_optimizer
+
+
+@register_optimizer('adafactor')
+class FairseqAdafactor(FairseqOptimizer):
+    def __init__(self, args, params):
+        super().__init__(args)
+        self._optimizer = Adafactor(params, **self.optimizer_config)
+
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--adafactor-eps', default='(1e-30, 1e-3)', metavar="E",
+                            help='epsilons for Adafactor optimizer')
+        parser.add_argument('--clip-threshold', type=float, default=1.0, metavar="C",
+                            help='threshold for clipping update root mean square')
+        parser.add_argument('--decay-rate', type=float, default=-0.8, metavar="D",
+                            help='decay rate of the second moment estimator')
+        parser.add_argument('--beta1', type=float, default=None, metavar="B",
+                            help='beta for first moment estimator. Optional')
+        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
+                            help='weight decay')
+        parser.add_argument('--scale-parameter', action='store_true',
+                            help='scale learning rate by root mean square of parameter')
+        parser.add_argument('--relative-step', action='store_true',
+                            help='set learning rate to inverse square root of timestep,'
+                                 'otherwise use external learning rate')
+        parser.add_argument('--warmup-init', action='store_true',
+                            help='use relative step for warm-up learning rate schedule')
+        # fmt: on
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        Note : Convergence issues empirically observed with fp16 on.
+               Might require search for appropriate configuration.
+        """
+        return {
+            'lr': self.args.lr[0],
+            'eps': eval(self.args.adafactor_eps),
+            'clip_threshold': self.args.clip_threshold,
+            'decay_rate': self.args.decay_rate,
+            'beta1': self.args.beta1,
+            'weight_decay': self.args.weight_decay,
+            'scale_parameter': self.args.scale_parameter,  # defaults to False
+            'relative_step': self.args.relative_step,  # defaults to False
+            'warmup_init': self.args.warmup_init,
+        }
+
+
+class Adafactor(torch.optim.Optimizer):
+    """Implements Adafactor algorithm.
+
+    This implementation is based on:
+    `Adafactor: Adaptive Learning Rates with Sublinear Memory Cost`
+    (see https://arxiv.org/abs/1804.04235)
+
+    Note that this optimizer internally adjusts the learning rate
+    depending on the *scale_parameter*, *relative_step* and
+    *warmup_init* options. To use a manual (external) learning rate
+    schedule you should set `scale_parameter=False` and
+    `relative_step=False`.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): external learning rate (default: None)
+        eps (tuple[float, float]): regularization constans for square gradient
+            and parameter scale respectively (default: (1e-30, 1e-3))
+        clip_threshold (float): threshold of root mean square of
+            final gradient update (default: 1.0)
+        decay_rate (float): coefficient used to compute running averages of square
+            gradient (default: -0.8)
+        beta1 (float): coefficient used for computing running averages of gradient
+            (default: None)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        scale_parameter (bool): if True, learning rate is scaled by root mean square of
+            parameter (default: True)
+        relative_step (bool): if True, time-dependent learning rate is computed
+            instead of external learning rate (default: True)
+        warmup_init (bool): time-dependent learning rate computation depends on
+            whether warm-up initialization is being used (default: False)
+    """
+
+    def __init__(self, params, lr=None, eps=(1e-30, 1e-3), clip_threshold=1.0,
+                 decay_rate=-0.8, beta1=None, weight_decay=0.0, scale_parameter=True,
+                 relative_step=True, warmup_init=False):
+        if lr is not None and relative_step:
+            raise ValueError('Cannot combine manual lr and relative_step options')
+        if warmup_init and not relative_step:
+            raise ValueError('warmup_init requires relative_step=True')
+
+        defaults = dict(lr=lr, eps=eps, clip_threshold=clip_threshold, decay_rate=decay_rate,
+                        beta1=beta1, weight_decay=weight_decay, scale_parameter=scale_parameter,
+                        relative_step=relative_step, warmup_init=warmup_init)
+        super(Adafactor, self).__init__(params, defaults)
+
+    @property
+    def supports_memory_efficient_fp16(self):
+        return True
+
+    @property
+    def supports_flat_params(self):
+        return False
+
+    def _get_lr(self, param_group, param_state):
+        rel_step_sz = param_group['lr']
+        if param_group['relative_step']:
+            min_step = 1e-6 * param_state['step'] if param_group['warmup_init'] else 1e-2
+            rel_step_sz = min(min_step, 1.0/math.sqrt(param_state['step']))
+        param_scale = 1.0
+        if param_group['scale_parameter']:
+            param_scale = max(param_group['eps'][1], param_state['RMS'])
+        return param_scale * rel_step_sz
+
+    def _get_options(self, param_group, param_shape):
+        factored = len(param_shape) >= 2
+        use_first_moment = param_group['beta1'] is not None
+        return factored, use_first_moment
+
+    def _rms(self, tensor):
+        return tensor.norm(2) / (tensor.numel() ** 0.5)
+
+    def _approx_sq_grad(self, exp_avg_sq_row, exp_avg_sq_col):
+        r_factor = (
+            exp_avg_sq_row / exp_avg_sq_row.mean(dim=-1, keepdim=True)
+        ).rsqrt_()
+        c_factor = exp_avg_sq_col.rsqrt()
+        return torch.mm(r_factor.unsqueeze(-1), c_factor.unsqueeze(0))
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.dtype in {torch.float16, torch.bfloat16}:
+                    grad = grad.float()
+                if grad.is_sparse:
+                    raise RuntimeError('Adafactor does not support sparse gradients.')
+
+                state = self.state[p]
+                grad_shape = grad.shape
+
+                factored, use_first_moment = self._get_options(group, grad_shape)
+                # State Initialization
+                if len(state) == 0:
+                    state['step'] = 0
+
+                    if use_first_moment:
+                        # Exponential moving average of gradient values
+                        state['exp_avg'] = torch.zeros_like(grad)
+                    if factored:
+                        state['exp_avg_sq_row'] = torch.zeros(grad_shape[:-1]).to(grad)
+                        state['exp_avg_sq_col'] = torch.zeros(grad_shape[:-2] + grad_shape[-1:]).to(grad)
+                    else:
+                        state['exp_avg_sq'] = torch.zeros_like(grad)
+
+                    state['RMS'] = 0
+                else:
+                    if use_first_moment:
+                        state['exp_avg'] = state['exp_avg'].to(grad)
+                    if factored:
+                        state['exp_avg_sq_row'] = state['exp_avg_sq_row'].to(grad)
+                        state['exp_avg_sq_col'] = state['exp_avg_sq_col'].to(grad)
+                    else:
+                        state['exp_avg_sq'] = state['exp_avg_sq'].to(grad)
+
+                p_data_fp32 = p.data
+                if p.data.dtype in {torch.float16, torch.bfloat16}:
+                    p_data_fp32 = p_data_fp32.float()
+
+                state['step'] += 1
+                state['RMS'] = self._rms(p_data_fp32)
+                group['lr'] = self._get_lr(group, state)
+
+                beta2t = 1.0 - math.pow(state['step'], group['decay_rate'])
+                update = (grad**2) + group['eps'][0]
+                if factored:
+                    exp_avg_sq_row = state['exp_avg_sq_row']
+                    exp_avg_sq_col = state['exp_avg_sq_col']
+
+                    exp_avg_sq_row.mul_(beta2t).add_(1.0 - beta2t, update.mean(dim=-1))
+                    exp_avg_sq_col.mul_(beta2t).add_(1.0 - beta2t, update.mean(dim=-2))
+
+                    # Approximation of exponential moving average of square of gradient
+                    update = self._approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col)
+                    update.mul_(grad)
+                else:
+                    exp_avg_sq = state['exp_avg_sq']
+
+                    exp_avg_sq.mul_(beta2t).add_(1.0 - beta2t, update)
+                    update = exp_avg_sq.rsqrt().mul_(grad)
+
+                update.div_(
+                    (self._rms(update) / group['clip_threshold']).clamp_(min=1.0)
+                )
+                update.mul_(group['lr'])
+
+                if use_first_moment:
+                    exp_avg = state['exp_avg']
+                    exp_avg.mul_(group['beta1']).add_(1 - group['beta1'], update)
+                    update = exp_avg
+
+                if group['weight_decay'] != 0:
+                    p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
+
+                p_data_fp32.add_(-update)
+
+                if p.data.dtype in {torch.float16, torch.bfloat16}:
+                    p.data.copy_(p_data_fp32)
+
+        return loss
diff --git a/fairseq/optim/adagrad.py b/fairseq/optim/adagrad.py
new file mode 100644
index 0000000000000000000000000000000000000000..57f83258cfd177c5381210115f679a501241c6ed
--- /dev/null
+++ b/fairseq/optim/adagrad.py
@@ -0,0 +1,40 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.optim
+
+from . import FairseqOptimizer, register_optimizer
+
+
+@register_optimizer('adagrad')
+class Adagrad(FairseqOptimizer):
+    def __init__(self, args, params):
+        super().__init__(args)
+        self._optimizer = torch.optim.Adagrad(params, **self.optimizer_config)
+
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
+                            help='weight decay')
+        # fmt: on
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            'lr': self.args.lr[0],
+            'weight_decay': self.args.weight_decay,
+        }
+
+    @property
+    def supports_flat_params(self):
+        return True
diff --git a/fairseq/optim/adam.py b/fairseq/optim/adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5783b258cfe3a3d9e16411ad7012f2146c30136
--- /dev/null
+++ b/fairseq/optim/adam.py
@@ -0,0 +1,209 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import math
+import types
+
+import torch
+import torch.optim
+import torch.distributed as dist
+
+from fairseq.optim import FairseqOptimizer, register_optimizer
+from fairseq.optim.fused_adam import get_fused_adam_class
+
+logger = logging.getLogger(__name__)
+
+
+@register_optimizer('adam')
+class FairseqAdam(FairseqOptimizer):
+    """Adam optimizer for fairseq.
+
+    Important note: this optimizer corresponds to the "AdamW" variant of
+    Adam in its weight decay behavior. As such, it is most closely
+    analogous to torch.optim.AdamW from PyTorch.
+    """
+
+    def __init__(self, args, params):
+        super().__init__(args)
+        fused_adam_cls = get_fused_adam_class()
+        use_fused_adam = (
+            not getattr(args, 'use_old_adam', False)
+            and fused_adam_cls is not None
+            and torch.cuda.is_available()
+        )
+        if getattr(args, 'tpu', False):
+            # on TPUs we use the Adam defined here, since it
+            # automatically casts gradients to FP32
+            self._optimizer = Adam(params, **self.optimizer_config)
+        elif use_fused_adam:
+            logger.info('using FusedAdam')
+            self._optimizer = fused_adam_cls(params, **self.optimizer_config)
+        else:
+            self._optimizer = Adam(params, **self.optimizer_config)
+
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--adam-betas', default='(0.9, 0.999)', metavar='B',
+                            help='betas for Adam optimizer')
+        parser.add_argument('--adam-eps', type=float, default=1e-8, metavar='D',
+                            help='epsilon for Adam optimizer')
+        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
+                            help='weight decay')
+        # Maintain backward compatibility with old checkpoints that have stored
+        # optimizer state as fairseq.optim.adam.Adam.
+        parser.add_argument(
+            "--use-old-adam",
+            action='store_true',
+            default=False,
+            help="Use fairseq.optim.adam.Adam",
+        )
+        # fmt: on
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            'lr': self.args.lr[0],
+            'betas': eval(self.args.adam_betas),
+            'eps': self.args.adam_eps,
+            'weight_decay': self.args.weight_decay,
+        }
+
+    def average_params(self):
+        """Reduce Params is only used during BMUF distributed training."""
+        state_dict = self.optimizer.state_dict()
+        total_gpus = float(dist.get_world_size())
+
+        for _, value in state_dict["state"].items():
+            value["exp_avg"] /= total_gpus
+            value["exp_avg_sq"] /= total_gpus
+            dist.all_reduce(value["exp_avg"], op=dist.ReduceOp.SUM)
+            dist.all_reduce(value["exp_avg_sq"], op=dist.ReduceOp.SUM)
+
+
+class Adam(torch.optim.Optimizer):
+    """Implements Adam algorithm.
+
+    This implementation is modified from torch.optim.Adam based on:
+    `Fixed Weight Decay Regularization in Adam`
+    (see https://arxiv.org/abs/1711.05101)
+
+    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=0, amsgrad=False):
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad)
+        super(Adam, self).__init__(params, defaults)
+
+    @property
+    def supports_memory_efficient_fp16(self):
+        return True
+
+    @property
+    def supports_flat_params(self):
+        return True
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.dtype in {torch.float16, torch.bfloat16}:
+                    grad = grad.float()
+                if grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+                amsgrad = group['amsgrad']
+
+                p_data_fp32 = p.data
+                if p.data.dtype in {torch.float16, torch.bfloat16}:
+                    p_data_fp32 = p_data_fp32.float()
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p_data_fp32)
+                else:
+                    state['exp_avg'] = state['exp_avg'].to(p_data_fp32)
+                    state['exp_avg_sq'] = state['exp_avg_sq'].to(p_data_fp32)
+                    if amsgrad:
+                        state['max_exp_avg_sq'] = state['max_exp_avg_sq'].to(p_data_fp32)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                if amsgrad:
+                    max_exp_avg_sq = state['max_exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                # Decay the first and second moment running average coefficient
+                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+                if amsgrad:
+                    # Maintains the maximum of all 2nd moment running avg. till now
+                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+                    # Use the max. for normalizing running avg. of gradient
+                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
+                else:
+                    denom = exp_avg_sq.sqrt().add_(group['eps'])
+
+                bias_correction1 = 1 - beta1 ** state['step']
+                bias_correction2 = 1 - beta2 ** state['step']
+                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
+
+                if group['weight_decay'] != 0:
+                    p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr'])
+
+                p_data_fp32.addcdiv_(exp_avg, denom, value=-step_size)
+
+                if p.data.dtype in {torch.float16, torch.bfloat16}:
+                    p.data.copy_(p_data_fp32)
+
+        return loss
diff --git a/fairseq/optim/adamax.py b/fairseq/optim/adamax.py
new file mode 100644
index 0000000000000000000000000000000000000000..856215a3ba97ed03ddf5741fed692d0dc32af947
--- /dev/null
+++ b/fairseq/optim/adamax.py
@@ -0,0 +1,158 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.optim
+
+from . import FairseqOptimizer, register_optimizer
+
+
+@register_optimizer('adamax')
+class FairseqAdamax(FairseqOptimizer):
+    def __init__(self, args, params):
+        super().__init__(args)
+        self._optimizer = Adamax(params, **self.optimizer_config)
+
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--adamax-betas', default='(0.9, 0.999)', metavar='B',
+                            help='betas for Adam optimizer')
+        parser.add_argument('--adamax-eps', type=float, default=1e-8, metavar='D',
+                            help='epsilon for Adam optimizer')
+        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
+                            help='weight decay')
+        parser.add_argument('--no-bias-correction', default=False, action='store_true',
+                            help='disable bias correction')
+        # fmt: on
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            'lr': self.args.lr[0],
+            'betas': eval(self.args.adamax_betas),
+            'eps': self.args.adamax_eps,
+            'weight_decay': self.args.weight_decay,
+            'bias_correction': not self.args.no_bias_correction,
+        }
+
+
+class Adamax(torch.optim.Optimizer):
+    """Implements Adamax algorithm (a variant of Adam based on infinity norm).
+
+    It has been proposed in `Adam: A Method for Stochastic Optimization`__.
+
+    Compared to the version in PyTorch, this version implements a fix for weight decay.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 2e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        bias_correction (bool, optional): enable bias correction (default: True)
+
+    __ https://arxiv.org/abs/1412.6980
+    """
+
+    def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=0, bias_correction=True):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
+                        bias_correction=bias_correction)
+        super(Adamax, self).__init__(params, defaults)
+
+    @property
+    def supports_memory_efficient_fp16(self):
+        return True
+
+    @property
+    def supports_flat_params(self):
+        return True
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data.float()
+                if grad.is_sparse:
+                    raise RuntimeError('Adamax does not support sparse gradients')
+
+                p_data_fp32 = p.data
+                if p.data.dtype in {torch.float16, torch.bfloat16}:
+                    p_data_fp32 = p_data_fp32.float()
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
+                    state['exp_inf'] = torch.zeros_like(p_data_fp32)
+                else:
+                    state['exp_avg'] = state['exp_avg'].to(p_data_fp32)
+                    state['exp_inf'] = state['exp_inf'].to(p_data_fp32)
+
+                exp_avg, exp_inf = state['exp_avg'], state['exp_inf']
+                beta1, beta2 = group['betas']
+                eps = group['eps']
+
+                state['step'] += 1
+
+                # Update biased first moment estimate.
+                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+
+                # Update the exponentially weighted infinity norm.
+                torch.max(
+                    exp_inf.mul_(beta2),
+                    grad.abs_(),
+                    out=exp_inf,
+                )
+
+                step_size = group['lr']
+                if group['bias_correction']:
+                    bias_correction = 1 - beta1 ** state['step']
+                    step_size /= bias_correction
+
+                if group['weight_decay'] != 0:
+                    p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr'])
+
+                p_data_fp32.addcdiv_(exp_avg, exp_inf.add(eps), value=-step_size)
+
+                if p.data.dtype in {torch.float16, torch.bfloat16}:
+                    p.data.copy_(p_data_fp32)
+
+        return loss
diff --git a/fairseq/optim/bmuf.py b/fairseq/optim/bmuf.py
new file mode 100644
index 0000000000000000000000000000000000000000..be7bdd74a777a626dbb8035b6c21c0ed3af75a67
--- /dev/null
+++ b/fairseq/optim/bmuf.py
@@ -0,0 +1,230 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.distributed as dist
+
+from . import FairseqOptimizer
+
+
+class FairseqBMUF(FairseqOptimizer):
+    """
+    Implements incremental block distributed data parallelism similar to
+    https://ieeexplore.ieee.org/document/7472805
+
+    Paper title: Scalable training of deep learning machines by incremental
+    block training with intra-block parallel optimization and blockwise
+    model-update filtering
+    """
+
+    def __init__(self, args, optimizer):
+
+        super().__init__(args)
+        self._optimizer = optimizer
+        self._num_updates = 0
+        self.sync_iter = self.args.global_sync_iter
+        self.block_momentum = self.args.block_momentum
+        self.block_lr = self.args.block_lr
+        self._reset_local_data()
+        self.warmup_iteration = self.args.warmup_iterations
+        self.use_nbm = self.args.use_nbm
+        self.initial_state = self._optimizer.state_dict()
+        self.average_sync = self.args.average_sync
+        self.world_size = self.args.distributed_world_size
+
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        parser.add_argument(
+            "--block-lr", default=1, type=float, help="block learning rate for bmuf"
+        )
+        parser.add_argument(
+            "--block-momentum",
+            default=0.875,
+            type=float,
+            help="block momentum for bmuf",
+        )
+        parser.add_argument(
+            "--global-sync-iter",
+            default=50,
+            type=int,
+            help="Iteration for syncing global model",
+        )
+        parser.add_argument(
+            "--warmup-iterations",
+            default=500,
+            type=int,
+            help="warmup iterations for model to broadcast",
+        )
+        parser.add_argument(
+            "--use-nbm",
+            default=False,
+            action="store_true",
+            help="Specify whether you want to use classical BM / Nesterov BM",
+        )
+        parser.add_argument(
+            "--average-sync",
+            default=False,
+            action="store_true",
+            help="Specify whether you want to average the local momentum after each sync",
+        )
+
+    @property
+    def optimizer(self):
+        return self._optimizer.optimizer
+
+    @property
+    def optimizer_config(self):
+        return self._optimizer.optimizer_config
+
+    def get_lr(self):
+        return self._optimizer.get_lr()
+
+    def set_lr(self, lr):
+        self._optimizer.set_lr(lr)
+
+    def state_dict(self):
+        return self._optimizer.state_dict()
+
+    def load_state_dict(self, state_dict, optimizer_overrides=None):
+        self._optimizer.load_state_dict(state_dict, optimizer_overrides)
+        self.initial_state = self._optimizer.state_dict()
+
+    def multiply_grads(self, c):
+        """Multiplies grads by a constant *c*."""
+        self._optimizer.multiply_grads(c)
+
+    def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
+        """Clips gradient norm."""
+        return self._optimizer.clip_grad_norm(max_norm, aggregate_norm_fn)
+
+    def average_params(self):
+        self._optimizer.average_params()
+
+    def _block_sync(self):
+        if self.world_size <= 1:
+            return
+        # Update the global model using local models from all GPUs
+        # (Step-1) Calculate grad between previously synced model and
+        # currrent local model
+        if self.block_momentum != 0:
+            self._calc_grad()
+
+        # (Step-2) Average gradient from all GPUs
+        self._avg_grad_from_all_gpus()
+
+        # (Step-3) Calculate global momentum and update the global model
+        if self.block_momentum != 0:
+            self._update_global_model()
+
+        # (Step-4) Average local optimizer params
+        if self.average_sync:
+            self.average_params()
+
+    def _is_warmup_end(self):
+        # Check whether train iterations is equal to warmup iter
+        if self.get_num_updates() == self.warmup_iteration:
+            return True
+        return False
+
+    def _is_bmuf_iter(self):
+        # Check whether train iterations is equal to bmuf sync iter
+        if (self.get_num_updates() > self.warmup_iteration) and (
+            self.get_num_updates() % self.sync_iter == 0
+        ):
+            return True
+        return False
+
+    def _warmup_sync(self, root_rank=0):
+        if self.world_size <= 1:
+            return
+        # Broadcast the local model to all gpus
+        for param in self.params:
+            dist.broadcast(param.data, src=root_rank)
+
+        # Update local optimizer state
+        if self.average_sync:
+            self._optimizer.average_params()
+        else:
+            self._optimizer.load_state_dict(self.initial_state)
+
+        self._reset_local_data()
+
+    def step(self, closure=None):
+        """Performs a single optimization step."""
+        self._optimizer.step(closure)
+        self.set_num_updates(self.get_num_updates() + 1)
+        if self._is_warmup_end():
+            self._warmup_sync()
+        elif self._is_bmuf_iter():
+            self._block_sync()
+
+    def zero_grad(self):
+        """Clears the gradients of all optimized parameters."""
+        self._optimizer.zero_grad()
+
+    def get_num_updates(self):
+        """Get the number of parameters updates."""
+        return self._num_updates
+
+    def set_num_updates(self, num_updates):
+        """Set the number of parameters updates."""
+        self._num_updates = num_updates
+
+    @torch.no_grad()
+    def _reset_local_data(self):
+        # (Step-0) Initialize global momentum parameters and store global copy on each gpu
+        self.global_params = [torch.zeros_like(p.data) for p in self.params]
+        self.smoothed_grads = [p.data.new_zeros(p.data.size()) for p in self.params]
+        self.grads = [p.data.new_zeros(p.data.size()) for p in self.params]
+
+        # saving the global model locally for calculating gradient during bmuf sync
+        for param, global_param in zip(self.params, self.global_params):
+            global_param.copy_(param.data)
+
+    @torch.no_grad()
+    def _calc_grad(self):
+        # global_params is basically the global copy from the previously finished
+        # synchronisation. param.data is local parameter after block_sync_freq
+        # for the local gpu. so grad is difference between previously synced
+        # model and currrent local model.
+        for index, (param, global_param) in enumerate(
+            zip(self.params, self.global_params)
+        ):
+            self.grads[index] = global_param - param.data
+
+    def _avg_grad_from_all_gpus(self):
+        for index, param in enumerate(self.params):
+            sync_para = param.data if self.block_momentum == 0 else self.grads[index]
+            sync_para /= float(dist.get_world_size())
+            dist.all_reduce(sync_para, op=dist.ReduceOp.SUM)
+
+    @torch.no_grad()
+    def _update_global_model(self):
+        for index, (param, global_param, smoothed_grad, grad) in enumerate(
+            zip(
+                self.params,
+                self.global_params,
+                self.smoothed_grads,
+                # all gpus would share the same value of smoothed_grad, since it is
+                # always computed on synchronized gradients.
+                self.grads,
+            )
+        ):
+            # global_param is basically last syncrhornized parameter. though
+            # smoothed_grad is local, all processes will have same value of
+            # smoothed_grad and hence param is globally synchronized copy.
+            # smoothed_grad(t) = BM * smoothed_grad(t-1) + BM_lr * grad(t)
+            smoothed_grad = self.block_momentum * smoothed_grad + self.block_lr * grad
+            param.data.copy_(global_param - smoothed_grad)
+
+            # A Nesterov momentum here is to do a partial weight update before
+            # calculating the gradient
+            if self.use_nbm:
+                param.data.copy_(param.data - self.block_momentum * smoothed_grad)
+
+            # backup for the next synchronization.
+            self.smoothed_grads[index] = smoothed_grad
+            global_param.copy_(param.data)
diff --git a/fairseq/optim/dynamic_loss_scaler.py b/fairseq/optim/dynamic_loss_scaler.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d1f0b2c050f03e8a30abecf03b2cece75e5dc27
--- /dev/null
+++ b/fairseq/optim/dynamic_loss_scaler.py
@@ -0,0 +1,63 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+class DynamicLossScaler(object):
+
+    def __init__(
+        self, init_scale=2.**15, scale_factor=2., scale_window=2000,
+        tolerance=0.05, threshold=None, min_loss_scale=1e-4
+    ):
+        self.loss_scale = init_scale
+        self.scale_factor = scale_factor
+        self.scale_window = scale_window
+        self.tolerance = tolerance
+        self.threshold = threshold
+        self._iter = 0
+        self._last_overflow_iter = -1
+        self._last_rescale_iter = -1
+        self._overflows_since_rescale = 0
+        self.min_loss_scale = min_loss_scale
+
+    def scale(self, outputs):
+        return self.loss_scale * outputs
+
+    def update(self):
+        if (self._iter - self._last_overflow_iter) % self.scale_window == 0:
+            self.loss_scale *= self.scale_factor
+            self._last_rescale_iter = self._iter
+        self._iter += 1
+
+    def _decrease_loss_scale(self):
+        self.loss_scale /= self.scale_factor
+        if self.threshold is not None:
+            self.loss_scale = max(self.loss_scale, self.threshold)
+
+    def check_overflow(self, grad_norm):
+        # detect inf and nan
+        if grad_norm == float('inf') or grad_norm != grad_norm:
+            # overflow has occured
+            prev_scale = self.loss_scale
+            iter_since_rescale = self._iter - self._last_rescale_iter
+
+            self._last_overflow_iter = self._iter
+            self._overflows_since_rescale += 1
+            pct_overflow = self._overflows_since_rescale / float(iter_since_rescale)
+            if pct_overflow >= self.tolerance:
+                self._decrease_loss_scale()
+                self._last_rescale_iter = self._iter
+                self._overflows_since_rescale = 0
+
+            if self.loss_scale <= self.min_loss_scale:
+                # Use FloatingPointError as an uncommon error that parent
+                # functions can safely catch to stop training.
+                self.loss_scale = prev_scale
+                raise FloatingPointError((
+                    'Minimum loss scale reached ({}). Your loss is probably exploding. '
+                    'Try lowering the learning rate, using gradient clipping or '
+                    'increasing the batch size.'
+                ).format(self.min_loss_scale))
+
+            self._iter += 1
+            raise OverflowError('setting loss scale to: ' + str(self.loss_scale))
diff --git a/fairseq/optim/fairseq_optimizer.py b/fairseq/optim/fairseq_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1b9c76edb3ce0ae172205c2c4cd96764b939831
--- /dev/null
+++ b/fairseq/optim/fairseq_optimizer.py
@@ -0,0 +1,133 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from fairseq import utils
+
+
+class FairseqOptimizer(object):
+
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        pass
+
+    @property
+    def optimizer(self):
+        """Return a torch.optim.optimizer.Optimizer instance."""
+        if not hasattr(self, '_optimizer'):
+            raise NotImplementedError
+        if not isinstance(self._optimizer, torch.optim.Optimizer):
+            raise ValueError('_optimizer must be an instance of torch.optim.Optimizer')
+        return self._optimizer
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        raise NotImplementedError
+
+    @property
+    def params(self):
+        """Return an iterable of the parameters held by the optimizer."""
+        for param_group in self.param_groups:
+            for p in param_group['params']:
+                yield p
+
+    @property
+    def param_groups(self):
+        return self.optimizer.param_groups
+
+    def __getstate__(self):
+        return self._optimizer.__getstate__()
+
+    def get_lr(self):
+        """Return the current learning rate."""
+        return self.param_groups[0]['lr']
+
+    def set_lr(self, lr):
+        """Set the learning rate."""
+        for param_group in self.param_groups:
+            param_group['lr'] = lr
+
+    def state_dict(self):
+        """Return the optimizer's state dict."""
+        return self.optimizer.state_dict()
+
+    def load_state_dict(self, state_dict, optimizer_overrides=None):
+        """Load an optimizer state dict.
+
+        In general we should prefer the configuration of the existing optimizer
+        instance (e.g., learning rate) over that found in the state_dict. This
+        allows us to resume training from a checkpoint using a new set of
+        optimizer args.
+        """
+        self.optimizer.load_state_dict(state_dict)
+
+        if optimizer_overrides is not None and len(optimizer_overrides) > 0:
+            # override learning rate, momentum, etc. with latest values
+            for group in self.param_groups:
+                group.update(optimizer_overrides)
+
+    def backward(self, loss):
+        """Computes the sum of gradients of the given tensor w.r.t. graph leaves."""
+        loss.backward()
+
+    def multiply_grads(self, c):
+        """Multiplies grads by a constant *c*."""
+        for p in self.params:
+            if p.grad is not None:
+                p.grad.data.mul_(c)
+
+    def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
+        """Clips gradient norm."""
+        return utils.clip_grad_norm_(self.params, max_norm, aggregate_norm_fn)
+
+    def step(self, closure=None, scale=1.):
+        """Performs a single optimization step."""
+        if self.supports_step_with_scale:
+            self.optimizer.step(closure, scale=scale)
+        else:
+            self.optimizer.step(closure)
+
+    def zero_grad(self):
+        """Clears the gradients of all optimized parameters."""
+        for p in self.params:
+            p.grad = None
+        self.optimizer.zero_grad()
+
+    @property
+    def supports_memory_efficient_fp16(self):
+        if hasattr(self.optimizer, 'supports_memory_efficient_fp16'):
+            return self.optimizer.supports_memory_efficient_fp16
+        return False
+
+    @property
+    def supports_step_with_scale(self):
+        if hasattr(self.optimizer, 'supports_step_with_scale'):
+            return self.optimizer.supports_step_with_scale
+        return False
+
+    @property
+    def supports_flat_params(self):
+        """
+        Whether the optimizer supports collapsing of the model
+        parameters/gradients into a single contiguous Tensor.
+        """
+        if hasattr(self.optimizer, 'supports_flat_params'):
+            return self.optimizer.supports_flat_params
+        return False
+
+    def average_params(self):
+        pass
diff --git a/fairseq/optim/fp16_optimizer.py b/fairseq/optim/fp16_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..37e94965bbaa2ae6501af080ade8df5d7787ab4d
--- /dev/null
+++ b/fairseq/optim/fp16_optimizer.py
@@ -0,0 +1,413 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from itertools import chain
+
+import torch
+
+from fairseq import optim, utils
+
+from .dynamic_loss_scaler import DynamicLossScaler
+
+
+class _FP16OptimizerMixin(object):
+
+    def __init__(self, *args, **kwargs):
+        # forward __init__ call to the next class in mro(method resolution order)
+        super().__init__(*args, **kwargs)
+
+    @property
+    def has_flat_params(self):
+        return torch.is_tensor(self.fp32_params)
+
+    @classmethod
+    def build_fp32_params(cls, params, flatten=True):
+        # create FP32 copy of parameters and grads
+        if flatten:
+            total_param_size = sum(p.data.numel() for p in params)
+            fp32_params = torch.zeros(total_param_size, dtype=torch.float, device=params[0].device)
+            offset = 0
+            for p in params:
+                numel = p.data.numel()
+                fp32_params[offset:offset+numel].copy_(p.data.view(-1))
+                offset += numel
+            fp32_params = torch.nn.Parameter(fp32_params)
+            fp32_params.grad = fp32_params.data.new(total_param_size)
+            return fp32_params
+        else:
+            fp32_params = []
+            for p in params:
+                p32 = torch.nn.Parameter(p.data.float())
+                p32.grad = torch.zeros_like(p32.data)
+                fp32_params.append(p32)
+            return fp32_params
+
+    def state_dict(self):
+        """Return the optimizer's state dict."""
+        state_dict = self.fp32_optimizer.state_dict()
+        if self.scaler is not None:
+            state_dict['loss_scale'] = self.scaler.loss_scale
+        return state_dict
+
+    def load_state_dict(self, state_dict, optimizer_overrides=None):
+        """Load an optimizer state dict.
+
+        In general we should prefer the configuration of the existing optimizer
+        instance (e.g., learning rate) over that found in the state_dict. This
+        allows us to resume training from a checkpoint using a new set of
+        optimizer args.
+        """
+        if 'loss_scale' in state_dict and self.scaler is not None:
+            self.scaler.loss_scale = state_dict['loss_scale']
+        self.fp32_optimizer.load_state_dict(state_dict, optimizer_overrides)
+
+    def backward(self, loss):
+        """Computes the sum of gradients of the given tensor w.r.t. graph leaves.
+
+        Compared to :func:`fairseq.optim.FairseqOptimizer.backward`, this
+        function additionally dynamically scales the loss to avoid gradient
+        underflow.
+        """
+        if self.scaler is not None:
+            loss = self.scaler.scale(loss)
+        loss.backward()
+        self._needs_sync = True
+
+    def _sync_fp16_grads_to_fp32(self, multiply_grads=1.):
+        if self._needs_sync:
+            if self.scaler is not None:
+                # correct for dynamic loss scaler
+                multiply_grads /= self.scaler.loss_scale
+
+            # copy FP16 grads to FP32
+            if self.has_flat_params:
+                offset = 0
+                for p in self.fp16_params:
+                    if not p.requires_grad:
+                        continue
+                    grad_data = p.grad.data if p.grad is not None else p.data.new_zeros(p.data.shape)
+                    numel = grad_data.numel()
+                    self.fp32_params.grad.data[offset:offset+numel].copy_(grad_data.view(-1))
+                    offset += numel
+                self.fp32_params.grad.data.mul_(multiply_grads)
+            else:
+                for p, p32 in zip(self.fp16_params, self.fp32_params):
+                    if not p.requires_grad:
+                        continue
+                    if p.grad is not None:
+                        p32.grad.data.copy_(p.grad.data)
+                        p32.grad.data.mul_(multiply_grads)
+                    else:
+                        p32.grad = torch.zeros_like(p.data, dtype=torch.float)
+
+            self._needs_sync = False
+
+    def _sync_fp32_grads_to_fp16(self):
+        # copy FP32 params back into FP16 model
+        if self.has_flat_params:
+            offset = 0
+            for p in self.fp16_params:
+                if not p.requires_grad:
+                    continue
+                numel = p.data.numel()
+                p.data.copy_(self.fp32_params.data[offset:offset+numel].view_as(p.data))
+                offset += numel
+        else:
+            for p, p32 in zip(self.fp16_params, self.fp32_params):
+                if not p.requires_grad:
+                    continue
+                p.data.copy_(p32.data)
+
+    def multiply_grads(self, c):
+        """Multiplies grads by a constant ``c``."""
+        if self._needs_sync:
+            self._sync_fp16_grads_to_fp32(c)
+        elif self.has_flat_params:
+            self.fp32_params.grad.data.mul_(c)
+        else:
+            for p32 in self.fp32_params:
+                p32.grad.data.mul_(c)
+
+    def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
+        """Clips gradient norm and updates dynamic loss scaler."""
+        self._sync_fp16_grads_to_fp32()
+        grad_norm = utils.clip_grad_norm_(self.fp32_params, max_norm, aggregate_norm_fn)
+
+        # detect overflow and adjust loss scale
+        if self.scaler is not None:
+            self.scaler.check_overflow(grad_norm)
+
+        return grad_norm
+
+    def step(self, closure=None):
+        """Performs a single optimization step."""
+        self._sync_fp16_grads_to_fp32()
+        self.fp32_optimizer.step(closure)
+
+        if self.scaler is not None:
+            self.scaler.update()
+
+        self._sync_fp32_grads_to_fp16()
+
+    def zero_grad(self):
+        """Clears the gradients of all optimized parameters."""
+        for p in self.fp16_params:
+            p.grad = None
+        if self.has_flat_params:
+            self.fp32_params.grad.zero_()
+        else:
+            for p32 in self.fp32_params:
+                p32.grad.zero_()
+        self._needs_sync = False
+
+
+class FP16Optimizer(_FP16OptimizerMixin, optim.FairseqOptimizer):
+    """
+    Wrap an *optimizer* to support FP16 (mixed precision) training.
+    """
+
+    def __init__(self, args, params, fp32_optimizer, fp32_params):
+        super().__init__(args)
+        self.fp16_params = params
+        self.fp32_optimizer = fp32_optimizer
+        self.fp32_params = fp32_params
+
+        if getattr(args, 'fp16_scale_window', None) is None:
+            if len(args.update_freq) > 1:
+                raise ValueError(
+                    '--fp16-scale-window must be given explicitly when using a '
+                    'custom --update-freq schedule'
+                )
+            data_parallel_size = int(args.distributed_world_size / args.model_parallel_size)
+            scale_window = int(2**14 / data_parallel_size / args.update_freq[0])
+        else:
+            scale_window = args.fp16_scale_window
+
+        if not getattr(args, 'bf16', False):
+            self.scaler = DynamicLossScaler(
+                init_scale=args.fp16_init_scale,
+                scale_window=scale_window,
+                tolerance=args.fp16_scale_tolerance,
+                threshold=args.threshold_loss_scale,
+                min_loss_scale=args.min_loss_scale
+            )
+        else:
+            # disable loss scaling for bfloat16
+            self.scaler = None
+
+    @classmethod
+    def build_optimizer(cls, args, params):
+        """
+        Args:
+            args (argparse.Namespace): fairseq args
+            params (iterable): iterable of parameters to optimize
+        """
+        flatten = not getattr(args, 'fp16_no_flatten_grads', False)
+        if getattr(args, 'bf16', False):
+            flatten = False  # mixed precision is faster on TPUs without flat grads
+        fp32_params = cls.build_fp32_params(params, flatten=flatten)
+        if flatten:
+            fp32_optimizer = optim.build_optimizer(args, [fp32_params])
+        else:
+            fp32_optimizer = optim.build_optimizer(args, fp32_params)
+        if flatten and not fp32_optimizer.supports_flat_params:
+            raise RuntimeError(
+                'chosen optimizer does not support flat params, '
+                'please set --fp16-no-flatten-grads'
+            )
+        return cls(args, params, fp32_optimizer, fp32_params)
+
+    @property
+    def optimizer(self):
+        return self.fp32_optimizer.optimizer
+
+    @property
+    def optimizer_config(self):
+        return self.fp32_optimizer.optimizer_config
+
+    def get_lr(self):
+        return self.fp32_optimizer.get_lr()
+
+    def set_lr(self, lr):
+        self.fp32_optimizer.set_lr(lr)
+
+
+class _MemoryEfficientFP16OptimizerMixin(object):
+
+    def __init__(self, *args, **kwargs):
+        # forward __init__ call to the next class in MRO (method resolution order)
+        super().__init__(*args, **kwargs)
+
+    @property
+    def has_flat_params(self):
+        return False
+
+    def state_dict(self):
+        """Return the optimizer's state dict."""
+        state_dict = self.wrapped_optimizer.state_dict()
+        if self.scaler is not None:
+            state_dict['loss_scale'] = self.scaler.loss_scale
+        return state_dict
+
+    def load_state_dict(self, state_dict, optimizer_overrides=None):
+        """Load an optimizer state dict.
+
+        In general we should prefer the configuration of the existing optimizer
+        instance (e.g., learning rate) over that found in the state_dict. This
+        allows us to resume training from a checkpoint using a new set of
+        optimizer args.
+        """
+        if 'loss_scale' in state_dict and self.scaler is not None:
+            self.scaler.loss_scale = state_dict['loss_scale']
+
+        self.wrapped_optimizer.load_state_dict(state_dict, optimizer_overrides)
+
+        # Hack: PyTorch automatically casts the optimizer state to match the
+        # type of the current parameters. But with --memory-efficient-fp16 the
+        # params are FP16 while the optimizer state is FP32 and we don't want
+        # to cast. A workaround is to manually copy back the original state
+        # after the optimizer has been loaded.
+        groups = self.optimizer.param_groups
+        saved_groups = state_dict['param_groups']
+        id_map = {
+            old_id: p
+            for old_id, p in zip(
+                chain(*(g['params'] for g in saved_groups)),
+                chain(*(g['params'] for g in groups))
+            )
+        }
+        for k, v in state_dict['state'].items():
+            if k in id_map:
+                param = id_map[k]
+                self.optimizer.state[param] = v
+
+    def backward(self, loss):
+        """Computes the sum of gradients of the given tensor w.r.t. graph leaves.
+
+        Compared to :func:`fairseq.optim.FairseqOptimizer.backward`, this
+        function additionally dynamically scales the loss to avoid gradient
+        underflow.
+        """
+        if self.scaler is not None:
+            loss = self.scaler.scale(loss)
+        loss.backward()
+
+    def _unscale_grads(self):
+        if self._multiply_factor != 1.:
+            self.wrapped_optimizer.multiply_grads(self._multiply_factor)
+            self._multiply_factor = 1.
+
+    def multiply_grads(self, c):
+        """Multiplies grads by a constant *c*."""
+        self._multiply_factor *= c
+
+    def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
+        """Clips gradient norm and updates dynamic loss scaler."""
+        max_norm = float(max_norm)
+        grad_norm = self._multiply_factor * self.wrapped_optimizer.clip_grad_norm(0, aggregate_norm_fn)
+
+        if self.scaler is not None:
+            grad_norm_cpu = float(grad_norm)
+            if grad_norm_cpu > max_norm > 0.:
+                self._multiply_factor *= max_norm / grad_norm_cpu
+
+            # detect overflow and adjust loss scale
+            self.scaler.check_overflow(grad_norm_cpu)
+        else:
+            clip_coef = (max_norm / (grad_norm + 1e-6)).clamp_(max=1)
+            self._multiply_factor *= clip_coef
+
+        return grad_norm
+
+    def step(self, closure=None):
+        """Performs a single optimization step."""
+        if self.supports_step_with_scale:
+            # NOTE(msb) optimizer divides by scale factor
+            self.wrapped_optimizer.step(closure, scale=(1. / self._multiply_factor))
+        else:
+            self._unscale_grads()
+            self.wrapped_optimizer.step(closure)
+
+        if self.scaler is not None:
+            self.scaler.update()
+
+    def zero_grad(self):
+        """Clears the gradients of all optimized parameters."""
+        self.wrapped_optimizer.zero_grad()
+        if self.scaler is not None:
+            self._multiply_factor = 1. / float(self.scaler.loss_scale)
+
+
+class MemoryEfficientFP16Optimizer(_MemoryEfficientFP16OptimizerMixin, optim.FairseqOptimizer):
+    """
+    Wrap an *optimizer* to support FP16 (mixed precision) training.
+
+    Compared to :class:`fairseq.optim.FP16Optimizer`, this version does not
+    maintain an FP32 copy of the model. We instead expect the optimizer to
+    convert the gradients to FP32 internally and sync the results back to the
+    FP16 model params. This significantly reduces memory usage but slightly
+    increases the time spent in the optimizer.
+
+    Since this wrapper depends on specific functionality in the wrapped
+    optimizer (i.e., on-the-fly conversion of grads to FP32), only certain
+    optimizers can be wrapped. This is determined by the
+    *supports_memory_efficient_fp16* property.
+    """
+
+    def __init__(self, args, params, optimizer):
+        if not optimizer.supports_memory_efficient_fp16:
+            raise ValueError(
+                'Unsupported optimizer: {}'.format(optimizer.__class__.__name__)
+            )
+
+        super().__init__(args)
+        self.wrapped_optimizer = optimizer
+
+        if getattr(args, 'fp16_scale_window', None) is None:
+            if len(args.update_freq) > 1:
+                raise ValueError(
+                    '--fp16-scale-window must be given explicitly when using a '
+                    'custom --update-freq schedule'
+                )
+            data_parallel_size = int(args.distributed_world_size / args.model_parallel_size)
+            scale_window = 2**14 / data_parallel_size / args.update_freq[0]
+        else:
+            scale_window = args.fp16_scale_window
+
+        if not getattr(args, 'bf16', False):
+            self.scaler = DynamicLossScaler(
+                init_scale=args.fp16_init_scale,
+                scale_window=scale_window,
+                tolerance=args.fp16_scale_tolerance,
+                threshold=args.threshold_loss_scale,
+                min_loss_scale=args.min_loss_scale
+            )
+        else:
+            # disable loss scaling for bfloat16
+            self.scaler = None
+
+    @classmethod
+    def build_optimizer(cls, args, params):
+        """
+        Args:
+            args (argparse.Namespace): fairseq args
+            params (iterable): iterable of parameters to optimize
+        """
+        fp16_optimizer = optim.build_optimizer(args, params)
+        return cls(args, params, fp16_optimizer)
+
+    @property
+    def optimizer(self):
+        return self.wrapped_optimizer.optimizer
+
+    @property
+    def optimizer_config(self):
+        return self.wrapped_optimizer.optimizer_config
+
+    def get_lr(self):
+        return self.wrapped_optimizer.get_lr()
+
+    def set_lr(self, lr):
+        self.wrapped_optimizer.set_lr(lr)
diff --git a/fairseq/optim/fused_adam.py b/fairseq/optim/fused_adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..9024451aff6df740d699001f5f7c4ca4e2cf3111
--- /dev/null
+++ b/fairseq/optim/fused_adam.py
@@ -0,0 +1,312 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import types
+
+import torch
+
+
+def get_fused_adam_class():
+    """
+    Look for the FusedAdam optimizer from apex. We first try to load the
+    "contrib" interface, which is a bit faster than the main interface,
+    but is technically deprecated.
+    """
+    try:
+        # The "deprecated" interface in recent versions of apex is a bit
+        # faster than the main interface, since we don't use the apex
+        # optimizer. This can be installed by passing the
+        # `--deprecated_fused_adam` option when building apex.
+        global fused_adam_cuda
+        import importlib
+        fused_adam_cuda = importlib.import_module("fused_adam_cuda")
+        return FusedAdamV1
+    except ImportError:
+        try:
+            # fallback to the newer interface
+            from apex.optimizers import FusedAdam as _FusedAdam  # noqa
+            from apex.multi_tensor_apply import multi_tensor_applier
+            if multi_tensor_applier.available:
+                return FusedAdamV2
+        except ImportError:
+            pass
+    return None
+
+
+class FusedAdamV1(torch.optim.Optimizer):
+    """
+    Implements Adam algorithm. Currently GPU-only. Requires Apex to be installed via
+    ``python setup.py install --cuda_ext --cpp_ext``.
+
+    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
+
+    Compared to the original version in Apex, the fairseq version casts grads
+    and params to FP32 internally to support ``--memory-efficient-fp16``.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False) NOT SUPPORTED in FusedAdam!
+        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
+            adds eps to the bias-corrected second moment estimate before
+            evaluating square root instead of adding it to the square root of
+            second moment estimate as in the original paper. (default: False)
+    .. _Adam: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(self, params,
+                 lr=1e-3, bias_correction=True,
+                 betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt=False,
+                 weight_decay=0., max_grad_norm=0., amsgrad=False):
+        global fused_adam_cuda
+        import importlib
+        fused_adam_cuda = importlib.import_module("fused_adam_cuda")
+
+        if amsgrad:
+            raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
+        defaults = {
+            'lr': lr,
+            'bias_correction': bias_correction,
+            'betas': betas,
+            'eps': eps,
+            'weight_decay': weight_decay,
+            'max_grad_norm': max_grad_norm,
+        }
+        super().__init__(params, defaults)
+        self.eps_mode = 0 if eps_inside_sqrt else 1
+
+    @property
+    def supports_memory_efficient_fp16(self):
+        return True
+
+    @property
+    def supports_flat_params(self):
+        return True
+
+    @property
+    def supports_step_with_scale(self):
+        return True
+
+    def step(self, closure=None, grads=None, scale=1., grad_norms=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+            grads (list of tensors, optional): weight gradient to use for the
+                optimizer update. If gradients have type torch.half, parameters
+                are expected to be in type torch.float. (default: None)
+            output params (list of tensors, optional): A reduced precision copy
+                of the updated weights written out in addition to the regular
+                updated weights. Have to be of same type as gradients. (default: None)
+            scale (float, optional): factor to divide gradient tensor values
+                by before applying to weights. (default: 1)
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        if grads is None:
+            grads_group = [None] * len(self.param_groups)
+        # backward compatibility
+        # assuming a list/generator of parameter means single group
+        elif isinstance(grads, types.GeneratorType):
+            grads_group = [grads]
+        elif type(grads[0]) != list:
+            grads_group = [grads]
+        else:
+            grads_group = grads
+
+        if grad_norms is None:
+            grad_norms = [None]*len(self.param_groups)
+
+        for group, grads_this_group, grad_norm in zip(self.param_groups, grads_group, grad_norms):
+            if grads_this_group is None:
+                grads_this_group = [None]*len(group['params'])
+
+            # compute combined scale factor for this group
+            combined_scale = scale
+            if group.get('max_grad_norm', 0) > 0:
+                # norm is in fact norm*scale
+                clip = ((grad_norm / scale) + 1e-6) / group['max_grad_norm']
+                if clip > 1:
+                    combined_scale = clip * scale
+
+            bias_correction = 1 if group.get('bias_correction', 1) else 0
+
+            for p, grad in zip(group['params'], grads_this_group):
+                # note: p.grad should not ever be set for correct
+                # operation of mixed precision optimizer that sometimes
+                # sends None gradients
+                if p.grad is None and grad is None:
+                    continue
+                if grad is None:
+                    grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError(
+                        'FusedAdam does not support sparse gradients, '
+                        'please consider SparseAdam instead'
+                    )
+
+                p_data_fp32 = p.data.float()
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
+                else:
+                    state['exp_avg'] = state['exp_avg'].to(p_data_fp32)
+                    state['exp_avg_sq'] = state['exp_avg_sq'].to(p_data_fp32)
+
+                exp_avg = state['exp_avg']
+                exp_avg_sq = state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                out_p = p.data
+                with torch.cuda.device(p.device):
+                    fused_adam_cuda.adam(p_data_fp32,
+                                         out_p,
+                                         exp_avg,
+                                         exp_avg_sq,
+                                         grad,
+                                         group['lr'],
+                                         beta1,
+                                         beta2,
+                                         group['eps'],
+                                         combined_scale,
+                                         state['step'],
+                                         self.eps_mode,
+                                         bias_correction,
+                                         group['weight_decay'])
+
+        return loss
+
+
+try:
+    from apex.optimizers import FusedAdam
+    from apex.multi_tensor_apply import multi_tensor_applier
+
+    class FusedAdamV2(FusedAdam):
+        """
+        Compared to the original version in Apex, the fairseq version casts grads
+        and params to FP32 internally to support ``--memory-efficient-fp16``.
+        """
+
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+            if not hasattr(self, 'multi_tensor_adam'):
+                raise Exception('Apex installation is outdated. Please install an updated version of apex.')
+
+        @property
+        def supports_memory_efficient_fp16(self):
+            return True
+
+        @property
+        def supports_flat_params(self):
+            return True
+
+        def step(self, closure=None, grads=None, output_params=None, scale=None, grad_norms=None):
+            """Performs a single optimization step."""
+            loss = None
+            if closure is not None:
+                loss = closure()
+
+            for group in self.param_groups:
+                bias_correction = 1 if group['bias_correction'] else 0
+                beta1, beta2 = group['betas']
+
+                # assume same step across group now to simplify things
+                # per parameter step can be easily support by making it tensor, or pass list into kernel
+                if 'step' in group:
+                    group['step'] += 1
+                else:
+                    group['step'] = 1
+
+                # create lists for multi-tensor apply
+                g_16, p_16, orig_p_16, m_16, v_16 = [], [], [], [], []
+                g_32, p_32, m_32, v_32 = [], [], [], []
+
+                for p in group['params']:
+                    if p.grad is None:
+                        continue
+                    if p.grad.data.is_sparse:
+                        raise RuntimeError(
+                            'FusedAdam does not support sparse gradients, '
+                            'please consider SparseAdam instead'
+                        )
+
+                    state = self.state[p]
+                    # State initialization
+                    if len(state) == 0:
+                        # Exponential moving average of gradient values
+                        state['exp_avg'] = torch.zeros_like(p.data, dtype=torch.float)
+                        # Exponential moving average of squared gradient values
+                        state['exp_avg_sq'] = torch.zeros_like(p.data, dtype=torch.float)
+                    else:
+                        state['exp_avg'] = state['exp_avg'].to(device=p.data.device, dtype=torch.float)
+                        state['exp_avg_sq'] = state['exp_avg_sq'].to(device=p.data.device, dtype=torch.float)
+
+                    if p.dtype == torch.float16:
+                        g_16.append(p.grad.data.float())
+                        p_16.append(p.data.float())
+                        orig_p_16.append(p.data)
+                        m_16.append(state['exp_avg'])
+                        v_16.append(state['exp_avg_sq'])
+                    elif p.dtype == torch.float32:
+                        g_32.append(p.grad.data)
+                        p_32.append(p.data)
+                        m_32.append(state['exp_avg'])
+                        v_32.append(state['exp_avg_sq'])
+                    else:
+                        raise RuntimeError('FusedAdam only support fp16 and fp32.')
+
+                with torch.cuda.device(p.device):
+                    if(len(g_16) > 0):
+                        multi_tensor_applier(self.multi_tensor_adam,
+                                             self._dummy_overflow_buf,
+                                             [g_16, p_16, m_16, v_16],
+                                             group['lr'],
+                                             beta1,
+                                             beta2,
+                                             group['eps'],
+                                             group['step'],
+                                             self.adam_w_mode,
+                                             bias_correction,
+                                             group['weight_decay'])
+                        for orig_p, p in zip(orig_p_16, p_16):
+                            orig_p.copy_(p.data)
+                    if(len(g_32) > 0):
+                        multi_tensor_applier(self.multi_tensor_adam,
+                                             self._dummy_overflow_buf,
+                                             [g_32, p_32, m_32, v_32],
+                                             group['lr'],
+                                             beta1,
+                                             beta2,
+                                             group['eps'],
+                                             group['step'],
+                                             self.adam_w_mode,
+                                             bias_correction,
+                                             group['weight_decay'])
+
+            return loss
+except ImportError:
+    pass
diff --git a/fairseq/optim/fused_lamb.py b/fairseq/optim/fused_lamb.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9b0409c5333a9ca2bf2fe70b3d222a57dcdd0cc
--- /dev/null
+++ b/fairseq/optim/fused_lamb.py
@@ -0,0 +1,50 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.optim import FairseqOptimizer, register_optimizer
+
+
+@register_optimizer('lamb')
+class FairseqLAMB(FairseqOptimizer):
+    """LAMB optimizer."""
+
+    def __init__(self, args, params):
+        super().__init__(args)
+        try:
+            from apex.optimizers import FusedLAMB
+            self._optimizer = FusedLAMB(params, **self.optimizer_config)
+        except ImportError:
+            raise ImportError('Please install apex to use LAMB optimizer')
+
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--lamb-betas', default='(0.9, 0.999)', metavar='B',
+                            help='betas for LAMB optimizer')
+        parser.add_argument('--lamb-eps', type=float, default=1e-8, metavar='D',
+                            help='epsilon for LAMB optimizer')
+        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
+                            help='weight decay')
+        # fmt: on
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            'lr': self.args.lr[0],
+            'betas': eval(self.args.lamb_betas),
+            'eps': self.args.lamb_eps,
+            'weight_decay': self.args.weight_decay,
+        }
+
+    @property
+    def supports_flat_params(self):
+        return False
diff --git a/fairseq/optim/lr_scheduler/__init__.py b/fairseq/optim/lr_scheduler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..edd0a6a13e1a10e91a8653371a420b36bcb2cc27
--- /dev/null
+++ b/fairseq/optim/lr_scheduler/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import importlib
+import os
+
+from fairseq import registry
+from fairseq.optim.lr_scheduler.fairseq_lr_scheduler import FairseqLRScheduler
+
+
+build_lr_scheduler, register_lr_scheduler, LR_SCHEDULER_REGISTRY = registry.setup_registry(
+    '--lr-scheduler',
+    base_class=FairseqLRScheduler,
+    default='fixed',
+)
+
+# automatically import any Python files in the optim/lr_scheduler/ directory
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith('.py') and not file.startswith('_'):
+        module = file[:file.find('.py')]
+        importlib.import_module('fairseq.optim.lr_scheduler.' + module)
diff --git a/fairseq/optim/lr_scheduler/__pycache__/__init__.cpython-310.pyc b/fairseq/optim/lr_scheduler/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ff0cd34b65ec35fdfdc0923c5d1f7a2ebc660cb
Binary files /dev/null and b/fairseq/optim/lr_scheduler/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/optim/lr_scheduler/__pycache__/cosine_lr_scheduler.cpython-310.pyc b/fairseq/optim/lr_scheduler/__pycache__/cosine_lr_scheduler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dcc046b845c58af19d025431930033c71d18cc71
Binary files /dev/null and b/fairseq/optim/lr_scheduler/__pycache__/cosine_lr_scheduler.cpython-310.pyc differ
diff --git a/fairseq/optim/lr_scheduler/__pycache__/fairseq_lr_scheduler.cpython-310.pyc b/fairseq/optim/lr_scheduler/__pycache__/fairseq_lr_scheduler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2eca6a51b616673c8527fce24e9a6ca6a7290949
Binary files /dev/null and b/fairseq/optim/lr_scheduler/__pycache__/fairseq_lr_scheduler.cpython-310.pyc differ
diff --git a/fairseq/optim/lr_scheduler/__pycache__/fixed_schedule.cpython-310.pyc b/fairseq/optim/lr_scheduler/__pycache__/fixed_schedule.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19452785fd577d1f2eb88af6996e5f3d50610726
Binary files /dev/null and b/fairseq/optim/lr_scheduler/__pycache__/fixed_schedule.cpython-310.pyc differ
diff --git a/fairseq/optim/lr_scheduler/__pycache__/inverse_square_root_schedule.cpython-310.pyc b/fairseq/optim/lr_scheduler/__pycache__/inverse_square_root_schedule.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b741628a674fb0b26fc5995c1710d23db3d40777
Binary files /dev/null and b/fairseq/optim/lr_scheduler/__pycache__/inverse_square_root_schedule.cpython-310.pyc differ
diff --git a/fairseq/optim/lr_scheduler/__pycache__/polynomial_decay_schedule.cpython-310.pyc b/fairseq/optim/lr_scheduler/__pycache__/polynomial_decay_schedule.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f412212e2f58dc6ce71a25075536f00620a56d03
Binary files /dev/null and b/fairseq/optim/lr_scheduler/__pycache__/polynomial_decay_schedule.cpython-310.pyc differ
diff --git a/fairseq/optim/lr_scheduler/__pycache__/reduce_lr_on_plateau.cpython-310.pyc b/fairseq/optim/lr_scheduler/__pycache__/reduce_lr_on_plateau.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ec8a3c4feab9a7a753dafa6fba812fa68def743
Binary files /dev/null and b/fairseq/optim/lr_scheduler/__pycache__/reduce_lr_on_plateau.cpython-310.pyc differ
diff --git a/fairseq/optim/lr_scheduler/__pycache__/tri_stage_lr_scheduler.cpython-310.pyc b/fairseq/optim/lr_scheduler/__pycache__/tri_stage_lr_scheduler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de654e1b6720ecfa8dab1cd4e1c7d7dea82e36c7
Binary files /dev/null and b/fairseq/optim/lr_scheduler/__pycache__/tri_stage_lr_scheduler.cpython-310.pyc differ
diff --git a/fairseq/optim/lr_scheduler/__pycache__/triangular_lr_scheduler.cpython-310.pyc b/fairseq/optim/lr_scheduler/__pycache__/triangular_lr_scheduler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a2c1f885567c9d3dacd33756cd919a3e4cfd98d9
Binary files /dev/null and b/fairseq/optim/lr_scheduler/__pycache__/triangular_lr_scheduler.cpython-310.pyc differ
diff --git a/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py b/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..9137e11b786a860cd6ab9fdcf35523c088989781
--- /dev/null
+++ b/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py
@@ -0,0 +1,118 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+from . import FairseqLRScheduler, register_lr_scheduler
+
+
+@register_lr_scheduler('cosine')
+class CosineSchedule(FairseqLRScheduler):
+    """Assign LR based on a cyclical schedule that follows the cosine function.
+
+    See https://arxiv.org/pdf/1608.03983.pdf for details.
+
+    We also support a warmup phase where we linearly increase the learning rate
+    from some initial learning rate (``--warmup-init-lr``) until the configured
+    max learning rate (``--max-lr``).
+
+    During warmup::
+
+      lrs = torch.linspace(args.warmup_init_lr, args.lr, args.warmup_updates)
+      lr = lrs[update_num]
+
+    After warmup::
+
+      lr = lr_min + 0.5*(lr_max - lr_min)*(1 + cos(t_curr / t_i))
+
+    where ``t_curr`` is current percentage of updates within the current period
+    range and ``t_i`` is the current period range, which is scaled by ``t_mul``
+    after every iteration.
+    """
+
+    def __init__(self, args, optimizer):
+        super().__init__(args, optimizer)
+        if len(args.lr) > 1:
+            raise ValueError(
+                'Cannot use a fixed learning rate schedule with cosine.'
+                ' Consider --lr-scheduler=fixed instead.'
+            )
+
+        warmup_end_lr = args.max_lr
+        if args.warmup_init_lr < 0:
+            args.warmup_init_lr = args.lr[0]
+
+        self.min_lr = args.lr[0]
+        self.max_lr = args.max_lr
+
+        assert self.max_lr > self.min_lr, 'max_lr must be more than lr'
+
+        self.t_mult = args.t_mult
+        self.period = args.lr_period_updates
+
+        if self.period <= 0:
+            assert args.max_update >= 0, 'Either --max_update or --lr-period-updates must be set'
+            self.period = args.max_update - args.warmup_updates
+
+        if args.warmup_updates > 0:
+            # linearly warmup for the first args.warmup_updates
+            self.lr_step = (warmup_end_lr - args.warmup_init_lr) / args.warmup_updates
+        else:
+            self.lr_step = 1
+
+        self.warmup_updates = args.warmup_updates
+        self.lr_shrink = args.lr_shrink
+
+        # initial learning rate
+        self.lr = args.warmup_init_lr
+        self.optimizer.set_lr(self.lr)
+
+    @staticmethod
+    def add_args(parser):
+        """Add arguments to the parser for this LR scheduler."""
+        # fmt: off
+        parser.add_argument('--warmup-updates', default=0, type=int, metavar='N',
+                            help='warmup the learning rate linearly for the first N updates')
+        parser.add_argument('--warmup-init-lr', default=-1, type=float, metavar='LR',
+                            help='initial learning rate during warmup phase; default is args.lr')
+        parser.add_argument('--max-lr', type=float, metavar='LR',
+                            help='max learning rate, must be more than args.lr')
+        parser.add_argument('--t-mult', default=1, type=float, metavar='LR',
+                            help='factor to grow the length of each period')
+        parser.add_argument('--lr-period-updates', default=-1, type=float, metavar='LR',
+                            help='initial number of updates per period')
+        parser.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS',
+                            help='shrink factor for annealing')
+        # fmt: on
+
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        super().step(epoch, val_loss)
+        # we don't change the learning rate at epoch boundaries
+        return self.optimizer.get_lr()
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        if num_updates < self.args.warmup_updates:
+            self.lr = self.args.warmup_init_lr + num_updates * self.lr_step
+        else:
+            curr_updates = num_updates - self.args.warmup_updates
+            if self.t_mult != 1:
+                i = math.floor(math.log(1 - curr_updates / self.period * (1 - self.t_mult), self.t_mult))
+                t_i = self.t_mult ** i * self.period
+                t_curr = curr_updates - (1 - self.t_mult ** i) / (1 - self.t_mult) * self.period
+            else:
+                i = math.floor(curr_updates / self.period)
+                t_i = self.period
+                t_curr = curr_updates - (self.period * i)
+
+            lr_shrink = self.lr_shrink ** i
+            min_lr = self.min_lr * lr_shrink
+            max_lr = self.max_lr * lr_shrink
+
+            self.lr = min_lr + 0.5 * (max_lr - min_lr) * (1 + math.cos(math.pi * t_curr / t_i))
+
+        self.optimizer.set_lr(self.lr)
+        return self.lr
diff --git a/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py b/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b7884829a6311deea1f1160b452791f1485d4d3
--- /dev/null
+++ b/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py
@@ -0,0 +1,42 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .. import FairseqOptimizer
+
+
+class FairseqLRScheduler(object):
+
+    def __init__(self, args, optimizer):
+        super().__init__()
+        if not isinstance(optimizer, FairseqOptimizer):
+            raise ValueError('optimizer must be an instance of FairseqOptimizer')
+        self.args = args
+        self.optimizer = optimizer
+        self.best = None
+
+    @staticmethod
+    def add_args(parser):
+        """Add arguments to the parser for this LR scheduler."""
+        pass
+
+    def state_dict(self):
+        """Return the LR scheduler state dict."""
+        return {'best': self.best}
+
+    def load_state_dict(self, state_dict):
+        """Load an LR scheduler state dict."""
+        self.best = state_dict['best']
+
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        if val_loss is not None:
+            if self.best is None:
+                self.best = val_loss
+            else:
+                self.best = min(self.best, val_loss)
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        return self.optimizer.get_lr()
diff --git a/fairseq/optim/lr_scheduler/fixed_schedule.py b/fairseq/optim/lr_scheduler/fixed_schedule.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc10db16388e1befc893c1cd7c496c5e7da4892e
--- /dev/null
+++ b/fairseq/optim/lr_scheduler/fixed_schedule.py
@@ -0,0 +1,61 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import FairseqLRScheduler, register_lr_scheduler
+
+
+@register_lr_scheduler('fixed')
+class FixedSchedule(FairseqLRScheduler):
+    """Decay the LR on a fixed schedule."""
+
+    def __init__(self, args, optimizer):
+        super().__init__(args, optimizer)
+
+        # set defaults
+        args.warmup_updates = getattr(args, 'warmup_updates', 0) or 0
+
+        self.lr = args.lr[0]
+        if args.warmup_updates > 0:
+            self.warmup_factor = 1. / args.warmup_updates
+        else:
+            self.warmup_factor = 1
+
+    @staticmethod
+    def add_args(parser):
+        """Add arguments to the parser for this LR scheduler."""
+        # fmt: off
+        parser.add_argument('--force-anneal', '--fa', type=int, metavar='N',
+                            help='force annealing at specified epoch')
+        parser.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS',
+                            help='shrink factor for annealing, lr_new = (lr * lr_shrink)')
+        parser.add_argument('--warmup-updates', default=0, type=int, metavar='N',
+                            help='warmup the learning rate linearly for the first N updates')
+        # fmt: on
+
+    def get_next_lr(self, epoch):
+        lrs = self.args.lr
+        if self.args.force_anneal is None or epoch < self.args.force_anneal:
+            # use fixed LR schedule
+            next_lr = lrs[min(epoch, len(lrs) - 1)]
+        else:
+            # annneal based on lr_shrink
+            next_lr = lrs[-1] * self.args.lr_shrink ** (epoch + 1 - self.args.force_anneal)
+        return next_lr
+
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        super().step(epoch, val_loss)
+        self.lr = self.get_next_lr(epoch)
+        self.optimizer.set_lr(self.warmup_factor * self.lr)
+        return self.optimizer.get_lr()
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        if self.args.warmup_updates > 0 and num_updates < self.args.warmup_updates:
+            self.warmup_factor = (num_updates + 1) / float(self.args.warmup_updates)
+            self.optimizer.set_lr(self.warmup_factor * self.lr)
+        else:
+            self.optimizer.set_lr(self.lr)
+        return self.optimizer.get_lr()
diff --git a/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py b/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py
new file mode 100644
index 0000000000000000000000000000000000000000..f98a7c3b997e2aa2f77911da25686e9bdccbad5b
--- /dev/null
+++ b/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py
@@ -0,0 +1,73 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import FairseqLRScheduler, register_lr_scheduler
+
+
+@register_lr_scheduler('inverse_sqrt')
+class InverseSquareRootSchedule(FairseqLRScheduler):
+    """Decay the LR based on the inverse square root of the update number.
+
+    We also support a warmup phase where we linearly increase the learning rate
+    from some initial learning rate (``--warmup-init-lr``) until the configured
+    learning rate (``--lr``). Thereafter we decay proportional to the number of
+    updates, with a decay factor set to align with the configured learning rate.
+
+    During warmup::
+
+      lrs = torch.linspace(args.warmup_init_lr, args.lr, args.warmup_updates)
+      lr = lrs[update_num]
+
+    After warmup::
+
+      decay_factor = args.lr * sqrt(args.warmup_updates)
+      lr = decay_factor / sqrt(update_num)
+    """
+
+    def __init__(self, args, optimizer):
+        super().__init__(args, optimizer)
+        if len(args.lr) > 1:
+            raise ValueError(
+                'Cannot use a fixed learning rate schedule with inverse_sqrt.'
+                ' Consider --lr-scheduler=fixed instead.'
+            )
+        warmup_end_lr = args.lr[0]
+        if args.warmup_init_lr < 0:
+            args.warmup_init_lr = 0 if args.warmup_updates > 0 else warmup_end_lr
+
+        # linearly warmup for the first args.warmup_updates
+        self.lr_step = (warmup_end_lr - args.warmup_init_lr) / args.warmup_updates
+
+        # then, decay prop. to the inverse square root of the update number
+        self.decay_factor = warmup_end_lr * args.warmup_updates**0.5
+
+        # initial learning rate
+        self.lr = args.warmup_init_lr
+        self.optimizer.set_lr(self.lr)
+
+    @staticmethod
+    def add_args(parser):
+        """Add arguments to the parser for this LR scheduler."""
+        # fmt: off
+        parser.add_argument('--warmup-updates', default=4000, type=int, metavar='N',
+                            help='warmup the learning rate linearly for the first N updates')
+        parser.add_argument('--warmup-init-lr', default=-1, type=float, metavar='LR',
+                            help='initial learning rate during warmup phase; default is args.lr')
+        # fmt: on
+
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        super().step(epoch, val_loss)
+        # we don't change the learning rate at epoch boundaries
+        return self.optimizer.get_lr()
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        if num_updates < self.args.warmup_updates:
+            self.lr = self.args.warmup_init_lr + num_updates*self.lr_step
+        else:
+            self.lr = self.decay_factor * num_updates**-0.5
+        self.optimizer.set_lr(self.lr)
+        return self.lr
diff --git a/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py b/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py
new file mode 100644
index 0000000000000000000000000000000000000000..aff57f9b93d0221c3685467be50cde258e965327
--- /dev/null
+++ b/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py
@@ -0,0 +1,70 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import FairseqLRScheduler, register_lr_scheduler
+
+
+@register_lr_scheduler('polynomial_decay')
+class PolynomialDecaySchedule(FairseqLRScheduler):
+    """Decay the LR on a fixed schedule."""
+
+    def __init__(self, args, optimizer):
+        super().__init__(args, optimizer)
+
+        # set defaults
+        args.warmup_updates = getattr(args, 'warmup_updates', 0) or 0
+
+        self.lr = args.lr[0]
+        if args.warmup_updates > 0:
+            self.warmup_factor = 1. / args.warmup_updates
+        else:
+            self.warmup_factor = 1
+        self.end_learning_rate = args.end_learning_rate
+        self.total_num_update = args.total_num_update
+        self.power = args.power
+        self.optimizer.set_lr(self.warmup_factor * self.lr)
+
+    @staticmethod
+    def add_args(parser):
+        """Add arguments to the parser for this LR scheduler."""
+        parser.add_argument('--force-anneal', '--fa', type=int, metavar='N',
+                            help='force annealing at specified epoch')
+        parser.add_argument('--warmup-updates', default=0, type=int, metavar='N',
+                            help='warmup the learning rate linearly for the first N updates')
+        parser.add_argument('--end-learning-rate', default=0.0, type=float)
+        parser.add_argument('--power', default=1.0, type=float)
+        parser.add_argument('--total-num-update', default=1000000, type=int)
+
+    def get_next_lr(self, epoch):
+        lrs = self.args.lr
+        if self.args.force_anneal is None or epoch < self.args.force_anneal:
+            # use fixed LR schedule
+            next_lr = lrs[min(epoch, len(lrs) - 1)]
+        else:
+            # annneal based on lr_shrink
+            next_lr = self.optimizer.get_lr()
+        return next_lr
+
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        super().step(epoch, val_loss)
+        self.lr = self.get_next_lr(epoch)
+        self.optimizer.set_lr(self.warmup_factor * self.lr)
+        return self.optimizer.get_lr()
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        if self.args.warmup_updates > 0 and num_updates <= self.args.warmup_updates:
+            self.warmup_factor = num_updates / float(self.args.warmup_updates)
+            lr = self.warmup_factor * self.lr
+        elif num_updates >= self.total_num_update:
+            lr = self.end_learning_rate
+        else:
+            warmup = self.args.warmup_updates
+            lr_range = self.lr - self.end_learning_rate
+            pct_remaining = 1 - (num_updates - warmup) / (self.total_num_update - warmup)
+            lr = lr_range * pct_remaining ** (self.power) + self.end_learning_rate
+        self.optimizer.set_lr(lr)
+        return self.optimizer.get_lr()
diff --git a/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py b/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
new file mode 100644
index 0000000000000000000000000000000000000000..8128cf0eb81e24e7ac2c838ddc7bedd9feb5df65
--- /dev/null
+++ b/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
@@ -0,0 +1,110 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.optim.lr_scheduler
+
+from . import FairseqLRScheduler, register_lr_scheduler
+
+
+@register_lr_scheduler('reduce_lr_on_plateau')
+class ReduceLROnPlateau(FairseqLRScheduler):
+    """
+    Decay the LR by a factor every time the validation loss plateaus.
+    Also comes with optional warmup phase, where we linearly increase
+    the learning rate from some initial learning rate
+    (``--warmup-init-lr``) until the configured learning rate
+    (``--lr``). Thereafter the lr is adjusted according to original
+    reduce_on_plateau scheme.
+
+    During warmup::
+
+      lrs = torch.linspace(
+          args.warmup_init_lr, args.lr, args.warmup_updates
+      )
+      lr = lrs[update_num]
+    """
+
+    def __init__(self, args, optimizer):
+        super().__init__(args, optimizer)
+        if len(args.lr) > 1:
+            raise ValueError(
+                'Cannot use a fixed learning rate schedule with reduce_lr_on_plateau.'
+                ' Consider --lr-scheduler=fixed instead.'
+            )
+        self.lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+            self.optimizer.optimizer, patience=args.lr_patience, factor=args.lr_shrink,
+            mode='max' if args.maximize_best_checkpoint_metric else 'min',
+            threshold=args.lr_threshold)
+        warmup_end_lr = args.lr[0]
+        # if no warm up, sets initial lr to be args.lr[0]
+        if args.warmup_init_lr < 0:
+            args.warmup_init_lr = 0 if args.warmup_updates > 0 else warmup_end_lr
+
+        # linearly warmup for the first args.warmup_updates
+        if args.warmup_updates > 0:
+            self.lr_step = (warmup_end_lr - args.warmup_init_lr) / args.warmup_updates
+        # this flag is either set from arg when no warm up, or set by
+        # step_update() when warmup finishes
+        self.warmup_end = True if args.warmup_updates <= 0 else False
+        # initial learning rate
+        # this self.lr is used only during init and/or warm up period
+        self.lr = args.warmup_init_lr
+        self.optimizer.set_lr(self.lr)
+
+    @staticmethod
+    def add_args(parser):
+        """Add arguments to the parser for this LR scheduler."""
+        # fmt: off
+        parser.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS',
+                            help='shrink factor for annealing, lr_new = (lr * lr_shrink)')
+        parser.add_argument('--lr-threshold', default=1e-4, type=float, metavar='LT',
+                            help='threshold for measuring the new optimum, '
+                                 'to only focus on significant changes')
+        parser.add_argument('--lr-patience', default=0, type=int,
+                            help='number of epochs with no improvement after which '
+                                 'learning rate will be reduced')
+        parser.add_argument('--warmup-updates', default=0, type=int, metavar='N',
+                            help='warmup the learning rate linearly for the first N updates')
+        parser.add_argument('--warmup-init-lr', default=-1, type=float, metavar='LR',
+                            help='initial learning rate during warmup phase; default is args.lr')
+        # fmt: on
+
+    def state_dict(self):
+        """Return the LR scheduler state dict."""
+        return {
+            'best': self.lr_scheduler.best,
+            'last_epoch': self.lr_scheduler.last_epoch,
+        }
+
+    def load_state_dict(self, state_dict):
+        """Load an LR scheduler state dict."""
+        self.lr_scheduler.best = state_dict['best']
+        if 'last_epoch' in state_dict:
+            self.lr_scheduler.last_epoch = state_dict['last_epoch']
+
+    def step(self, epoch, val_loss=None):
+        """
+        Update the learning rate at the end of the given epoch if warmup
+        finishes otherwise no update of lr on epoch boundaries
+        """
+        if val_loss is not None and self.warmup_end is True:
+            self.lr_scheduler.step(val_loss)
+        else:
+            self.lr_scheduler.last_epoch = epoch
+        return self.optimizer.get_lr()
+
+    def step_update(self, num_updates):
+        """
+        Update the learning rate after each update."""
+        # if there is warmup
+        if self.args.warmup_updates > 0:
+            if num_updates <= self.args.warmup_updates:
+                self.lr = self.args.warmup_init_lr + num_updates*self.lr_step
+                self.optimizer.set_lr(self.lr)
+            else:
+                if self.warmup_end is False:
+                    self.warmup_end = True
+        # else do nothing
+        return self.optimizer.get_lr()
diff --git a/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py b/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..3460fa1226ed256750409a501a8ebb3a6c0806c2
--- /dev/null
+++ b/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py
@@ -0,0 +1,163 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import FairseqLRScheduler, register_lr_scheduler
+import math
+
+
+@register_lr_scheduler('tri_stage')
+class TriStageLRSchedule(FairseqLRScheduler):
+    """Tristage learning rate schedulr
+
+    Implement the learning rate scheduler in https://arxiv.org/pdf/1904.08779.pdf
+
+    Similar to inverse_squre_root scheduler, but tri_stage learning rate employs
+    three stages LR scheduling:
+
+        - warmup stage, starting from `lr` * `init_lr_scale`, linearly
+          increased to `lr` in `warmup_steps` iterations
+
+        - hold stage, after `warmup_steps`, keep the LR as `lr` for `hold_steps`
+          iterations
+
+        - decay stage, after hold stage, decay LR exponetially to
+          `lr` * `final_lr_scale` in `decay_steps`;
+          after that LR is keep as `final_lr_scale` * `lr`
+
+    During warmup::
+
+      init_lr = args.init_lr_scale * args.lr
+      lrs = torch.linspace(init_lr, args.lr, args.warmup_steps)
+      lr = lrs[update_num]
+
+    During hold::
+
+      lr = args.lr
+
+    During decay::
+
+      decay_factor = - math.log(args.final_lr_scale) / args.decay_steps
+      lr = args.lr * exp(- (update_num - warmup_steps - decay_steps) * decay_factor)
+
+    After that::
+
+      lr = args.lr * args.final_lr_scale
+    """
+
+    def __init__(self, args, optimizer):
+        super().__init__(args, optimizer)
+        if len(args.lr) > 1:
+            raise ValueError(
+                'Cannot use a fixed learning rate schedule with tri-stage lr.'
+                ' Consider --lr-scheduler=fixed instead.'
+            )
+
+        # calculate LR at each point
+        self.peak_lr = args.lr[0]
+        self.init_lr = args.init_lr_scale * args.lr[0]
+        self.final_lr = args.final_lr_scale * args.lr[0]
+
+        # remember the steps at each stage
+        self.warmup_steps = args.warmup_steps
+        self.hold_steps = args.hold_steps
+        self.decay_steps = args.decay_steps
+
+        self.warmup_rate = (
+            (self.peak_lr - self.init_lr) / self.warmup_steps if self.warmup_steps != 0
+            else 0
+        )
+        self.decay_factor = -math.log(args.final_lr_scale) / args.decay_steps
+
+        # initial learning rate
+        self.lr = self.init_lr
+        self.optimizer.set_lr(self.lr)
+
+    @staticmethod
+    def add_args(parser):
+        """Add arguments to the parser for this LR scheduler."""
+        # fmt: off
+        parser.add_argument(
+            '--warmup-steps',
+            default=4000,
+            type=int,
+            metavar='N',
+            help='warmup the learning rate linearly for the first N updates'
+        )
+        parser.add_argument(
+            '--hold-steps',
+            default=20000,
+            type=int,
+            metavar='N',
+            help='steps in hold stage.'
+        )
+        parser.add_argument(
+            '--decay-steps',
+            default=60000,
+            type=int,
+            metavar='N',
+            help='steps in decay stages'
+        )
+        parser.add_argument(
+            '--init-lr-scale',
+            default=0.01,
+            type=float,
+            help="""
+    initial learning rate scale during warmup phase; default is 0.01""")
+        parser.add_argument(
+            '--final-lr-scale',
+            default=0.01,
+            type=float,
+            help="final learning rate scale; default to 0.01"
+        )
+        # fmt: on
+
+    def _decide_stage(self, update_step):
+        """
+        return stage, and the corresponding steps within the current stage
+        """
+        if update_step < self.warmup_steps:
+            # warmup state
+            return 0, update_step
+
+        offset = self.warmup_steps
+
+        if update_step < offset + self.hold_steps:
+            # hold stage
+            return 1, update_step - offset
+
+        offset += self.hold_steps
+
+        if update_step <= offset + self.decay_steps:
+            # decay stage
+            return 2, update_step - offset
+
+        offset += self.decay_steps
+
+        # still here ? constant lr stage
+        return 3, update_step - offset
+
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        super().step(epoch, val_loss)
+        # we don't change the learning rate at epoch boundaries
+        return self.optimizer.get_lr()
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        stage, steps_in_stage = self._decide_stage(num_updates)
+        if stage == 0:
+            self.lr = self.init_lr + self.warmup_rate * steps_in_stage
+        elif stage == 1:
+            self.lr = self.peak_lr
+        elif stage == 2:
+            self.lr = self.peak_lr * math.exp(-self.decay_factor * steps_in_stage)
+        elif stage == 3:
+            self.lr = self.final_lr
+        else:
+            raise ValueError("Undefined stage")
+
+        self.optimizer.set_lr(self.lr)
+
+        return self.lr
diff --git a/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py b/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..fed0cf7ef119852c84825eaae89b35226991eedc
--- /dev/null
+++ b/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py
@@ -0,0 +1,74 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+from . import FairseqLRScheduler, register_lr_scheduler
+
+
+@register_lr_scheduler('triangular')
+class TriangularSchedule(FairseqLRScheduler):
+    """Assign LR based on a triangular cyclical schedule.
+
+    See https://arxiv.org/pdf/1506.01186.pdf for details.
+    """
+
+    def __init__(self, args, optimizer):
+        super().__init__(args, optimizer)
+        if len(args.lr) > 1:
+            raise ValueError(
+                'Cannot use a fixed learning rate schedule with triangular.'
+                ' Consider --lr-scheduler=fixed instead.'
+            )
+
+        lr = args.lr[0]
+
+        assert args.max_lr > lr, 'max_lr must be more than lr'
+        self.min_lr = lr
+        self.max_lr = args.max_lr
+        self.stepsize = args.lr_period_updates // 2
+        self.lr_shrink = args.lr_shrink
+        self.shrink_min = args.shrink_min
+
+        # initial learning rate
+        self.lr = self.min_lr
+        self.optimizer.set_lr(self.lr)
+
+    @staticmethod
+    def add_args(parser):
+        """Add arguments to the parser for this LR scheduler."""
+        # fmt: off
+        parser.add_argument('--max-lr', required=True, type=float, metavar='LR',
+                            help='max learning rate, must be more than args.lr')
+        parser.add_argument('--lr-period-updates', default=5000, type=float, metavar='LR',
+                            help='initial number of updates per period (cycle length)')
+        parser.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS',
+                            help='shrink factor for annealing')
+        parser.add_argument('--shrink-min', action='store_true',
+                            help='if set, also shrinks min lr')
+        # fmt: on
+
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        super().step(epoch, val_loss)
+        # we don't change the learning rate at epoch boundaries
+        return self.optimizer.get_lr()
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        cycle = math.floor(num_updates / (2 * self.stepsize))
+
+        lr_shrink = self.lr_shrink ** cycle
+        max_lr = self.max_lr * lr_shrink
+        if self.shrink_min:
+            min_lr = self.min_lr * lr_shrink
+        else:
+            min_lr = self.min_lr
+
+        x = abs(num_updates / self.stepsize - 2 * (cycle + 1) + 1)
+        self.lr = min_lr + (max_lr - min_lr) * max(0, (1 - x))
+
+        self.optimizer.set_lr(self.lr)
+        return self.lr
diff --git a/fairseq/optim/nag.py b/fairseq/optim/nag.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9b7fb8019575a5cfa3522c8ebe2f9982a55bdd9
--- /dev/null
+++ b/fairseq/optim/nag.py
@@ -0,0 +1,103 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch.optim.optimizer import Optimizer, required
+
+from . import FairseqOptimizer, register_optimizer
+
+
+@register_optimizer('nag')
+class FairseqNAG(FairseqOptimizer):
+    def __init__(self, args, params):
+        super().__init__(args)
+        self._optimizer = NAG(params, **self.optimizer_config)
+
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--momentum', default=0.99, type=float, metavar='M',
+                            help='momentum factor')
+        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
+                            help='weight decay')
+        # fmt: on
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            'lr': self.args.lr[0],
+            'momentum': self.args.momentum,
+            'weight_decay': self.args.weight_decay,
+        }
+
+
+class NAG(Optimizer):
+    def __init__(self, params, lr=required, momentum=0, weight_decay=0):
+        defaults = dict(lr=lr, lr_old=lr, momentum=momentum, weight_decay=weight_decay)
+        super(NAG, self).__init__(params, defaults)
+
+    @property
+    def supports_memory_efficient_fp16(self):
+        return True
+
+    @property
+    def supports_flat_params(self):
+        return True
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            weight_decay = group['weight_decay']
+            momentum = group['momentum']
+            lr = group['lr']
+            lr_old = group.get('lr_old', lr)
+            lr_correct = lr / lr_old
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                p_data_fp32 = p.data
+                if p_data_fp32.dtype in {torch.float16, torch.bfloat16}:
+                    p_data_fp32 = p_data_fp32.float()
+
+                d_p = p.grad.data.float()
+                param_state = self.state[p]
+                if 'momentum_buffer' not in param_state:
+                    param_state['momentum_buffer'] = torch.zeros_like(d_p)
+                else:
+                    param_state['momentum_buffer'] = param_state['momentum_buffer'].to(d_p)
+
+                buf = param_state['momentum_buffer']
+
+                if weight_decay != 0:
+                    p_data_fp32.mul_(1 - lr * weight_decay)
+                p_data_fp32.add_(buf, alpha=momentum * momentum * lr_correct)
+                p_data_fp32.add_(d_p, alpha=-(1 + momentum) * lr)
+
+                buf.mul_(momentum * lr_correct).add_(d_p, alpha=-lr)
+
+                if p.data.dtype in {torch.float16, torch.bfloat16}:
+                    p.data.copy_(p_data_fp32)
+
+            group['lr_old'] = lr
+
+        return loss
diff --git a/fairseq/optim/sgd.py b/fairseq/optim/sgd.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c4e3e0a809f308941cbe28504197acb8a72d88d
--- /dev/null
+++ b/fairseq/optim/sgd.py
@@ -0,0 +1,43 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.optim
+
+from . import FairseqOptimizer, register_optimizer
+
+
+@register_optimizer('sgd')
+class SGD(FairseqOptimizer):
+    def __init__(self, args, params):
+        super().__init__(args)
+        self._optimizer = torch.optim.SGD(params, **self.optimizer_config)
+
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--momentum', default=0.0, type=float, metavar='M',
+                            help='momentum factor')
+        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
+                            help='weight decay')
+        # fmt: on
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            'lr': self.args.lr[0],
+            'momentum': self.args.momentum,
+            'weight_decay': self.args.weight_decay,
+        }
+
+    @property
+    def supports_flat_params(self):
+        return True
diff --git a/fairseq/options.py b/fairseq/options.py
new file mode 100644
index 0000000000000000000000000000000000000000..e889821ee6c7d483f517b8806a9e46705e203185
--- /dev/null
+++ b/fairseq/options.py
@@ -0,0 +1,675 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import sys
+from typing import Callable, List, Optional
+
+import torch
+
+from fairseq import scoring, utils
+from fairseq.data.indexed_dataset import get_available_dataset_impl
+
+
+def get_preprocessing_parser(default_task="translation"):
+    parser = get_parser("Preprocessing", default_task)
+    add_preprocess_args(parser)
+    return parser
+
+
+def get_training_parser(default_task="translation"):
+    parser = get_parser("Trainer", default_task)
+    add_dataset_args(parser, train=True)
+    add_distributed_training_args(parser)
+    add_model_args(parser)
+    add_optimization_args(parser)
+    add_checkpoint_args(parser)
+    return parser
+
+
+def get_generation_parser(interactive=False, default_task="translation"):
+    parser = get_parser("Generation", default_task)
+    add_dataset_args(parser, gen=True)
+    add_distributed_training_args(parser, default_world_size=1)
+    add_generation_args(parser)
+    if interactive:
+        add_interactive_args(parser)
+    return parser
+
+
+def get_interactive_generation_parser(default_task="translation"):
+    return get_generation_parser(interactive=True, default_task=default_task)
+
+
+def get_eval_lm_parser(default_task="language_modeling"):
+    parser = get_parser("Evaluate Language Model", default_task)
+    add_dataset_args(parser, gen=True)
+    add_distributed_training_args(parser, default_world_size=1)
+    add_eval_lm_args(parser)
+    return parser
+
+
+def get_validation_parser(default_task=None):
+    parser = get_parser("Validation", default_task)
+    add_dataset_args(parser, train=True)
+    add_distributed_training_args(parser, default_world_size=1)
+    group = parser.add_argument_group("Evaluation")
+    add_common_eval_args(group)
+    return parser
+
+
+def csv_str_list(x):
+    return x.split(',')
+
+
+def eval_str_list(x, type=float):
+    if x is None:
+        return None
+    if isinstance(x, str):
+        x = eval(x)
+    try:
+        return list(map(type, x))
+    except TypeError:
+        return [type(x)]
+
+
+def eval_str_dict(x, type=dict):
+    if x is None:
+        return None
+    if isinstance(x, str):
+        x = eval(x)
+    return x
+
+
+def eval_bool(x, default=False):
+    if x is None:
+        return default
+    try:
+        return bool(eval(x))
+    except TypeError:
+        return default
+
+
+def parse_args_and_arch(
+    parser: argparse.ArgumentParser,
+    input_args: List[str] = None,
+    parse_known: bool = False,
+    suppress_defaults: bool = False,
+    modify_parser: Optional[Callable[[argparse.ArgumentParser], None]] = None,
+):
+    """
+    Args:
+        parser (ArgumentParser): the parser
+        input_args (List[str]): strings to parse, defaults to sys.argv
+        parse_known (bool): only parse known arguments, similar to
+            `ArgumentParser.parse_known_args`
+        suppress_defaults (bool): parse while ignoring all default values
+        modify_parser (Optional[Callable[[ArgumentParser], None]]):
+            function to modify the parser, e.g., to set default values
+    """
+    if suppress_defaults:
+        # Parse args without any default values. This requires us to parse
+        # twice, once to identify all the necessary task/model args, and a second
+        # time with all defaults set to None.
+        args = parse_args_and_arch(
+            parser,
+            input_args=input_args,
+            parse_known=parse_known,
+            suppress_defaults=False,
+        )
+        suppressed_parser = argparse.ArgumentParser(add_help=False, parents=[parser])
+        suppressed_parser.set_defaults(**{k: None for k, v in vars(args).items()})
+        args = suppressed_parser.parse_args(input_args)
+        return argparse.Namespace(
+            **{k: v for k, v in vars(args).items() if v is not None}
+        )
+
+    from fairseq.models import ARCH_MODEL_REGISTRY, ARCH_CONFIG_REGISTRY
+
+    # Before creating the true parser, we need to import optional user module
+    # in order to eagerly import custom tasks, optimizers, architectures, etc.
+    usr_parser = argparse.ArgumentParser(add_help=False, allow_abbrev=False)
+    usr_parser.add_argument("--user-dir", default=None)
+    usr_args, _ = usr_parser.parse_known_args(input_args)
+    utils.import_user_module(usr_args)
+
+    if modify_parser is not None:
+        modify_parser(parser)
+
+    # The parser doesn't know about model/criterion/optimizer-specific args, so
+    # we parse twice. First we parse the model/criterion/optimizer, then we
+    # parse a second time after adding the *-specific arguments.
+    # If input_args is given, we will parse those args instead of sys.argv.
+    args, _ = parser.parse_known_args(input_args)
+
+    # Add model-specific args to parser.
+    if hasattr(args, "arch"):
+        model_specific_group = parser.add_argument_group(
+            "Model-specific configuration",
+            # Only include attributes which are explicitly given as command-line
+            # arguments or which have default values.
+            argument_default=argparse.SUPPRESS,
+        )
+        ARCH_MODEL_REGISTRY[args.arch].add_args(model_specific_group)
+
+    # Add *-specific args to parser.
+    from fairseq.registry import REGISTRIES
+
+    for registry_name, REGISTRY in REGISTRIES.items():
+        choice = getattr(args, registry_name, None)
+        if choice is not None:
+            cls = REGISTRY["registry"][choice]
+            if hasattr(cls, "add_args"):
+                cls.add_args(parser)
+    if hasattr(args, "task"):
+        from fairseq.tasks import TASK_REGISTRY
+
+        TASK_REGISTRY[args.task].add_args(parser)
+    if getattr(args, "use_bmuf", False):
+        # hack to support extra args for block distributed data parallelism
+        from fairseq.optim.bmuf import FairseqBMUF
+
+        FairseqBMUF.add_args(parser)
+
+    # Modify the parser a second time, since defaults may have been reset
+    if modify_parser is not None:
+        modify_parser(parser)
+
+    # Parse a second time.
+    if parse_known:
+        args, extra = parser.parse_known_args(input_args)
+    else:
+        args = parser.parse_args(input_args)
+        extra = None
+
+    # Post-process args.
+    if hasattr(args, "max_sentences_valid") and args.max_sentences_valid is None:
+        args.max_sentences_valid = args.max_sentences
+    if hasattr(args, "max_tokens_valid") and args.max_tokens_valid is None:
+        args.max_tokens_valid = args.max_tokens
+    if getattr(args, "memory_efficient_fp16", False):
+        args.fp16 = True
+    if getattr(args, "memory_efficient_bf16", False):
+        args.bf16 = True
+    args.tpu = getattr(args, "tpu", False)
+    args.bf16 = getattr(args, "bf16", False)
+    if args.bf16:
+        args.tpu = True
+    if args.tpu and args.fp16:
+        raise ValueError("Cannot combine --fp16 and --tpu, use --bf16 on TPUs")
+
+    if getattr(args, "seed", None) is None:
+        args.seed = 1  # default seed for training
+        args.no_seed_provided = True
+    else:
+        args.no_seed_provided = False
+
+    # Apply architecture configuration.
+    if hasattr(args, "arch"):
+        ARCH_CONFIG_REGISTRY[args.arch](args)
+
+    if parse_known:
+        return args, extra
+    else:
+        return args
+
+
+def get_parser(desc, default_task="translation"):
+    # Before creating the true parser, we need to import optional user module
+    # in order to eagerly import custom tasks, optimizers, architectures, etc.
+    usr_parser = argparse.ArgumentParser(add_help=False, allow_abbrev=False)
+    usr_parser.add_argument("--user-dir", default=None)
+    usr_args, _ = usr_parser.parse_known_args()
+    utils.import_user_module(usr_args)
+
+    parser = argparse.ArgumentParser(allow_abbrev=False)
+    # fmt: off
+    parser.add_argument('--no-progress-bar', action='store_true', help='disable progress bar')
+    parser.add_argument('--log-interval', type=int, default=100, metavar='N',
+                        help='log progress every N batches (when progress bar is disabled)')
+    parser.add_argument('--log-format', default=None, help='log format to use',
+                        choices=['json', 'none', 'simple', 'tqdm'])
+    parser.add_argument('--tensorboard-logdir', metavar='DIR', default='',
+                        help='path to save logs for tensorboard, should match --logdir '
+                             'of running tensorboard (default: no tensorboard logging)')
+    parser.add_argument('--seed', default=None, type=int, metavar='N',
+                        help='pseudo random number generator seed')
+    parser.add_argument('--cpu', action='store_true', help='use CPU instead of CUDA')
+    parser.add_argument('--tpu', action='store_true', help='use TPU instead of CUDA')
+    parser.add_argument('--bf16', action='store_true', help='use bfloat16; implies --tpu')
+    parser.add_argument('--fp16', action='store_true', help='use FP16')
+    parser.add_argument('--memory-efficient-bf16', action='store_true',
+                        help='use a memory-efficient version of BF16 training; implies --bf16')
+    parser.add_argument('--memory-efficient-fp16', action='store_true',
+                        help='use a memory-efficient version of FP16 training; implies --fp16')
+    parser.add_argument('--fp16-no-flatten-grads', action='store_true',
+                        help='don\'t flatten FP16 grads tensor')
+    parser.add_argument('--fp16-init-scale', default=2 ** 7, type=int,
+                        help='default FP16 loss scale')
+    parser.add_argument('--fp16-scale-window', type=int,
+                        help='number of updates before increasing loss scale')
+    parser.add_argument('--fp16-scale-tolerance', default=0.0, type=float,
+                        help='pct of updates that can overflow before decreasing the loss scale')
+    parser.add_argument('--min-loss-scale', default=1e-4, type=float, metavar='D',
+                        help='minimum FP16 loss scale, after which training is stopped')
+    parser.add_argument('--threshold-loss-scale', type=float,
+                        help='threshold FP16 loss scale from below')
+    parser.add_argument('--user-dir', default=None,
+                        help='path to a python module containing custom extensions (tasks and/or architectures)')
+    parser.add_argument('--empty-cache-freq', default=0, type=int,
+                        help='how often to clear the PyTorch CUDA cache (0 to disable)')
+    parser.add_argument('--all-gather-list-size', default=16384, type=int,
+                        help='number of bytes reserved for gathering stats from workers')
+    parser.add_argument('--model-parallel-size', type=int, metavar='N',
+                        default=1,
+                        help='total number of GPUs to parallelize model over')
+    parser.add_argument('--checkpoint-suffix', default='',
+                        help='suffix to add to the checkpoint file name')
+    parser.add_argument('--quantization-config-path', default=None,
+                        help='path to quantization config file')
+    parser.add_argument('--profile', action='store_true', help='enable autograd profiler emit_nvtx')
+
+    from fairseq.registry import REGISTRIES
+    for registry_name, REGISTRY in REGISTRIES.items():
+        parser.add_argument(
+            '--' + registry_name.replace('_', '-'),
+            default=REGISTRY['default'],
+            choices=REGISTRY['registry'].keys(),
+        )
+
+    # Task definitions can be found under fairseq/tasks/
+    from fairseq.tasks import TASK_REGISTRY
+    parser.add_argument('--task', metavar='TASK', default=default_task,
+                        choices=TASK_REGISTRY.keys(),
+                        help='task')
+    # fmt: on
+    return parser
+
+
+def add_preprocess_args(parser):
+    group = parser.add_argument_group("Preprocessing")
+    # fmt: off
+    group.add_argument("-s", "--source-lang", default=None, metavar="SRC",
+                       help="source language")
+    group.add_argument("-t", "--target-lang", default=None, metavar="TARGET",
+                       help="target language")
+    group.add_argument("--trainpref", metavar="FP", default=None,
+                       help="train file prefix")
+    group.add_argument("--validpref", metavar="FP", default=None,
+                       help="comma separated, valid file prefixes")
+    group.add_argument("--testpref", metavar="FP", default=None,
+                       help="comma separated, test file prefixes")
+    group.add_argument("--align-suffix", metavar="FP", default=None,
+                       help="alignment file suffix")
+    group.add_argument("--destdir", metavar="DIR", default="data-bin",
+                       help="destination dir")
+    group.add_argument("--thresholdtgt", metavar="N", default=0, type=int,
+                       help="map words appearing less than threshold times to unknown")
+    group.add_argument("--thresholdsrc", metavar="N", default=0, type=int,
+                       help="map words appearing less than threshold times to unknown")
+    group.add_argument("--tgtdict", metavar="FP",
+                       help="reuse given target dictionary")
+    group.add_argument("--srcdict", metavar="FP",
+                       help="reuse given source dictionary")
+    group.add_argument("--nwordstgt", metavar="N", default=-1, type=int,
+                       help="number of target words to retain")
+    group.add_argument("--nwordssrc", metavar="N", default=-1, type=int,
+                       help="number of source words to retain")
+    group.add_argument("--alignfile", metavar="ALIGN", default=None,
+                       help="an alignment file (optional)")
+    parser.add_argument('--dataset-impl', metavar='FORMAT', default='mmap',
+                        choices=get_available_dataset_impl(),
+                        help='output dataset implementation')
+    group.add_argument("--joined-dictionary", action="store_true",
+                       help="Generate joined dictionary")
+    group.add_argument("--only-source", action="store_true",
+                       help="Only process the source language")
+    group.add_argument("--padding-factor", metavar="N", default=8, type=int,
+                       help="Pad dictionary size to be multiple of N")
+    group.add_argument("--workers", metavar="N", default=1, type=int,
+                       help="number of parallel workers")
+    # fmt: on
+    return parser
+
+
+def add_dataset_args(parser, train=False, gen=False):
+    group = parser.add_argument_group("Dataset and data loading")
+    # fmt: off
+    group.add_argument('--num-workers', default=1, type=int, metavar='N',
+                       help='how many subprocesses to use for data loading')
+    group.add_argument('--skip-invalid-size-inputs-valid-test', action='store_true',
+                       help='ignore too long or too short lines in valid and test set')
+    group.add_argument('--max-tokens', type=int, metavar='N',
+                       help='maximum number of tokens in a batch')
+    group.add_argument('--max-sentences', '--batch-size', type=int, metavar='N',
+                       help='maximum number of sentences in a batch')
+    group.add_argument('--required-batch-size-multiple', default=8, type=int, metavar='N',
+                       help='batch size will either be less than this value, '
+                            'or a multiple of this value')
+    parser.add_argument('--dataset-impl', metavar='FORMAT',
+                        choices=get_available_dataset_impl(),
+                        help='output dataset implementation')
+    group.add_argument('--data-buffer-size', default=10, type=int, metavar='N',
+                        help='number of batches to preload')
+    if train:
+        group.add_argument('--train-subset', default='train', metavar='SPLIT',
+                           help='data subset to use for training (e.g. train, valid, test)')
+        group.add_argument('--valid-subset', default='valid', metavar='SPLIT',
+                           help='comma separated list of data subsets to use for validation'
+                                ' (e.g. train, valid, test)')
+        group.add_argument('--validate-interval', type=int, default=1, metavar='N',
+                           help='validate every N epochs')
+        group.add_argument('--validate-interval-updates', type=int, default=0, metavar='N',
+                           help='validate every N updates')
+        group.add_argument('--validate-after-updates', type=int, default=0, metavar='N',
+                           help='dont validate until reaching this many updates')
+        group.add_argument('--fixed-validation-seed', default=None, type=int, metavar='N',
+                           help='specified random seed for validation')
+        group.add_argument('--disable-validation', action='store_true',
+                           help='disable validation')
+        group.add_argument('--max-tokens-valid', type=int, metavar='N',
+                           help='maximum number of tokens in a validation batch'
+                                ' (defaults to --max-tokens)')
+        group.add_argument('--max-sentences-valid', type=int, metavar='N',
+                           help='maximum number of sentences in a validation batch'
+                                ' (defaults to --max-sentences)')
+        group.add_argument('--curriculum', default=0, type=int, metavar='N',
+                           help='don\'t shuffle batches for first N epochs')
+    if gen:
+        group.add_argument('--gen-subset', default='test', metavar='SPLIT',
+                           help='data subset to generate (train, valid, test)')
+        group.add_argument('--num-shards', default=1, type=int, metavar='N',
+                           help='shard generation over N shards')
+        group.add_argument('--shard-id', default=0, type=int, metavar='ID',
+                           help='id of the shard to generate (id < num_shards)')
+    # fmt: on
+    return group
+
+
+def add_distributed_training_args(parser, default_world_size=None):
+    group = parser.add_argument_group("Distributed training")
+    # fmt: off
+    if default_world_size is None:
+        default_world_size = max(1, torch.cuda.device_count())
+    group.add_argument('--distributed-world-size', type=int, metavar='N',
+                       default=default_world_size,
+                       help='total number of GPUs across all nodes (default: all visible GPUs)')
+    group.add_argument('--distributed-rank', default=0, type=int,
+                       help='rank of the current worker')
+    group.add_argument('--distributed-backend', default='nccl', type=str,
+                       help='distributed backend')
+    group.add_argument('--distributed-init-method', default=None, type=str,
+                       help='typically tcp://hostname:port that will be used to '
+                            'establish initial connetion')
+    group.add_argument('--distributed-port', default=-1, type=int,
+                       help='port number (not required if using --distributed-init-method)')
+    group.add_argument('--device-id', '--local_rank', default=0, type=int,
+                       help='which GPU to use (usually configured automatically)')
+    group.add_argument('--distributed-no-spawn', action='store_true',
+                       help='do not spawn multiple processes even if multiple GPUs are visible')
+    # "c10d" is PyTorch's DDP implementation and provides the fastest
+    # training. "no_c10d" is a more robust, but slightly slower DDP
+    # implementation. Try this if you get warning messages about
+    # inconsistent gradients between workers, or if some of your model
+    # parameters are not always used.
+    group.add_argument('--ddp-backend', default='c10d', type=str,
+                       choices=['c10d', 'no_c10d'],
+                       help='DistributedDataParallel backend')
+    group.add_argument('--bucket-cap-mb', default=25, type=int, metavar='MB',
+                       help='bucket size for reduction')
+    group.add_argument('--fix-batches-to-gpus', action='store_true',
+                       help='don\'t shuffle batches between GPUs; this reduces overall '
+                            'randomness and may affect precision but avoids the cost of '
+                            're-reading the data')
+    group.add_argument('--find-unused-parameters', default=False, action='store_true',
+                       help='disable unused parameter detection (not applicable to '
+                       'no_c10d ddp-backend')
+    group.add_argument('--fast-stat-sync', default=False, action='store_true',
+                       help='[deprecated] this is now defined per Criterion')
+    group.add_argument('--broadcast-buffers', default=False, action='store_true',
+                       help='Copy non-trainable parameters between GPUs, such as '
+                      'batchnorm population statistics')
+
+    group.add_argument('--distributed-wrapper', default='DDP', type=str,
+                       choices=['DDP', 'SlowMo'],
+                       help='DistributedDataParallel backend')
+    # Add arguments for SlowMo - these will be used when SlowMo is enabled via above
+    group.add_argument('--slowmo-momentum', default=None, type=float,
+                       help='SlowMo momentum term; by default use 0.0 for 16 GPUs, '
+                            '0.2 for 32 GPUs; 0.5 for 64 GPUs, 0.6 for > 64 GPUs')
+    group.add_argument('--slowmo-algorithm', default='LocalSGD', choices=['LocalSGD', 'SGP'],
+                       help='whether to use LocalSGD or SGP')
+    group.add_argument('--localsgd-frequency', default=3, type=int,
+                       help='Local SGD allreduce frequency')
+    group.add_argument('--nprocs-per-node', type=int, metavar='N',
+                       default=max(1, torch.cuda.device_count()),
+                       help='number of GPUs in each node. An allreduce operation across GPUs in '
+                            'a node is very fast. Hence, we do allreduce across GPUs in a node, '
+                            'and gossip across different nodes')
+    # fmt: on
+    return group
+
+
+def add_optimization_args(parser):
+    group = parser.add_argument_group("Optimization")
+    # fmt: off
+    group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N',
+                       help='force stop training at specified epoch')
+    group.add_argument('--max-update', '--mu', default=0, type=int, metavar='N',
+                       help='force stop training at specified update')
+    group.add_argument('--stop-time-hours', default=0, type=float, metavar='N',
+                       help='force stop training after specified cumulative time (if >0)')
+    group.add_argument('--clip-norm', default=0.0, type=float, metavar='NORM',
+                       help='clip threshold of gradients')
+    group.add_argument('--sentence-avg', action='store_true',
+                       help='normalize gradients by the number of sentences in a batch'
+                            ' (default is to normalize by number of tokens)')
+    group.add_argument('--update-freq', default='1', metavar='N1,N2,...,N_K',
+                       type=lambda uf: eval_str_list(uf, type=int),
+                       help='update parameters every N_i batches, when in epoch i')
+    group.add_argument('--lr', '--learning-rate', default='0.25', type=eval_str_list,
+                       metavar='LR_1,LR_2,...,LR_N',
+                       help='learning rate for the first N epochs; all epochs >N using LR_N'
+                            ' (note: this may be interpreted differently depending on --lr-scheduler)')
+    group.add_argument('--min-lr', default=-1, type=float, metavar='LR',
+                       help='stop training when the learning rate reaches this minimum')
+    group.add_argument('--use-bmuf', default=False, action='store_true',
+                       help='specify global optimizer for syncing models on different GPUs/shards')
+    # fmt: on
+    return group
+
+
+def add_checkpoint_args(parser):
+    group = parser.add_argument_group("Checkpointing")
+    # fmt: off
+    group.add_argument('--save-dir', metavar='DIR', default='checkpoints',
+                       help='path to save checkpoints')
+    group.add_argument('--restore-file', default='checkpoint_last.pt',
+                       help='filename from which to load checkpoint '
+                            '(default: <save-dir>/checkpoint_last.pt')
+    group.add_argument('--finetune-from-model', default=None, type=str,
+                       help='finetune from a pretrained model; '
+                            'note that meters and lr scheduler will be reset')
+    group.add_argument('--reset-dataloader', action='store_true',
+                       help='if set, does not reload dataloader state from the checkpoint')
+    group.add_argument('--reset-lr-scheduler', action='store_true',
+                       help='if set, does not load lr scheduler state from the checkpoint')
+    group.add_argument('--reset-meters', action='store_true',
+                       help='if set, does not load meters from the checkpoint')
+    group.add_argument('--reset-optimizer', action='store_true',
+                       help='if set, does not load optimizer state from the checkpoint')
+    group.add_argument('--optimizer-overrides', default="{}", type=str, metavar='DICT',
+                       help='a dictionary used to override optimizer args when loading a checkpoint')
+    group.add_argument('--save-interval', type=int, default=1, metavar='N',
+                       help='save a checkpoint every N epochs')
+    group.add_argument('--save-interval-updates', type=int, default=0, metavar='N',
+                       help='save a checkpoint (and validate) every N updates')
+    group.add_argument('--keep-interval-updates', type=int, default=-1, metavar='N',
+                       help='keep the last N checkpoints saved with --save-interval-updates')
+    group.add_argument('--keep-last-epochs', type=int, default=-1, metavar='N',
+                       help='keep last N epoch checkpoints')
+    group.add_argument('--keep-best-checkpoints', type=int, default=-1, metavar='N',
+                       help='keep best N checkpoints based on scores')
+    group.add_argument('--no-save', action='store_true',
+                       help='don\'t save models or checkpoints')
+    group.add_argument('--no-epoch-checkpoints', action='store_true',
+                       help='only store last and best checkpoints')
+    group.add_argument('--no-last-checkpoints', action='store_true',
+                       help='don\'t store last checkpoints')
+    group.add_argument('--no-save-optimizer-state', action='store_true',
+                       help='don\'t save optimizer-state as part of checkpoint')
+    group.add_argument('--best-checkpoint-metric', type=str, default='loss',
+                       help='metric to use for saving "best" checkpoints')
+    group.add_argument('--maximize-best-checkpoint-metric', action='store_true',
+                       help='select the largest metric value for saving "best" checkpoints')
+    group.add_argument('--patience', type=int, default=-1, metavar='N',
+                       help=('early stop training if valid performance doesn\'t '
+                             'improve for N consecutive validation runs; note '
+                             'that this is influenced by --validate-interval'))
+    # fmt: on
+    return group
+
+
+def add_common_eval_args(group):
+    # fmt: off
+    group.add_argument('--path', metavar='FILE',
+                       help='path(s) to model file(s), colon separated')
+    group.add_argument('--remove-bpe', '--post-process', nargs='?', const='@@ ', default=None,
+                       help='remove BPE tokens before scoring (can be set to sentencepiece)')
+    group.add_argument('--quiet', action='store_true',
+                       help='only print final scores')
+    group.add_argument('--model-overrides', default="{}", type=str, metavar='DICT',
+                       help='a dictionary used to override model args at generation '
+                            'that were used during model training')
+    group.add_argument('--results-path', metavar='RESDIR', type=str, default=None,
+                       help='path to save eval results (optional)"')
+    # fmt: on
+
+
+def add_eval_lm_args(parser):
+    group = parser.add_argument_group("LM Evaluation")
+    add_common_eval_args(group)
+    # fmt: off
+    group.add_argument('--output-word-probs', action='store_true',
+                       help='if set, outputs words and their predicted log probabilities to standard output')
+    group.add_argument('--output-word-stats', action='store_true',
+                       help='if set, outputs word statistics such as word count, average probability, etc')
+    group.add_argument('--context-window', default=0, type=int, metavar='N',
+                       help='ensures that every evaluated token has access to a context of at least this size,'
+                            ' if possible')
+    group.add_argument('--softmax-batch', default=sys.maxsize, type=int, metavar='N',
+                       help='if BxT is more than this, will batch the softmax over vocab to this amount of tokens'
+                            ' in order to fit into GPU memory')
+    # fmt: on
+
+
+def add_generation_args(parser):
+    group = parser.add_argument_group("Generation")
+    add_common_eval_args(group)
+    # fmt: off
+    group.add_argument('--beam', default=5, type=int, metavar='N',
+                       help='beam size')
+    group.add_argument('--nbest', default=1, type=int, metavar='N',
+                       help='number of hypotheses to output')
+    group.add_argument('--max-len-a', default=0, type=float, metavar='N',
+                       help=('generate sequences of maximum length ax + b, '
+                             'where x is the source length'))
+    group.add_argument('--max-len-b', default=200, type=int, metavar='N',
+                       help=('generate sequences of maximum length ax + b, '
+                             'where x is the source length'))
+    group.add_argument('--min-len', default=1, type=float, metavar='N',
+                       help=('minimum generation length'))
+    group.add_argument('--match-source-len', default=False, action='store_true',
+                       help=('generations should match the source length'))
+    group.add_argument('--no-early-stop', action='store_true',
+                       help='deprecated')
+    group.add_argument('--unnormalized', action='store_true',
+                       help='compare unnormalized hypothesis scores')
+    group.add_argument('--no-beamable-mm', action='store_true',
+                       help='don\'t use BeamableMM in attention layers')
+    group.add_argument('--lenpen', default=1, type=float,
+                       help='length penalty: <1.0 favors shorter, >1.0 favors longer sentences')
+    group.add_argument('--unkpen', default=0, type=float,
+                       help='unknown word penalty: <0 produces more unks, >0 produces fewer')
+    group.add_argument('--replace-unk', nargs='?', const=True, default=None,
+                       help='perform unknown replacement (optionally with alignment dictionary)')
+    group.add_argument('--sacrebleu', action='store_true',
+                       help='score with sacrebleu')
+    group.add_argument('--score-reference', action='store_true',
+                       help='just score the reference translation')
+    group.add_argument('--prefix-size', default=0, type=int, metavar='PS',
+                       help='initialize generation by target prefix of given length')
+    group.add_argument('--no-repeat-ngram-size', default=0, type=int, metavar='N',
+                       help='ngram blocking such that this size ngram cannot be repeated in the generation')
+    group.add_argument('--sampling', action='store_true',
+                       help='sample hypotheses instead of using beam search')
+    group.add_argument('--sampling-topk', default=-1, type=int, metavar='PS',
+                       help='sample from top K likely next words instead of all words')
+    group.add_argument('--sampling-topp', default=-1.0, type=float, metavar='PS',
+                       help='sample from the smallest set whose cumulative probability mass exceeds p for next words')
+    group.add_argument('--temperature', default=1., type=float, metavar='N',
+                       help='temperature for generation')
+    group.add_argument('--diverse-beam-groups', default=-1, type=int, metavar='N',
+                       help='number of groups for Diverse Beam Search')
+    group.add_argument('--diverse-beam-strength', default=0.5, type=float, metavar='N',
+                       help='strength of diversity penalty for Diverse Beam Search')
+    group.add_argument('--diversity-rate', default=-1.0, type=float, metavar='N',
+                       help='strength of diversity penalty for Diverse Siblings Search')
+    group.add_argument('--print-alignment', action='store_true',
+                       help='if set, uses attention feedback to compute and print alignment to source tokens')
+    group.add_argument('--print-step', action='store_true')
+
+    # arguments for iterative refinement generator
+    group.add_argument('--iter-decode-eos-penalty', default=0.0, type=float, metavar='N',
+                       help='if > 0.0, it penalized early-stopping in decoding.')
+    group.add_argument('--iter-decode-max-iter', default=10, type=int, metavar='N',
+                       help='maximum iterations for iterative refinement.')
+    group.add_argument('--iter-decode-force-max-iter', action='store_true',
+                       help='if set, run exact the maximum number of iterations without early stop')
+    group.add_argument('--iter-decode-with-beam', default=1, type=int, metavar='N',
+                       help='if > 1, model will generate translations varying by the lengths.')
+    group.add_argument('--iter-decode-with-external-reranker', action='store_true',
+                       help='if set, the last checkpoint are assumed to be a reranker to rescore the translations'),
+    group.add_argument('--retain-iter-history', action='store_true',
+                       help='if set, decoding returns the whole history of iterative refinement')
+    group.add_argument('--retain-dropout', action='store_true',
+                       help='Use dropout at inference time')
+    group.add_argument('--retain-dropout-modules', default=None, nargs='+', type=str,
+                       help='if set, only retain dropout for the specified modules; '
+                            'if not set, then dropout will be retained for all modules')
+
+    # special decoding format for advanced decoding.
+    group.add_argument('--decoding-format', default=None, type=str, choices=['unigram', 'ensemble', 'vote', 'dp', 'bs'])
+    # fmt: on
+    return group
+
+
+def add_interactive_args(parser):
+    group = parser.add_argument_group("Interactive")
+    # fmt: off
+    group.add_argument('--buffer-size', default=0, type=int, metavar='N',
+                       help='read this many sentences into a buffer before processing them')
+    group.add_argument('--input', default='-', type=str, metavar='FILE',
+                       help='file to read from; use - for stdin')
+    # fmt: on
+
+
+def add_model_args(parser):
+    group = parser.add_argument_group("Model configuration")
+    # fmt: off
+
+    # Model definitions can be found under fairseq/models/
+    #
+    # The model architecture can be specified in several ways.
+    # In increasing order of priority:
+    # 1) model defaults (lowest priority)
+    # 2) --arch argument
+    # 3) --encoder/decoder-* arguments (highest priority)
+    from fairseq.models import ARCH_MODEL_REGISTRY
+    group.add_argument('--arch', '-a', default='fconv', metavar='ARCH',
+                       choices=ARCH_MODEL_REGISTRY.keys(),
+                       help='Model Architecture')
+    # fmt: on
+    return group
diff --git a/fairseq/pdb.py b/fairseq/pdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1ce3c46bca47fd8a47272770d6c61fcd9c13f75
--- /dev/null
+++ b/fairseq/pdb.py
@@ -0,0 +1,47 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import multiprocessing
+import os
+import pdb
+import sys
+
+
+__all__ = ['set_trace']
+
+
+_stdin = [None]
+_stdin_lock = multiprocessing.Lock()
+try:
+    _stdin_fd = sys.stdin.fileno()
+except Exception:
+    _stdin_fd = None
+
+
+class MultiprocessingPdb(pdb.Pdb):
+    """A Pdb wrapper that works in a multiprocessing environment.
+
+    Usage: `from fairseq import pdb; pdb.set_trace()`
+    """
+
+    def __init__(self):
+        pdb.Pdb.__init__(self, nosigint=True)
+
+    def _cmdloop(self):
+        stdin_bak = sys.stdin
+        with _stdin_lock:
+            try:
+                if _stdin_fd is not None:
+                    if not _stdin[0]:
+                        _stdin[0] = os.fdopen(_stdin_fd)
+                    sys.stdin = _stdin[0]
+                self.cmdloop()
+            finally:
+                sys.stdin = stdin_bak
+
+
+def set_trace():
+    pdb = MultiprocessingPdb()
+    pdb.set_trace(sys._getframe().f_back)
diff --git a/fairseq/quantization_utils.py b/fairseq/quantization_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7f5ade9b31738393c78ee803fb8d66d825e69e1
--- /dev/null
+++ b/fairseq/quantization_utils.py
@@ -0,0 +1,142 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+from fairseq.modules.quantization import pq, quantization_options, scalar
+
+
+logger = logging.getLogger(__name__)
+
+
+def quantize_model_scalar(model, args):
+    quant_noise_scalar = getattr(args, 'quant_noise_scalar', 0)
+    if quant_noise_scalar > 0:
+        # quantize_model edits the model in place
+        scalar.quantize_model_(model, p=quant_noise_scalar, bits=8, update_step=1000)
+    return model
+
+
+class Quantizer(object):
+
+    def __init__(self, config_path, max_epoch, max_update):
+        try:
+            import yaml
+        except ImportError:
+            raise ImportError('Please install yaml with: pip install yaml')
+
+        # parse config
+        if config_path:
+            with open(config_path) as config_file:
+                config = quantization_options.parse_config_yaml(
+                    yaml.safe_load(config_file)
+                )
+        else:
+            config = quantization_options.parse_config_yaml({})
+
+        self.n_centroids_config = config["n_centroids"]
+        self.block_sizes_config = config["block_sizes"]
+        self.layers_to_quantize = config["layers_to_quantize"]
+
+        # We assume that training will run for a fixed number of epochs
+        # (or updates) and that we should train for equal durations
+        # between iterations of PQ.
+        num_iterations = len(self.layers_to_quantize)
+        if max_epoch > 0:
+            assert max_epoch % num_iterations == 0, (
+                'for iterative PQ, --max-epoch (={}) must be evenly divisible by '
+                'len(layers_to_quantize) (={})'.format(max_epoch, num_iterations)
+            )
+            self.epoch_schedule = max_epoch // num_iterations
+        else:
+            self.epoch_schedule = None
+        if max_update > 0:
+            assert max_update % num_iterations == 0, (
+                'for iterative PQ, --max-update (={}) must be evenly divisible by '
+                'len(layers_to_quantize) (={})'.format(max_update, num_iterations)
+            )
+            self.update_schedule = max_update // num_iterations
+        else:
+            self.update_schedule = None
+        assert (self.epoch_schedule is not None) ^ (self.update_schedule is not None), \
+            'for iterative PQ, cannot specify both --max-update and --max-epoch'
+
+        # 0 is a special value for quantization step, which will force
+        # the first call to begin_epoch() to call step()
+        self.quantization_step = 0
+
+    def set_trainer(self, trainer):
+        self.trainer = trainer
+        self.size_tracker = pq.SizeTracker(self.trainer.get_model())
+
+    def step(self):
+        """Move to the next stage of quantization."""
+        if self.quantization_step >= len(self.layers_to_quantize):
+            # Maybe we just finished the last training step or we loaded
+            # a checkpoint for an iterative PQ model which previously
+            # finished training. Either way, don't quantize again.
+            return
+
+        logger.info(
+            'quantizing model (step={}; layers_to_quantize[step]={})'.format(
+                self.quantization_step, self.layers_to_quantize[self.quantization_step]
+            )
+        )
+        quantized_layers = pq.quantize_model_(
+            self.trainer.get_model(),
+            self.size_tracker,
+            self.layers_to_quantize,
+            self.block_sizes_config,
+            self.n_centroids_config,
+            step=self.quantization_step,
+        )
+        logger.info('quantized layers: {}'.format(quantized_layers))
+        logger.info(self.size_tracker)
+
+        self.quantization_step += 1
+
+        # reintialize the Trainer since model parameters have changed
+        self.trainer.reinitialize()
+
+    def begin_epoch(self, epoch):
+        """Called at the beginning of each epoch (epochs start at 1)."""
+        if (
+            (
+                self.epoch_schedule is not None
+                and epoch > 0
+                and (epoch - 1) % self.epoch_schedule == 0
+            )
+            # we always step once in the beginning, even if using
+            # update-based quantization
+            or self.quantization_step == 0
+        ):
+            self.step()
+
+    def step_update(self, num_updates):
+        """Called at the end of each step."""
+        if (
+            self.update_schedule is not None
+            and num_updates > 0
+            and num_updates % self.update_schedule == 0
+        ):
+            self.step()
+
+    def state_dict(self):
+        return {
+            'n_centroids_config': self.n_centroids_config,
+            'block_sizes_config': self.block_sizes_config,
+            'layers_to_quantize': self.layers_to_quantize,
+            'epoch_schedule': self.epoch_schedule,
+            'update_schedule': self.update_schedule,
+            'quantization_step': self.quantization_step,
+        }
+
+    def load_state_dict(self, state_dict):
+        self.n_centroids_config = state_dict['n_centroids_config']
+        self.block_sizes_config = state_dict['block_sizes_config']
+        self.layers_to_quantize = state_dict['layers_to_quantize']
+        self.epoch_schedule = state_dict['epoch_schedule']
+        self.update_schedule = state_dict['update_schedule']
+        self.quantization_step = state_dict['quantization_step']
diff --git a/fairseq/registry.py b/fairseq/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed24258c57713075fd42383a8bfa9461c45cf1b1
--- /dev/null
+++ b/fairseq/registry.py
@@ -0,0 +1,80 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+
+
+REGISTRIES = {}
+
+
+def setup_registry(
+    registry_name: str,
+    base_class=None,
+    default=None,
+):
+    assert registry_name.startswith('--')
+    registry_name = registry_name[2:].replace('-', '_')
+
+    REGISTRY = {}
+    REGISTRY_CLASS_NAMES = set()
+
+    # maintain a registry of all registries
+    if registry_name in REGISTRIES:
+        return  # registry already exists
+    REGISTRIES[registry_name] = {
+        'registry': REGISTRY,
+        'default': default,
+    }
+
+    def build_x(args, *extra_args, **extra_kwargs):
+        choice = getattr(args, registry_name, None)
+        if choice is None:
+            return None
+        cls = REGISTRY[choice]
+        if hasattr(cls, 'build_' + registry_name):
+            builder = getattr(cls, 'build_' + registry_name)
+        else:
+            builder = cls
+        set_defaults(args, cls)
+        return builder(args, *extra_args, **extra_kwargs)
+
+    def register_x(name):
+
+        def register_x_cls(cls):
+            if name in REGISTRY:
+                raise ValueError('Cannot register duplicate {} ({})'.format(registry_name, name))
+            if cls.__name__ in REGISTRY_CLASS_NAMES:
+                raise ValueError(
+                    'Cannot register {} with duplicate class name ({})'.format(
+                        registry_name, cls.__name__,
+                    )
+                )
+            if base_class is not None and not issubclass(cls, base_class):
+                raise ValueError('{} must extend {}'.format(cls.__name__, base_class.__name__))
+            REGISTRY[name] = cls
+            REGISTRY_CLASS_NAMES.add(cls.__name__)
+            return cls
+
+        return register_x_cls
+
+    return build_x, register_x, REGISTRY
+
+
+def set_defaults(args, cls):
+    """Helper to set default arguments based on *add_args*."""
+    if not hasattr(cls, 'add_args'):
+        return
+    parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, allow_abbrev=False)
+    cls.add_args(parser)
+    # copied from argparse.py:
+    defaults = argparse.Namespace()
+    for action in parser._actions:
+        if action.dest is not argparse.SUPPRESS:
+            if not hasattr(defaults, action.dest):
+                if action.default is not argparse.SUPPRESS:
+                    setattr(defaults, action.dest, action.default)
+    for key, default_value in vars(defaults).items():
+        if not hasattr(args, key):
+            setattr(args, key, default_value)
diff --git a/fairseq/scoring/__init__.py b/fairseq/scoring/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e5cc287ba393e83ed2da5c2a0fb8e156654853f
--- /dev/null
+++ b/fairseq/scoring/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import importlib
+import os
+
+from fairseq import registry
+
+
+build_scoring, register_scoring, SCORING_REGISTRY = registry.setup_registry(
+    "--scoring", default="bleu"
+)
+
+
+# automatically import any Python files in the current directory
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith(".py") and not file.startswith("_"):
+        module = file[: file.find(".py")]
+        importlib.import_module("fairseq.scoring." + module)
diff --git a/fairseq/scoring/__pycache__/__init__.cpython-310.pyc b/fairseq/scoring/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eace372b9c5541cd35bef5afad465381cceb604b
Binary files /dev/null and b/fairseq/scoring/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/scoring/__pycache__/bleu.cpython-310.pyc b/fairseq/scoring/__pycache__/bleu.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10670697ca22d2f7d65e2caf1ecc44114cba4fa7
Binary files /dev/null and b/fairseq/scoring/__pycache__/bleu.cpython-310.pyc differ
diff --git a/fairseq/scoring/__pycache__/scoring_utils.cpython-310.pyc b/fairseq/scoring/__pycache__/scoring_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..901a4cd98a9f3d97ef365bc09677e24658d57b78
Binary files /dev/null and b/fairseq/scoring/__pycache__/scoring_utils.cpython-310.pyc differ
diff --git a/fairseq/scoring/__pycache__/wer.cpython-310.pyc b/fairseq/scoring/__pycache__/wer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ad6fbccc45824b2b528456b452d50b4818d145b
Binary files /dev/null and b/fairseq/scoring/__pycache__/wer.cpython-310.pyc differ
diff --git a/fairseq/scoring/bleu.py b/fairseq/scoring/bleu.py
new file mode 100644
index 0000000000000000000000000000000000000000..40f3440d82f0638c138f131d3a9bdf0e3d2b2a33
--- /dev/null
+++ b/fairseq/scoring/bleu.py
@@ -0,0 +1,141 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import ctypes
+import math
+import torch
+
+from fairseq.scoring import register_scoring
+
+try:
+    from fairseq import libbleu
+except ImportError as e:
+    import sys
+
+    sys.stderr.write("ERROR: missing libbleu.so. run `pip install --editable .`\n")
+    raise e
+
+
+C = ctypes.cdll.LoadLibrary(libbleu.__file__)
+
+
+class BleuStat(ctypes.Structure):
+    _fields_ = [
+        ("reflen", ctypes.c_size_t),
+        ("predlen", ctypes.c_size_t),
+        ("match1", ctypes.c_size_t),
+        ("count1", ctypes.c_size_t),
+        ("match2", ctypes.c_size_t),
+        ("count2", ctypes.c_size_t),
+        ("match3", ctypes.c_size_t),
+        ("count3", ctypes.c_size_t),
+        ("match4", ctypes.c_size_t),
+        ("count4", ctypes.c_size_t),
+    ]
+
+
+@register_scoring("sacrebleu")
+class SacrebleuScorer(object):
+    def __init__(self, *unused):
+        import sacrebleu
+
+        self.sacrebleu = sacrebleu
+        self.reset()
+
+    def reset(self, one_init=False):
+        if one_init:
+            raise NotImplementedError
+        self.ref = []
+        self.sys = []
+
+    def add_string(self, ref, pred):
+        self.ref.append(ref)
+        self.sys.append(pred)
+
+    def score(self, order=4):
+        return self.result_string(order).score
+
+    def result_string(self, order=4):
+        if order != 4:
+            raise NotImplementedError
+        return self.sacrebleu.corpus_bleu(self.sys, [self.ref]).format()
+
+
+@register_scoring("bleu")
+class Scorer(object):
+    def __init__(self, pad, eos, unk):
+        self.stat = BleuStat()
+        self.pad = pad
+        self.eos = eos
+        self.unk = unk
+        self.reset()
+
+    def reset(self, one_init=False):
+        if one_init:
+            C.bleu_one_init(ctypes.byref(self.stat))
+        else:
+            C.bleu_zero_init(ctypes.byref(self.stat))
+
+    def add(self, ref, pred):
+        if not isinstance(ref, torch.IntTensor):
+            raise TypeError("ref must be a torch.IntTensor (got {})".format(type(ref)))
+        if not isinstance(pred, torch.IntTensor):
+            raise TypeError("pred must be a torch.IntTensor(got {})".format(type(pred)))
+
+        # don't match unknown words
+        rref = ref.clone()
+        assert not rref.lt(0).any()
+        rref[rref.eq(self.unk)] = -999
+
+        rref = rref.contiguous().view(-1)
+        pred = pred.contiguous().view(-1)
+
+        C.bleu_add(
+            ctypes.byref(self.stat),
+            ctypes.c_size_t(rref.size(0)),
+            ctypes.c_void_p(rref.data_ptr()),
+            ctypes.c_size_t(pred.size(0)),
+            ctypes.c_void_p(pred.data_ptr()),
+            ctypes.c_int(self.pad),
+            ctypes.c_int(self.eos),
+        )
+
+    def score(self, order=4):
+        psum = sum(
+            math.log(p) if p > 0 else float("-Inf") for p in self.precision()[:order]
+        )
+        return self.brevity() * math.exp(psum / order) * 100
+
+    def precision(self):
+        def ratio(a, b):
+            return a / b if b > 0 else 0
+
+        return [
+            ratio(self.stat.match1, self.stat.count1),
+            ratio(self.stat.match2, self.stat.count2),
+            ratio(self.stat.match3, self.stat.count3),
+            ratio(self.stat.match4, self.stat.count4),
+        ]
+
+    def brevity(self):
+        r = self.stat.reflen / self.stat.predlen
+        return min(1, math.exp(1 - r))
+
+    def result_string(self, order=4):
+        assert order <= 4, "BLEU scores for order > 4 aren't supported"
+        fmt = "BLEU{} = {:2.2f}, {:2.1f}"
+        for _ in range(1, order):
+            fmt += "/{:2.1f}"
+        fmt += " (BP={:.3f}, ratio={:.3f}, syslen={}, reflen={})"
+        bleup = [p * 100 for p in self.precision()[:order]]
+        return fmt.format(
+            order,
+            self.score(order=order),
+            *bleup,
+            self.brevity(),
+            self.stat.predlen / self.stat.reflen,
+            self.stat.predlen,
+            self.stat.reflen
+        )
diff --git a/fairseq/scoring/scoring_utils.py b/fairseq/scoring/scoring_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b710d5bb87a9487c3a5554de310f7dda8d1393b
--- /dev/null
+++ b/fairseq/scoring/scoring_utils.py
@@ -0,0 +1,22 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq import utils
+from . import bleu, build_scoring
+
+
+def build_scorer(args, tgt_dict):
+    if args.sacrebleu:
+        utils.deprecation_warning(
+            "--sacrebleu is deprecated. Please use --scoring sacrebleu instead."
+        )
+        args.scoring = "sacrebleu"
+
+    if args.scoring == "bleu":
+        scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk())
+    else:
+        return build_scoring(args)
+
+    return scorer
diff --git a/fairseq/scoring/wer.py b/fairseq/scoring/wer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f4521f6cd02f876ab7c4f3f5ef0f058460f0974
--- /dev/null
+++ b/fairseq/scoring/wer.py
@@ -0,0 +1,32 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import editdistance
+
+from fairseq.scoring import register_scoring
+
+
+@register_scoring("wer")
+class WerScorer(object):
+    def __init__(self, *unused):
+        self.reset()
+
+    def reset(self):
+        self.distance = 0
+        self.target_length = 0
+
+    def add_string(self, ref, pred):
+        pred_items = ref.split()
+        targ_items = pred.split()
+        self.distance += editdistance.eval(pred_items, targ_items)
+        self.target_length += len(targ_items)
+
+    def result_string(self):
+        return f"WER: {self.score()}"
+
+    def score(self):
+        return (
+            100.0 * self.distance / self.target_length if self.target_length > 0 else 0
+        )
diff --git a/fairseq/search.py b/fairseq/search.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e18581a978e678b4152427c0b8328c35e0b1005
--- /dev/null
+++ b/fairseq/search.py
@@ -0,0 +1,341 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Optional, List
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+
+class Search(nn.Module):
+    def __init__(self, tgt_dict):
+        super().__init__()
+        self.pad = tgt_dict.pad()
+        self.unk = tgt_dict.unk()
+        self.eos = tgt_dict.eos()
+        self.vocab_size = len(tgt_dict)
+        self.src_lengths = torch.tensor(-1)
+
+    def step(self, step, lprobs, scores):
+        """Take a single search step.
+
+        Args:
+            step: the current search step, starting at 0
+            lprobs: (bsz x input_beam_size x vocab_size)
+                the model's log-probabilities over the vocabulary at the current step
+            scores: (bsz x input_beam_size x step)
+                the historical model scores of each hypothesis up to this point
+
+        Return: A tuple of (scores, indices, beams) where:
+            scores: (bsz x output_beam_size)
+                the scores of the chosen elements; output_beam_size can be
+                larger than input_beam_size, e.g., we may return
+                2*input_beam_size to account for EOS
+            indices: (bsz x output_beam_size)
+                the indices of the chosen elements
+            beams: (bsz x output_beam_size)
+                the hypothesis ids of the chosen elements, in the range [0, input_beam_size)
+        """
+        raise NotImplementedError
+
+    @torch.jit.export
+    def set_src_lengths(self, src_lengths):
+        self.src_lengths = src_lengths
+
+
+class BeamSearch(Search):
+    def __init__(self, tgt_dict):
+        super().__init__(tgt_dict)
+
+    @torch.jit.export
+    def step(self, step: int, lprobs, scores: Optional[Tensor]):
+        bsz, beam_size, vocab_size = lprobs.size()
+
+        if step == 0:
+            # at the first step all hypotheses are equally likely, so use
+            # only the first beam
+            lprobs = lprobs[:, ::beam_size, :].contiguous()
+        else:
+            # make probs contain cumulative scores for each hypothesis
+            assert scores is not None
+            lprobs = lprobs + scores[:, :, step - 1].unsqueeze(-1)
+
+        top_prediction = torch.topk(
+            lprobs.view(bsz, -1),
+            k=min(
+                # Take the best 2 x beam_size predictions. We'll choose the first
+                # beam_size of these which don't predict eos to continue with.
+                beam_size * 2,
+                lprobs.view(bsz, -1).size(1) - 1,  # -1 so we never select pad
+            ),
+        )
+        scores_buf = top_prediction[0]
+        indices_buf = top_prediction[1]
+        beams_buf = indices_buf // vocab_size
+        indices_buf = indices_buf.fmod(vocab_size)
+        return scores_buf, indices_buf, beams_buf
+
+
+class LengthConstrainedBeamSearch(Search):
+    def __init__(self, tgt_dict, min_len_a, min_len_b, max_len_a, max_len_b):
+        super().__init__(tgt_dict)
+        self.min_len_a = min_len_a
+        self.min_len_b = min_len_b
+        self.max_len_a = max_len_a
+        self.max_len_b = max_len_b
+        self.beam = BeamSearch(tgt_dict)
+        self.needs_src_lengths = True
+
+    def step(self, step: int, lprobs, scores):
+        min_lens = self.min_len_a * self.src_lengths + self.min_len_b
+        max_lens = self.max_len_a * self.src_lengths + self.max_len_b
+        lprobs[step < min_lens, :, self.eos] = -math.inf
+        lprobs[step >= max_lens, :, self.eos] = 0
+        return self.beam.step(step, lprobs, scores)
+
+
+class DiverseBeamSearch(Search):
+    """Diverse Beam Search.
+
+    See "Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence
+    Models" for details.
+
+    We only implement the Hamming Diversity penalty here, which performed best
+    in the original paper.
+    """
+
+    def __init__(self, tgt_dict, num_groups, diversity_strength):
+        super().__init__(tgt_dict)
+        self.num_groups = num_groups
+        self.diversity_strength = -diversity_strength
+        self.beam = BeamSearch(tgt_dict)
+
+    @torch.jit.export
+    def step(self, step: int, lprobs, scores):
+        bsz, beam_size, vocab_size = lprobs.size()
+        if beam_size % self.num_groups != 0:
+            raise ValueError(
+                "DiverseBeamSearch requires --beam to be divisible by the number of groups"
+            )
+
+        # initialize diversity penalty
+        diversity_buf = torch.zeros(lprobs[:, 0, :].size()).to(lprobs)
+
+        scores_G, indices_G, beams_G = [], [], []
+        for g in range(self.num_groups):
+            lprobs_g = lprobs[:, g :: self.num_groups, :]
+            scores_g = scores[:, g :: self.num_groups, :] if step > 0 else None
+
+            # apply diversity penalty
+            if g > 0:
+                lprobs_g = torch.add(
+                    lprobs_g, self.diversity_strength, diversity_buf.unsqueeze(1)
+                )
+            else:
+                lprobs_g = lprobs_g.contiguous()
+
+            scores_buf, indices_buf, beams_buf = self.beam.step(
+                step, lprobs_g, scores_g
+            )
+            beams_buf.mul_(self.num_groups).add_(g)
+
+            scores_G.append(scores_buf.clone())
+            indices_G.append(indices_buf.clone())
+            beams_G.append(beams_buf.clone())
+
+            # update diversity penalty
+            diversity_buf.scatter_add_(
+                1, indices_buf, torch.ones(indices_buf.size()).to(diversity_buf)
+            )
+
+        # interleave results from different groups
+        scores_buf = torch.stack(scores_G, dim=2).view(bsz, -1)
+        indices_buf = torch.stack(indices_G, dim=2).view(bsz, -1)
+        beams_buf = torch.stack(beams_G, dim=2).view(bsz, -1)
+        return scores_buf, indices_buf, beams_buf
+
+
+class Sampling(Search):
+    sampling_topk: int
+    sampling_topp: float
+
+    def __init__(self, tgt_dict, sampling_topk=-1, sampling_topp=-1.0):
+        super().__init__(tgt_dict)
+        self.sampling_topk = sampling_topk
+        self.sampling_topp = sampling_topp
+
+    def _sample_topp(self, lprobs):
+        """Sample among the smallest set of elements whose cumulative probability mass exceeds p.
+
+        See `"The Curious Case of Neural Text Degeneration"
+        (Holtzman et al., 2019) <https://arxiv.org/abs/1904.09751>`_.
+
+        Args:
+            lprobs: (bsz x input_beam_size x vocab_size)
+                the model's log-probabilities over the vocabulary at the current step
+
+        Return: A tuple of (trimed_probs, truncated_indices) where:
+            trimed_probs: (bsz x input_beam_size x ?)
+                the model's probabilities over the elements selected to sample from. The
+                width of the third dimension is determined by top-P.
+            truncated_indices: (bsz x input_beam_size x ?)
+                the indices of the chosen elements.
+        """
+        probs = lprobs.exp_()
+
+        # sort the last dimension (vocab dimension) in descending order
+        sorted_probs, sorted_indices = probs.sort(descending=True)
+
+        # compute a mask to indicate the words to be included in the top-P set.
+        cumsum_probs = sorted_probs.cumsum(dim=2)
+        mask = cumsum_probs.lt(self.sampling_topp)
+
+        # note that mask was computed by 'lt'. One more word needs to be included
+        # so that the cumulative probability mass can exceed p.
+        cumsum_mask = mask.cumsum(dim=2)
+        last_included = cumsum_mask[:, :, -1:]
+        last_included.clamp_(0, mask.size()[2] - 1)
+        mask = mask.scatter_(2, last_included, 1)
+
+        # truncate unnecessary dims.
+        max_dim = last_included.max()
+        truncated_mask = mask[:, :, : max_dim + 1]
+        truncated_probs = sorted_probs[:, :, : max_dim + 1]
+        truncated_indices = sorted_indices[:, :, : max_dim + 1]
+
+        # trim the words that are not in top-P by setting their probabilities
+        # to 0, so that they would not be sampled later.
+        trim_mask = ~truncated_mask
+        trimed_probs = truncated_probs.masked_fill_(trim_mask, 0)
+        return trimed_probs, truncated_indices
+
+    @torch.jit.export
+    def step(self, step: int, lprobs, scores):
+        bsz, beam_size, vocab_size = lprobs.size()
+
+        if step == 0:
+            # at the first step all hypotheses are equally likely, so use
+            # only the first beam
+            lprobs = lprobs[:, ::beam_size, :].contiguous()
+
+        if self.sampling_topp > 0:
+            # only sample from the smallest set of words whose cumulative probability mass exceeds p
+            probs, top_indices = self._sample_topp(lprobs)
+        elif self.sampling_topk > 0:
+            # only sample from top-k candidates
+            lprobs, top_indices = lprobs.topk(self.sampling_topk)
+            probs = lprobs.exp_()
+        else:
+            probs = lprobs.exp_()
+
+            # dummy data to be consistent with true branch for type check
+            top_indices = torch.empty(0).to(probs)
+        # sample
+        if step == 0:
+            indices_buf = torch.multinomial(
+                probs.view(bsz, -1), beam_size, replacement=True,
+            ).view(bsz, beam_size)
+        else:
+            indices_buf = torch.multinomial(
+                probs.view(bsz * beam_size, -1),
+                1,
+                replacement=True,
+            ).view(bsz, beam_size)
+
+        if step == 0:
+            # expand to beam size
+            probs = probs.expand(bsz, beam_size, -1)
+
+        # gather scores
+        scores_buf = torch.gather(
+            probs, dim=2, index=indices_buf.unsqueeze(-1)
+        )
+        scores_buf = scores_buf.log_().view(bsz, -1)
+
+        # remap indices if using top-k or top-P sampling
+        if self.sampling_topk > 0 or self.sampling_topp > 0:
+            indices_buf = torch.gather(
+                top_indices.expand(bsz, beam_size, -1),
+                dim=2,
+                index=indices_buf.unsqueeze(-1),
+            ).squeeze(2)
+
+        if step == 0:
+            beams_buf = indices_buf.new_zeros(bsz, beam_size)
+        else:
+            beams_buf = torch.arange(0, beam_size).to(indices_buf).repeat(bsz, 1)
+            # make scores cumulative
+            scores_buf.add_(
+                torch.gather(scores[:, :, step - 1], dim=1, index=beams_buf)
+            )
+
+        return scores_buf, indices_buf, beams_buf
+
+
+class DiverseSiblingsSearch(Search):
+    """
+    Beam search with diverse siblings.
+
+    See "A Simple, Fast Diverse Decoding Algorithm for Neural Generation" for details.
+    https://arxiv.org/abs/1611.08562
+
+    1/ Calculate hypotheses for each beam
+    2/ Intra-sibling ordering
+    3/ Rewrite scores
+    4/ Choose top K hypotheses
+
+    if diversity_rate == 0 is equivalent to BeamSearch
+    """
+
+    def __init__(self, tgt_dict, diversity_rate):
+        super().__init__(tgt_dict)
+        self.diversity_rate = diversity_rate
+        self.beam = BeamSearch(tgt_dict)
+
+    def step(self, step: int, lprobs, scores):
+        bsz, beam_size, vocab_size = lprobs.size()
+        k = min(
+            # Take the best 2 x beam_size predictions. We'll choose the first
+            # beam_size of these which don't predict eos to continue with.
+            beam_size * 2,
+            lprobs.view(bsz, -1).size(1) - 1,  # -1 so we never select pad
+        )
+        s_list: List[Tensor]
+        i_list: List[Tensor]
+        s_list = [torch.empty(0).to(lprobs) for i in range(beam_size)]
+        i_list = [torch.LongTensor().to(device=lprobs.device) for i in range(beam_size)]
+        sibling_score = torch.arange(1, k + 1).to(lprobs) * self.diversity_rate
+
+        if step == 0:
+            return self.beam.step(step, lprobs, scores)
+        lprobs.add_(scores[:, :, step - 1].unsqueeze(-1))
+
+        # 1/ Calculate hypotheses for each beam
+        for i in range(beam_size):
+            torch.topk(lprobs[:, i, :].view(bsz, -1), k, out=(s_list[i], i_list[i]))
+            i_list[i].fmod_(vocab_size)
+
+            # 2/ Intra-sibling ordering by default from topk + 3/ Rewrite scores
+            s_list[i].sub_(sibling_score)
+
+        # 4/ Choose top K hypotheses
+        indices = torch.stack(i_list, dim=1).view(bsz, -1)
+
+        final_scores = torch.empty(0).to(lprobs)
+        final_indices = torch.LongTensor().to(device=lprobs.device)
+        final_beams = torch.LongTensor().to(device=lprobs.device)
+        (final_scores, final_indices) = torch.topk(
+            torch.stack(s_list, dim=1).view(bsz, -1),
+            k,
+        )
+
+        final_beams = final_indices // k
+
+        for i in range(bsz):
+            final_indices[i] = indices[i][final_indices[i]]
+
+        return final_scores, final_indices, final_beams
diff --git a/fairseq/sequence_generator.py b/fairseq/sequence_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..42012fbbb129033d848eabe0fe79f83f27e62089
--- /dev/null
+++ b/fairseq/sequence_generator.py
@@ -0,0 +1,919 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Dict, List, Optional
+
+import torch
+import torch.nn as nn
+from fairseq import search, utils
+from fairseq.data import data_utils
+from fairseq.models import FairseqIncrementalDecoder
+from fairseq.models.fairseq_encoder import EncoderOut
+from torch import Tensor
+
+
+class SequenceGenerator(nn.Module):
+    def __init__(
+        self,
+        models,
+        tgt_dict,
+        beam_size=1,
+        max_len_a=0,
+        max_len_b=200,
+        min_len=1,
+        normalize_scores=True,
+        len_penalty=1.0,
+        unk_penalty=0.0,
+        temperature=1.0,
+        match_source_len=False,
+        no_repeat_ngram_size=0,
+        search_strategy=None,
+        eos=None,
+        symbols_to_strip_from_output=None,
+    ):
+        """Generates translations of a given source sentence.
+
+        Args:
+            models (List[~fairseq.models.FairseqModel]): ensemble of models,
+                currently support fairseq.models.TransformerModel for scripting
+            beam_size (int, optional): beam width (default: 1)
+            max_len_a/b (int, optional): generate sequences of maximum length
+                ax + b, where x is the source length
+            min_len (int, optional): the minimum length of the generated output
+                (not including end-of-sentence)
+            normalize_scores (bool, optional): normalize scores by the length
+                of the output (default: True)
+            len_penalty (float, optional): length penalty, where <1.0 favors
+                shorter, >1.0 favors longer sentences (default: 1.0)
+            unk_penalty (float, optional): unknown word penalty, where <0
+                produces more unks, >0 produces fewer (default: 0.0)
+            temperature (float, optional): temperature, where values
+                >1.0 produce more uniform samples and values <1.0 produce
+                sharper samples (default: 1.0)
+            match_source_len (bool, optional): outputs should match the source
+                length (default: False)
+        """
+        super().__init__()
+        if isinstance(models, EnsembleModel):
+            self.model = models
+        else:
+            self.model = EnsembleModel(models)
+        self.pad = tgt_dict.pad()
+        self.unk = tgt_dict.unk()
+        self.eos = tgt_dict.eos() if eos is None else eos
+        self.symbols_to_strip_from_output = (
+            symbols_to_strip_from_output.union({self.eos})
+            if symbols_to_strip_from_output is not None else {self.eos})
+        self.vocab_size = len(tgt_dict)
+        self.beam_size = beam_size
+        # the max beam size is the dictionary size - 1, since we never select pad
+        self.beam_size = min(beam_size, self.vocab_size - 1)
+        self.max_len_a = max_len_a
+        self.max_len_b = max_len_b
+        self.min_len = min_len
+
+        self.normalize_scores = normalize_scores
+        self.len_penalty = len_penalty
+        self.unk_penalty = unk_penalty
+        self.temperature = temperature
+        self.match_source_len = match_source_len
+        self.no_repeat_ngram_size = no_repeat_ngram_size
+        assert temperature > 0, "--temperature must be greater than 0"
+
+        self.search = (
+            search.BeamSearch(tgt_dict) if search_strategy is None else search_strategy
+        )
+        # We only need to set src_lengths in LengthConstrainedBeamSearch.
+        # As a module attribute, setting it would break in multithread
+        # settings when the model is shared.
+        self.should_set_src_lengths = hasattr(self.search, 'needs_src_lengths') and self.search.needs_src_lengths
+
+        self.model.eval()
+
+    def cuda(self):
+        self.model.cuda()
+        return self
+
+    @torch.no_grad()
+    def forward(
+        self,
+        sample: Dict[str, Dict[str, Tensor]],
+        prefix_tokens: Optional[Tensor] = None,
+        bos_token: Optional[int] = None,
+    ):
+        """Generate a batch of translations.
+
+        Args:
+            sample (dict): batch
+            prefix_tokens (torch.LongTensor, optional): force decoder to begin
+                with these tokens
+            bos_token (int, optional): beginning of sentence token
+                (default: self.eos)
+        """
+        return self._generate(sample, prefix_tokens, bos_token)
+
+    # TODO(myleott): unused, deprecate after pytorch-translate migration
+    def generate_batched_itr(self, data_itr, beam_size=None, cuda=False, timer=None):
+        """Iterate over a batched dataset and yield individual translations.
+        Args:
+            cuda (bool, optional): use GPU for generation
+            timer (StopwatchMeter, optional): time generations
+        """
+        for sample in data_itr:
+            s = utils.move_to_cuda(sample) if cuda else sample
+            if "net_input" not in s:
+                continue
+            input = s["net_input"]
+            # model.forward normally channels prev_output_tokens into the decoder
+            # separately, but SequenceGenerator directly calls model.encoder
+            encoder_input = {
+                k: v for k, v in input.items() if k != "prev_output_tokens"
+            }
+            if timer is not None:
+                timer.start()
+            with torch.no_grad():
+                hypos = self.generate(encoder_input)
+            if timer is not None:
+                timer.stop(sum(len(h[0]["tokens"]) for h in hypos))
+            for i, id in enumerate(s["id"].data):
+                # remove padding
+                src = utils.strip_pad(input["src_tokens"].data[i, :], self.pad)
+                ref = (
+                    utils.strip_pad(s["target"].data[i, :], self.pad)
+                    if s["target"] is not None
+                    else None
+                )
+                yield id, src, ref, hypos[i]
+
+    @torch.no_grad()
+    def generate(self, models, sample: Dict[str, Dict[str, Tensor]], **kwargs):
+        """Generate translations. Match the api of other fairseq generators.
+
+        Args:
+            models (List[~fairseq.models.FairseqModel]): ensemble of models
+            sample (dict): batch
+            prefix_tokens (torch.LongTensor, optional): force decoder to begin
+                with these tokens
+            bos_token (int, optional): beginning of sentence token
+                (default: self.eos)
+        """
+        return self._generate(sample, **kwargs)
+
+    def _generate(
+        self,
+        sample: Dict[str, Dict[str, Tensor]],
+        prefix_tokens: Optional[Tensor] = None,
+        bos_token: Optional[int] = None,
+    ):
+        incremental_states = torch.jit.annotate(
+            List[Dict[str, Dict[str, Optional[Tensor]]]],
+            [
+                torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {})
+                for i in range(self.model.models_size)
+            ],
+        )
+        net_input = sample["net_input"]
+
+        if 'src_tokens' in net_input:
+            src_tokens = net_input['src_tokens']
+            # length of the source text being the character length except EndOfSentence and pad
+            src_lengths = (src_tokens.ne(self.eos) & src_tokens.ne(self.pad)).long().sum(dim=1)
+        elif 'source' in net_input:
+            src_tokens = net_input['source']
+            src_lengths = net_input['padding_mask'].size(-1) - net_input['padding_mask'].sum(-1) if net_input['padding_mask'] is not None else torch.tensor(src_tokens.size(-1))
+        else:
+            raise Exception('expected src_tokens or source in net input')
+
+        # bsz: total number of sentences in beam
+        input_size = src_tokens.size()
+        bsz, src_len = input_size[0], input_size[1]
+        beam_size = self.beam_size
+
+        max_len: int = -1
+        if self.match_source_len:
+            max_len = src_lengths.max().item()
+        else:
+            max_len = min(
+                int(self.max_len_a * src_len + self.max_len_b),
+                # exclude the EOS marker
+                self.model.max_decoder_positions() - 1,
+            )
+        assert (
+            self.min_len <= max_len
+        ), "min_len cannot be larger than max_len, please adjust these!"
+        # compute the encoder output for each beam
+        encoder_outs = self.model.forward_encoder(net_input)
+
+        # placeholder of indices for bsz * beam_size to hold tokens and accumulative scores
+        new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1)
+        new_order = new_order.to(src_tokens.device).long()
+        encoder_outs = self.model.reorder_encoder_out(encoder_outs, new_order)
+        # ensure encoder_outs is a List.
+        assert encoder_outs is not None
+
+        # initialize buffers
+        scores = (
+            torch.zeros(bsz * beam_size, max_len + 1).to(src_tokens).float()
+        )  # +1 for eos; pad is never choosed for scoring
+        tokens = (
+            torch.zeros(bsz * beam_size, max_len + 2)
+            .to(src_tokens)
+            .long()
+            .fill_(self.pad)
+        )  # +2 for eos and pad
+        tokens[:, 0] = self.eos if bos_token is None else bos_token
+        attn: Optional[Tensor] = None
+
+        # A list that indicates candidates that should be ignored.
+        # For example, suppose we're sampling and have already finalized 2/5
+        # samples. Then cands_to_ignore would mark 2 positions as being ignored,
+        # so that we only finalize the remaining 3 samples.
+        cands_to_ignore = (
+            torch.zeros(bsz, beam_size).to(src_tokens).eq(-1)
+        )  # forward and backward-compatible False mask
+
+        # list of completed sentences
+        finalized = torch.jit.annotate(
+            List[List[Dict[str, Tensor]]],
+            [torch.jit.annotate(List[Dict[str, Tensor]], []) for i in range(bsz)],
+        )  # contains lists of dictionaries of infomation about the hypothesis being finalized at each step
+
+        finished = [
+            False for i in range(bsz)
+        ]  # a boolean array indicating if the sentence at the index is finished or not
+        num_remaining_sent = bsz  # number of sentences remaining
+
+        # number of candidate hypos per step
+        cand_size = 2 * beam_size  # 2 x beam size in case half are EOS
+
+        # offset arrays for converting between different indexing schemes
+        bbsz_offsets = (torch.arange(0, bsz) * beam_size).unsqueeze(1).type_as(tokens)
+        cand_offsets = torch.arange(0, cand_size).type_as(tokens)
+
+        reorder_state: Optional[Tensor] = None
+        batch_idxs: Optional[Tensor] = None
+        for step in range(max_len + 1):  # one extra step for EOS marker
+            # reorder decoder internal states based on the prev choice of beams
+            # print(f'step: {step}')
+            if reorder_state is not None:
+                if batch_idxs is not None:
+                    # update beam indices to take into account removed sentences
+                    corr = batch_idxs - torch.arange(batch_idxs.numel()).type_as(
+                        batch_idxs
+                    )
+                    reorder_state.view(-1, beam_size).add_(
+                        corr.unsqueeze(-1) * beam_size
+                    )
+                self.model.reorder_incremental_state(incremental_states, reorder_state)
+                encoder_outs = self.model.reorder_encoder_out(
+                    encoder_outs, reorder_state
+                )
+
+            lprobs, avg_attn_scores = self.model.forward_decoder(
+                tokens[:, : step + 1],
+                encoder_outs,
+                incremental_states,
+                self.temperature,
+            )
+            lprobs[lprobs != lprobs] = torch.tensor(-math.inf).to(lprobs)
+
+            lprobs[:, self.pad] = -math.inf  # never select pad
+            lprobs[:, self.unk] -= self.unk_penalty  # apply unk penalty
+
+            # handle max length constraint
+            if step >= max_len:
+                lprobs[:, : self.eos] = -math.inf
+                lprobs[:, self.eos + 1 :] = -math.inf
+
+            # handle prefix tokens (possibly with different lengths)
+            if (
+                prefix_tokens is not None
+                and step < prefix_tokens.size(1)
+                and step < max_len
+            ):
+                lprobs, tokens, scores = self._prefix_tokens(
+                    step, lprobs, scores, tokens, prefix_tokens, beam_size
+                )
+            elif step < self.min_len:
+                # minimum length constraint (does not apply if using prefix_tokens)
+                lprobs[:, self.eos] = -math.inf
+
+            # Record attention scores, only support avg_attn_scores is a Tensor
+            if avg_attn_scores is not None:
+                if attn is None:
+                    attn = torch.empty(
+                        bsz * beam_size, avg_attn_scores.size(1), max_len + 2
+                    ).to(scores)
+                attn[:, :, step + 1].copy_(avg_attn_scores)
+
+            scores = scores.type_as(lprobs)
+            eos_bbsz_idx = torch.empty(0).to(
+                tokens
+            )  # indices of hypothesis ending with eos (finished sentences)
+            eos_scores = torch.empty(0).to(
+                scores
+            )  # scores of hypothesis ending with eos (finished sentences)
+
+            if self.should_set_src_lengths:
+                self.search.set_src_lengths(src_lengths)
+
+            if self.no_repeat_ngram_size > 0:
+                lprobs = self._no_repeat_ngram(tokens, lprobs, bsz, beam_size, step)
+
+            cand_scores, cand_indices, cand_beams = self.search.step(
+                step,
+                lprobs.view(bsz, -1, self.vocab_size),
+                scores.view(bsz, beam_size, -1)[:, :, :step],
+            )
+
+            # cand_bbsz_idx contains beam indices for the top candidate
+            # hypotheses, with a range of values: [0, bsz*beam_size),
+            # and dimensions: [bsz, cand_size]
+            cand_bbsz_idx = cand_beams.add(bbsz_offsets)
+
+            # finalize hypotheses that end in eos
+            eos_mask = cand_indices.eq(self.eos) & cand_scores.ne(-math.inf)
+            eos_mask[:, :beam_size][cands_to_ignore] = torch.tensor(0).to(eos_mask)
+
+            # only consider eos when it's among the top beam_size indices
+            eos_bbsz_idx = torch.masked_select(
+                cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size]
+            )
+
+            finalized_sents: List[int] = []
+            if eos_bbsz_idx.numel() > 0:
+                eos_scores = torch.masked_select(
+                    cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size]
+                )
+                finalized_sents = self.finalize_hypos(
+                    step,
+                    eos_bbsz_idx,
+                    eos_scores,
+                    tokens,
+                    scores,
+                    finalized,
+                    finished,
+                    beam_size,
+                    attn,
+                    src_lengths,
+                    max_len,
+                )
+                num_remaining_sent -= len(finalized_sents)
+
+            assert num_remaining_sent >= 0
+            if num_remaining_sent == 0:
+                break
+            assert step < max_len
+
+            if len(finalized_sents) > 0:
+                new_bsz = bsz - len(finalized_sents)
+
+                # construct batch_idxs which holds indices of batches to keep for the next pass
+                batch_mask = torch.ones(bsz).to(cand_indices)
+                batch_mask[
+                    torch.tensor(finalized_sents).to(cand_indices)
+                ] = torch.tensor(0).to(batch_mask)
+                batch_idxs = batch_mask.nonzero().squeeze(-1)
+
+                eos_mask = eos_mask[batch_idxs]
+                cand_beams = cand_beams[batch_idxs]
+                bbsz_offsets.resize_(new_bsz, 1)
+                cand_bbsz_idx = cand_beams.add(bbsz_offsets)
+                cand_scores = cand_scores[batch_idxs]
+                cand_indices = cand_indices[batch_idxs]
+
+                if prefix_tokens is not None:
+                    prefix_tokens = prefix_tokens[batch_idxs]
+                src_lengths = src_lengths[batch_idxs]
+                cands_to_ignore = cands_to_ignore[batch_idxs]
+
+                scores = scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
+                tokens = tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
+                if attn is not None:
+                    attn = attn.view(bsz, -1)[batch_idxs].view(
+                        new_bsz * beam_size, attn.size(1), -1
+                    )
+                bsz = new_bsz
+            else:
+                batch_idxs = None
+            # set active_mask so that values > cand_size indicate eos hypos
+            # and values < cand_size indicate candidate active hypos.
+            # After, the min values per row are the top candidate active hypos
+
+            # Rewrite the operator since the element wise or is not supported in torchscript.
+
+            eos_mask[:, :beam_size] = ~((~cands_to_ignore) & (~eos_mask[:, :beam_size]))
+            active_mask = torch.add(
+                eos_mask.type_as(cand_offsets) * cand_size,
+                cand_offsets[: eos_mask.size(1)],
+            )
+
+            # get the top beam_size active hypotheses, which are just the hypos
+            # with the smallest values in active_mask
+            new_cands_to_ignore, active_hypos = torch.topk(
+                active_mask, k=beam_size, dim=1, largest=False
+            )
+
+            # update cands_to_ignore to ignore any finalized hypos
+            cands_to_ignore = new_cands_to_ignore.ge(cand_size)[:, :beam_size]
+            assert (~cands_to_ignore).any(dim=1).all()
+
+            active_bbsz_idx = torch.gather(cand_bbsz_idx, dim=1, index=active_hypos)
+            active_scores = torch.gather(cand_scores, dim=1, index=active_hypos)
+
+            active_bbsz_idx = active_bbsz_idx.view(-1)
+            active_scores = active_scores.view(-1)
+
+            # copy tokens and scores for active hypotheses
+            tokens[:, : step + 1] = torch.index_select(
+                tokens[:, : step + 1], dim=0, index=active_bbsz_idx
+            )
+            tokens.view(bsz, beam_size, -1)[:, :, step + 1] = torch.gather(
+                cand_indices, dim=1, index=active_hypos
+            )
+            if step > 0:
+                scores[:, :step] = torch.index_select(
+                    scores[:, :step], dim=0, index=active_bbsz_idx
+                )
+            scores.view(bsz, beam_size, -1)[:, :, step] = torch.gather(
+                cand_scores, dim=1, index=active_hypos
+            )
+
+            # copy attention for active hypotheses
+            if attn is not None:
+                attn[:, :, : step + 2] = torch.index_select(
+                    attn[:, :, : step + 2], dim=0, index=active_bbsz_idx
+                )
+
+            # reorder incremental state in decoder
+            reorder_state = active_bbsz_idx
+
+        # sort by score descending
+        for sent in range(len(finalized)):
+            # make into beam container
+            BCList = [
+                BeamContainer(elem["score"].item(), elem) for elem in finalized[sent]
+            ]
+            BCList.sort()
+            BCList.reverse()
+            finalized[sent] = torch.jit.annotate(
+                List[Dict[str, Tensor]], [x.elem for x in BCList]
+            )
+
+        return finalized
+
+    def _prefix_tokens(
+        self, step: int, lprobs, scores, tokens, prefix_tokens, beam_size: int
+    ):
+        """Handle prefix tokens"""
+        prefix_toks = prefix_tokens[:, step].unsqueeze(-1).repeat(1, beam_size).view(-1)
+        prefix_lprobs = lprobs.gather(-1, prefix_toks.unsqueeze(-1))
+        prefix_mask = prefix_toks.ne(self.pad)
+        lprobs[prefix_mask] = torch.tensor(-math.inf).to(lprobs)
+        lprobs[prefix_mask] = lprobs[prefix_mask].scatter(
+            -1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_lprobs[prefix_mask]
+        )
+        # if prefix includes eos, then we should make sure tokens and
+        # scores are the same across all beams
+        eos_mask = prefix_toks.eq(self.eos)
+        if eos_mask.any():
+            # validate that the first beam matches the prefix
+            first_beam = tokens[eos_mask].view(-1, beam_size, tokens.size(-1))[
+                :, 0, 1 : step + 1
+            ]
+            eos_mask_batch_dim = eos_mask.view(-1, beam_size)[:, 0]
+            target_prefix = prefix_tokens[eos_mask_batch_dim][:, :step]
+            assert (first_beam == target_prefix).all()
+
+            # copy tokens, scores and lprobs from the first beam to all beams
+            tokens = self.replicate_first_beam(tokens, eos_mask_batch_dim, beam_size)
+            scores = self.replicate_first_beam(scores, eos_mask_batch_dim, beam_size)
+            lprobs = self.replicate_first_beam(lprobs, eos_mask_batch_dim, beam_size)
+        return lprobs, tokens, scores
+
+    def replicate_first_beam(self, tensor, mask, beam_size: int):
+        tensor = tensor.view(-1, beam_size, tensor.size(-1))
+        tensor[mask] = tensor[mask][:, :1, :]
+        return tensor.view(-1, tensor.size(-1))
+
+    def finalize_hypos(
+        self,
+        step: int,
+        bbsz_idx,
+        eos_scores,
+        tokens,
+        scores,
+        finalized: List[List[Dict[str, Tensor]]],
+        finished: List[bool],
+        beam_size: int,
+        attn: Optional[Tensor],
+        src_lengths,
+        max_len: int,
+    ):
+        """Finalize hypothesis, store finalized information in `finalized`, and change `finished` accordingly.
+        Returns number of sentences being finalized.
+        Args:
+            bbsz_idx (Tensor):
+        """
+        assert bbsz_idx.numel() == eos_scores.numel()
+
+        # clone relevant token and attention tensors
+        tokens_clone = tokens.index_select(0, bbsz_idx)[
+            :, 1 : step + 2
+        ]  # skip the first index, which is EOS
+
+        tokens_clone[:, step] = self.eos
+        attn_clone = (
+            attn.index_select(0, bbsz_idx)[:, :, 1 : step + 2]
+            if attn is not None
+            else None
+        )
+
+        # compute scores per token position
+        pos_scores = scores.index_select(0, bbsz_idx)[:, : step + 1]
+        pos_scores[:, step] = eos_scores
+        # convert from cumulative to per-position scores
+        pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1]
+
+        # normalize sentence-level scores
+        if self.normalize_scores:
+            eos_scores /= (step + 1) ** self.len_penalty
+
+        cum_unfin: List[int] = []
+        prev = 0
+        for f in finished:
+            if f:
+                prev += 1
+            else:
+                cum_unfin.append(prev)
+
+        # set() is not supported in script export
+        sents_seen: Dict[str, Optional[Tensor]] = {}
+        for i in range(bbsz_idx.size()[0]):
+            idx = bbsz_idx[i]
+            score = eos_scores[i]
+            unfin_idx = idx // beam_size
+            sent = unfin_idx + cum_unfin[unfin_idx]
+            # Cannot create dict for key type '(int, int)' in torchscript.
+            # The workaround is to cast int to string
+            seen = str(sent.item()) + "_" + str(unfin_idx.item())
+            if seen not in sents_seen:
+                sents_seen[seen] = None
+
+            if self.match_source_len and step > src_lengths[unfin_idx]:
+                score = torch.tensor(-math.inf).to(score)
+
+            if len(finalized[sent]) < beam_size:
+                if attn_clone is not None:
+                    # remove padding tokens from attn scores
+                    hypo_attn = attn_clone[i]
+                else:
+                    hypo_attn = torch.empty(0)
+                finalized[sent].append(
+                    {
+                        "tokens": tokens_clone[i],
+                        "score": score,
+                        "attention": hypo_attn,  # src_len x tgt_len
+                        "alignment": torch.empty(0),
+                        "positional_scores": pos_scores[i],
+                    }
+                )
+
+        newly_finished: List[int] = []
+        for seen in sents_seen.keys():
+            # check termination conditions for this sentence
+            sent: int = int(float(seen.split("_")[0]))
+            unfin_idx: int = int(float(seen.split("_")[1]))
+            if not finished[sent] and self.is_finished(
+                step, unfin_idx, max_len, len(finalized[sent]), beam_size
+            ):
+                finished[sent] = True
+                newly_finished.append(unfin_idx)
+        return newly_finished
+
+    def is_finished(
+        self,
+        step: int,
+        unfin_idx: int,
+        max_len: int,
+        finalized_sent_len: int,
+        beam_size: int,
+    ):
+        """
+        Check whether we've finished generation for a given sentence, by
+        comparing the worst score among finalized hypotheses to the best
+        possible score among unfinalized hypotheses.
+        """
+        assert finalized_sent_len <= beam_size
+        if finalized_sent_len == beam_size or step == max_len:
+            return True
+        return False
+
+    def calculate_banned_tokens(
+        self,
+        tokens,
+        step: int,
+        gen_ngrams: List[Dict[str, List[int]]],
+        no_repeat_ngram_size: int,
+        bbsz_idx: int,
+    ):
+        tokens_list: List[int] = tokens[
+            bbsz_idx, step + 2 - no_repeat_ngram_size : step + 1
+        ].tolist()
+        # before decoding the next token, prevent decoding of ngrams that have already appeared
+        ngram_index = ",".join([str(x) for x in tokens_list])
+        return gen_ngrams[bbsz_idx].get(ngram_index, torch.jit.annotate(List[int], []))
+
+    def transpose_list(self, l: List[List[int]]):
+        # GeneratorExp aren't supported in TS so ignoring the lint
+        min_len = min([len(x) for x in l])  # noqa
+        l2 = [[row[i] for row in l] for i in range(min_len)]
+        return l2
+
+    def _no_repeat_ngram(self, tokens, lprobs, bsz: int, beam_size: int, step: int):
+        # for each beam and batch sentence, generate a list of previous ngrams
+        gen_ngrams: List[Dict[str, List[int]]] = [
+            torch.jit.annotate(Dict[str, List[int]], {})
+            for bbsz_idx in range(bsz * beam_size)
+        ]
+        cpu_tokens = tokens.cpu()
+        for bbsz_idx in range(bsz * beam_size):
+            gen_tokens: List[int] = cpu_tokens[bbsz_idx].tolist()
+            for ngram in self.transpose_list(
+                [gen_tokens[i:] for i in range(self.no_repeat_ngram_size)]
+            ):
+                key = ",".join([str(x) for x in ngram[:-1]])
+                gen_ngrams[bbsz_idx][key] = gen_ngrams[bbsz_idx].get(
+                    key, torch.jit.annotate(List[int], [])
+                ) + [ngram[-1]]
+
+        if step + 2 - self.no_repeat_ngram_size >= 0:
+            # no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
+            banned_tokens = [
+                self.calculate_banned_tokens(
+                    tokens, step, gen_ngrams, self.no_repeat_ngram_size, bbsz_idx
+                )
+                for bbsz_idx in range(bsz * beam_size)
+            ]
+        else:
+            banned_tokens = [
+                torch.jit.annotate(List[int], []) for bbsz_idx in range(bsz * beam_size)
+            ]
+        for bbsz_idx in range(bsz * beam_size):
+            lprobs[bbsz_idx][
+                torch.tensor(banned_tokens[bbsz_idx]).long()
+            ] = torch.tensor(-math.inf, dtype=torch.float)
+        return lprobs
+
+
+class EnsembleModel(nn.Module):
+    """A wrapper around an ensemble of models."""
+
+    def __init__(self, models):
+        super().__init__()
+        self.models_size = len(models)
+        # method '__len__' is not supported in ModuleList for torch script
+        self.single_model = models[0]
+        self.models = nn.ModuleList(models)
+
+        self.has_incremental: bool = False
+        if all(
+            hasattr(m, "decoder") and isinstance(m.decoder, FairseqIncrementalDecoder)
+            for m in models
+        ):
+            self.has_incremental = True
+
+    def forward(self):
+        pass
+
+    def has_encoder(self):
+        return hasattr(self.single_model, "encoder")
+
+    def has_incremental_states(self):
+        return self.has_incremental
+
+    def max_decoder_positions(self):
+        return min([m.max_decoder_positions() for m in self.models])
+
+    @torch.jit.export
+    def forward_encoder(self, net_input: Dict[str, Tensor]):
+        if not self.has_encoder():
+            return None
+        return [
+            model.encoder.forward_torchscript(net_input)
+            for model in self.models
+        ]
+
+    @torch.jit.export
+    def forward_decoder(
+        self,
+        tokens,
+        encoder_outs: List[EncoderOut],
+        incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]],
+        temperature: float = 1.0,
+    ):
+        log_probs = []
+        avg_attn: Optional[Tensor] = None
+        encoder_out: Optional[EncoderOut] = None
+        for i, model in enumerate(self.models):
+            if self.has_encoder():
+                encoder_out = encoder_outs[i]
+            # decode each model
+            if self.has_incremental_states():
+                decoder_out = model.decoder.forward(
+                    tokens,
+                    encoder_out=encoder_out,
+                    incremental_state=incremental_states[i],
+                )
+            else:
+                decoder_out = model.decoder.forward(tokens, encoder_out=encoder_out)
+
+            attn: Optional[Tensor] = None
+            decoder_len = len(decoder_out)
+            if decoder_len > 1 and decoder_out[1] is not None:
+                if isinstance(decoder_out[1], Tensor):
+                    attn = decoder_out[1]
+                else:
+                    attn_holder = decoder_out[1]["attn"]
+                    if isinstance(attn_holder, Tensor):
+                        attn = attn_holder
+                    elif attn_holder is not None:
+                        attn = attn_holder[0]
+                if attn is not None:
+                    attn = attn[:, -1, :]
+
+            decoder_out_tuple = (
+                decoder_out[0][:, -1:, :].div_(temperature),
+                None if decoder_len <= 1 else decoder_out[1],
+            )
+
+            probs = model.get_normalized_probs(
+                decoder_out_tuple, log_probs=True, sample=None
+            )
+            probs = probs[:, -1, :]
+            if self.models_size == 1:
+                return probs, attn
+
+            log_probs.append(probs)
+            if attn is not None:
+                if avg_attn is None:
+                    avg_attn = attn
+                else:
+                    avg_attn.add_(attn)
+        avg_probs = torch.logsumexp(torch.stack(log_probs, dim=0), dim=0) - math.log(
+            self.models_size
+        )
+        if avg_attn is not None:
+            avg_attn.div_(self.models_size)
+        return avg_probs, avg_attn
+
+    @torch.jit.export
+    def reorder_encoder_out(self, encoder_outs: Optional[List[EncoderOut]], new_order):
+        """
+        Reorder encoder output according to *new_order*.
+
+        Args:
+            encoder_out: output from the ``forward()`` method
+            new_order (LongTensor): desired order
+
+        Returns:
+            *encoder_out* rearranged according to *new_order*
+        """
+        new_outs: List[EncoderOut] = []
+        if not self.has_encoder():
+            return new_outs
+        for i, model in enumerate(self.models):
+            assert encoder_outs is not None
+            new_outs.append(
+                model.encoder.reorder_encoder_out(encoder_outs[i], new_order)
+            )
+        return new_outs
+
+    @torch.jit.export
+    def reorder_incremental_state(
+        self,
+        incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]],
+        new_order,
+    ):
+        if not self.has_incremental_states():
+            return
+        for i, model in enumerate(self.models):
+            model.decoder.reorder_incremental_state_scripting(
+                incremental_states[i], new_order
+            )
+
+
+class SequenceGeneratorWithAlignment(SequenceGenerator):
+    def __init__(self, models, tgt_dict, left_pad_target=False, **kwargs):
+        """Generates translations of a given source sentence.
+
+        Produces alignments following "Jointly Learning to Align and
+        Translate with Transformer Models" (Garg et al., EMNLP 2019).
+
+        Args:
+            left_pad_target (bool, optional): Whether or not the
+                hypothesis should be left padded or not when they are
+                teacher forced for generating alignments.
+        """
+        super().__init__(EnsembleModelWithAlignment(models), tgt_dict, **kwargs)
+        self.left_pad_target = left_pad_target
+
+    @torch.no_grad()
+    def generate(self, models, sample, **kwargs):
+        finalized = super()._generate(sample, **kwargs)
+
+        src_tokens = sample["net_input"]["src_tokens"]
+        bsz = src_tokens.shape[0]
+        beam_size = self.beam_size
+        src_tokens, src_lengths, prev_output_tokens, tgt_tokens = self._prepare_batch_for_alignment(
+            sample, finalized
+        )
+        if any(getattr(m, "full_context_alignment", False) for m in self.model.models):
+            attn = self.model.forward_align(src_tokens, src_lengths, prev_output_tokens)
+        else:
+            attn = [
+                finalized[i // beam_size][i % beam_size]["attention"].transpose(1, 0)
+                for i in range(bsz * beam_size)
+            ]
+
+        if src_tokens.device != "cpu":
+            src_tokens = src_tokens.to('cpu')
+            tgt_tokens = tgt_tokens.to('cpu')
+            attn = [i.to('cpu') for i in attn]
+
+        # Process the attn matrix to extract hard alignments.
+        for i in range(bsz * beam_size):
+            alignment = utils.extract_hard_alignment(
+                attn[i], src_tokens[i], tgt_tokens[i], self.pad, self.eos
+            )
+            finalized[i // beam_size][i % beam_size]["alignment"] = alignment
+        return finalized
+
+    def _prepare_batch_for_alignment(self, sample, hypothesis):
+        src_tokens = sample["net_input"]["src_tokens"]
+        bsz = src_tokens.shape[0]
+        src_tokens = (
+            src_tokens[:, None, :]
+            .expand(-1, self.beam_size, -1)
+            .contiguous()
+            .view(bsz * self.beam_size, -1)
+        )
+        src_lengths = sample["net_input"]["src_lengths"]
+        src_lengths = (
+            src_lengths[:, None]
+            .expand(-1, self.beam_size)
+            .contiguous()
+            .view(bsz * self.beam_size)
+        )
+        prev_output_tokens = data_utils.collate_tokens(
+            [beam["tokens"] for example in hypothesis for beam in example],
+            self.pad,
+            self.eos,
+            self.left_pad_target,
+            move_eos_to_beginning=True,
+        )
+        tgt_tokens = data_utils.collate_tokens(
+            [beam["tokens"] for example in hypothesis for beam in example],
+            self.pad,
+            self.eos,
+            self.left_pad_target,
+            move_eos_to_beginning=False,
+        )
+        return src_tokens, src_lengths, prev_output_tokens, tgt_tokens
+
+
+class EnsembleModelWithAlignment(EnsembleModel):
+    """A wrapper around an ensemble of models."""
+
+    def __init__(self, models):
+        super().__init__(models)
+
+    def forward_align(self, src_tokens, src_lengths, prev_output_tokens):
+        avg_attn = None
+        for model in self.models:
+            decoder_out = model(src_tokens, src_lengths, prev_output_tokens)
+            attn = decoder_out[1]["attn"]
+            if avg_attn is None:
+                avg_attn = attn
+            else:
+                avg_attn.add_(attn)
+        if len(self.models) > 1:
+            avg_attn.div_(len(self.models))
+        return avg_attn
+
+
+@torch.jit.script
+class BeamContainer(object):
+    def __init__(self, score: float, elem: Dict[str, Tensor]):
+        self.score = score
+        self.elem = elem
+
+    def __lt__(self, other):
+        # type: (BeamContainer) -> bool
+        # Due to https://github.com/pytorch/pytorch/issues/20388,
+        # this has to use old style type annotations
+        # Match original behavior of sorted function when two scores are equal.
+        return self.score <= other.score
diff --git a/fairseq/sequence_scorer.py b/fairseq/sequence_scorer.py
new file mode 100644
index 0000000000000000000000000000000000000000..343c29acc2292d2d1a86cbed3af035c039b4c36f
--- /dev/null
+++ b/fairseq/sequence_scorer.py
@@ -0,0 +1,133 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import sys
+
+from fairseq import utils
+
+
+class SequenceScorer(object):
+    """Scores the target for a given source sentence."""
+
+    def __init__(
+        self, tgt_dict, softmax_batch=None, compute_alignment=False, eos=None,
+        symbols_to_strip_from_output=None,
+    ):
+        self.pad = tgt_dict.pad()
+        self.eos = tgt_dict.eos() if eos is None else eos
+        self.softmax_batch = softmax_batch or sys.maxsize
+        assert self.softmax_batch > 0
+        self.compute_alignment = compute_alignment
+        self.symbols_to_strip_from_output = (
+            symbols_to_strip_from_output.union({self.eos})
+            if symbols_to_strip_from_output is not None else {self.eos})
+
+    @torch.no_grad()
+    def generate(self, models, sample, **kwargs):
+        """Score a batch of translations."""
+        net_input = sample['net_input']
+
+        def batch_for_softmax(dec_out, target):
+            # assumes decoder_out[0] is the only thing needed (may not be correct for future models!)
+            first, rest = dec_out[0], dec_out[1:]
+            bsz, tsz, dim = first.shape
+            if bsz * tsz < self.softmax_batch:
+                yield dec_out, target, True
+            else:
+                flat = first.contiguous().view(1, -1, dim)
+                flat_tgt = target.contiguous().view(flat.shape[:-1])
+                s = 0
+                while s < flat.size(1):
+                    e = s + self.softmax_batch
+                    yield (flat[:, s:e],) + rest, flat_tgt[:, s:e], False
+                    s = e
+
+        def gather_target_probs(probs, target):
+            probs = probs.gather(
+                dim=2,
+                index=target.unsqueeze(-1),
+            )
+            return probs
+
+        orig_target = sample['target']
+
+        # compute scores for each model in the ensemble
+        avg_probs = None
+        avg_attn = None
+        for model in models:
+            model.eval()
+            decoder_out = model(**net_input)
+            attn = decoder_out[1] if len(decoder_out) > 1 else None
+            if type(attn) is dict:
+                attn = attn.get('attn', None)
+
+            batched = batch_for_softmax(decoder_out, orig_target)
+            probs, idx = None, 0
+            for bd, tgt, is_single in batched:
+                sample['target'] = tgt
+                curr_prob = model.get_normalized_probs(bd, log_probs=len(models) == 1, sample=sample).data
+                if is_single:
+                    probs = gather_target_probs(curr_prob, orig_target)
+                else:
+                    if probs is None:
+                        probs = curr_prob.new(orig_target.numel())
+                    step = curr_prob.size(0) * curr_prob.size(1)
+                    end = step + idx
+                    tgt_probs = gather_target_probs(curr_prob.view(tgt.shape + (curr_prob.size(-1),)), tgt)
+                    probs[idx:end] = tgt_probs.view(-1)
+                    idx = end
+                sample['target'] = orig_target
+
+            probs = probs.view(sample['target'].shape)
+
+            if avg_probs is None:
+                avg_probs = probs
+            else:
+                avg_probs.add_(probs)
+            if attn is not None and torch.is_tensor(attn):
+                attn = attn.data
+                if avg_attn is None:
+                    avg_attn = attn
+                else:
+                    avg_attn.add_(attn)
+        if len(models) > 1:
+            avg_probs.div_(len(models))
+            avg_probs.log_()
+            if avg_attn is not None:
+                avg_attn.div_(len(models))
+
+        bsz = avg_probs.size(0)
+        hypos = []
+        start_idxs = sample['start_indices'] if 'start_indices' in sample else [0] * bsz
+        for i in range(bsz):
+            # remove padding from ref
+            ref = utils.strip_pad(sample['target'][i, start_idxs[i]:], self.pad) \
+                if sample['target'] is not None else None
+            tgt_len = ref.numel()
+            avg_probs_i = avg_probs[i][start_idxs[i]:start_idxs[i] + tgt_len]
+            score_i = avg_probs_i.sum() / tgt_len
+            if avg_attn is not None:
+                avg_attn_i = avg_attn[i]
+                if self.compute_alignment:
+                    alignment = utils.extract_hard_alignment(
+                        avg_attn_i,
+                        sample['net_input']['src_tokens'][i],
+                        sample['target'][i],
+                        self.pad,
+                        self.eos,
+                    )
+                else:
+                    alignment = None
+            else:
+                avg_attn_i = alignment = None
+            hypos.append([{
+                'tokens': ref,
+                'score': score_i,
+                'attention': avg_attn_i,
+                'alignment': alignment,
+                'positional_scores': avg_probs_i,
+            }])
+        return hypos
diff --git a/fairseq/tasks/__init__.py b/fairseq/tasks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1bb404f1c31cb8ce02bdf88e5063da209853151
--- /dev/null
+++ b/fairseq/tasks/__init__.py
@@ -0,0 +1,82 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import importlib
+import os
+
+from .fairseq_task import FairseqTask
+
+TASK_REGISTRY = {}
+TASK_CLASS_NAMES = set()
+
+
+def setup_task(args, **kwargs):
+    return TASK_REGISTRY[args.task].setup_task(args, **kwargs)
+
+
+def register_task(name):
+    """
+    New tasks can be added to fairseq with the
+    :func:`~fairseq.tasks.register_task` function decorator.
+
+    For example::
+
+        @register_task('classification')
+        class ClassificationTask(FairseqTask):
+            (...)
+
+    .. note::
+
+        All Tasks must implement the :class:`~fairseq.tasks.FairseqTask`
+        interface.
+
+    Please see the
+
+    Args:
+        name (str): the name of the task
+    """
+
+    def register_task_cls(cls):
+        if name in TASK_REGISTRY:
+            raise ValueError('Cannot register duplicate task ({})'.format(name))
+        if not issubclass(cls, FairseqTask):
+            raise ValueError('Task ({}: {}) must extend FairseqTask'.format(name, cls.__name__))
+        if cls.__name__ in TASK_CLASS_NAMES:
+            raise ValueError('Cannot register task with duplicate class name ({})'.format(cls.__name__))
+        TASK_REGISTRY[name] = cls
+        TASK_CLASS_NAMES.add(cls.__name__)
+        return cls
+
+    return register_task_cls
+
+
+def get_task(name):
+    return TASK_REGISTRY[name]
+
+
+# automatically import any Python files in the tasks/ directory
+tasks_dir = os.path.dirname(__file__)
+for file in os.listdir(tasks_dir):
+    path = os.path.join(tasks_dir, file)
+    if (
+        not file.startswith('_')
+        and not file.startswith('.')
+        and (file.endswith('.py') or os.path.isdir(path))
+    ):
+        task_name = file[:file.find('.py')] if file.endswith('.py') else file
+        importlib.import_module('fairseq.tasks.' + task_name)
+
+        # expose `task_parser` for sphinx
+        if task_name in TASK_REGISTRY:
+            parser = argparse.ArgumentParser(add_help=False)
+            group_task = parser.add_argument_group('Task name')
+            # fmt: off
+            group_task.add_argument('--task', metavar=task_name,
+                                    help='Enable this task with: ``--task=' + task_name + '``')
+            # fmt: on
+            group_args = parser.add_argument_group('Additional command-line arguments')
+            TASK_REGISTRY[task_name].add_args(group_args)
+            globals()[task_name + '_parser'] = parser
diff --git a/fairseq/tasks/__pycache__/__init__.cpython-310.pyc b/fairseq/tasks/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..653ec077abf75177851b98f582cd9ecb36895712
Binary files /dev/null and b/fairseq/tasks/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq/tasks/__pycache__/audio_pretraining.cpython-310.pyc b/fairseq/tasks/__pycache__/audio_pretraining.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..960eaca3d163910a3aaedcd8e69bea500ac4bb0c
Binary files /dev/null and b/fairseq/tasks/__pycache__/audio_pretraining.cpython-310.pyc differ
diff --git a/fairseq/tasks/__pycache__/cross_lingual_lm.cpython-310.pyc b/fairseq/tasks/__pycache__/cross_lingual_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a9bfc6d9be049c602dd8a74cc37ed5d60de26a2
Binary files /dev/null and b/fairseq/tasks/__pycache__/cross_lingual_lm.cpython-310.pyc differ
diff --git a/fairseq/tasks/__pycache__/denoising.cpython-310.pyc b/fairseq/tasks/__pycache__/denoising.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..497d4fe8918655f2bc1f96a5f738bb603d8d5431
Binary files /dev/null and b/fairseq/tasks/__pycache__/denoising.cpython-310.pyc differ
diff --git a/fairseq/tasks/__pycache__/fairseq_task.cpython-310.pyc b/fairseq/tasks/__pycache__/fairseq_task.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ac8525a2ba586c024236f1739c9fd8ece597c17
Binary files /dev/null and b/fairseq/tasks/__pycache__/fairseq_task.cpython-310.pyc differ
diff --git a/fairseq/tasks/__pycache__/language_modeling.cpython-310.pyc b/fairseq/tasks/__pycache__/language_modeling.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e14ac0b417382b48d3297c6a1ee9925babb233a
Binary files /dev/null and b/fairseq/tasks/__pycache__/language_modeling.cpython-310.pyc differ
diff --git a/fairseq/tasks/__pycache__/legacy_masked_lm.cpython-310.pyc b/fairseq/tasks/__pycache__/legacy_masked_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a09de8871a6922957d2d4eafb59fc4b902a07961
Binary files /dev/null and b/fairseq/tasks/__pycache__/legacy_masked_lm.cpython-310.pyc differ
diff --git a/fairseq/tasks/__pycache__/masked_lm.cpython-310.pyc b/fairseq/tasks/__pycache__/masked_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eca76e50e07c2dc2431ec6fdf0a1930615eca4b2
Binary files /dev/null and b/fairseq/tasks/__pycache__/masked_lm.cpython-310.pyc differ
diff --git a/fairseq/tasks/__pycache__/multilingual_denoising.cpython-310.pyc b/fairseq/tasks/__pycache__/multilingual_denoising.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6eb542676312ef936419040681b3f4c0f9a10f7a
Binary files /dev/null and b/fairseq/tasks/__pycache__/multilingual_denoising.cpython-310.pyc differ
diff --git a/fairseq/tasks/__pycache__/multilingual_masked_lm.cpython-310.pyc b/fairseq/tasks/__pycache__/multilingual_masked_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a4bea3529db67f6eb18f2a641ea4a8fcb3a629e
Binary files /dev/null and b/fairseq/tasks/__pycache__/multilingual_masked_lm.cpython-310.pyc differ
diff --git a/fairseq/tasks/__pycache__/multilingual_translation.cpython-310.pyc b/fairseq/tasks/__pycache__/multilingual_translation.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1ac49efe92aed8de7eedf2ef20702358d4fc6a8
Binary files /dev/null and b/fairseq/tasks/__pycache__/multilingual_translation.cpython-310.pyc differ
diff --git a/fairseq/tasks/__pycache__/semisupervised_translation.cpython-310.pyc b/fairseq/tasks/__pycache__/semisupervised_translation.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2944900a1f8c3318dd84ca4dea1b94d93fecb2ff
Binary files /dev/null and b/fairseq/tasks/__pycache__/semisupervised_translation.cpython-310.pyc differ
diff --git a/fairseq/tasks/__pycache__/sentence_prediction.cpython-310.pyc b/fairseq/tasks/__pycache__/sentence_prediction.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ebf0068051f05e0b75b22c906eb653117e824d4
Binary files /dev/null and b/fairseq/tasks/__pycache__/sentence_prediction.cpython-310.pyc differ
diff --git a/fairseq/tasks/__pycache__/sentence_ranking.cpython-310.pyc b/fairseq/tasks/__pycache__/sentence_ranking.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78445f2c6f58807c3a95a0b59594b989feae258b
Binary files /dev/null and b/fairseq/tasks/__pycache__/sentence_ranking.cpython-310.pyc differ
diff --git a/fairseq/tasks/__pycache__/translation.cpython-310.pyc b/fairseq/tasks/__pycache__/translation.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f82d9a3a2da52d7f7c942b4b53cdf8a4e8b1362
Binary files /dev/null and b/fairseq/tasks/__pycache__/translation.cpython-310.pyc differ
diff --git a/fairseq/tasks/__pycache__/translation_from_pretrained_bart.cpython-310.pyc b/fairseq/tasks/__pycache__/translation_from_pretrained_bart.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94c53a91abe573cdf6380d8ed26c4116bfb765ec
Binary files /dev/null and b/fairseq/tasks/__pycache__/translation_from_pretrained_bart.cpython-310.pyc differ
diff --git a/fairseq/tasks/__pycache__/translation_from_pretrained_xlm.cpython-310.pyc b/fairseq/tasks/__pycache__/translation_from_pretrained_xlm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a3a0b03b42fed1ea56821f82c16fab0897155ac
Binary files /dev/null and b/fairseq/tasks/__pycache__/translation_from_pretrained_xlm.cpython-310.pyc differ
diff --git a/fairseq/tasks/__pycache__/translation_lev.cpython-310.pyc b/fairseq/tasks/__pycache__/translation_lev.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..677b8b14d980c2671d7a3094500554be4401f66a
Binary files /dev/null and b/fairseq/tasks/__pycache__/translation_lev.cpython-310.pyc differ
diff --git a/fairseq/tasks/__pycache__/translation_multi_simple_epoch.cpython-310.pyc b/fairseq/tasks/__pycache__/translation_multi_simple_epoch.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..252602de4fea2b6450b71b31f7047fbd90aec229
Binary files /dev/null and b/fairseq/tasks/__pycache__/translation_multi_simple_epoch.cpython-310.pyc differ
diff --git a/fairseq/tasks/audio_pretraining.py b/fairseq/tasks/audio_pretraining.py
new file mode 100644
index 0000000000000000000000000000000000000000..46d164ba9869f90700076b539fd089144583f691
--- /dev/null
+++ b/fairseq/tasks/audio_pretraining.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import os
+import sys
+
+from fairseq.data import FileAudioDataset, Dictionary, AddTargetDataset
+from . import FairseqTask, register_task
+
+
+class LabelEncoder(object):
+    def __init__(self, dictionary):
+        self.dictionary = dictionary
+
+    def __call__(self, label):
+        return self.dictionary.encode_line(
+            label, append_eos=False, add_if_not_exist=False
+        )
+
+
+@register_task("audio_pretraining")
+class AudioPretrainingTask(FairseqTask):
+    """
+
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument("data", help="path to data directory")
+        parser.add_argument(
+            "--sample-rate",
+            default=16000,
+            type=int,
+            help="target sample rate. audio files will be up/down sampled to this rate",
+        )
+        parser.add_argument(
+            "--normalize",
+            action="store_true",
+            help="if set, normalizes input to have 0 mean and unit variance",
+        )
+        parser.add_argument(
+            "--max-sample-size",
+            default=None,
+            type=int,
+            help="max sample size to crop to for batching. default = min sample length",
+        )
+        parser.add_argument(
+            "--min-sample-size",
+            default=None,
+            type=int,
+            help="min sample size to crop to for batching. default = same as --max-sample-size",
+        )
+
+        parser.add_argument(
+            "--enable-padding",
+            action="store_true",
+            help="pad shorter samples instead of cropping",
+        )
+
+        parser.add_argument(
+            "--labels",
+            type=str,
+            default=None,
+            help="extension of the label file to load, if any",
+        )
+
+    def __init__(self, args, source_dictionary=None):
+        super().__init__(args)
+        self._target_dictionary = None
+        self._source_dictionary = source_dictionary
+        self.is_ctc = args.criterion == "ctc"
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task (e.g., load dictionaries).
+
+        Args:
+            args (argparse.Namespace): parsed command-line arguments
+        """
+        return cls(args)
+
+    def load_dataset(self, split, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        manifest = os.path.join(self.args.data, "{}.tsv".format(split))
+        self.datasets[split] = FileAudioDataset(
+            manifest,
+            sample_rate=self.args.sample_rate,
+            max_sample_size=self.args.max_sample_size,
+            min_sample_size=self.args.max_sample_size,
+            min_length=self.args.min_sample_size,
+            pad=self.args.labels is not None or self.args.enable_padding,
+            normalize=self.args.normalize,
+        )
+
+        if self.args.labels:
+            dict_path = os.path.join(self.args.data, f"dict.{self.args.labels}.txt")
+            self._target_dictionary = Dictionary.load(dict_path)
+            label_path = os.path.join(self.args.data, f"{split}.{self.args.labels}")
+            labels = []
+            with open(label_path, "r") as f:
+                for line in f:
+                    labels.append(line)
+
+            process_label = LabelEncoder(self.target_dictionary)
+
+            self.datasets[split] = AddTargetDataset(
+                self.datasets[split],
+                labels,
+                pad=self.target_dictionary.pad(),
+                eos=self.target_dictionary.eos(),
+                batch_targets=True,
+                process_label=process_label,
+                add_to_input=not self.is_ctc,
+            )
+
+    @property
+    def source_dictionary(self):
+        return self._source_dictionary
+
+    @property
+    def target_dictionary(self):
+        """Return the :class:`~fairseq.data.Dictionary` for the language
+        model."""
+        return self._target_dictionary
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return (sys.maxsize, sys.maxsize)
diff --git a/fairseq/tasks/cross_lingual_lm.py b/fairseq/tasks/cross_lingual_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..3589492f1115641712489b7982876fda3cc39317
--- /dev/null
+++ b/fairseq/tasks/cross_lingual_lm.py
@@ -0,0 +1,170 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import OrderedDict
+import itertools
+import logging
+import os
+
+import numpy as np
+
+from fairseq import tokenizer
+from fairseq.data.legacy.masked_lm_dictionary import MaskedLMDictionary
+
+from fairseq.data import (
+    Dictionary,
+    ConcatDataset,
+    data_utils,
+    TokenBlockDataset,
+)
+from fairseq.data.legacy.masked_lm_dataset import MaskedLMDataset
+from fairseq.data.multi_corpus_sampled_dataset import MultiCorpusSampledDataset
+from fairseq.tasks import FairseqTask, register_task
+from fairseq import utils
+
+logger = logging.getLogger(__name__)
+
+
+@register_task('cross_lingual_lm')
+class CrossLingualLMTask(FairseqTask):
+    """
+    Task for training cross-lingual language models.
+
+    For more details look at: https://arxiv.org/pdf/1901.07291.pdf
+
+    Args:
+        dictionary (Dictionary): the dictionary for the input of the task
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument('data', help='colon separated path to data directories list, \
+                            will be iterated upon during epochs in round-robin manner')
+        parser.add_argument('--tokens-per-sample', default=512, type=int,
+                            help='max number of total tokens over all segments'
+                                 ' per sample')
+        parser.add_argument('--monolingual-langs', default='en', type=str,
+                            help='comma separated list of languages for which we'
+                                 ' want to train XLM on')
+        parser.add_argument('--shuffle', action='store_true',
+                            help='shuffle each monolingual dataset while'
+                            ' training')
+
+    def __init__(self, args, dictionary):
+        super().__init__(args)
+        self.dictionary = dictionary
+        self.seed = args.seed
+        self.distributed_world_size = args.distributed_world_size
+        self.langs2id = self._lang_to_id(args.monolingual_langs)
+
+    def _lang_to_id(
+            self,
+            languages: str
+    ):
+        """
+        Build a map from languages to ids. These ids are used as segment labels
+        for cross-lingual LM training.
+        """
+        lang2id = {}
+        langs = [l.strip() for l in languages.split(',')]
+        for id, lang in enumerate(langs):
+            lang2id[lang] = id
+        return lang2id
+
+    @classmethod
+    def load_dictionary(cls, filename):
+        return MaskedLMDictionary.load(filename)
+
+    @classmethod
+    def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8):
+        d = MaskedLMDictionary()
+        for filename in filenames:
+            Dictionary.add_file_to_dictionary(filename, d, tokenizer.tokenize_line, workers)
+        d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
+        return d
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task."""
+        dictionary = MaskedLMDictionary.load(os.path.join(args.data, 'dict.txt'))
+        logger.info('dictionary: {} types'.format(len(dictionary)))
+        return cls(args, dictionary)
+
+    def _load_single_lang_dataset(self, split, epoch):
+        loaded_datasets = []
+
+        paths = utils.split_paths(self.args.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+
+        for k in itertools.count():
+            split_k = split + (str(k) if k > 0 else '')
+            path = os.path.join(data_path, split_k)
+
+            ds = data_utils.load_indexed_dataset(path, self.dictionary, self.args.dataset_impl)
+            if ds is None:
+                if k > 0:
+                    break
+                else:
+                    raise FileNotFoundError('Dataset not found: {} ({})'.format(split, data_path))
+
+            # Since we append each block with the classification_token,
+            # we need to effectively create blocks of length
+            # tokens_per_sample-1
+            loaded_datasets.append(
+                TokenBlockDataset(
+                    ds, ds.sizes, self.args.tokens_per_sample - 1,
+                    pad=self.dictionary.pad(), eos=self.dictionary.eos(),
+                )
+            )
+
+            logger.info('{} {} {} examples'.format(data_path, split_k, len(loaded_datasets[-1])))
+
+        if len(loaded_datasets) == 1:
+            dataset = loaded_datasets[0]
+            sizes = dataset.sizes
+        else:
+            dataset = ConcatDataset(loaded_datasets)
+            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])
+
+        return dataset, sizes
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        dataset_map = OrderedDict()
+
+        for lang in self.langs2id.keys():
+            # Datasets are expected to be in "split.lang" format (Eg: train.en)
+            language_split = '{}.{}'.format(split, lang)
+
+            block_dataset, sizes = self._load_single_lang_dataset(split=language_split, epoch=epoch)
+
+            dataset_map[lang] = MaskedLMDataset(
+                dataset=block_dataset,
+                sizes=sizes,
+                vocab=self.dictionary,
+                pad_idx=self.dictionary.pad(),
+                mask_idx=self.dictionary.mask(),
+                classif_token_idx=self.dictionary.eos(),
+                sep_token_idx=self.dictionary.eos(),
+                shuffle=getattr(self.args, 'shuffle', False),
+                has_pairs=False,
+                segment_id=self.langs2id[lang],
+                seed=self.seed,
+            )
+
+        self.datasets[split] = MultiCorpusSampledDataset(dataset_map)
+        logger.info('{} {} {} examples'.format(
+            utils.split_paths(self.args.data)[epoch - 1], split, len(self.datasets[split]))
+        )
diff --git a/fairseq/tasks/denoising.py b/fairseq/tasks/denoising.py
new file mode 100644
index 0000000000000000000000000000000000000000..28beb517f2c6902edf51bd697a9dbe3e8d110b75
--- /dev/null
+++ b/fairseq/tasks/denoising.py
@@ -0,0 +1,171 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+
+from fairseq.data import (
+    data_utils,
+    Dictionary,
+    AppendTokenDataset,
+    DenoisingDataset,
+    PrependTokenDataset,
+    StripTokenDataset,
+    TokenBlockDataset,
+)
+from fairseq.data.encoders.utils import get_whole_word_mask
+from fairseq.tasks import FairseqTask, register_task
+from fairseq import utils
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_task('denoising')
+class DenoisingTask(FairseqTask):
+    """
+    Denoising task for applying sequence to sequence denoising. (ie. BART)
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument('data', help='path to data directory')
+        parser.add_argument('--tokens-per-sample', default=512, type=int,
+                            help='max number of total tokens over all segments'
+                                 ' per sample for dataset')
+        parser.add_argument(
+            '--sample-break-mode', default="complete_doc", type=str,
+            help='mode for breaking sentence',
+        )
+        parser.add_argument(
+            '--mask', default=0.0, type=float,
+            help='fraction of words/subwords that will be masked',
+        )
+        parser.add_argument(
+            '--mask-random', default=0.0, type=float,
+            help='instead of using [MASK], use random token this often'
+        )
+        parser.add_argument(
+            '--insert', default=0.0, type=float,
+            help='insert this percentage of additional random tokens',
+        )
+        parser.add_argument(
+            '--permute', default=0.0, type=float,
+            help='take this proportion of subwords and permute them',
+        )
+        parser.add_argument(
+            '--rotate', default=0.5, type=float,
+            help='rotate this proportion of inputs',
+        )
+        parser.add_argument(
+            '--poisson-lambda', default=3.0, type=float,
+            help='randomly shuffle sentences for this proportion of inputs'
+        )
+        parser.add_argument(
+            '--permute-sentences', default=0.0, type=float,
+            help='shuffle this proportion of sentences in all inputs'
+        )
+        parser.add_argument(
+            '--mask-length', default="subword", type=str,
+            choices=['subword', 'word', 'span-poisson'],
+            help='mask length to choose'
+        )
+        parser.add_argument(
+            '--replace-length', default=-1, type=int,
+            help='when masking N tokens, replace with 0, 1, or N tokens (use -1 for N)'
+        )
+        parser.add_argument(
+            '--max-source-positions', default=1024, type=int, metavar='N',
+            help='max number of tokens in the source sequence'
+        )
+        parser.add_argument(
+            '--max-target-positions', default=1024, type=int, metavar='N',
+            help='max number of tokens in the target sequence'
+        )
+
+    def __init__(self, args, dictionary):
+        super().__init__(args)
+        self.dictionary = dictionary
+        self.seed = args.seed
+
+        # add mask token
+        self.mask_idx = self.dictionary.add_symbol('<mask>')
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task.
+        """
+        dictionary = Dictionary.load(os.path.join(args.data, 'dict.txt'))
+        logger.info('dictionary: {} types'.format(len(dictionary)))
+        if not hasattr(args, 'shuffle_instance'):
+            args.shuffle_instance = False
+        return cls(args, dictionary)
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        paths = utils.split_paths(self.args.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+        split_path = os.path.join(data_path, split)
+
+        dataset = data_utils.load_indexed_dataset(
+            split_path,
+            self.dictionary,
+            self.args.dataset_impl,
+            combine=combine,
+        )
+        if dataset is None:
+            raise FileNotFoundError('Dataset not found: {} ({})'.format(split, split_path))
+
+        dataset = StripTokenDataset(dataset, self.dictionary.eos())
+
+        # create continuous blocks of tokens
+        dataset = TokenBlockDataset(
+                dataset,
+                dataset.sizes,
+                self.args.tokens_per_sample - 2,  # one less for <s> and one for </s>
+                pad=self.dictionary.pad(),
+                eos=self.dictionary.eos(),
+                break_mode=self.args.sample_break_mode,
+                document_sep_len=0
+        )
+
+        # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)
+        dataset = PrependTokenDataset(dataset, self.source_dictionary.bos())
+        dataset = AppendTokenDataset(dataset, self.source_dictionary.eos())
+
+        mask_whole_words = get_whole_word_mask(self.args, self.source_dictionary) \
+            if self.args.mask_length != 'subword' else None
+
+        self.datasets[split] = DenoisingDataset(
+            dataset, dataset.sizes, self.dictionary, self.mask_idx,
+            mask_whole_words, shuffle=self.args.shuffle_instance,
+            seed=self.seed, args=self.args
+        )
+        logger.info(
+            "Split: {0}, Loaded {1} samples of denoising_dataset".format(
+                split,
+                len(self.datasets[split]),
+            )
+        )
+
+    def max_positions(self):
+        """Return the max sentence length allowed by the task."""
+        return (self.args.max_source_positions, self.args.max_target_positions)
+
+    @property
+    def source_dictionary(self):
+        """Return the source :class:`~fairseq.data.Dictionary`."""
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        """Return the target :class:`~fairseq.data.Dictionary`."""
+        return self.dictionary
diff --git a/fairseq/tasks/fairseq_task.py b/fairseq/tasks/fairseq_task.py
new file mode 100644
index 0000000000000000000000000000000000000000..59663b531d59b9b523836cedb29cbd0e7cf88e0b
--- /dev/null
+++ b/fairseq/tasks/fairseq_task.py
@@ -0,0 +1,465 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import warnings
+
+
+import torch
+
+from fairseq import metrics, search, tokenizer, utils
+from fairseq.data import data_utils, FairseqDataset, iterators, Dictionary
+
+logger = logging.getLogger(__name__)
+
+
+class FairseqTask(object):
+    """
+    Tasks store dictionaries and provide helpers for loading/iterating over
+    Datasets, initializing the Model/Criterion and calculating the loss.
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        pass
+
+    @staticmethod
+    def logging_outputs_can_be_summed(criterion) -> bool:
+        """
+        Whether the logging outputs returned by `train_step` and `valid_step` can
+        be summed across workers prior to calling `aggregate_logging_outputs`.
+        Setting this to True will improves distributed training speed.
+        """
+        return criterion.logging_outputs_can_be_summed()
+
+    def __init__(self, args):
+        self.args = args
+        self.datasets = {}
+        self.dataset_to_epoch_iter = {}
+
+    @classmethod
+    def load_dictionary(cls, filename):
+        """Load the dictionary from the filename
+
+        Args:
+            filename (str): the filename
+        """
+        return Dictionary.load(filename)
+
+    @classmethod
+    def build_dictionary(
+        cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8
+    ):
+        """Build the dictionary
+
+        Args:
+            filenames (list): list of filenames
+            workers (int): number of concurrent workers
+            threshold (int): defines the minimum word count
+            nwords (int): defines the total number of words in the final dictionary,
+                including special symbols
+            padding_factor (int): can be used to pad the dictionary size to be a
+                multiple of 8, which is important on some hardware (e.g., Nvidia
+                Tensor Cores).
+        """
+        d = Dictionary()
+        for filename in filenames:
+            Dictionary.add_file_to_dictionary(
+                filename, d, tokenizer.tokenize_line, workers
+            )
+        d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
+        return d
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task (e.g., load dictionaries).
+
+        Args:
+            args (argparse.Namespace): parsed command-line arguments
+        """
+        return cls(args, **kwargs)
+
+    def has_sharded_data(self, split):
+        return (os.pathsep in getattr(self.args, 'data', ''))
+
+    def load_dataset(self, split, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        raise NotImplementedError
+
+    def dataset(self, split):
+        """
+        Return a loaded dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+
+        Returns:
+            a :class:`~fairseq.data.FairseqDataset` corresponding to *split*
+        """
+        from fairseq.data import FairseqDataset
+
+        if split not in self.datasets:
+            raise KeyError("Dataset not loaded: " + split)
+        if not isinstance(self.datasets[split], FairseqDataset):
+            raise TypeError("Datasets are expected to be of type FairseqDataset")
+        return self.datasets[split]
+
+    def filter_indices_by_size(self,
+                               indices,
+                               dataset,
+                               max_positions,
+                               ignore_invalid_inputs):
+        """
+        Filter examples that are too large
+
+        Args:
+            indices (np.array): original array of sample indices
+            dataset (~fairseq.data.FairseqDataset): dataset to batch
+            max_positions (optional): max sentence length supported by the
+                model (default: None).
+            ignore_invalid_inputs (bool, optional): don't raise Exception for
+                sentences that are too long (default: False).
+        Returns:
+            np.array: array of filtered sample indices
+        """
+        indices, ignored = dataset.filter_indices_by_size(indices, max_positions)
+        if len(ignored) > 0:
+            if not ignore_invalid_inputs:
+                raise Exception((
+                    'Size of sample #{} is invalid (={}) since max_positions={}, '
+                    'skip this example with --skip-invalid-size-inputs-valid-test'
+                ).format(ignored[0], dataset.size(ignored[0]), max_positions))
+            logger.warning((
+                '{} samples have invalid sizes and will be skipped, '
+                'max_positions={}, first few sample ids={}'
+            ).format(len(ignored), max_positions, ignored[:10]))
+        return indices
+
+    def get_batch_iterator(
+        self,
+        dataset,
+        max_tokens=None,
+        max_sentences=None,
+        max_positions=None,
+        ignore_invalid_inputs=False,
+        required_batch_size_multiple=1,
+        seed=1,
+        num_shards=1,
+        shard_id=0,
+        num_workers=0,
+        epoch=1
+    ):
+        """
+        Get an iterator that yields batches of data from the given dataset.
+
+        Args:
+            dataset (~fairseq.data.FairseqDataset): dataset to batch
+            max_tokens (int, optional): max number of tokens in each batch
+                (default: None).
+            max_sentences (int, optional): max number of sentences in each
+                batch (default: None).
+            max_positions (optional): max sentence length supported by the
+                model (default: None).
+            ignore_invalid_inputs (bool, optional): don't raise Exception for
+                sentences that are too long (default: False).
+            required_batch_size_multiple (int, optional): require batch size to
+                be a multiple of N (default: 1).
+            seed (int, optional): seed for random number generator for
+                reproducibility (default: 1).
+            num_shards (int, optional): shard the data iterator into N
+                shards (default: 1).
+            shard_id (int, optional): which shard of the data iterator to
+                return (default: 0).
+            num_workers (int, optional): how many subprocesses to use for data
+                loading. 0 means the data will be loaded in the main process
+                (default: 0).
+            epoch (int, optional): the epoch to start the iterator from
+                (default: 1).
+        Returns:
+            ~fairseq.iterators.EpochBatchIterator: a batched iterator over the
+                given dataset split
+        """
+        # For default fairseq task, return same iterator across epochs
+        # as datasets are not dynamic, can be overridden in task specific
+        # setting.
+        if dataset in self.dataset_to_epoch_iter:
+            return self.dataset_to_epoch_iter[dataset]
+
+        assert isinstance(dataset, FairseqDataset)
+
+        # initialize the dataset with the correct starting epoch
+        dataset.set_epoch(epoch)
+
+        # get indices ordered by example size
+        with data_utils.numpy_seed(seed):
+            indices = dataset.ordered_indices()
+
+        # filter examples that are too large
+        if max_positions is not None:
+            indices = self.filter_indices_by_size(indices,
+                                                  dataset,
+                                                  max_positions,
+                                                  ignore_invalid_inputs)
+
+        # create mini-batches with given size constraints
+        batch_sampler = dataset.batch_by_size(
+            indices,
+            max_tokens=max_tokens,
+            max_sentences=max_sentences,
+            required_batch_size_multiple=required_batch_size_multiple,
+        )
+
+        # return a reusable, sharded iterator
+        epoch_iter = iterators.EpochBatchIterator(
+            dataset=dataset,
+            collate_fn=dataset.collater,
+            batch_sampler=batch_sampler,
+            seed=seed,
+            num_shards=num_shards,
+            shard_id=shard_id,
+            num_workers=num_workers,
+            epoch=epoch,
+            buffer_size=getattr(self.args, 'data_buffer_size', 0)
+        )
+        self.dataset_to_epoch_iter[dataset] = epoch_iter
+        return epoch_iter
+
+    def build_model(self, args):
+        """
+        Build the :class:`~fairseq.models.BaseFairseqModel` instance for this
+        task.
+
+        Args:
+            args (argparse.Namespace): parsed command-line arguments
+
+        Returns:
+            a :class:`~fairseq.models.BaseFairseqModel` instance
+        """
+        from fairseq import models, quantization_utils
+        model = models.build_model(args, self)
+        if getattr(args, 'tpu', False):
+            model.prepare_for_tpu_()
+        model = quantization_utils.quantize_model_scalar(model, args)
+        return model
+
+    def build_criterion(self, args):
+        """
+        Build the :class:`~fairseq.criterions.FairseqCriterion` instance for
+        this task.
+
+        Args:
+            args (argparse.Namespace): parsed command-line arguments
+
+        Returns:
+            a :class:`~fairseq.criterions.FairseqCriterion` instance
+        """
+        from fairseq import criterions
+
+        return criterions.build_criterion(args, self)
+
+    def build_generator(
+        self, models, args,
+        seq_gen_cls=None, extra_gen_cls_kwargs=None
+    ):
+        if getattr(args, "score_reference", False):
+            from fairseq.sequence_scorer import SequenceScorer
+
+            return SequenceScorer(
+                self.target_dictionary,
+                compute_alignment=getattr(args, "print_alignment", False),
+            )
+
+        from fairseq.sequence_generator import (
+            SequenceGenerator,
+            SequenceGeneratorWithAlignment,
+        )
+
+        # Choose search strategy. Defaults to Beam Search.
+        sampling = getattr(args, "sampling", False)
+        sampling_topk = getattr(args, "sampling_topk", -1)
+        sampling_topp = getattr(args, "sampling_topp", -1.0)
+        diverse_beam_groups = getattr(args, "diverse_beam_groups", -1)
+        diverse_beam_strength = getattr(args, "diverse_beam_strength", 0.5)
+        match_source_len = getattr(args, "match_source_len", False)
+        diversity_rate = getattr(args, "diversity_rate", -1)
+        if (
+            sum(
+                int(cond)
+                for cond in [
+                    sampling,
+                    diverse_beam_groups > 0,
+                    match_source_len,
+                    diversity_rate > 0,
+                ]
+            )
+            > 1
+        ):
+            raise ValueError("Provided Search parameters are mutually exclusive.")
+        assert sampling_topk < 0 or sampling, "--sampling-topk requires --sampling"
+        assert sampling_topp < 0 or sampling, "--sampling-topp requires --sampling"
+
+        if sampling:
+            search_strategy = search.Sampling(
+                self.target_dictionary, sampling_topk, sampling_topp
+            )
+        elif diverse_beam_groups > 0:
+            search_strategy = search.DiverseBeamSearch(
+                self.target_dictionary, diverse_beam_groups, diverse_beam_strength
+            )
+        elif match_source_len:
+            # this is useful for tagging applications where the output
+            # length should match the input length, so we hardcode the
+            # length constraints for simplicity
+            search_strategy = search.LengthConstrainedBeamSearch(
+                self.target_dictionary,
+                min_len_a=1,
+                min_len_b=0,
+                max_len_a=1,
+                max_len_b=0,
+            )
+        elif diversity_rate > -1:
+            search_strategy = search.DiverseSiblingsSearch(
+                self.target_dictionary, diversity_rate
+            )
+        else:
+            search_strategy = search.BeamSearch(self.target_dictionary)
+
+        if seq_gen_cls is None:
+            if getattr(args, "print_alignment", False):
+                seq_gen_cls = SequenceGeneratorWithAlignment
+            else:
+                seq_gen_cls = SequenceGenerator
+        extra_gen_cls_kwargs = extra_gen_cls_kwargs or {}
+        return seq_gen_cls(
+            models,
+            self.target_dictionary,
+            beam_size=getattr(args, "beam", 5),
+            max_len_a=getattr(args, "max_len_a", 0),
+            max_len_b=getattr(args, "max_len_b", 200),
+            min_len=getattr(args, "min_len", 1),
+            normalize_scores=(not getattr(args, "unnormalized", False)),
+            len_penalty=getattr(args, "lenpen", 1),
+            unk_penalty=getattr(args, "unkpen", 0),
+            temperature=getattr(args, "temperature", 1.0),
+            match_source_len=getattr(args, "match_source_len", False),
+            no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0),
+            search_strategy=search_strategy,
+            **extra_gen_cls_kwargs,
+        )
+
+    def train_step(
+        self, sample, model, criterion, optimizer, update_num, ignore_grad=False
+    ):
+        """
+        Do forward and backward, and return the loss as computed by *criterion*
+        for the given *model* and *sample*.
+
+        Args:
+            sample (dict): the mini-batch. The format is defined by the
+                :class:`~fairseq.data.FairseqDataset`.
+            model (~fairseq.models.BaseFairseqModel): the model
+            criterion (~fairseq.criterions.FairseqCriterion): the criterion
+            optimizer (~fairseq.optim.FairseqOptimizer): the optimizer
+            update_num (int): the current update
+            ignore_grad (bool): multiply loss by 0 if this is set to True
+
+        Returns:
+            tuple:
+                - the loss
+                - the sample size, which is used as the denominator for the
+                  gradient
+                - logging outputs to display while training
+        """
+        model.train()
+        model.set_num_updates(update_num)
+        with torch.autograd.profiler.record_function("forward"):
+            loss, sample_size, logging_output = criterion(model, sample)
+        if ignore_grad:
+            loss *= 0
+        with torch.autograd.profiler.record_function("backward"):
+            optimizer.backward(loss)
+        return loss, sample_size, logging_output
+
+    def valid_step(self, sample, model, criterion):
+        model.eval()
+        with torch.no_grad():
+            loss, sample_size, logging_output = criterion(model, sample)
+        return loss, sample_size, logging_output
+
+    def inference_step(self, generator, models, sample, prefix_tokens=None):
+        with torch.no_grad():
+            return generator.generate(models, sample, prefix_tokens=prefix_tokens)
+
+    def begin_epoch(self, epoch, model):
+        """Hook function called before the start of each epoch."""
+        pass
+
+    def aggregate_logging_outputs(self, logging_outputs, criterion):
+        """[deprecated] Aggregate logging outputs from data parallel training."""
+        utils.deprecation_warning(
+            "The aggregate_logging_outputs API is deprecated. "
+            "Please use the reduce_metrics API instead."
+        )
+        with metrics.aggregate() as agg:
+            self.reduce_metrics(logging_outputs, criterion)
+            return agg.get_smoothed_values()
+
+    def reduce_metrics(self, logging_outputs, criterion):
+        """Aggregate logging outputs from data parallel training."""
+        # backward compatibility for tasks that override aggregate_logging_outputs
+        base_func = FairseqTask.aggregate_logging_outputs
+        self_func = getattr(self, "aggregate_logging_outputs").__func__
+        if self_func is not base_func:
+            utils.deprecation_warning(
+                "Tasks should implement the reduce_metrics API. "
+                "Falling back to deprecated aggregate_logging_outputs API."
+            )
+            agg_logging_outputs = self.aggregate_logging_outputs(
+                logging_outputs, criterion
+            )
+            for k, v in agg_logging_outputs.items():
+                metrics.log_scalar(k, v)
+            return
+
+        if not any("ntokens" in log for log in logging_outputs):
+            warnings.warn(
+                "ntokens not found in Criterion logging outputs, cannot log wpb or wps"
+            )
+        else:
+            ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+            metrics.log_scalar("wpb", ntokens, priority=180, round=1)
+            metrics.log_speed("wps", ntokens, priority=90, round=1)
+
+        if not any("nsentences" in log for log in logging_outputs):
+            warnings.warn(
+                "nsentences not found in Criterion logging outputs, cannot log bsz"
+            )
+        else:
+            nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+            metrics.log_scalar("bsz", nsentences, priority=190, round=1)
+
+        criterion.__class__.reduce_metrics(logging_outputs)
+
+    def max_positions(self):
+        """Return the max input length allowed by the task."""
+        return None
+
+    @property
+    def source_dictionary(self):
+        """Return the source :class:`~fairseq.data.Dictionary` (if applicable
+        for this task)."""
+        raise NotImplementedError
+
+    @property
+    def target_dictionary(self):
+        """Return the target :class:`~fairseq.data.Dictionary` (if applicable
+        for this task)."""
+        raise NotImplementedError
diff --git a/fairseq/tasks/language_modeling.py b/fairseq/tasks/language_modeling.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4a98e07bc5b3e9d21cbacf24815e0771b348456
--- /dev/null
+++ b/fairseq/tasks/language_modeling.py
@@ -0,0 +1,290 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+
+import numpy as np
+import torch
+
+from fairseq import utils
+from fairseq.data import (
+    AppendTokenDataset,
+    data_utils,
+    Dictionary,
+    IdDataset,
+    MonolingualDataset,
+    NestedDictionaryDataset,
+    NumelDataset,
+    PadDataset,
+    PrependTokenDataset,
+    StripTokenDataset,
+    TokenBlockDataset,
+    TransformEosDataset,
+    TruncatedDictionary,
+)
+from fairseq.data.shorten_dataset import maybe_shorten_dataset
+from fairseq.tasks import FairseqTask, register_task
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_task("language_modeling")
+class LanguageModelingTask(FairseqTask):
+    """
+    Train a language model.
+
+    Args:
+        dictionary (~fairseq.data.Dictionary): the dictionary for the input of
+            the language model
+        output_dictionary (~fairseq.data.Dictionary): the dictionary for the
+            output of the language model. In most cases it will be the same as
+            *dictionary*, but could possibly be a more limited version of the
+            dictionary (if ``--output-dictionary-size`` is used).
+        targets (List[str]): list of the target types that the language model
+            should predict.  Can be one of "self", "future", and "past".
+            Defaults to "future".
+
+    .. note::
+
+        The language modeling task is compatible with :mod:`fairseq-train`,
+        :mod:`fairseq-generate`, :mod:`fairseq-interactive` and
+        :mod:`fairseq-eval-lm`.
+
+    The language modeling task provides the following additional command-line
+    arguments:
+
+    .. argparse::
+        :ref: fairseq.tasks.language_modeling_parser
+        :prog:
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('data', help='path to data directory')
+        parser.add_argument('--sample-break-mode', default='none',
+                            choices=['none', 'complete', 'complete_doc', 'eos'],
+                            help='If omitted or "none", fills each sample with tokens-per-sample '
+                                 'tokens. If set to "complete", splits samples only at the end '
+                                 'of sentence, but may include multiple sentences per sample. '
+                                 '"complete_doc" is similar but respects doc boundaries. '
+                                 'If set to "eos", includes only one sentence per sample.')
+        parser.add_argument('--tokens-per-sample', default=1024, type=int,
+                            help='max number of tokens per sample for LM dataset')
+        parser.add_argument('--output-dictionary-size', default=-1, type=int,
+                            help='limit the size of output dictionary')
+        parser.add_argument('--self-target', action='store_true',
+                            help='include self target')
+        parser.add_argument('--future-target', action='store_true',
+                            help='include future target')
+        parser.add_argument('--past-target', action='store_true',
+                            help='include past target')
+        parser.add_argument('--add-bos-token', action='store_true',
+                            help='prepend beginning of sentence token (<s>)')
+        parser.add_argument('--max-target-positions', type=int, metavar='N',
+                            help='max number of tokens in the target sequence')
+        parser.add_argument('--shorten-method', default='none',
+                            choices=['none', 'truncate', 'random_crop'],
+                            help='if not none, shorten sequences that exceed --tokens-per-sample')
+        parser.add_argument('--shorten-data-split-list', default='',
+                            help='comma-separated list of dataset splits to apply shortening to, '
+                                 'e.g., "train,valid" (default: all dataset splits)')
+        # fmt: on
+
+    def __init__(self, args, dictionary, output_dictionary=None, targets=None):
+        super().__init__(args)
+        self.dictionary = dictionary
+        self.output_dictionary = output_dictionary or dictionary
+
+        if targets is None:
+            targets = ["future"]
+        self.targets = targets
+
+    @classmethod
+    def setup_dictionary(cls, args, **kwargs):
+        dictionary = None
+        output_dictionary = None
+        if args.data:
+            paths = utils.split_paths(args.data)
+            assert len(paths) > 0
+            dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
+            logger.info("dictionary: {} types".format(len(dictionary)))
+            output_dictionary = dictionary
+            if args.output_dictionary_size >= 0:
+                output_dictionary = TruncatedDictionary(
+                    dictionary, args.output_dictionary_size
+                )
+        return (dictionary, output_dictionary)
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task (e.g., load dictionaries).
+
+        Args:
+            args (argparse.Namespace): parsed command-line arguments
+        """
+        dictionary, output_dictionary = cls.setup_dictionary(args, **kwargs)
+
+        # upgrade old checkpoints
+        if hasattr(args, "exclude_self_target"):
+            args.self_target = not args.exclude_self_target
+
+        targets = []
+        if getattr(args, "self_target", False):
+            targets.append("self")
+        if getattr(args, "future_target", False):
+            targets.append("future")
+        if getattr(args, "past_target", False):
+            targets.append("past")
+        if len(targets) == 0:
+            # standard language modeling
+            targets = ["future"]
+
+        return cls(args, dictionary, output_dictionary, targets=targets)
+
+    def build_model(self, args):
+        model = super().build_model(args)
+
+        for target in self.targets:
+            if target not in model.supported_targets:
+                raise ValueError(
+                    "Unsupported language modeling target: {}".format(target)
+                )
+
+        return model
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        paths = utils.split_paths(self.args.data)
+        assert len(paths) > 0
+
+        data_path = paths[(epoch - 1) % len(paths)]
+        split_path = os.path.join(data_path, split)
+
+        dataset = data_utils.load_indexed_dataset(
+            split_path, self.dictionary, self.args.dataset_impl, combine=combine
+        )
+        if dataset is None:
+            raise FileNotFoundError(
+                "Dataset not found: {} ({})".format(split, split_path)
+            )
+
+        dataset = maybe_shorten_dataset(
+            dataset,
+            split,
+            self.args.shorten_data_split_list,
+            self.args.shorten_method,
+            self.args.tokens_per_sample,
+            self.args.seed,
+        )
+
+        dataset = TokenBlockDataset(
+            dataset,
+            dataset.sizes,
+            self.args.tokens_per_sample,
+            pad=self.dictionary.pad(),
+            eos=self.dictionary.eos(),
+            break_mode=self.args.sample_break_mode,
+            include_targets=True,
+        )
+
+        add_eos_for_other_targets = (
+            self.args.sample_break_mode is not None
+            and self.args.sample_break_mode != "none"
+        )
+
+        self.datasets[split] = self._initialize_dataset(
+            dataset=dataset,
+            sizes=dataset.sizes,
+            src_vocab=self.dictionary,
+            tgt_vocab=self.output_dictionary,
+            add_eos_for_other_targets=add_eos_for_other_targets,
+            shuffle=True,
+            targets=self.targets,
+            add_bos_token=self.args.add_bos_token,
+        )
+
+    def _initialize_dataset(self, **kwargs):
+        return MonolingualDataset(**kwargs)
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs):
+        """
+        Generate batches for inference. We prepend an eos token to src_tokens
+        (or bos if `--add-bos-token` is set) and we append a <pad> to target.
+        This is convenient both for generation with a prefix and LM scoring.
+        """
+        dataset = StripTokenDataset(
+            TokenBlockDataset(
+                src_tokens,
+                src_lengths,
+                block_size=None,  # ignored for "eos" break mode
+                pad=self.source_dictionary.pad(),
+                eos=self.source_dictionary.eos(),
+                break_mode="eos",
+            ),
+            # remove eos from (end of) target sequence
+            self.source_dictionary.eos(),
+        )
+        src_dataset = PrependTokenDataset(
+            dataset,
+            token=(
+                self.source_dictionary.bos()
+                if getattr(self.args, "add_bos_token", False)
+                else self.source_dictionary.eos()
+            ),
+        )
+        tgt_dataset = AppendTokenDataset(
+            dataset,
+            token=self.source_dictionary.pad()
+        )
+        return NestedDictionaryDataset(
+            {
+                "id": IdDataset(),
+                "net_input": {
+                    "src_tokens": PadDataset(src_dataset, pad_idx=self.source_dictionary.pad(), left_pad=False),
+                    "src_lengths": NumelDataset(src_dataset, reduce=False),
+                },
+                "target": PadDataset(tgt_dataset, pad_idx=self.source_dictionary.pad(), left_pad=False),
+            },
+            sizes=[np.array(src_lengths)],
+        )
+
+    def inference_step(self, generator, models, sample, prefix_tokens=None):
+        with torch.no_grad():
+            # Generation will always be conditioned on bos_token
+            if getattr(self.args, "add_bos_token", False):
+                bos_token = self.source_dictionary.bos()
+            else:
+                bos_token = self.source_dictionary.eos()
+
+            # SequenceGenerator doesn't use src_tokens directly, we need to
+            # pass the `prefix_tokens` argument instead
+            if prefix_tokens is None and sample["net_input"]["src_tokens"].nelement():
+                prefix_tokens = sample["net_input"]["src_tokens"]
+                if prefix_tokens[:, 0].eq(bos_token).all():
+                    prefix_tokens = prefix_tokens[:, 1:]
+
+            return generator.generate(
+                models, sample, prefix_tokens=prefix_tokens, bos_token=bos_token,
+            )
+
+    @property
+    def source_dictionary(self):
+        """Return the :class:`~fairseq.data.Dictionary` for the language
+        model."""
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        """Return the :class:`~fairseq.data.Dictionary` for the language
+        model."""
+        return self.output_dictionary
diff --git a/fairseq/tasks/legacy_masked_lm.py b/fairseq/tasks/legacy_masked_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..40e27249539b4ff4b2cc9035cf442234733d9d43
--- /dev/null
+++ b/fairseq/tasks/legacy_masked_lm.py
@@ -0,0 +1,144 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+import logging
+import os
+
+import numpy as np
+
+from fairseq import tokenizer
+from fairseq.data import (
+    ConcatDataset,
+    indexed_dataset,
+    data_utils,
+)
+
+from fairseq.data import Dictionary
+from fairseq.data.legacy.block_pair_dataset import BlockPairDataset
+from fairseq.data.legacy.masked_lm_dataset import MaskedLMDataset
+from fairseq.data.legacy.masked_lm_dictionary import BertDictionary
+from fairseq.tasks import FairseqTask, register_task
+from fairseq import utils
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_task('legacy_masked_lm')
+class LegacyMaskedLMTask(FairseqTask):
+    """
+    Task for training Masked LM (BERT) model.
+    Args:
+        dictionary (Dictionary): the dictionary for the input of the task
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument('data', help='colon separated path to data directories list, \
+                            will be iterated upon during epochs in round-robin manner')
+        parser.add_argument('--tokens-per-sample', default=512, type=int,
+                            help='max number of total tokens over all segments'
+                                 ' per sample for BERT dataset')
+        parser.add_argument('--break-mode', default="doc", type=str, help='mode for breaking sentence')
+        parser.add_argument('--shuffle-dataset', action='store_true', default=False)
+
+    def __init__(self, args, dictionary):
+        super().__init__(args)
+        self.dictionary = dictionary
+        self.seed = args.seed
+
+    @classmethod
+    def load_dictionary(cls, filename):
+        return BertDictionary.load(filename)
+
+    @classmethod
+    def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8):
+        d = BertDictionary()
+        for filename in filenames:
+            Dictionary.add_file_to_dictionary(filename, d, tokenizer.tokenize_line, workers)
+        d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
+        return d
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task.
+        """
+        paths = utils.split_paths(args.data)
+        assert len(paths) > 0
+        dictionary = BertDictionary.load(os.path.join(paths[0], 'dict.txt'))
+        logger.info('dictionary: {} types'.format(len(dictionary)))
+
+        return cls(args, dictionary)
+
+    def load_dataset(self, split, epoch=1, combine=False):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        loaded_datasets = []
+
+        paths = utils.split_paths(self.args.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+        logger.info("data_path", data_path)
+
+        for k in itertools.count():
+            split_k = split + (str(k) if k > 0 else '')
+            path = os.path.join(data_path, split_k)
+            ds = indexed_dataset.make_dataset(
+                path,
+                impl=self.args.dataset_impl,
+                fix_lua_indexing=True,
+                dictionary=self.dictionary,
+            )
+
+            if ds is None:
+                if k > 0:
+                    break
+                else:
+                    raise FileNotFoundError('Dataset not found: {} ({})'.format(split, data_path))
+
+            with data_utils.numpy_seed(self.seed + k):
+                loaded_datasets.append(
+                    BlockPairDataset(
+                        ds,
+                        self.dictionary,
+                        ds.sizes,
+                        self.args.tokens_per_sample,
+                        break_mode=self.args.break_mode,
+                        doc_break_size=1,
+                    )
+                )
+
+            logger.info('{} {} {} examples'.format(data_path, split_k, len(loaded_datasets[-1])))
+
+            if not combine:
+                break
+
+        if len(loaded_datasets) == 1:
+            dataset = loaded_datasets[0]
+            sizes = dataset.sizes
+        else:
+            dataset = ConcatDataset(loaded_datasets)
+            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])
+
+        self.datasets[split] = MaskedLMDataset(
+            dataset=dataset,
+            sizes=sizes,
+            vocab=self.dictionary,
+            pad_idx=self.dictionary.pad(),
+            mask_idx=self.dictionary.mask(),
+            classif_token_idx=self.dictionary.cls(),
+            sep_token_idx=self.dictionary.sep(),
+            shuffle=self.args.shuffle_dataset,
+            seed=self.seed,
+        )
diff --git a/fairseq/tasks/masked_lm.py b/fairseq/tasks/masked_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d7ea54b644d1a3b18b32eb186b57fafb8e86e06
--- /dev/null
+++ b/fairseq/tasks/masked_lm.py
@@ -0,0 +1,210 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+
+import numpy as np
+
+from fairseq.data import (
+    data_utils,
+    Dictionary,
+    IdDataset,
+    MaskTokensDataset,
+    NestedDictionaryDataset,
+    NumelDataset,
+    NumSamplesDataset,
+    PadDataset,
+    PrependTokenDataset,
+    SortDataset,
+    TokenBlockDataset,
+)
+from fairseq.data.shorten_dataset import maybe_shorten_dataset
+from fairseq.tasks import FairseqTask, register_task
+from fairseq.data.encoders.utils import get_whole_word_mask
+from fairseq import utils
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_task('masked_lm')
+class MaskedLMTask(FairseqTask):
+    """Task for training masked language models (e.g., BERT, RoBERTa)."""
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument('data', help='colon separated path to data directories list, \
+                            will be iterated upon during epochs in round-robin manner')
+        parser.add_argument('--sample-break-mode', default='complete',
+                            choices=['none', 'complete', 'complete_doc', 'eos'],
+                            help='If omitted or "none", fills each sample with tokens-per-sample '
+                                 'tokens. If set to "complete", splits samples only at the end '
+                                 'of sentence, but may include multiple sentences per sample. '
+                                 '"complete_doc" is similar but respects doc boundaries. '
+                                 'If set to "eos", includes only one sentence per sample.')
+        parser.add_argument('--tokens-per-sample', default=512, type=int,
+                            help='max number of total tokens over all segments '
+                                 'per sample for BERT dataset')
+        parser.add_argument('--mask-prob', default=0.15, type=float,
+                            help='probability of replacing a token with mask')
+        parser.add_argument('--leave-unmasked-prob', default=0.1, type=float,
+                            help='probability that a masked token is unmasked')
+        parser.add_argument('--random-token-prob', default=0.1, type=float,
+                            help='probability of replacing a token with a random token')
+        parser.add_argument('--freq-weighted-replacement', default=False, action='store_true',
+                            help='sample random replacement words based on word frequencies')
+        parser.add_argument('--mask-whole-words', default=False, action='store_true',
+                            help='mask whole words; you may also want to set --bpe')
+        parser.add_argument('--shorten-method', default='none',
+                            choices=['none', 'truncate', 'random_crop'],
+                            help='if not none, shorten sequences that exceed --tokens-per-sample')
+        parser.add_argument('--shorten-data-split-list', default='',
+                            help='comma-separated list of dataset splits to apply shortening to, '
+                                 'e.g., "train,valid" (default: all dataset splits)')
+
+    def __init__(self, args, dictionary):
+        super().__init__(args)
+        self.dictionary = dictionary
+        self.seed = args.seed
+
+        # add mask token
+        self.mask_idx = dictionary.add_symbol('<mask>')
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        paths = utils.split_paths(args.data)
+        assert len(paths) > 0
+        dictionary = Dictionary.load(os.path.join(paths[0], 'dict.txt'))
+        logger.info('dictionary: {} types'.format(len(dictionary)))
+        return cls(args, dictionary)
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        paths = utils.split_paths(self.args.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+        split_path = os.path.join(data_path, split)
+
+        dataset = data_utils.load_indexed_dataset(
+            split_path,
+            self.source_dictionary,
+            self.args.dataset_impl,
+            combine=combine,
+        )
+        if dataset is None:
+            raise FileNotFoundError('Dataset not found: {} ({})'.format(split, split_path))
+
+        dataset = maybe_shorten_dataset(
+            dataset,
+            split,
+            self.args.shorten_data_split_list,
+            self.args.shorten_method,
+            self.args.tokens_per_sample,
+            self.args.seed,
+        )
+
+        # create continuous blocks of tokens
+        dataset = TokenBlockDataset(
+            dataset,
+            dataset.sizes,
+            self.args.tokens_per_sample - 1,  # one less for <s>
+            pad=self.source_dictionary.pad(),
+            eos=self.source_dictionary.eos(),
+            break_mode=self.args.sample_break_mode,
+        )
+        logger.info('loaded {} blocks from: {}'.format(len(dataset), split_path))
+
+        # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)
+        dataset = PrependTokenDataset(dataset, self.source_dictionary.bos())
+
+        # create masked input and targets
+        mask_whole_words = get_whole_word_mask(self.args, self.source_dictionary) \
+            if self.args.mask_whole_words else None
+
+        src_dataset, tgt_dataset = MaskTokensDataset.apply_mask(
+            dataset,
+            self.source_dictionary,
+            pad_idx=self.source_dictionary.pad(),
+            mask_idx=self.mask_idx,
+            seed=self.args.seed,
+            mask_prob=self.args.mask_prob,
+            leave_unmasked_prob=self.args.leave_unmasked_prob,
+            random_token_prob=self.args.random_token_prob,
+            freq_weighted_replacement=self.args.freq_weighted_replacement,
+            mask_whole_words=mask_whole_words,
+        )
+
+        with data_utils.numpy_seed(self.args.seed + epoch):
+            shuffle = np.random.permutation(len(src_dataset))
+
+        self.datasets[split] = SortDataset(
+            NestedDictionaryDataset(
+                {
+                    'id': IdDataset(),
+                    'net_input': {
+                        'src_tokens': PadDataset(
+                            src_dataset,
+                            pad_idx=self.source_dictionary.pad(),
+                            left_pad=False,
+                        ),
+                        'src_lengths': NumelDataset(src_dataset, reduce=False),
+                    },
+                    'target': PadDataset(
+                        tgt_dataset,
+                        pad_idx=self.source_dictionary.pad(),
+                        left_pad=False,
+                    ),
+                    'nsentences': NumSamplesDataset(),
+                    'ntokens': NumelDataset(src_dataset, reduce=True),
+                },
+                sizes=[src_dataset.sizes],
+            ),
+            sort_order=[
+                shuffle,
+                src_dataset.sizes,
+            ],
+        )
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths, sort=True):
+        src_dataset = PadDataset(
+            TokenBlockDataset(
+                src_tokens,
+                src_lengths,
+                self.args.tokens_per_sample - 1,  # one less for <s>
+                pad=self.source_dictionary.pad(),
+                eos=self.source_dictionary.eos(),
+                break_mode='eos',
+            ),
+            pad_idx=self.source_dictionary.pad(),
+            left_pad=False,
+        )
+        src_dataset = PrependTokenDataset(src_dataset, self.source_dictionary.bos())
+        src_dataset = NestedDictionaryDataset(
+            {
+                'id': IdDataset(),
+                'net_input': {
+                    'src_tokens': src_dataset,
+                    'src_lengths': NumelDataset(src_dataset, reduce=False),
+                },
+            },
+            sizes=src_lengths,
+        )
+        if sort:
+            src_dataset = SortDataset(src_dataset, sort_order=[src_lengths])
+        return src_dataset
+
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
diff --git a/fairseq/tasks/multilingual_denoising.py b/fairseq/tasks/multilingual_denoising.py
new file mode 100644
index 0000000000000000000000000000000000000000..18ee717fff0b84154dff36f44dd8f6eb33f96241
--- /dev/null
+++ b/fairseq/tasks/multilingual_denoising.py
@@ -0,0 +1,225 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+
+import numpy as np
+
+from fairseq.data import (
+    data_utils,
+    Dictionary,
+    AppendTokenDataset,
+    ConcatDataset,
+    DenoisingDataset,
+    PrependTokenDataset,
+    ResamplingDataset,
+    SortDataset,
+    TokenBlockDataset,
+)
+from .denoising import DenoisingTask
+from fairseq.data.encoders.utils import get_whole_word_mask
+from fairseq.tasks import register_task
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_task('multilingual_denoising')
+class MultilingualDenoisingTask(DenoisingTask):
+
+    @staticmethod
+    def add_args(parser):
+        DenoisingTask.add_args(parser)
+        parser.add_argument('--multilang-sampling-alpha', type=float, default=1.0,
+                            help='smoothing alpha for sample ratios across multiple datasets')
+        parser.add_argument('--add-lang-token', default=False, action='store_true')
+        parser.add_argument('--langs', type=str, help="language ids we are considering", default=None)
+        parser.add_argument('--no-whole-word-mask-langs', type=str, default='', metavar='N',
+                            help='languages without spacing between words dont support whole word masking')
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task.
+        """
+        paths = args.data.split(':')
+        assert len(paths) > 0
+        dictionary = Dictionary.load(os.path.join(paths[0], 'dict.txt'))
+
+        data_path = paths[0]
+        if args.langs is None:
+            languages = sorted([
+                name for name in os.listdir(data_path)
+                if os.path.isdir(os.path.join(data_path, name))
+            ])
+        else:
+            languages = args.langs.split(',')
+
+        if args.add_lang_token:
+            for lang in languages:
+                dictionary.add_symbol('[{}]'.format(lang))
+
+        logger.info("dictionary: {} types".format(len(dictionary)))
+        if not hasattr(args, 'shuffle_instance'):
+            args.shuffle_instance = False
+        return cls(args, dictionary)
+
+    def __init__(self, args, dictionary):
+        super().__init__(args, dictionary)
+        self.dictionary = dictionary
+        self.seed = args.seed
+
+        # add mask token
+        self.mask_idx = self.dictionary.add_symbol('<mask>')
+        self.langs = args.langs
+        self.args = args
+
+    def _get_sample_prob(self, dataset_lens):
+        """
+        Get smoothed sampling porbability by languages. This helps low resource
+        languages by upsampling them.
+        """
+        prob = dataset_lens / dataset_lens.sum()
+        smoothed_prob = prob ** self.args.multilang_sampling_alpha
+        smoothed_prob = smoothed_prob / smoothed_prob.sum()
+        return smoothed_prob
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        paths = self.args.data.split(':')
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+        split_path = os.path.join(data_path, split)
+
+        if self.langs is None:
+            languages = sorted([
+                name for name in os.listdir(data_path)
+                if os.path.isdir(os.path.join(data_path, name))
+            ])
+        else:
+            languages = self.langs.split(',')
+            for name in languages:
+                p = os.path.join(data_path, name)
+                assert os.path.exists(p), "data not found: {}".format(p)
+
+        logger.info("Training on {0} languages: {1}".format(len(languages), languages))
+        logger.info("Language to id mapping: ", {
+                lang: id for id, lang in enumerate(languages)
+            }
+        )
+
+        mask_whole_words = get_whole_word_mask(self.args, self.dictionary)
+        language_without_segmentations = self.args.no_whole_word_mask_langs.split(',')
+        lang_datasets = []
+        for language in languages:
+            split_path = os.path.join(data_path, language, split)
+
+            dataset = data_utils.load_indexed_dataset(
+                split_path,
+                self.source_dictionary,
+                self.args.dataset_impl,
+                combine=combine,
+            )
+            if dataset is None:
+                raise FileNotFoundError('Dataset not found: {} ({})'.format(split, split_path))
+
+            end_token = self.source_dictionary.index('[{}]'.format(language)) \
+                if self.args.add_lang_token else self.source_dictionary.eos()
+
+            # create continuous blocks of tokens
+            dataset = TokenBlockDataset(
+                dataset,
+                dataset.sizes,
+                self.args.tokens_per_sample - 2,  # one less for <s>
+                pad=self.source_dictionary.pad(),
+                eos=end_token,
+                break_mode=self.args.sample_break_mode,
+            )
+            logger.info('loaded {} blocks from: {}'.format(len(dataset), split_path))
+
+            # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)
+            dataset = PrependTokenDataset(dataset, self.source_dictionary.bos())
+            dataset = AppendTokenDataset(dataset, end_token)
+
+            lang_mask_whole_words = mask_whole_words if language not in language_without_segmentations else None
+            lang_dataset = DenoisingDataset(
+                dataset,
+                dataset.sizes,
+                self.dictionary,
+                self.mask_idx,
+                lang_mask_whole_words,
+                shuffle=self.args.shuffle_instance,
+                seed=self.seed,
+                args=self.args,
+                eos=None if not self.args.add_lang_token else self.source_dictionary.index('[{}]'.format(language)),
+            )
+            lang_datasets.append(lang_dataset)
+
+        dataset_lengths = np.array(
+            [len(d) for d in lang_datasets],
+            dtype=float,
+        )
+        logger.info(
+            'loaded total {} blocks for all languages'.format(
+                int(dataset_lengths.sum()),
+            )
+        )
+        if split == self.args.train_subset:
+            # For train subset, additionally up or down sample languages.
+            sample_probs = self._get_sample_prob(dataset_lengths)
+            logger.info(
+                "Sample probability by language: {}".format({
+                    lang: "{0:.4f}".format(sample_probs[id])
+                    for id, lang in enumerate(languages)
+                })
+            )
+            size_ratio = (sample_probs * dataset_lengths.sum()) / dataset_lengths
+            logger.info(
+                "Up/Down Sampling ratio by language: {}".format({
+                    lang: "{0:.2f}".format(size_ratio[id])
+                    for id, lang in enumerate(languages)
+                })
+            )
+
+            resampled_lang_datasets = [
+                ResamplingDataset(
+                    lang_datasets[i],
+                    size_ratio=size_ratio[i],
+                    seed=self.args.seed,
+                    epoch=epoch,
+                    replace=size_ratio[i] >= 1.0,
+                )
+                for i, d in enumerate(lang_datasets)
+            ]
+            dataset = ConcatDataset(
+                resampled_lang_datasets,
+            )
+        else:
+            dataset = ConcatDataset(lang_datasets)
+            lang_splits = [split]
+            for lang_id, lang_dataset in enumerate(lang_datasets):
+                split_name = split + '_' + languages[lang_id]
+                lang_splits.append(split_name)
+                self.datasets[split_name] = lang_dataset
+
+            if split in self.args.valid_subset:
+                self.args.valid_subset = self.args.valid_subset.replace(
+                    split, ','.join(lang_splits)
+                )
+
+        with data_utils.numpy_seed(self.args.seed + epoch):
+            shuffle = np.random.permutation(len(dataset))
+
+        self.datasets[split] = SortDataset(
+            dataset,
+            sort_order=[
+                shuffle,
+                dataset.sizes,
+            ],
+        )
diff --git a/fairseq/tasks/multilingual_masked_lm.py b/fairseq/tasks/multilingual_masked_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..248724bd56a02f7a9d9093c2e6183e9623203565
--- /dev/null
+++ b/fairseq/tasks/multilingual_masked_lm.py
@@ -0,0 +1,317 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+
+import numpy as np
+import torch
+
+from fairseq.data import (
+    data_utils,
+    Dictionary,
+    encoders,
+    ConcatDataset,
+    IdDataset,
+    MaskTokensDataset,
+    NestedDictionaryDataset,
+    NumelDataset,
+    NumSamplesDataset,
+    PadDataset,
+    PrependTokenDataset,
+    RawLabelDataset,
+    ResamplingDataset,
+    SortDataset,
+    TokenBlockDataset,
+)
+from fairseq.tasks import FairseqTask, register_task
+from fairseq import utils
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_task('multilingual_masked_lm')
+class MultiLingualMaskedLMTask(FairseqTask):
+    """Task for training masked language models (e.g., BERT, RoBERTa)."""
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument('data', help='colon separated path to data directories list, \
+                            will be iterated upon during epochs in round-robin manner')
+        parser.add_argument('--sample-break-mode', default='complete',
+                            choices=['none', 'complete', 'complete_doc', 'eos'],
+                            help='If omitted or "none", fills each sample with tokens-per-sample '
+                                 'tokens. If set to "complete", splits samples only at the end '
+                                 'of sentence, but may include multiple sentences per sample. '
+                                 '"complete_doc" is similar but respects doc boundaries. '
+                                 'If set to "eos", includes only one sentence per sample.')
+        parser.add_argument('--tokens-per-sample', default=512, type=int,
+                            help='max number of total tokens over all segments '
+                                 'per sample for BERT dataset')
+        parser.add_argument('--mask-prob', default=0.15, type=float,
+                            help='probability of replacing a token with mask')
+        parser.add_argument('--leave-unmasked-prob', default=0.1, type=float,
+                            help='probability that a masked token is unmasked')
+        parser.add_argument('--random-token-prob', default=0.1, type=float,
+                            help='probability of replacing a token with a random token')
+        parser.add_argument('--freq-weighted-replacement', action='store_true',
+                            help='sample random replacement words based on word frequencies')
+        parser.add_argument('--mask-whole-words', default=False, action='store_true',
+                            help='mask whole words; you may also want to set --bpe')
+        parser.add_argument('--multilang-sampling-alpha', type=float, default=1.0,
+                            help='smoothing alpha for sample rations across multiple datasets')
+
+    def __init__(self, args, dictionary):
+        super().__init__(args)
+        self.dictionary = dictionary
+        self.seed = args.seed
+
+        # add mask token
+        self.mask_idx = dictionary.add_symbol('<mask>')
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        paths = utils.split_paths(args.data)
+        assert len(paths) > 0
+        dictionary = Dictionary.load(os.path.join(paths[0], 'dict.txt'))
+        logger.info('dictionary: {} types'.format(len(dictionary)))
+        return cls(args, dictionary)
+
+    def _get_whole_word_mask(self):
+        # create masked input and targets
+        if self.args.mask_whole_words:
+            bpe = encoders.build_bpe(self.args)
+            if bpe is not None:
+
+                def is_beginning_of_word(i):
+                    if i < self.source_dictionary.nspecial:
+                        # special elements are always considered beginnings
+                        return True
+                    tok = self.source_dictionary[i]
+                    if tok.startswith('madeupword'):
+                        return True
+                    try:
+                        return bpe.is_beginning_of_word(tok)
+                    except ValueError:
+                        return True
+
+                mask_whole_words = torch.ByteTensor(list(
+                    map(is_beginning_of_word, range(len(self.source_dictionary)))
+                ))
+        else:
+            mask_whole_words = None
+        return mask_whole_words
+
+    def _get_sample_prob(self, dataset_lens):
+        """
+        Get smoothed sampling porbability by languages. This helps low resource
+        languages by upsampling them.
+        """
+        prob = dataset_lens / dataset_lens.sum()
+        smoothed_prob = prob ** self.args.multilang_sampling_alpha
+        smoothed_prob = smoothed_prob / smoothed_prob.sum()
+        return smoothed_prob
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        paths = utils.split_paths(self.args.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+
+        languages = sorted(
+            name for name in os.listdir(data_path)
+            if os.path.isdir(os.path.join(data_path, name))
+        )
+
+        logger.info("Training on {0} languages: {1}".format(len(languages), languages))
+        logger.info("Language to id mapping: ", {
+                lang: id for id, lang in enumerate(languages)
+            }
+        )
+
+        mask_whole_words = self._get_whole_word_mask()
+        lang_datasets = []
+        for lang_id, language in enumerate(languages):
+            split_path = os.path.join(data_path, language, split)
+
+            dataset = data_utils.load_indexed_dataset(
+                split_path,
+                self.source_dictionary,
+                self.args.dataset_impl,
+                combine=combine,
+            )
+            if dataset is None:
+                raise FileNotFoundError('Dataset not found: {} ({})'.format(split, split_path))
+
+            # create continuous blocks of tokens
+            dataset = TokenBlockDataset(
+                dataset,
+                dataset.sizes,
+                self.args.tokens_per_sample - 1,  # one less for <s>
+                pad=self.source_dictionary.pad(),
+                eos=self.source_dictionary.eos(),
+                break_mode=self.args.sample_break_mode,
+            )
+            logger.info('loaded {} blocks from: {}'.format(len(dataset), split_path))
+
+            # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)
+            dataset = PrependTokenDataset(dataset, self.source_dictionary.bos())
+
+            src_dataset, tgt_dataset = MaskTokensDataset.apply_mask(
+                dataset,
+                self.source_dictionary,
+                pad_idx=self.source_dictionary.pad(),
+                mask_idx=self.mask_idx,
+                seed=self.args.seed,
+                mask_prob=self.args.mask_prob,
+                leave_unmasked_prob=self.args.leave_unmasked_prob,
+                random_token_prob=self.args.random_token_prob,
+                freq_weighted_replacement=self.args.freq_weighted_replacement,
+                mask_whole_words=mask_whole_words,
+            )
+
+            lang_dataset = NestedDictionaryDataset(
+                {
+                    'net_input': {
+                        'src_tokens': PadDataset(
+                            src_dataset,
+                            pad_idx=self.source_dictionary.pad(),
+                            left_pad=False,
+                        ),
+                        'src_lengths': NumelDataset(src_dataset, reduce=False),
+                    },
+                    'target': PadDataset(
+                        tgt_dataset,
+                        pad_idx=self.source_dictionary.pad(),
+                        left_pad=False,
+                    ),
+                    'nsentences': NumSamplesDataset(),
+                    'ntokens': NumelDataset(src_dataset, reduce=True),
+                    'lang_id': RawLabelDataset([lang_id] * src_dataset.sizes.shape[0]),
+                },
+                sizes=[src_dataset.sizes],
+            )
+            lang_datasets.append(lang_dataset)
+
+
+        dataset_lengths = np.array(
+            [len(d) for d in lang_datasets],
+            dtype=float,
+        )
+        logger.info(
+            'loaded total {} blocks for all languages'.format(
+                dataset_lengths.sum(),
+            )
+        )
+        if split == self.args.train_subset:
+            # For train subset, additionally up or down sample languages.
+            sample_probs = self._get_sample_prob(dataset_lengths)
+            logger.info("Sample probability by language: ", {
+                    lang: "{0:.4f}".format(sample_probs[id])
+                    for id, lang in enumerate(languages)
+                }
+            )
+            size_ratio = (sample_probs * dataset_lengths.sum()) / dataset_lengths
+            logger.info("Up/Down Sampling ratio by language: ", {
+                    lang: "{0:.2f}".format(size_ratio[id])
+                    for id, lang in enumerate(languages)
+                }
+            )
+
+            resampled_lang_datasets = [
+                ResamplingDataset(
+                    lang_datasets[i],
+                    size_ratio=size_ratio[i],
+                    seed=self.args.seed,
+                    epoch=epoch,
+                    replace=size_ratio[i] >= 1.0,
+                )
+                for i, d in enumerate(lang_datasets)
+            ]
+            dataset = ConcatDataset(resampled_lang_datasets)
+        else:
+            dataset = ConcatDataset(lang_datasets)
+            lang_splits = [split]
+            for lang_id, lang_dataset in enumerate(lang_datasets):
+                split_name = split + '_' + languages[lang_id]
+                lang_splits.append(split_name)
+                self.datasets[split_name] = lang_dataset
+
+            # [TODO]: This is hacky for now to print validation ppl for each
+            # language individually. Maybe need task API changes to allow it
+            # in more generic ways.
+            if split in self.args.valid_subset:
+                self.args.valid_subset = self.args.valid_subset.replace(
+                    split, ','.join(lang_splits)
+                )
+
+        with data_utils.numpy_seed(self.args.seed + epoch):
+            shuffle = np.random.permutation(len(dataset))
+
+        self.datasets[split] = SortDataset(
+            dataset,
+            sort_order=[
+                shuffle,
+                dataset.sizes,
+            ],
+        )
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths, sort=True):
+        src_dataset = PadDataset(
+            TokenBlockDataset(
+                src_tokens,
+                src_lengths,
+                self.args.tokens_per_sample - 1,  # one less for <s>
+                pad=self.source_dictionary.pad(),
+                eos=self.source_dictionary.eos(),
+                break_mode='eos',
+            ),
+            pad_idx=self.source_dictionary.pad(),
+            left_pad=False,
+        )
+        src_dataset = PrependTokenDataset(src_dataset, self.source_dictionary.bos())
+        src_dataset = NestedDictionaryDataset(
+            {
+                'id': IdDataset(),
+                'net_input': {
+                    'src_tokens': src_dataset,
+                    'src_lengths': NumelDataset(src_dataset, reduce=False),
+                },
+            },
+            sizes=src_lengths,
+        )
+        if sort:
+            src_dataset = SortDataset(src_dataset, sort_order=[src_lengths])
+        return src_dataset
+
+    def get_batch_iterator(
+        self, dataset, max_tokens=None, max_sentences=None, max_positions=None,
+        ignore_invalid_inputs=False, required_batch_size_multiple=1,
+        seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=1,
+    ):
+        # Recreate epoch iterator every epoch cause the underlying
+        # datasets are dynamic due to sampling.
+        self.dataset_to_epoch_iter = {}
+        epoch_iter = super().get_batch_iterator(
+            dataset, max_tokens, max_sentences, max_positions,
+            ignore_invalid_inputs, required_batch_size_multiple,
+            seed, num_shards, shard_id, num_workers, epoch,
+        )
+        self.dataset_to_epoch_iter = {}
+        return epoch_iter
+
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
diff --git a/fairseq/tasks/multilingual_translation.py b/fairseq/tasks/multilingual_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..59634131fcddb233c63c4cafd59b3502bb3c0c8f
--- /dev/null
+++ b/fairseq/tasks/multilingual_translation.py
@@ -0,0 +1,358 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import OrderedDict
+import logging
+import os
+
+import contextlib
+import torch
+
+from fairseq import metrics, options
+from fairseq.data import (
+    Dictionary,
+    LanguagePairDataset,
+    RoundRobinZipDatasets,
+    TransformEosLangPairDataset,
+)
+from fairseq.models import FairseqMultiModel
+from fairseq.tasks.translation import load_langpair_dataset
+
+from . import FairseqTask, register_task
+from fairseq import utils
+
+logger = logging.getLogger(__name__)
+
+
+def _lang_token(lang: str):
+    return '__{}__'.format(lang)
+
+
+def _lang_token_index(dic: Dictionary, lang: str):
+    """Return language token index."""
+    idx = dic.index(_lang_token(lang))
+    assert idx != dic.unk_index, \
+        'cannot find language token for lang {}'.format(lang)
+    return idx
+
+
+@register_task('multilingual_translation')
+class MultilingualTranslationTask(FairseqTask):
+    """A task for training multiple translation models simultaneously.
+
+    We iterate round-robin over batches from multiple language pairs, ordered
+    according to the `--lang-pairs` argument.
+
+    The training loop is roughly:
+
+        for i in range(len(epoch)):
+            for lang_pair in args.lang_pairs:
+                batch = next_batch_for_lang_pair(lang_pair)
+                loss = criterion(model_for_lang_pair(lang_pair), batch)
+                loss.backward()
+            optimizer.step()
+
+    In practice, `next_batch_for_lang_pair` is abstracted in a FairseqDataset
+    (e.g., `RoundRobinZipDatasets`) and `model_for_lang_pair` is a model that
+    implements the `FairseqMultiModel` interface.
+
+    During inference it is required to specify a single `--source-lang` and
+    `--target-lang`, which indicates the inference langauge direction.
+    `--lang-pairs`, `--encoder-langtok`, `--decoder-langtok` have to be set to
+    the same value as training.
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('data', metavar='DIR', help='path to data directory')
+        parser.add_argument('--lang-pairs', default=None, metavar='PAIRS',
+                            help='comma-separated list of language pairs (in training order): en-de,en-fr,de-fr')
+        parser.add_argument('-s', '--source-lang', default=None, metavar='SRC',
+                            help='source language (only needed for inference)')
+        parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET',
+                            help='target language (only needed for inference)')
+        parser.add_argument('--left-pad-source', default='True', type=str, metavar='BOOL',
+                            help='pad the source on the left (default: True)')
+        parser.add_argument('--left-pad-target', default='False', type=str, metavar='BOOL',
+                            help='pad the target on the left (default: False)')
+        parser.add_argument('--max-source-positions', default=1024, type=int, metavar='N',
+                            help='max number of tokens in the source sequence')
+        parser.add_argument('--max-target-positions', default=1024, type=int, metavar='N',
+                            help='max number of tokens in the target sequence')
+        parser.add_argument('--upsample-primary', default=1, type=int,
+                            help='amount to upsample primary dataset')
+        parser.add_argument('--encoder-langtok', default=None, type=str, choices=['src', 'tgt'],
+                            metavar='SRCTGT',
+                            help='replace beginning-of-sentence in source sentence with source or target '
+                                 'language token. (src/tgt)')
+        parser.add_argument('--decoder-langtok', action='store_true',
+                            help='replace beginning-of-sentence in target sentence with target language token')
+        # fmt: on
+
+    def __init__(self, args, dicts, training):
+        super().__init__(args)
+        self.dicts = dicts
+        self.training = training
+        if training:
+            self.lang_pairs = args.lang_pairs
+        else:
+            self.lang_pairs = ['{}-{}'.format(args.source_lang, args.target_lang)]
+        # eval_lang_pairs for multilingual translation is usually all of the
+        # lang_pairs. However for other multitask settings or when we want to
+        # optimize for certain languages we want to use a different subset. Thus
+        # the eval_lang_pairs class variable is provided for classes that extend
+        # this class.
+        self.eval_lang_pairs = self.lang_pairs
+        # model_lang_pairs will be used to build encoder-decoder model pairs in
+        # models.build_model(). This allows multitask type of sub-class can
+        # build models other than the input lang_pairs
+        self.model_lang_pairs = self.lang_pairs
+        self.langs = list(dicts.keys())
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        dicts, training = cls.prepare(args, **kwargs)
+        return cls(args, dicts, training)
+
+    @classmethod
+    def prepare(cls, args, **kargs):
+        args.left_pad_source = options.eval_bool(args.left_pad_source)
+        args.left_pad_target = options.eval_bool(args.left_pad_target)
+
+        if args.lang_pairs is None:
+            raise ValueError('--lang-pairs is required. List all the language pairs in the training objective.')
+        if isinstance(args.lang_pairs, str):
+            args.lang_pairs = args.lang_pairs.split(',')
+        sorted_langs = sorted(list({x for lang_pair in args.lang_pairs for x in lang_pair.split('-')}))
+        if args.source_lang is not None or args.target_lang is not None:
+            training = False
+        else:
+            training = True
+
+        # load dictionaries
+        dicts = OrderedDict()
+        for lang in sorted_langs:
+            paths = utils.split_paths(args.data)
+            assert len(paths) > 0
+            dicts[lang] = cls.load_dictionary(os.path.join(paths[0], 'dict.{}.txt'.format(lang)))
+            if len(dicts) > 0:
+                assert dicts[lang].pad() == dicts[sorted_langs[0]].pad()
+                assert dicts[lang].eos() == dicts[sorted_langs[0]].eos()
+                assert dicts[lang].unk() == dicts[sorted_langs[0]].unk()
+            if args.encoder_langtok is not None or args.decoder_langtok:
+                for lang_to_add in sorted_langs:
+                    dicts[lang].add_symbol(_lang_token(lang_to_add))
+            logger.info('[{}] dictionary: {} types'.format(lang, len(dicts[lang])))
+        return dicts, training
+
+    def get_encoder_langtok(self, src_lang, tgt_lang):
+        if self.args.encoder_langtok is None:
+            return self.dicts[src_lang].eos()
+        if self.args.encoder_langtok == 'src':
+            return _lang_token_index(self.dicts[src_lang], src_lang)
+        else:
+            return _lang_token_index(self.dicts[src_lang], tgt_lang)
+
+    def get_decoder_langtok(self, tgt_lang):
+        if not self.args.decoder_langtok:
+            return self.dicts[tgt_lang].eos()
+        return _lang_token_index(self.dicts[tgt_lang], tgt_lang)
+
+    def alter_dataset_langtok(self, lang_pair_dataset,
+                              src_eos=None, src_lang=None, tgt_eos=None, tgt_lang=None):
+        if self.args.encoder_langtok is None and not self.args.decoder_langtok:
+            return lang_pair_dataset
+
+        new_src_eos = None
+        if self.args.encoder_langtok is not None and src_eos is not None \
+           and src_lang is not None and tgt_lang is not None:
+            new_src_eos = self.get_encoder_langtok(src_lang, tgt_lang)
+        else:
+            src_eos = None
+
+        new_tgt_bos = None
+        if self.args.decoder_langtok and tgt_eos is not None and tgt_lang is not None:
+            new_tgt_bos = self.get_decoder_langtok(tgt_lang)
+        else:
+            tgt_eos = None
+
+        return TransformEosLangPairDataset(
+            lang_pair_dataset,
+            src_eos=src_eos,
+            new_src_eos=new_src_eos,
+            tgt_bos=tgt_eos,
+            new_tgt_bos=new_tgt_bos,
+        )
+
+    def load_dataset(self, split, epoch=1, **kwargs):
+        """Load a dataset split."""
+        paths = utils.split_paths(self.args.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+
+        def language_pair_dataset(lang_pair):
+            src, tgt = lang_pair.split('-')
+            langpair_dataset = load_langpair_dataset(
+                data_path, split, src, self.dicts[src], tgt, self.dicts[tgt],
+                combine=True, dataset_impl=self.args.dataset_impl,
+                upsample_primary=self.args.upsample_primary,
+                left_pad_source=self.args.left_pad_source,
+                left_pad_target=self.args.left_pad_target,
+                max_source_positions=self.args.max_source_positions,
+                max_target_positions=self.args.max_target_positions,
+            )
+            return self.alter_dataset_langtok(
+                langpair_dataset,
+                src_eos=self.dicts[src].eos(),
+                src_lang=src,
+                tgt_eos=self.dicts[tgt].eos(),
+                tgt_lang=tgt,
+            )
+
+        self.datasets[split] = RoundRobinZipDatasets(
+            OrderedDict([
+                (lang_pair, language_pair_dataset(lang_pair))
+                for lang_pair in self.lang_pairs
+            ]),
+            eval_key=None if self.training else "%s-%s" % (self.args.source_lang, self.args.target_lang),
+        )
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths):
+        lang_pair = "%s-%s" % (self.args.source_lang, self.args.target_lang)
+        return RoundRobinZipDatasets(
+            OrderedDict([(
+                lang_pair,
+                self.alter_dataset_langtok(
+                    LanguagePairDataset(
+                        src_tokens, src_lengths,
+                        self.source_dictionary
+                    ),
+                    src_eos=self.source_dictionary.eos(),
+                    src_lang=self.args.source_lang,
+                    tgt_eos=self.target_dictionary.eos(),
+                    tgt_lang=self.args.target_lang,
+                ),
+            )]),
+            eval_key=lang_pair,
+        )
+
+    def build_model(self, args):
+        def check_args():
+            messages = []
+            if len(set(self.args.lang_pairs).symmetric_difference(args.lang_pairs)) != 0:
+                messages.append('--lang-pairs should include all the language pairs {}.'.format(args.lang_pairs))
+            if self.args.encoder_langtok != args.encoder_langtok:
+                messages.append('--encoder-langtok should be {}.'.format(args.encoder_langtok))
+            if self.args.decoder_langtok != args.decoder_langtok:
+                messages.append('--decoder-langtok should {} be set.'.format("" if args.decoder_langtok else "not"))
+
+            if len(messages) > 0:
+                raise ValueError(' '.join(messages))
+
+        # Check if task args are consistant with model args
+        check_args()
+
+        from fairseq import models
+        model = models.build_model(args, self)
+        if not isinstance(model, FairseqMultiModel):
+            raise ValueError('MultilingualTranslationTask requires a FairseqMultiModel architecture')
+        return model
+
+    def train_step(self, sample, model, criterion, optimizer, update_num, ignore_grad=False):
+        model.train()
+        from collections import defaultdict
+        agg_loss, agg_sample_size, agg_logging_output = 0., 0., defaultdict(float)
+        curr_lang_pairs = [
+            lang_pair
+            for lang_pair in self.model_lang_pairs
+            if sample[lang_pair] is not None and len(sample[lang_pair]) != 0
+        ]
+
+        for idx, lang_pair in enumerate(curr_lang_pairs):
+            def maybe_no_sync():
+                if (
+                    self.args.distributed_world_size > 1
+                    and hasattr(model, 'no_sync')
+                    and idx < len(curr_lang_pairs) - 1
+                ):
+                    return model.no_sync()
+                else:
+                    return contextlib.ExitStack()  # dummy contextmanager
+            with maybe_no_sync():
+                loss, sample_size, logging_output = criterion(model.models[lang_pair], sample[lang_pair])
+                if ignore_grad:
+                    loss *= 0
+                optimizer.backward(loss)
+            agg_loss += loss.detach().item()
+            # TODO make summing of the sample sizes configurable
+            agg_sample_size += sample_size
+            for k in logging_output:
+                agg_logging_output[k] += logging_output[k]
+                agg_logging_output[f"{lang_pair}:{k}"] += logging_output[k]
+        return agg_loss, agg_sample_size, agg_logging_output
+
+    def valid_step(self, sample, model, criterion):
+        model.eval()
+        with torch.no_grad():
+            from collections import defaultdict
+            agg_loss, agg_sample_size, agg_logging_output = 0., 0., defaultdict(float)
+            for lang_pair in self.eval_lang_pairs:
+                if lang_pair not in sample or sample[lang_pair] is None or len(sample[lang_pair]) == 0:
+                    continue
+                loss, sample_size, logging_output = criterion(model.models[lang_pair], sample[lang_pair])
+                agg_loss += loss.data.item()
+                # TODO make summing of the sample sizes configurable
+                agg_sample_size += sample_size
+                for k in logging_output:
+                    agg_logging_output[k] += logging_output[k]
+                    agg_logging_output[f"{lang_pair}:{k}"] += logging_output[k]
+        return agg_loss, agg_sample_size, agg_logging_output
+
+    def inference_step(self, generator, models, sample, prefix_tokens=None):
+        with torch.no_grad():
+            if self.args.decoder_langtok:
+                bos_token = _lang_token_index(self.target_dictionary, self.args.target_lang)
+            else:
+                bos_token = self.target_dictionary.eos()
+            return generator.generate(
+                models,
+                sample,
+                prefix_tokens=prefix_tokens,
+                bos_token=bos_token,
+            )
+
+    def reduce_metrics(self, logging_outputs, criterion):
+        with metrics.aggregate():
+            # pass 'sample_size', 'nsentences', 'ntokens' stats to fairseq_task
+            super().reduce_metrics(logging_outputs, criterion)
+            for k in ['sample_size', 'nsentences', 'ntokens']:
+                metrics.log_scalar(k, sum(l[k] for l in logging_outputs))
+
+    @property
+    def source_dictionary(self):
+        if self.training:
+            return next(iter(self.dicts.values()))
+        else:
+            return self.dicts[self.args.source_lang]
+
+    @property
+    def target_dictionary(self):
+        if self.training:
+            return next(iter(self.dicts.values()))
+        else:
+            return self.dicts[self.args.target_lang]
+
+    def max_positions(self):
+        """Return the max sentence length allowed by the task."""
+        if len(self.datasets.values()) == 0:
+            return {'%s-%s' % (self.args.source_lang, self.args.target_lang):
+                    (self.args.max_source_positions, self.args.max_target_positions)}
+        return OrderedDict([
+            (key, (self.args.max_source_positions, self.args.max_target_positions))
+            for split in self.datasets.keys()
+            for key in self.datasets[split].datasets.keys()
+        ])
diff --git a/fairseq/tasks/semisupervised_translation.py b/fairseq/tasks/semisupervised_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..c81d3628862f9b7ccfaf76a336b878f67a3a8f9e
--- /dev/null
+++ b/fairseq/tasks/semisupervised_translation.py
@@ -0,0 +1,385 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import OrderedDict
+import logging
+import os
+
+from fairseq.data import (
+    BacktranslationDataset,
+    data_utils,
+    indexed_dataset,
+    IndexedCachedDataset,
+    IndexedDataset,
+    IndexedRawTextDataset,
+    LanguagePairDataset,
+    NoisingDataset,
+    RoundRobinZipDatasets,
+)
+from fairseq.models import FairseqMultiModel
+from fairseq.sequence_generator import SequenceGenerator
+
+from .multilingual_translation import MultilingualTranslationTask
+from . import register_task
+from fairseq import utils
+
+logger = logging.getLogger(__name__)
+
+
+def _get_bt_dataset_key(lang_pair):
+    return "bt:" + lang_pair
+
+
+def _get_denoising_dataset_key(lang_pair):
+    return "denoising:" + lang_pair
+
+
+# ported from UnsupervisedMT
+def parse_lambda_config(x):
+    """
+    Parse the configuration of lambda coefficient (for scheduling).
+    x = "3"                  # lambda will be a constant equal to x
+    x = "0:1,1000:0"         # lambda will start from 1 and linearly decrease
+                             # to 0 during the first 1000 iterations
+    x = "0:0,1000:0,2000:1"  # lambda will be equal to 0 for the first 1000
+                             # iterations, then will linearly increase to 1 until iteration 2000
+    """
+    split = x.split(',')
+    if len(split) == 1:
+        return float(x), None
+    else:
+        split = [s.split(os.pathsep) for s in split]
+        assert all(len(s) == 2 for s in split)
+        assert all(k.isdigit() for k, _ in split)
+        assert all(int(split[i][0]) < int(split[i + 1][0]) for i in range(len(split) - 1))
+        return float(split[0][1]), [(int(k), float(v)) for k, v in split]
+
+
+@register_task('semisupervised_translation')
+class SemisupervisedTranslationTask(MultilingualTranslationTask):
+    """A task for training multiple translation models simultaneously.
+
+    We iterate round-robin over batches from multiple language pairs, ordered
+    according to the `--lang-pairs` argument.
+
+    The training loop is roughly:
+
+        for i in range(len(epoch)):
+            for lang_pair in args.lang_pairs:
+                batch = next_batch_for_lang_pair(lang_pair)
+                loss = criterion(model_for_lang_pair(lang_pair), batch)
+                loss.backward()
+            optimizer.step()
+
+    In practice, `next_batch_for_lang_pair` is abstracted in a FairseqDataset
+    (e.g., `RoundRobinZipDatasets`) and `model_for_lang_pair` is a model that
+    implements the `FairseqMultiModel` interface.
+
+    During inference it is required to specify a single `--source-lang` and
+    `--target-lang`, instead of `--lang-pairs`.
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        # fmt: off
+        MultilingualTranslationTask.add_args(parser)
+        parser.add_argument('--lambda-parallel-config', default="1.0", type=str, metavar='CONFIG',
+                            help='cross-entropy reconstruction coefficient (parallel data). '
+                                 'use fixed weight during training if set to floating point number. '
+                                 'use piecewise linear function over number of updates to schedule the '
+                                 'weight with the format: w0:step0,w1:step1,...')
+        parser.add_argument('--lambda-denoising-config', default="0.0", type=str, metavar='CONFIG',
+                            help='Cross-entropy reconstruction coefficient (denoising autoencoding)'
+                                 'use fixed weight during training if set to floating point number. '
+                                 'use piecewise linear function over number of updates to schedule the '
+                                 'weight with the format: w0:step0,w1:step1,...')
+        parser.add_argument('--lambda-otf-bt-config', default="0.0", type=str, metavar='CONFIG',
+                            help='cross-entropy reconstruction coefficient (on-the-fly back-translation parallel data)'
+                                 'use fixed weight during training if set to floating point number. '
+                                 'use piecewise linear function over number of updates to schedule the '
+                                 'weight with the format: w0:step0,w1:step1,...')
+        parser.add_argument('--bt-max-len-a', default=1.1, type=float, metavar='N',
+                            help='generate back-translated sequences of maximum length ax + b, where x is the '
+                                 'source length')
+        parser.add_argument('--bt-max-len-b', default=10.0, type=float, metavar='N',
+                            help='generate back-translated sequences of maximum length ax + b, where x is the '
+                                 'source length')
+        parser.add_argument('--bt-beam-size', default=1, type=int, metavar='N',
+                            help='beam size used in beam search of online back-translation')
+        parser.add_argument('--max-word-shuffle-distance', default=3.0, type=float, metavar='N',
+                            help='maximum word shuffle distance for denoising autoencoding data generation')
+        parser.add_argument('--word-dropout-prob', default=0.1, type=float, metavar='N',
+                            help='word dropout probability for denoising autoencoding data generation')
+        parser.add_argument('--word-blanking-prob', default=0.2, type=float, metavar='N',
+                            help='word blanking probability for denoising autoencoding data generation')
+        # fmt: on
+
+    def __init__(self, args, dicts, training):
+        super().__init__(args, dicts, training)
+        self.lambda_parallel, self.lambda_parallel_steps = parse_lambda_config(args.lambda_parallel_config)
+        self.lambda_otf_bt, self.lambda_otf_bt_steps = parse_lambda_config(args.lambda_otf_bt_config)
+        self.lambda_denoising, self.lambda_denoising_steps = parse_lambda_config(args.lambda_denoising_config)
+        if (self.lambda_denoising > 0.0 or self.lambda_denoising_steps is not None):
+            denoising_lang_pairs = [
+                "%s-%s" % (tgt, tgt)
+                for tgt in {lang_pair.split('-')[1] for lang_pair in args.lang_pairs}
+            ]
+            self.model_lang_pairs = self.model_lang_pairs + denoising_lang_pairs
+        self.backtranslate_datasets = {}
+        self.backtranslators = {}
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        dicts, training = MultilingualTranslationTask.prepare(args, **kwargs)
+        return cls(args, dicts, training)
+
+    def load_dataset(self, split, epoch=1, **kwargs):
+        """Load a dataset split."""
+        paths = utils.split_paths(self.args.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+
+        def split_exists(split, src, tgt, lang):
+            if src is not None:
+                filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang))
+            else:
+                filename = os.path.join(data_path, '{}.{}-None.{}'.format(split, src, tgt))
+            return indexed_dataset.dataset_exists(filename, impl=self.args.dataset_impl)
+
+        def load_indexed_dataset(path, dictionary):
+            return data_utils.load_indexed_dataset(path, dictionary, self.args.dataset_impl)
+
+        # load parallel datasets
+        src_datasets, tgt_datasets = {}, {}
+        if (self.lambda_parallel > 0.0 or self.lambda_parallel_steps is not None or not split.startswith("train")):
+            for lang_pair in self.lang_pairs:
+                src, tgt = lang_pair.split('-')
+                if split_exists(split, src, tgt, src):
+                    prefix = os.path.join(data_path, '{}.{}-{}.'.format(split, src, tgt))
+                elif split_exists(split, tgt, src, src):
+                    prefix = os.path.join(data_path, '{}.{}-{}.'.format(split, tgt, src))
+                else:
+                    continue
+                src_datasets[lang_pair] = load_indexed_dataset(prefix + src, self.dicts[src])
+                tgt_datasets[lang_pair] = load_indexed_dataset(prefix + tgt, self.dicts[tgt])
+                logger.info('parallel-{} {} {} examples'.format(data_path, split, len(src_datasets[lang_pair])))
+            if len(src_datasets) == 0:
+                raise FileNotFoundError('Dataset not found: {} ({})'.format(split, data_path))
+
+        # back translation datasets
+        backtranslate_datasets = {}
+        if (self.lambda_otf_bt > 0.0 or self.lambda_otf_bt_steps is not None) and split.startswith("train"):
+            for lang_pair in self.lang_pairs:
+                src, tgt = lang_pair.split('-')
+                if not split_exists(split, tgt, None, tgt):
+                    raise FileNotFoundError('Dataset not found: backtranslation {} ({})'.format(split, data_path))
+                filename = os.path.join(data_path, '{}.{}-None.{}'.format(split, tgt, tgt))
+                dataset = load_indexed_dataset(filename, self.dicts[tgt])
+                lang_pair_dataset_tgt = LanguagePairDataset(
+                    dataset,
+                    dataset.sizes,
+                    self.dicts[tgt],
+                    left_pad_source=self.args.left_pad_source,
+                    left_pad_target=self.args.left_pad_target,
+                )
+                lang_pair_dataset = LanguagePairDataset(
+                    dataset,
+                    dataset.sizes,
+                    src_dict=self.dicts[src],
+                    tgt=dataset,
+                    tgt_sizes=dataset.sizes,
+                    tgt_dict=self.dicts[tgt],
+                    left_pad_source=self.args.left_pad_source,
+                    left_pad_target=self.args.left_pad_target,
+                )
+                backtranslate_datasets[lang_pair] = BacktranslationDataset(
+                    tgt_dataset=self.alter_dataset_langtok(
+                        lang_pair_dataset_tgt,
+                        src_eos=self.dicts[tgt].eos(),
+                        src_lang=tgt,
+                        tgt_lang=src,
+                    ),
+                    backtranslation_fn=self.backtranslators[lang_pair],
+                    src_dict=self.dicts[src], tgt_dict=self.dicts[tgt],
+                    output_collater=self.alter_dataset_langtok(
+                        lang_pair_dataset=lang_pair_dataset,
+                        src_eos=self.dicts[src].eos(),
+                        src_lang=src,
+                        tgt_eos=self.dicts[tgt].eos(),
+                        tgt_lang=tgt,
+                    ).collater,
+                )
+                logger.info('backtranslate-{}: {} {} {} examples'.format(
+                    tgt, data_path, split, len(backtranslate_datasets[lang_pair]),
+                ))
+                self.backtranslate_datasets[lang_pair] = backtranslate_datasets[lang_pair]
+
+        # denoising autoencoder
+        noising_datasets = {}
+        if (self.lambda_denoising > 0.0 or self.lambda_denoising_steps is not None) and split.startswith("train"):
+            for lang_pair in self.lang_pairs:
+                _, tgt = lang_pair.split('-')
+                if not split_exists(split, tgt, None, tgt):
+                    continue
+                filename = os.path.join(data_path, '{}.{}-None.{}'.format(split, tgt, tgt))
+                tgt_dataset1 = load_indexed_dataset(filename, self.dicts[tgt])
+                tgt_dataset2 = load_indexed_dataset(filename, self.dicts[tgt])
+                noising_dataset = NoisingDataset(
+                    tgt_dataset1,
+                    self.dicts[tgt],
+                    seed=1,
+                    max_word_shuffle_distance=self.args.max_word_shuffle_distance,
+                    word_dropout_prob=self.args.word_dropout_prob,
+                    word_blanking_prob=self.args.word_blanking_prob,
+                )
+                noising_datasets[lang_pair] = self.alter_dataset_langtok(
+                    LanguagePairDataset(
+                        noising_dataset,
+                        tgt_dataset1.sizes,
+                        self.dicts[tgt],
+                        tgt_dataset2,
+                        tgt_dataset2.sizes,
+                        self.dicts[tgt],
+                        left_pad_source=self.args.left_pad_source,
+                        left_pad_target=self.args.left_pad_target,
+                    ),
+                    src_eos=self.dicts[tgt].eos(),
+                    src_lang=tgt,
+                    tgt_eos=self.dicts[tgt].eos(),
+                    tgt_lang=tgt,
+                )
+                logger.info('denoising-{}: {} {} {} examples'.format(
+                    tgt, data_path, split, len(noising_datasets[lang_pair]),
+                ))
+
+        def language_pair_dataset(lang_pair):
+            src, tgt = lang_pair.split('-')
+            src_dataset, tgt_dataset = src_datasets[lang_pair], tgt_datasets[lang_pair]
+            return self.alter_dataset_langtok(
+                LanguagePairDataset(
+                    src_dataset, src_dataset.sizes, self.dicts[src],
+                    tgt_dataset, tgt_dataset.sizes, self.dicts[tgt],
+                    left_pad_source=self.args.left_pad_source,
+                    left_pad_target=self.args.left_pad_target,
+                ),
+                self.dicts[src].eos(),
+                src,
+                self.dicts[tgt].eos(),
+                tgt,
+            )
+
+        self.datasets[split] = RoundRobinZipDatasets(
+            OrderedDict([
+                (lang_pair, language_pair_dataset(lang_pair))
+                for lang_pair in src_datasets.keys()
+            ] + [
+                (_get_bt_dataset_key(lang_pair), dataset)
+                for lang_pair, dataset in backtranslate_datasets.items()
+            ] + [
+                (_get_denoising_dataset_key(lang_pair), dataset)
+                for lang_pair, dataset in noising_datasets.items()
+            ]),
+            eval_key=None if self.training else "%s-%s" % (self.args.source_lang, self.args.target_lang),
+        )
+
+    def build_model(self, args):
+        from fairseq import models
+        model = models.build_model(args, self)
+        if not isinstance(model, FairseqMultiModel):
+            raise ValueError('SemisupervisedTranslationTask requires a FairseqMultiModel architecture')
+
+        # create SequenceGenerator for each model that has backtranslation dependency on it
+        self.sequence_generators = {}
+        if (self.lambda_otf_bt > 0.0 or self.lambda_otf_bt_steps is not None) and self.training:
+            for lang_pair in self.lang_pairs:
+                src, tgt = lang_pair.split('-')
+                key = '{}-{}'.format(tgt, src)
+                self.sequence_generators[key] = SequenceGenerator(
+                    [model.models[key]],
+                    tgt_dict=self.dicts[src],
+                    beam_size=args.bt_beam_size,
+                    max_len_a=args.bt_max_len_a,
+                    max_len_b=args.bt_max_len_b,
+                )
+                decoder_lang_tok_idx = self.get_decoder_langtok(src)
+
+                def backtranslate_fn(
+                    sample, model=model.models[key],
+                    bos_token=decoder_lang_tok_idx,
+                    sequence_generator=self.sequence_generators[key],
+                ):
+                    return sequence_generator.generate(
+                        [model],
+                        sample,
+                        bos_token=bos_token,
+                    )
+                self.backtranslators[lang_pair] = backtranslate_fn
+
+        return model
+
+    def train_step(self, sample, model, criterion, optimizer, update_num, ignore_grad=False):
+        model.train()
+
+        if update_num > 0:
+            self.update_step(update_num)
+
+        agg_loss, agg_sample_size, agg_logging_output = 0., 0., {}
+
+        def forward_backward(model, samples, logging_output_key, weight):
+            nonlocal agg_loss, agg_sample_size, agg_logging_output
+            if samples is None or len(samples) == 0:
+                return
+            loss, sample_size, logging_output = criterion(model, samples)
+            if ignore_grad:
+                loss *= 0
+            else:
+                loss *= weight
+            optimizer.backward(loss)
+            agg_loss += loss.detach().item()
+            # TODO make summing of the sample sizes configurable
+            agg_sample_size += sample_size
+            for k in logging_output:
+                agg_logging_output[k] += logging_output[k]
+                agg_logging_output[logging_output_key] += logging_output[k]
+
+        if self.lambda_parallel > 0.0:
+            for lang_pair in self.lang_pairs:
+                forward_backward(model.models[lang_pair], sample[lang_pair], lang_pair, self.lambda_parallel)
+
+        if self.lambda_otf_bt > 0.0:
+            for lang_pair in self.lang_pairs:
+                sample_key = _get_bt_dataset_key(lang_pair)
+                forward_backward(model.models[lang_pair], sample[sample_key], sample_key, self.lambda_otf_bt)
+
+        if self.lambda_denoising > 0.0:
+            for lang_pair in self.lang_pairs:
+                _, tgt = lang_pair.split('-')
+                sample_key = _get_denoising_dataset_key(lang_pair)
+                forward_backward(model.models['{0}-{0}'.format(tgt)], sample[sample_key], sample_key, self.lambda_denoising)
+
+        return agg_loss, agg_sample_size, agg_logging_output
+
+    def update_step(self, num_updates):
+        def lambda_step_func(config, n_iter):
+            """
+            Update a lambda value according to its schedule configuration.
+            """
+            ranges = [i for i in range(len(config) - 1) if config[i][0] <= n_iter < config[i + 1][0]]
+            if len(ranges) == 0:
+                assert n_iter >= config[-1][0]
+                return config[-1][1]
+            assert len(ranges) == 1
+            i = ranges[0]
+            x_a, y_a = config[i]
+            x_b, y_b = config[i + 1]
+            return y_a + (n_iter - x_a) * float(y_b - y_a) / float(x_b - x_a)
+
+        if self.lambda_parallel_steps is not None:
+            self.lambda_parallel = lambda_step_func(self.lambda_parallel_steps, num_updates)
+        if self.lambda_denoising_steps is not None:
+            self.lambda_denoising = lambda_step_func(self.lambda_denoising_steps, num_updates)
+        if self.lambda_otf_bt_steps is not None:
+            self.lambda_otf_bt = lambda_step_func(self.lambda_otf_bt_steps, num_updates)
diff --git a/fairseq/tasks/sentence_prediction.py b/fairseq/tasks/sentence_prediction.py
new file mode 100644
index 0000000000000000000000000000000000000000..b50c9922cc9c6472fbc66bb0d248444cd16f46cd
--- /dev/null
+++ b/fairseq/tasks/sentence_prediction.py
@@ -0,0 +1,249 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+
+import numpy as np
+
+from fairseq import utils
+from fairseq.data import (
+    ConcatSentencesDataset,
+    data_utils,
+    Dictionary,
+    IdDataset,
+    NestedDictionaryDataset,
+    NumSamplesDataset,
+    NumelDataset,
+    OffsetTokensDataset,
+    PrependTokenDataset,
+    RawLabelDataset,
+    RightPadDataset,
+    RollDataset,
+    SortDataset,
+    StripTokenDataset,
+)
+from fairseq.data.shorten_dataset import maybe_shorten_dataset
+from fairseq.tasks import FairseqTask, register_task
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_task('sentence_prediction')
+class SentencePredictionTask(FairseqTask):
+    """
+    Sentence (or sentence pair) prediction (classification or regression) task.
+
+    Args:
+        dictionary (Dictionary): the dictionary for the input of the task
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument('data', metavar='FILE',
+                            help='file prefix for data')
+        parser.add_argument('--num-classes', type=int, default=-1,
+                            help='number of classes or regression targets')
+        parser.add_argument('--init-token', type=int, default=None,
+                            help='add token at the beginning of each batch item')
+        parser.add_argument('--separator-token', type=int, default=None,
+                            help='add separator token between inputs')
+        parser.add_argument('--regression-target', action='store_true', default=False)
+        parser.add_argument('--no-shuffle', action='store_true', default=False)
+        parser.add_argument('--shorten-method', default='none',
+                            choices=['none', 'truncate', 'random_crop'],
+                            help='if not none, shorten sequences that exceed --tokens-per-sample')
+        parser.add_argument('--shorten-data-split-list', default='',
+                            help='comma-separated list of dataset splits to apply shortening to, '
+                                 'e.g., "train,valid" (default: all dataset splits)')
+        parser.add_argument('--add-prev-output-tokens', action='store_true', default=False,
+                            help='add prev_output_tokens to sample, used for encoder-decoder arch')
+
+    def __init__(self, args, data_dictionary, label_dictionary):
+        super().__init__(args)
+        self.dictionary = data_dictionary
+        self._label_dictionary = label_dictionary
+        if not hasattr(args, 'max_positions'):
+            self._max_positions = (
+                args.max_source_positions,
+                args.max_target_positions,
+            )
+        else:
+            self._max_positions = args.max_positions
+        args.tokens_per_sample = self._max_positions
+
+    @classmethod
+    def load_dictionary(cls, args, filename, source=True):
+        """Load the dictionary from the filename
+
+        Args:
+            filename (str): the filename
+        """
+        dictionary = Dictionary.load(filename)
+        dictionary.add_symbol('<mask>')
+        return dictionary
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        assert args.num_classes > 0, 'Must set --num-classes'
+
+        # load data dictionary
+        data_dict = cls.load_dictionary(
+            args,
+            os.path.join(args.data, 'input0', 'dict.txt'),
+            source=True,
+        )
+        logger.info('[input] dictionary: {} types'.format(len(data_dict)))
+
+        label_dict = None
+        if not args.regression_target:
+            # load label dictionary
+            label_dict = cls.load_dictionary(
+                args,
+                os.path.join(args.data, 'label', 'dict.txt'),
+                source=False,
+            )
+            logger.info('[label] dictionary: {} types'.format(len(label_dict)))
+        else:
+            label_dict = data_dict
+        return SentencePredictionTask(args, data_dict, label_dict)
+
+    def load_dataset(self, split, combine=False, **kwargs):
+        """Load a given dataset split (e.g., train, valid, test)."""
+        def get_path(type, split):
+            return os.path.join(self.args.data, type, split)
+
+        def make_dataset(type, dictionary):
+            split_path = get_path(type, split)
+
+            dataset = data_utils.load_indexed_dataset(
+                split_path,
+                dictionary,
+                self.args.dataset_impl,
+                combine=combine,
+            )
+            return dataset
+
+        input0 = make_dataset('input0', self.source_dictionary)
+        assert input0 is not None, 'could not find dataset: {}'.format(get_path(type, split))
+        input1 = make_dataset('input1', self.source_dictionary)
+
+        if self.args.init_token is not None:
+            input0 = PrependTokenDataset(input0, self.args.init_token)
+
+        if input1 is None:
+            src_tokens = input0
+        else:
+            if self.args.separator_token is not None:
+                input1 = PrependTokenDataset(input1, self.args.separator_token)
+
+            src_tokens = ConcatSentencesDataset(input0, input1)
+
+        with data_utils.numpy_seed(self.args.seed):
+            shuffle = np.random.permutation(len(src_tokens))
+
+        src_tokens = maybe_shorten_dataset(
+            src_tokens,
+            split,
+            self.args.shorten_data_split_list,
+            self.args.shorten_method,
+            self.args.max_positions,
+            self.args.seed,
+        )
+
+        dataset = {
+            'id': IdDataset(),
+            'net_input': {
+                'src_tokens': RightPadDataset(
+                    src_tokens,
+                    pad_idx=self.source_dictionary.pad(),
+                ),
+                'src_lengths': NumelDataset(src_tokens, reduce=False),
+            },
+            'nsentences': NumSamplesDataset(),
+            'ntokens': NumelDataset(src_tokens, reduce=True),
+        }
+
+        if self.args.add_prev_output_tokens:
+            prev_tokens_dataset = RightPadDataset(
+                RollDataset(src_tokens, 1),
+                pad_idx=self.dictionary.pad(),
+            )
+            dataset['net_input'].update(
+                prev_output_tokens=prev_tokens_dataset,
+            )
+
+        if not self.args.regression_target:
+            label_dataset = make_dataset('label', self.label_dictionary)
+            if label_dataset is not None:
+                dataset.update(
+                    target=OffsetTokensDataset(
+                        StripTokenDataset(
+                            label_dataset,
+                            id_to_strip=self.label_dictionary.eos(),
+                        ),
+                        offset=-self.label_dictionary.nspecial,
+                    )
+                )
+        else:
+            label_path = "{0}.label".format(get_path('label', split))
+            if os.path.exists(label_path):
+                def parse_regression_target(i, line):
+                    values = line.split()
+                    assert len(values) == self.args.num_classes, \
+                        f'expected num_classes={self.args.num_classes} regression target values on line {i}, found: "{line}"'
+                    return [float(x) for x in values]
+                dataset.update(
+                    target=RawLabelDataset([
+                        parse_regression_target(i, line.strip()) for i, line in enumerate(open(label_path).readlines())
+                    ])
+                )
+
+        nested_dataset = NestedDictionaryDataset(
+            dataset,
+            sizes=[src_tokens.sizes],
+        )
+
+        if self.args.no_shuffle:
+            dataset = nested_dataset
+        else:
+            dataset = SortDataset(
+                nested_dataset,
+                # shuffle
+                sort_order=[shuffle],
+            )
+
+        logger.info("Loaded {0} with #samples: {1}".format(split, len(dataset)))
+
+        self.datasets[split] = dataset
+        return self.datasets[split]
+
+    def build_model(self, args):
+        from fairseq import models
+        model = models.build_model(args, self)
+
+        model.register_classification_head(
+            getattr(args, 'classification_head_name', 'sentence_classification_head'),
+            num_classes=self.args.num_classes,
+        )
+
+        return model
+
+    def max_positions(self):
+        return self._max_positions
+
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
+
+    @property
+    def label_dictionary(self):
+        return self._label_dictionary
diff --git a/fairseq/tasks/sentence_ranking.py b/fairseq/tasks/sentence_ranking.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea4b50a294a9cac05b28d960a0925a5dc6f30795
--- /dev/null
+++ b/fairseq/tasks/sentence_ranking.py
@@ -0,0 +1,208 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+
+import numpy as np
+
+from fairseq import utils
+from fairseq.data import (
+    ConcatSentencesDataset,
+    data_utils,
+    Dictionary,
+    IdDataset,
+    NestedDictionaryDataset,
+    NumSamplesDataset,
+    NumelDataset,
+    PrependTokenDataset,
+    RawLabelDataset,
+    RightPadDataset,
+    SortDataset,
+    TruncateDataset
+)
+from fairseq.data.shorten_dataset import maybe_shorten_dataset
+from fairseq.tasks import FairseqTask, register_task
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_task('sentence_ranking')
+class SentenceRankingTask(FairseqTask):
+    """
+    Ranking task on multiple sentences.
+
+    Args:
+        dictionary (Dictionary): the dictionary for the input of the task
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument('data', metavar='FILE',
+                            help='file prefix for data')
+        parser.add_argument('--num-classes', type=int,
+                            help='number of sentences to be ranked')
+        parser.add_argument('--init-token', type=int,
+                            help='add token at the beginning of each batch item')
+        parser.add_argument('--separator-token', type=int,
+                            help='add separator token between inputs')
+        parser.add_argument('--no-shuffle', action='store_true')
+        parser.add_argument('--shorten-method', default='none',
+                            choices=['none', 'truncate', 'random_crop'],
+                            help='if not none, shorten sequences that exceed --tokens-per-sample')
+        parser.add_argument('--shorten-data-split-list', default='',
+                            help='comma-separated list of dataset splits to apply shortening to, '
+                                 'e.g., "train,valid" (default: all dataset splits)')
+        parser.add_argument('--max-option-length', type=int,
+                            help='max length for each option')
+
+    def __init__(self, args, dictionary):
+        super().__init__(args)
+        self.dictionary = dictionary
+
+    @classmethod
+    def load_dictionary(cls, args, filename, source=True):
+        """Load the dictionary from the filename
+
+        Args:
+            filename (str): the filename
+        """
+        dictionary = Dictionary.load(filename)
+        dictionary.add_symbol('<mask>')
+        return dictionary
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        assert args.criterion == 'sentence_ranking', \
+            'Must set --criterion=sentence_ranking'
+
+        # load data dictionary
+        data_dict = cls.load_dictionary(
+            args,
+            os.path.join(args.data, 'input0', 'dict.txt'),
+            source=True,
+        )
+        logger.info('[input] dictionary: {} types'.format(len(data_dict)))
+        return SentenceRankingTask(args, data_dict)
+
+    def load_dataset(self, split, combine=False, **kwargs):
+        """Load a given dataset split (e.g., train, valid, test)."""
+
+        def get_path(type, split):
+            return os.path.join(self.args.data, type, split)
+
+        def make_dataset(type, dictionary):
+            split_path = get_path(type, split)
+
+            dataset = data_utils.load_indexed_dataset(
+                split_path,
+                self.source_dictionary,
+                self.args.dataset_impl,
+                combine=combine,
+            )
+            return dataset
+
+        input0 = make_dataset('input0', self.source_dictionary)
+        input_options = [
+            make_dataset(
+                'input{idx}'.format(idx=idx + 1),
+                self.source_dictionary
+            )
+            for idx in range(self.args.num_classes)
+        ]
+
+        if self.args.separator_token is not None:
+            input0 = PrependTokenDataset(input0, self.args.separator_token)
+
+        src_tokens = []
+        for input_option in input_options:
+            if self.args.init_token is not None:
+                input_option = PrependTokenDataset(input_option, self.args.init_token)
+            if self.args.max_option_length is not None:
+                input_option = TruncateDataset(input_option, self.args.max_option_length)
+            src_token = ConcatSentencesDataset(input_option, input0)
+            src_token = maybe_shorten_dataset(
+                src_token,
+                split,
+                self.args.shorten_data_split_list,
+                self.args.shorten_method,
+                self.args.max_positions,
+                self.args.seed,
+            )
+            src_tokens.append(src_token)
+
+        with data_utils.numpy_seed(self.args.seed):
+            shuffle = np.random.permutation(len(src_tokens[0]))
+
+        dataset = {
+            'id': IdDataset(),
+            'nsentences': NumSamplesDataset(),
+            'ntokens': NumelDataset(src_tokens[0], reduce=True),
+        }
+
+        for src_token_idx in range(len(src_tokens)):
+            dataset.update(
+                {
+                    'net_input{idx}'.format(idx=src_token_idx+1): {
+                        'src_tokens': RightPadDataset(
+                            src_tokens[src_token_idx],
+                            pad_idx=self.source_dictionary.pad(),
+                        ),
+                        'src_lengths': NumelDataset(src_tokens[src_token_idx], reduce=False),
+                    }
+                }
+            )
+
+        label_path = '{}.label'.format(get_path('label', split))
+        if os.path.exists(label_path):
+            with open(label_path) as h:
+                dataset.update(
+                    target=RawLabelDataset([
+                        int(x.strip()) for x in h.readlines()
+                    ])
+                )
+
+        nested_dataset = NestedDictionaryDataset(
+            dataset,
+            sizes=[np.maximum.reduce([src_token.sizes for src_token in src_tokens])],
+        )
+
+        if self.args.no_shuffle:
+            dataset = nested_dataset
+        else:
+            dataset = SortDataset(
+                nested_dataset,
+                # shuffle
+                sort_order=[shuffle],
+            )
+
+        logger.info("Loaded {0} with #samples: {1}".format(split, len(dataset)))
+
+        self.datasets[split] = dataset
+        return self.datasets[split]
+
+    def build_model(self, args):
+        from fairseq import models
+        model = models.build_model(args, self)
+
+        model.register_classification_head(
+            getattr(args, 'ranking_head_name', 'sentence_classification_head'),
+            num_classes=1,
+        )
+
+        return model
+
+    def max_positions(self):
+        return self.args.max_positions
+
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
diff --git a/fairseq/tasks/translation.py b/fairseq/tasks/translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..7077943c1ef8d51f2393026332f047af4e235c1f
--- /dev/null
+++ b/fairseq/tasks/translation.py
@@ -0,0 +1,389 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from argparse import Namespace
+import json
+import itertools
+import logging
+import os
+
+import numpy as np
+
+from fairseq import metrics, options, utils
+from fairseq.data import (
+    AppendTokenDataset,
+    ConcatDataset,
+    data_utils,
+    encoders,
+    indexed_dataset,
+    LanguagePairDataset,
+    PrependTokenDataset,
+    StripTokenDataset,
+    TruncateDataset,
+)
+
+from fairseq.tasks import FairseqTask, register_task
+
+EVAL_BLEU_ORDER = 4
+
+
+logger = logging.getLogger(__name__)
+
+
+def load_langpair_dataset(
+    data_path, split,
+    src, src_dict,
+    tgt, tgt_dict,
+    combine, dataset_impl, upsample_primary,
+    left_pad_source, left_pad_target, max_source_positions,
+    max_target_positions, prepend_bos=False, load_alignments=False,
+    truncate_source=False, append_source_id=False,
+    num_buckets=0,
+    shuffle=True,
+):
+
+    def split_exists(split, src, tgt, lang, data_path):
+        filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang))
+        return indexed_dataset.dataset_exists(filename, impl=dataset_impl)
+
+    src_datasets = []
+    tgt_datasets = []
+
+    for k in itertools.count():
+        split_k = split + (str(k) if k > 0 else '')
+
+        # infer langcode
+        if split_exists(split_k, src, tgt, src, data_path):
+            prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, src, tgt))
+        elif split_exists(split_k, tgt, src, src, data_path):
+            prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, tgt, src))
+        else:
+            if k > 0:
+                break
+            else:
+                raise FileNotFoundError('Dataset not found: {} ({})'.format(split, data_path))
+
+        src_dataset = data_utils.load_indexed_dataset(prefix + src, src_dict, dataset_impl)
+        if truncate_source:
+            src_dataset = AppendTokenDataset(
+                TruncateDataset(
+                    StripTokenDataset(src_dataset, src_dict.eos()),
+                    max_source_positions - 1,
+                ),
+                src_dict.eos(),
+            )
+        src_datasets.append(src_dataset)
+
+        tgt_dataset = data_utils.load_indexed_dataset(prefix + tgt, tgt_dict, dataset_impl)
+        if tgt_dataset is not None:
+            tgt_datasets.append(tgt_dataset)
+
+        logger.info('{} {} {}-{} {} examples'.format(
+            data_path, split_k, src, tgt, len(src_datasets[-1])
+        ))
+
+        if not combine:
+            break
+
+    assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0
+
+    if len(src_datasets) == 1:
+        src_dataset = src_datasets[0]
+        tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None
+    else:
+        sample_ratios = [1] * len(src_datasets)
+        sample_ratios[0] = upsample_primary
+        src_dataset = ConcatDataset(src_datasets, sample_ratios)
+        if len(tgt_datasets) > 0:
+            tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios)
+        else:
+            tgt_dataset = None
+
+    if prepend_bos:
+        assert hasattr(src_dict, "bos_index") and hasattr(tgt_dict, "bos_index")
+        src_dataset = PrependTokenDataset(src_dataset, src_dict.bos())
+        if tgt_dataset is not None:
+            tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos())
+
+    eos = None
+    if append_source_id:
+        src_dataset = AppendTokenDataset(src_dataset, src_dict.index('[{}]'.format(src)))
+        if tgt_dataset is not None:
+            tgt_dataset = AppendTokenDataset(tgt_dataset, tgt_dict.index('[{}]'.format(tgt)))
+        eos = tgt_dict.index('[{}]'.format(tgt))
+
+    align_dataset = None
+    if load_alignments:
+        align_path = os.path.join(data_path, '{}.align.{}-{}'.format(split, src, tgt))
+        if indexed_dataset.dataset_exists(align_path, impl=dataset_impl):
+            align_dataset = data_utils.load_indexed_dataset(align_path, None, dataset_impl)
+
+    tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None
+    return LanguagePairDataset(
+        src_dataset, src_dataset.sizes, src_dict,
+        tgt_dataset, tgt_dataset_sizes, tgt_dict,
+        left_pad_source=left_pad_source,
+        left_pad_target=left_pad_target,
+        align_dataset=align_dataset, eos=eos,
+        num_buckets=num_buckets,
+        shuffle=shuffle,
+    )
+
+
+@register_task('translation')
+class TranslationTask(FairseqTask):
+    """
+    Translate from one (source) language to another (target) language.
+
+    Args:
+        src_dict (~fairseq.data.Dictionary): dictionary for the source language
+        tgt_dict (~fairseq.data.Dictionary): dictionary for the target language
+
+    .. note::
+
+        The translation task is compatible with :mod:`fairseq-train`,
+        :mod:`fairseq-generate` and :mod:`fairseq-interactive`.
+
+    The translation task provides the following additional command-line
+    arguments:
+
+    .. argparse::
+        :ref: fairseq.tasks.translation_parser
+        :prog:
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('data', help='colon separated path to data directories list, \
+                            will be iterated upon during epochs in round-robin manner')
+        parser.add_argument('-s', '--source-lang', default=None, metavar='SRC',
+                            help='source language')
+        parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET',
+                            help='target language')
+        parser.add_argument('--load-alignments', action='store_true',
+                            help='load the binarized alignments')
+        parser.add_argument('--left-pad-source', default='True', type=str, metavar='BOOL',
+                            help='pad the source on the left')
+        parser.add_argument('--left-pad-target', default='False', type=str, metavar='BOOL',
+                            help='pad the target on the left')
+        parser.add_argument('--max-source-positions', default=1024, type=int, metavar='N',
+                            help='max number of tokens in the source sequence')
+        parser.add_argument('--max-target-positions', default=1024, type=int, metavar='N',
+                            help='max number of tokens in the target sequence')
+        parser.add_argument('--upsample-primary', default=1, type=int,
+                            help='amount to upsample primary dataset')
+        parser.add_argument('--truncate-source', action='store_true', default=False,
+                            help='truncate source to max-source-positions')
+        parser.add_argument('--num-batch-buckets', default=0, type=int, metavar='N',
+                            help='if >0, then bucket source and target lengths into N '
+                                 'buckets and pad accordingly; this is useful on TPUs '
+                                 'to minimize the number of compilations')
+
+        # options for reporting BLEU during validation
+        parser.add_argument('--eval-bleu', action='store_true',
+                            help='evaluation with BLEU scores')
+        parser.add_argument('--eval-bleu-detok', type=str, default="space",
+                            help='detokenize before computing BLEU (e.g., "moses"); '
+                                 'required if using --eval-bleu; use "space" to '
+                                 'disable detokenization; see fairseq.data.encoders '
+                                 'for other options')
+        parser.add_argument('--eval-bleu-detok-args', type=str, metavar='JSON',
+                            help='args for building the tokenizer, if needed')
+        parser.add_argument('--eval-tokenized-bleu', action='store_true', default=False,
+                            help='compute tokenized BLEU instead of sacrebleu')
+        parser.add_argument('--eval-bleu-remove-bpe', nargs='?', const='@@ ', default=None,
+                            help='remove BPE before computing BLEU')
+        parser.add_argument('--eval-bleu-args', type=str, metavar='JSON',
+                            help='generation args for BLUE scoring, '
+                                 'e.g., \'{"beam": 4, "lenpen": 0.6}\'')
+        parser.add_argument('--eval-bleu-print-samples', action='store_true',
+                            help='print sample generations during validation')
+        # fmt: on
+
+    def __init__(self, args, src_dict, tgt_dict):
+        super().__init__(args)
+        self.src_dict = src_dict
+        self.tgt_dict = tgt_dict
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task (e.g., load dictionaries).
+
+        Args:
+            args (argparse.Namespace): parsed command-line arguments
+        """
+        args.left_pad_source = options.eval_bool(args.left_pad_source)
+        args.left_pad_target = options.eval_bool(args.left_pad_target)
+
+        paths = utils.split_paths(args.data)
+        assert len(paths) > 0
+        # find language pair automatically
+        if args.source_lang is None or args.target_lang is None:
+            args.source_lang, args.target_lang = data_utils.infer_language_pair(paths[0])
+        if args.source_lang is None or args.target_lang is None:
+            raise Exception('Could not infer language pair, please provide it explicitly')
+
+        # load dictionaries
+        src_dict = cls.load_dictionary(os.path.join(paths[0], 'dict.{}.txt'.format(args.source_lang)))
+        tgt_dict = cls.load_dictionary(os.path.join(paths[0], 'dict.{}.txt'.format(args.target_lang)))
+        assert src_dict.pad() == tgt_dict.pad()
+        assert src_dict.eos() == tgt_dict.eos()
+        assert src_dict.unk() == tgt_dict.unk()
+        logger.info('[{}] dictionary: {} types'.format(args.source_lang, len(src_dict)))
+        logger.info('[{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict)))
+
+        return cls(args, src_dict, tgt_dict)
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        paths = utils.split_paths(self.args.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+
+        # infer langcode
+        src, tgt = self.args.source_lang, self.args.target_lang
+
+        self.datasets[split] = load_langpair_dataset(
+            data_path, split, src, self.src_dict, tgt, self.tgt_dict,
+            combine=combine, dataset_impl=self.args.dataset_impl,
+            upsample_primary=self.args.upsample_primary,
+            left_pad_source=self.args.left_pad_source,
+            left_pad_target=self.args.left_pad_target,
+            max_source_positions=self.args.max_source_positions,
+            max_target_positions=self.args.max_target_positions,
+            load_alignments=self.args.load_alignments,
+            truncate_source=self.args.truncate_source,
+            num_buckets=self.args.num_batch_buckets,
+            shuffle=(split != 'test'),
+        )
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths):
+        return LanguagePairDataset(src_tokens, src_lengths, self.source_dictionary)
+
+    def build_model(self, args):
+        model = super().build_model(args)
+        if getattr(args, 'eval_bleu', False):
+            assert getattr(args, 'eval_bleu_detok', None) is not None, (
+                '--eval-bleu-detok is required if using --eval-bleu; '
+                'try --eval-bleu-detok=moses (or --eval-bleu-detok=space '
+                'to disable detokenization, e.g., when using sentencepiece)'
+            )
+            detok_args = json.loads(getattr(args, 'eval_bleu_detok_args', '{}') or '{}')
+            self.tokenizer = encoders.build_tokenizer(Namespace(
+                tokenizer=getattr(args, 'eval_bleu_detok', None),
+                **detok_args
+            ))
+
+            gen_args = json.loads(getattr(args, 'eval_bleu_args', '{}') or '{}')
+            self.sequence_generator = self.build_generator([model], Namespace(**gen_args))
+        return model
+
+    def valid_step(self, sample, model, criterion):
+        loss, sample_size, logging_output = super().valid_step(sample, model, criterion)
+        if self.args.eval_bleu:
+            bleu = self._inference_with_bleu(self.sequence_generator, sample, model)
+            logging_output['_bleu_sys_len'] = bleu.sys_len
+            logging_output['_bleu_ref_len'] = bleu.ref_len
+            # we split counts into separate entries so that they can be
+            # summed efficiently across workers using fast-stat-sync
+            assert len(bleu.counts) == EVAL_BLEU_ORDER
+            for i in range(EVAL_BLEU_ORDER):
+                logging_output['_bleu_counts_' + str(i)] = bleu.counts[i]
+                logging_output['_bleu_totals_' + str(i)] = bleu.totals[i]
+        return loss, sample_size, logging_output
+
+    def reduce_metrics(self, logging_outputs, criterion):
+        super().reduce_metrics(logging_outputs, criterion)
+        if self.args.eval_bleu:
+
+            def sum_logs(key):
+                return sum(log.get(key, 0) for log in logging_outputs)
+
+            counts, totals = [], []
+            for i in range(EVAL_BLEU_ORDER):
+                counts.append(sum_logs('_bleu_counts_' + str(i)))
+                totals.append(sum_logs('_bleu_totals_' + str(i)))
+
+            if max(totals) > 0:
+                # log counts as numpy arrays -- log_scalar will sum them correctly
+                metrics.log_scalar('_bleu_counts', np.array(counts))
+                metrics.log_scalar('_bleu_totals', np.array(totals))
+                metrics.log_scalar('_bleu_sys_len', sum_logs('_bleu_sys_len'))
+                metrics.log_scalar('_bleu_ref_len', sum_logs('_bleu_ref_len'))
+
+                def compute_bleu(meters):
+                    import inspect
+                    import sacrebleu
+                    fn_sig = inspect.getfullargspec(sacrebleu.compute_bleu)[0]
+                    if 'smooth_method' in fn_sig:
+                        smooth = {'smooth_method': 'exp'}
+                    else:
+                        smooth = {'smooth': 'exp'}
+                    bleu = sacrebleu.compute_bleu(
+                        correct=meters['_bleu_counts'].sum,
+                        total=meters['_bleu_totals'].sum,
+                        sys_len=meters['_bleu_sys_len'].sum,
+                        ref_len=meters['_bleu_ref_len'].sum,
+                        **smooth
+                    )
+                    return round(bleu.score, 2)
+
+                metrics.log_derived('bleu', compute_bleu)
+
+    def max_positions(self):
+        """Return the max sentence length allowed by the task."""
+        return (self.args.max_source_positions, self.args.max_target_positions)
+
+    @property
+    def source_dictionary(self):
+        """Return the source :class:`~fairseq.data.Dictionary`."""
+        return self.src_dict
+
+    @property
+    def target_dictionary(self):
+        """Return the target :class:`~fairseq.data.Dictionary`."""
+        return self.tgt_dict
+
+    def _inference_with_bleu(self, generator, sample, model):
+        import sacrebleu
+
+        def decode(toks, escape_unk=False):
+            s = self.tgt_dict.string(
+                toks.int().cpu(),
+                self.args.eval_bleu_remove_bpe,
+                # The default unknown string in fairseq is `<unk>`, but
+                # this is tokenized by sacrebleu as `< unk >`, inflating
+                # BLEU scores. Instead, we use a somewhat more verbose
+                # alternative that is unlikely to appear in the real
+                # reference, but doesn't get split into multiple tokens.
+                unk_string=(
+                    "UNKNOWNTOKENINREF" if escape_unk else "UNKNOWNTOKENINHYP"
+                ),
+            )
+            if self.tokenizer:
+                s = self.tokenizer.decode(s)
+            return s
+
+        gen_out = self.inference_step(generator, [model], sample, None)
+        hyps, refs = [], []
+        for i in range(len(gen_out)):
+            hyps.append(decode(gen_out[i][0]['tokens']))
+            refs.append(decode(
+                utils.strip_pad(sample['target'][i], self.tgt_dict.pad()),
+                escape_unk=True,  # don't count <unk> as matches to the hypo
+            ))
+        if self.args.eval_bleu_print_samples:
+            logger.info('example hypothesis: ' + hyps[0])
+            logger.info('example reference: ' + refs[0])
+        if self.args.eval_tokenized_bleu:
+            return sacrebleu.corpus_bleu(hyps, [refs], tokenize='none')
+        else:
+            return sacrebleu.corpus_bleu(hyps, [refs])
diff --git a/fairseq/tasks/translation_from_pretrained_bart.py b/fairseq/tasks/translation_from_pretrained_bart.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b7d589ceed8c4d5f33be97a2e05b73e6da24f5b
--- /dev/null
+++ b/fairseq/tasks/translation_from_pretrained_bart.py
@@ -0,0 +1,119 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from fairseq.data import LanguagePairDataset
+from fairseq import utils
+
+from .translation import load_langpair_dataset, TranslationTask
+from . import register_task
+
+
+@register_task('translation_from_pretrained_bart')
+class TranslationFromPretrainedBARTTask(TranslationTask):
+    """
+    Translate from source language to target language with a model initialized with a multilingual pretrain.
+
+    Args:
+        src_dict (~fairseq.data.Dictionary): dictionary for the source language
+        tgt_dict (~fairseq.data.Dictionary): dictionary for the target language
+
+    .. note::
+
+        The translation task is compatible with :mod:`fairseq-train`,
+        :mod:`fairseq-generate` and :mod:`fairseq-interactive`.
+
+    The translation task provides the following additional command-line
+    arguments:
+
+    .. argparse::
+        :ref: fairseq.tasks.translation_parser
+        :prog:
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        # fmt: off
+        TranslationTask.add_args(parser)
+        parser.add_argument('--langs', required=True, metavar='LANG',
+                            help='comma-separated list of monolingual language, '
+                                 'for example, "en,de,fr". These should match the '
+                                 'langs from pretraining (and be in the same order). '
+                                 'You should always add all pretraining language idx '
+                                 'during finetuning.')
+        parser.add_argument('--prepend-bos', action='store_true',
+                            help='prepend bos token to each sentence, which matches '
+                                 'mBART pretraining')
+        # fmt: on
+
+    def __init__(self, args, src_dict, tgt_dict):
+        super().__init__(args, src_dict, tgt_dict)
+        self.langs = args.langs.split(',')
+        for d in [src_dict, tgt_dict]:
+            for l in self.langs:
+                d.add_symbol('[{}]'.format(l))
+            d.add_symbol('<mask>')
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        paths = utils.split_paths(self.args.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+
+        # infer langcode
+        src, tgt = self.args.source_lang, self.args.target_lang
+
+        self.datasets[split] = load_langpair_dataset(
+            data_path, split, src, self.src_dict, tgt, self.tgt_dict,
+            combine=combine, dataset_impl=self.args.dataset_impl,
+            upsample_primary=self.args.upsample_primary,
+            left_pad_source=self.args.left_pad_source,
+            left_pad_target=self.args.left_pad_target,
+            max_source_positions=getattr(self.args, 'max_source_positions', 1024),
+            max_target_positions=getattr(self.args, 'max_target_positions', 1024),
+            load_alignments=self.args.load_alignments,
+            prepend_bos=getattr(self.args, 'prepend_bos', False),
+            append_source_id=True
+            )
+
+    def build_generator(self, models, args):
+        if getattr(args, 'score_reference', False):
+            from fairseq.sequence_scorer import SequenceScorer
+            return SequenceScorer(
+                self.target_dictionary,
+                eos=self.tgt_dict.index('[{}]'.format(self.args.target_lang))
+            )
+        else:
+            from fairseq.sequence_generator import SequenceGenerator
+            return SequenceGenerator(
+                models,
+                self.target_dictionary,
+                beam_size=getattr(args, 'beam', 5),
+                max_len_a=getattr(args, 'max_len_a', 0),
+                max_len_b=getattr(args, 'max_len_b', 200),
+                min_len=getattr(args, 'min_len', 1),
+                normalize_scores=(not getattr(args, 'unnormalized', False)),
+                len_penalty=getattr(args, 'lenpen', 1),
+                unk_penalty=getattr(args, 'unkpen', 0),
+                temperature=getattr(args, 'temperature', 1.),
+                match_source_len=getattr(args, 'match_source_len', False),
+                no_repeat_ngram_size=getattr(args, 'no_repeat_ngram_size', 0),
+                eos=self.tgt_dict.index('[{}]'.format(self.args.target_lang))
+            )
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths):
+        src_lang_id = self.source_dictionary.index('[{}]'.format(self.args.source_lang))
+        source_tokens = []
+        for s_t in src_tokens:
+            s_t = torch.cat([s_t, s_t.new(1).fill_(src_lang_id)])
+            source_tokens.append(s_t)
+        dataset = LanguagePairDataset(source_tokens, src_lengths, self.source_dictionary)
+        return dataset
diff --git a/fairseq/tasks/translation_from_pretrained_xlm.py b/fairseq/tasks/translation_from_pretrained_xlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..347a6eccb7657e6d20d1f1304b76fe31bc731393
--- /dev/null
+++ b/fairseq/tasks/translation_from_pretrained_xlm.py
@@ -0,0 +1,31 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.data.legacy.masked_lm_dictionary import MaskedLMDictionary
+from fairseq.tasks.translation import TranslationTask
+
+from . import register_task
+
+
+@register_task("translation_from_pretrained_xlm")
+class TranslationFromPretrainedXLMTask(TranslationTask):
+    """
+    Same as TranslationTask except use the MaskedLMDictionary class so that
+    we can load data that was binarized with the MaskedLMDictionary class.
+
+    This task should be used for the entire training pipeline when we want to
+    train an NMT model from a pretrained XLM checkpoint: binarizing NMT data,
+    training NMT with the pretrained XLM checkpoint, and subsequent evaluation
+    of that trained model.
+    """
+
+    @classmethod
+    def load_dictionary(cls, filename):
+        """Load the masked LM dictionary from the filename
+
+        Args:
+            filename (str): the filename
+        """
+        return MaskedLMDictionary.load(filename)
diff --git a/fairseq/tasks/translation_lev.py b/fairseq/tasks/translation_lev.py
new file mode 100644
index 0000000000000000000000000000000000000000..845dd8164456fbd76c8477205cba496dad6f3332
--- /dev/null
+++ b/fairseq/tasks/translation_lev.py
@@ -0,0 +1,169 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+
+import torch
+
+from fairseq.data import LanguagePairDataset
+
+from fairseq.utils import new_arange
+from fairseq.tasks import register_task
+from fairseq.tasks.translation import TranslationTask, load_langpair_dataset
+from fairseq import utils
+
+@register_task('translation_lev')
+class TranslationLevenshteinTask(TranslationTask):
+    """
+    Translation (Sequence Generation) task for Levenshtein Transformer
+    See `"Levenshtein Transformer" <https://arxiv.org/abs/1905.11006>`_.
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        # fmt: off
+        TranslationTask.add_args(parser)
+        parser.add_argument(
+            '--noise',
+            default='random_delete',
+            choices=['random_delete', 'random_mask', 'no_noise', 'full_mask'])
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        paths = utils.split_paths(self.args.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+
+        # infer langcode
+        src, tgt = self.args.source_lang, self.args.target_lang
+
+        self.datasets[split] = load_langpair_dataset(
+            data_path, split, src, self.src_dict, tgt, self.tgt_dict,
+            combine=combine, dataset_impl=self.args.dataset_impl,
+            upsample_primary=self.args.upsample_primary,
+            left_pad_source=self.args.left_pad_source,
+            left_pad_target=self.args.left_pad_target,
+            max_source_positions=self.args.max_source_positions,
+            max_target_positions=self.args.max_target_positions,
+            prepend_bos=True,
+        )
+
+    def inject_noise(self, target_tokens):
+        def _random_delete(target_tokens):
+            pad = self.tgt_dict.pad()
+            bos = self.tgt_dict.bos()
+            eos = self.tgt_dict.eos()
+
+            max_len = target_tokens.size(1)
+            target_mask = target_tokens.eq(pad)
+            target_score = target_tokens.clone().float().uniform_()
+            target_score.masked_fill_(
+                target_tokens.eq(bos) | target_tokens.eq(eos), 0.0)
+            target_score.masked_fill_(target_mask, 1)
+            target_score, target_rank = target_score.sort(1)
+            target_length = target_mask.size(1) - target_mask.float().sum(
+                1, keepdim=True)
+
+            # do not delete <bos> and <eos> (we assign 0 score for them)
+            target_cutoff = 2 + ((target_length - 2) * target_score.new_zeros(
+                target_score.size(0), 1).uniform_()).long()
+            target_cutoff = target_score.sort(1)[1] >= target_cutoff
+
+            prev_target_tokens = target_tokens.gather(
+                1, target_rank).masked_fill_(target_cutoff, pad).gather(
+                    1,
+                    target_rank.masked_fill_(target_cutoff,
+                                             max_len).sort(1)[1])
+            prev_target_tokens = prev_target_tokens[:, :prev_target_tokens.
+                                                    ne(pad).sum(1).max()]
+
+            return prev_target_tokens
+
+        def _random_mask(target_tokens):
+            pad = self.tgt_dict.pad()
+            bos = self.tgt_dict.bos()
+            eos = self.tgt_dict.eos()
+            unk = self.tgt_dict.unk()
+
+            target_masks = target_tokens.ne(pad) & \
+                           target_tokens.ne(bos) & \
+                           target_tokens.ne(eos)
+            target_score = target_tokens.clone().float().uniform_()
+            target_score.masked_fill_(~target_masks, 2.0)
+            target_length = target_masks.sum(1).float()
+            target_length = target_length * target_length.clone().uniform_()
+            target_length = target_length + 1  # make sure to mask at least one token.
+
+            _, target_rank = target_score.sort(1)
+            target_cutoff = new_arange(target_rank) < target_length[:, None].long()
+            prev_target_tokens = target_tokens.masked_fill(
+                target_cutoff.scatter(1, target_rank, target_cutoff), unk)
+            return prev_target_tokens
+
+        def _full_mask(target_tokens):
+            pad = self.tgt_dict.pad()
+            bos = self.tgt_dict.bos()
+            eos = self.tgt_dict.eos()
+            unk = self.tgt_dict.unk()
+
+            target_mask = target_tokens.eq(bos) | target_tokens.eq(
+                eos) | target_tokens.eq(pad)
+            return target_tokens.masked_fill(~target_mask, unk)
+
+        if self.args.noise == 'random_delete':
+            return _random_delete(target_tokens)
+        elif self.args.noise == 'random_mask':
+            return _random_mask(target_tokens)
+        elif self.args.noise == 'full_mask':
+            return _full_mask(target_tokens)
+        elif self.args.noise == 'no_noise':
+            return target_tokens
+        else:
+            raise NotImplementedError
+
+    def build_generator(self, models, args):
+        # add models input to match the API for SequenceGenerator
+        from fairseq.iterative_refinement_generator import IterativeRefinementGenerator
+        return IterativeRefinementGenerator(
+            self.target_dictionary,
+            eos_penalty=getattr(args, 'iter_decode_eos_penalty', 0.0),
+            max_iter=getattr(args, 'iter_decode_max_iter', 10),
+            beam_size=getattr(args, 'iter_decode_with_beam', 1),
+            reranking=getattr(args, 'iter_decode_with_external_reranker', False),
+            decoding_format=getattr(args, 'decoding_format', None),
+            adaptive=not getattr(args, 'iter_decode_force_max_iter', False),
+            retain_history=getattr(args, 'retain_iter_history', False))
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths):
+        return LanguagePairDataset(
+            src_tokens, src_lengths, self.source_dictionary, append_bos=True
+        )
+
+    def train_step(self,
+                   sample,
+                   model,
+                   criterion,
+                   optimizer,
+                   update_num,
+                   ignore_grad=False):
+        model.train()
+        sample['prev_target'] = self.inject_noise(sample['target'])
+        loss, sample_size, logging_output = criterion(model, sample)
+        if ignore_grad:
+            loss *= 0
+        optimizer.backward(loss)
+        return loss, sample_size, logging_output
+
+    def valid_step(self, sample, model, criterion):
+        model.eval()
+        with torch.no_grad():
+            sample['prev_target'] = self.inject_noise(sample['target'])
+            loss, sample_size, logging_output = criterion(model, sample)
+        return loss, sample_size, logging_output
diff --git a/fairseq/tasks/translation_multi_simple_epoch.py b/fairseq/tasks/translation_multi_simple_epoch.py
new file mode 100644
index 0000000000000000000000000000000000000000..eba32f17599f195476857edc93d2dd9557497415
--- /dev/null
+++ b/fairseq/tasks/translation_multi_simple_epoch.py
@@ -0,0 +1,322 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import datetime
+import time
+
+import torch
+from fairseq.data import (
+    data_utils,
+    FairseqDataset,
+    iterators,
+    LanguagePairDataset,
+    ListDataset,
+)
+
+from fairseq.tasks import FairseqTask, register_task
+from fairseq.data.multilingual.sampling_method import SamplingMethod
+from fairseq.data.multilingual.multilingual_data_manager import MultilingualDatasetManager
+
+
+###
+def get_time_gap(s, e):
+    return (datetime.datetime.fromtimestamp(e) - datetime.datetime.fromtimestamp(s)).__str__()
+###
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_task('translation_multi_simple_epoch')
+class TranslationMultiSimpleEpochTask(FairseqTask):
+    """
+    Translate from one (source) language to another (target) language.
+
+    Args:
+        langs (List[str]): a list of languages that are being supported
+        dicts (Dict[str, fairseq.data.Dictionary]): mapping from supported languages to their dictionaries
+        training (bool): whether the task should be configured for training or not
+
+    .. note::
+
+        The translation task is compatible with :mod:`fairseq-train`,
+        :mod:`fairseq-generate` and :mod:`fairseq-interactive`.
+
+    The translation task provides the following additional command-line
+    arguments:
+
+    .. argparse::
+        :ref: fairseq.tasks.translation_parser
+        :prog:
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('-s', '--source-lang', default=None, metavar='SRC',
+                            help='inference source language')
+        parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET',
+                            help='inference target language')
+        parser.add_argument('--lang-pairs', default=None, metavar='PAIRS',
+                            help='comma-separated list of language pairs (in training order): en-de,en-fr,de-fr')
+        parser.add_argument('--keep-inference-langtok', action='store_true',
+                            help='keep language tokens in inference output (e.g. for analysis or debugging)')
+
+        SamplingMethod.add_arguments(parser)
+        MultilingualDatasetManager.add_args(parser)
+        # fmt: on
+
+    def __init__(self, args, langs, dicts, training):
+        super().__init__(args)
+        self.langs = langs
+        self.dicts = dicts
+        self.training = training
+        if training:
+            self.lang_pairs = args.lang_pairs
+        else:
+            self.lang_pairs = ['{}-{}'.format(args.source_lang, args.target_lang)]
+        # eval_lang_pairs for multilingual translation is usually all of the
+        # lang_pairs. However for other multitask settings or when we want to
+        # optimize for certain languages we want to use a different subset. Thus
+        # the eval_lang_pairs class variable is provided for classes that extend
+        # this class.
+        self.eval_lang_pairs = self.lang_pairs
+        # model_lang_pairs will be used to build encoder-decoder model pairs in
+        # models.build_model(). This allows multitask type of sub-class can
+        # build models other than the input lang_pairs
+        self.model_lang_pairs = self.lang_pairs
+        self.sampling_method = SamplingMethod.build_sampler(args, self)
+        self.data_manager = MultilingualDatasetManager.setup_data_manager(
+            args, self.lang_pairs, langs, dicts, self.sampling_method)
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        langs, dicts, training = MultilingualDatasetManager.prepare(
+            cls.load_dictionary, args, **kwargs
+        )
+        return cls(args, langs, dicts, training)
+
+    def has_sharded_data(self, split):
+        return self.data_manager.has_sharded_data(split)
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        if split in self.datasets:
+            dataset = self.datasets[split]
+            if self.has_sharded_data(split) and dataset.load_next_shard:
+                shard_epoch = dataset.shard_epoch
+            else:
+                # no need to load next shard so skip loading
+                # also this avoid always loading from beginning of the data
+                return
+        else:
+            shard_epoch = None
+        logger.info(f'loading data for {split} epoch={epoch}/{shard_epoch}')
+        self.datasets[split] = self.data_manager.load_sampled_multi_epoch_dataset(
+            split,
+            self.training,
+            epoch=epoch, combine=combine, shard_epoch=shard_epoch, **kwargs
+        )
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths):
+        src_data = ListDataset(src_tokens, src_lengths)
+        dataset = LanguagePairDataset(src_data, src_lengths, self.source_dictionary)
+        src_langtok_spec, tgt_langtok_spec = self.args.langtoks['main']
+        if self.args.lang_tok_replacing_bos_eos:
+            dataset = self.data_manager.alter_dataset_langtok(
+                    dataset,
+                    src_eos=self.source_dictionary.eos(),
+                    src_lang=self.args.source_lang,
+                    tgt_eos=self.target_dictionary.eos(),
+                    tgt_lang=self.args.target_lang,
+                    src_langtok_spec=src_langtok_spec,
+                    tgt_langtok_spec=tgt_langtok_spec,
+                )
+        else:
+            dataset.src = self.data_manager.src_dataset_tranform_func(
+                self.args.source_lang,
+                self.args.target_lang,
+                dataset=dataset.src,
+                spec=src_langtok_spec,
+                )
+        return dataset
+
+    def build_generator(
+        self, models, args,
+        seq_gen_cls=None, extra_gen_cls_kwargs=None,
+    ):
+        if not getattr(args, 'keep_inference_langtok', False):
+            _, tgt_langtok_spec = self.args.langtoks['main']
+            if tgt_langtok_spec:
+                tgt_lang_tok = self.data_manager.get_decoder_langtok(self.args.target_lang, tgt_langtok_spec)
+                extra_gen_cls_kwargs = extra_gen_cls_kwargs or {}
+                extra_gen_cls_kwargs['symbols_to_strip_from_output'] = {tgt_lang_tok}
+
+        return super().build_generator(
+            models, args,
+            seq_gen_cls=None,
+            extra_gen_cls_kwargs=extra_gen_cls_kwargs
+        )
+
+    def build_model(self, args):
+        return super().build_model(args)
+
+    def valid_step(self, sample, model, criterion):
+        loss, sample_size, logging_output = super().valid_step(sample, model, criterion)
+        return loss, sample_size, logging_output
+
+    def inference_step(self, generator, models, sample, prefix_tokens=None):
+        with torch.no_grad():
+            _, tgt_langtok_spec = self.args.langtoks['main']
+            if not self.args.lang_tok_replacing_bos_eos:
+                if prefix_tokens is None and tgt_langtok_spec:
+                    tgt_lang_tok = self.data_manager.get_decoder_langtok(self.args.target_lang, tgt_langtok_spec)
+                    src_tokens = sample['net_input']['src_tokens']
+                    bsz = src_tokens.size(0)
+                    prefix_tokens = torch.LongTensor(
+                        [[tgt_lang_tok]]
+                        ).expand(bsz, 1).to(src_tokens)
+                return generator.generate(
+                        models,
+                        sample,
+                        prefix_tokens=prefix_tokens,
+                )
+            else:
+                return generator.generate(
+                        models,
+                        sample,
+                        prefix_tokens=prefix_tokens,
+                        bos_token=self.data_manager.get_decoder_langtok(self.args.target_lang, tgt_langtok_spec)
+                        if tgt_langtok_spec else self.target_dictionary.eos(),
+                )
+
+    def reduce_metrics(self, logging_outputs, criterion):
+        super().reduce_metrics(logging_outputs, criterion)
+
+    def max_positions(self):
+        """Return the max sentence length allowed by the task."""
+        return (self.args.max_source_positions, self.args.max_target_positions)
+
+    @property
+    def source_dictionary(self):
+        if self.training:
+            return next(iter(self.dicts.values()))
+        else:
+            return self.dicts[self.args.source_lang]
+
+    @property
+    def target_dictionary(self):
+        if self.training:
+            return next(iter(self.dicts.values()))
+        else:
+            return self.dicts[self.args.target_lang]
+
+    def create_batch_sampler_func(
+        self, max_positions, ignore_invalid_inputs,
+        max_tokens, max_sentences
+    ):
+        def construct_batch_sampler(
+            dataset, epoch
+        ):
+            splits = [s for s, _ in self.datasets.items() if self.datasets[s] == dataset]
+            split = splits[0] if len(splits) > 0 else None
+
+            if epoch is not None:
+                dataset.set_epoch(epoch)
+            start_time = time.time()
+            # get indices ordered by example size
+            indices = dataset.ordered_indices()
+            logger.debug(f'[{split}] @batch_sampler order indices time: {get_time_gap(start_time, time.time())}')
+
+            # filter examples that are too large
+            if max_positions is not None:
+                my_time = time.time()
+                indices = data_utils.filter_by_size(
+                    indices, dataset, max_positions, raise_exception=(not ignore_invalid_inputs),
+                )
+                logger.debug(f'[{split}] @batch_sampler filter_by_size time: {get_time_gap(my_time, time.time())}')
+
+            # create mini-batches with given size constraints
+            my_time = time.time()
+            batch_sampler = data_utils.batch_by_size(
+                indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences,
+            )
+            logger.debug(f'[{split}] @batch_sampler batch_by_size time: {get_time_gap(my_time, time.time())}')
+            logger.debug(f'[{split}] per epoch batch_sampler set-up time: {get_time_gap(start_time, time.time())}')
+            return batch_sampler
+        return construct_batch_sampler
+
+    # we need to override get_batch_iterator because we want to reset the epoch iterator each time
+    def get_batch_iterator(
+        self, dataset, max_tokens=None, max_sentences=None, max_positions=None,
+        ignore_invalid_inputs=False, required_batch_size_multiple=1,
+        seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=1,
+    ):
+        """
+        Get an iterator that yields batches of data from the given dataset.
+
+        Args:
+            dataset (~fairseq.data.FairseqDataset): dataset to batch
+            max_tokens (int, optional): max number of tokens in each batch
+                (default: None).
+            max_sentences (int, optional): max number of sentences in each
+                batch (default: None).
+            max_positions (optional): max sentence length supported by the
+                model (default: None).
+            ignore_invalid_inputs (bool, optional): don't raise Exception for
+                sentences that are too long (default: False).
+            required_batch_size_multiple (int, optional): require batch size to
+                be a multiple of N (default: 1).
+            seed (int, optional): seed for random number generator for
+                reproducibility (default: 1).
+            num_shards (int, optional): shard the data iterator into N
+                shards (default: 1).
+            shard_id (int, optional): which shard of the data iterator to
+                return (default: 0).
+            num_workers (int, optional): how many subprocesses to use for data
+                loading. 0 means the data will be loaded in the main process
+                (default: 0).
+            epoch (int, optional): the epoch to start the iterator from
+                (default: 0).
+        Returns:
+            ~fairseq.iterators.EpochBatchIterator: a batched iterator over the
+                given dataset split
+        """
+        # initialize the dataset with the correct starting epoch
+        assert isinstance(dataset, FairseqDataset)
+        if dataset in self.dataset_to_epoch_iter:
+            return self.dataset_to_epoch_iter[dataset]
+        if (
+            self.args.sampling_method == 'RoundRobin'
+        ):
+            batch_iter = super().get_batch_iterator(
+                dataset, max_tokens=max_tokens, max_sentences=max_sentences, max_positions=max_positions,
+                ignore_invalid_inputs=ignore_invalid_inputs, required_batch_size_multiple=required_batch_size_multiple,
+                seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch,
+            )
+            self.dataset_to_epoch_iter[dataset] = batch_iter
+            return batch_iter
+
+        construct_batch_sampler = self.create_batch_sampler_func(
+            max_positions, ignore_invalid_inputs,
+            max_tokens, max_sentences)
+
+        epoch_iter = iterators.EpochBatchIterator(
+            dataset=dataset,
+            collate_fn=dataset.collater,
+            batch_sampler=construct_batch_sampler,
+            seed=seed,
+            num_shards=num_shards,
+            shard_id=shard_id,
+            num_workers=num_workers,
+            epoch=epoch,
+        )
+        return epoch_iter
diff --git a/fairseq/tokenizer.py b/fairseq/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c4d694aa08be0e73179290785c565419247618b
--- /dev/null
+++ b/fairseq/tokenizer.py
@@ -0,0 +1,14 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import re
+
+SPACE_NORMALIZER = re.compile(r"\s+")
+
+
+def tokenize_line(line):
+    line = SPACE_NORMALIZER.sub(" ", line)
+    line = line.strip()
+    return line.split()
diff --git a/fairseq/trainer.py b/fairseq/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..898edb6d6cfd8a8f4bd2e612b96009a1bf232e7d
--- /dev/null
+++ b/fairseq/trainer.py
@@ -0,0 +1,1006 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Train a network across multiple GPUs.
+"""
+
+import contextlib
+from itertools import chain
+import logging
+import sys
+import time
+from typing import Any, Dict, List
+
+import torch
+
+from fairseq import checkpoint_utils, distributed_utils, models, optim, utils
+from fairseq.file_io import PathManager
+from fairseq.logging import meters, metrics
+from fairseq.nan_detector import NanDetector
+from fairseq.optim import lr_scheduler
+
+
+logger = logging.getLogger(__name__)
+
+
+class Trainer(object):
+    """Main class for data parallel training.
+
+    This class supports synchronous distributed data parallel training,
+    where multiple workers each have a full model replica and gradients
+    are accumulated across workers before each update. We use
+    :class:`~torch.nn.parallel.DistributedDataParallel` to handle
+    communication of the gradients across workers.
+    """
+
+    def __init__(self, args, task, model, criterion, quantizer=None):
+        self.args = args
+        self.task = task
+
+        # catalog shared parameters
+        shared_params = _catalog_shared_params(model)
+
+        self.tpu = getattr(args, 'tpu', False)
+        self.cuda = torch.cuda.is_available() and not args.cpu and not self.tpu
+        if self.cuda:
+            self.device = torch.device('cuda')
+        elif self.tpu:
+            self.device = utils.get_tpu_device(args)
+        else:
+            self.device = torch.device('cpu')
+
+        # copy model and criterion to current device/dtype
+        self._criterion = criterion
+        self._model = model
+        if self.tpu:
+            import torch_xla.core.xla_model as xm
+            self._model = xm.send_cpu_data_to_device(self._model, self.device)
+        if args.fp16:
+            self._criterion = self._criterion.half()
+            self._model = self._model.half()
+        elif args.bf16:
+            self._criterion = self._criterion.to(dtype=torch.bfloat16)
+            self._model = self._model.to(dtype=torch.bfloat16)
+        self._criterion = self._criterion.to(device=self.device)
+        self._model = self._model.to(device=self.device)
+
+        # check that shared parameters are preserved after device transfer
+        for shared_param in shared_params:
+            ref = _get_module_by_path(self._model, shared_param[0])
+            for path in shared_param[1:]:
+                logger.info(
+                    'detected shared parameter: {} <- {}'.format(shared_param[0], path)
+                )
+                _set_module_by_path(self._model, path, ref)
+
+        self._dummy_batch = "DUMMY"  # indicates we don't have a dummy batch at first
+        self._lr_scheduler = None
+        self._num_updates = 0
+        self._num_xla_compiles = 0  # for TPUs
+        self._optim_history = None
+        self._optimizer = None
+        self._warn_once = set()
+        self._wrapped_criterion = None
+        self._wrapped_model = None
+
+        # TODO(myleott): support tpu
+        if self.cuda and self.data_parallel_world_size > 1:
+            self._grad_norm_buf = torch.cuda.DoubleTensor(self.data_parallel_world_size)
+        else:
+            self._grad_norm_buf = None
+
+        self.quantizer = quantizer
+        if self.quantizer is not None:
+            self.quantizer.set_trainer(self)
+
+        # get detailed cuda environment
+        if self.cuda:
+            self.cuda_env = utils.CudaEnvironment()
+            if self.data_parallel_world_size > 1:
+                self.cuda_env_arr = distributed_utils.all_gather_list(self.cuda_env)
+            else:
+                self.cuda_env_arr = [self.cuda_env]
+            if self.data_parallel_rank == 0:
+                utils.CudaEnvironment.pretty_print_cuda_env_list(self.cuda_env_arr)
+        else:
+            self.cuda_env = None
+            self.cuda_env_arr = None
+
+        metrics.log_start_time("wall", priority=790, round=0)
+
+        self._start_time = time.time()
+        self._previous_training_time = 0
+        self._cumulative_training_time = None
+
+    def reinitialize(self):
+        """Reinitialize the Trainer, typically after model params change."""
+        self._lr_scheduler = None
+        self._optimizer = None
+        self._wrapped_criterion = None
+        self._wrapped_model = None
+
+    @property
+    def data_parallel_world_size(self):
+        return self.args.distributed_world_size
+
+    @property
+    def data_parallel_process_group(self):
+        if self.tpu:
+            return ('tpu', None)
+        else:
+            return None
+
+    @property
+    def data_parallel_rank(self):
+        return self.args.distributed_rank
+
+    @property
+    def is_data_parallel_master(self):
+        return distributed_utils.is_master(self.args)
+
+    @property
+    def criterion(self):
+        if self._wrapped_criterion is None:
+            if (
+                utils.has_parameters(self._criterion)
+                and self.data_parallel_world_size > 1
+                and not self.args.use_bmuf
+                and not self.tpu
+            ):
+                self._wrapped_criterion = models.DistributedFairseqModel(
+                    self.args, self._criterion,
+                    process_group=self.data_parallel_process_group
+                )
+            else:
+                self._wrapped_criterion = self._criterion
+        return self._wrapped_criterion
+
+    @property
+    def model(self):
+        if self._wrapped_model is None:
+            if (
+                self.data_parallel_world_size > 1
+                and not self.args.use_bmuf
+                and not self.tpu
+            ):
+                self._wrapped_model = models.DistributedFairseqModel(
+                    self.args, self._model,
+                    process_group=self.data_parallel_process_group
+                )
+            else:
+                self._wrapped_model = self._model
+        return self._wrapped_model
+
+    @property
+    def optimizer(self):
+        if self._optimizer is None:
+            self._build_optimizer()
+        return self._optimizer
+
+    @property
+    def lr_scheduler(self):
+        if self._lr_scheduler is None:
+            self._build_optimizer()  # this will initialize self._lr_scheduler
+        return self._lr_scheduler
+
+    def _build_optimizer(self):
+        params = list(
+            filter(
+                lambda p: p.requires_grad,
+                chain(self.model.parameters(), self.criterion.parameters()),
+            )
+        )
+
+        if self.args.fp16 or self.args.bf16:
+            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
+                logger.info(
+                    "NOTE: your device does NOT support faster training with --fp16, "
+                    "please switch to FP32 which is likely to be faster"
+                )
+            if self.args.memory_efficient_fp16 or self.args.memory_efficient_bf16:
+                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
+                    self.args, params
+                )
+            else:
+                self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params)
+        else:
+            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
+                logger.info("NOTE: your device may support faster training with --fp16")
+            self._optimizer = optim.build_optimizer(self.args, params)
+
+        if self.args.use_bmuf:
+            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)
+
+        # We should initialize the learning rate scheduler immediately after
+        # building the optimizer, so that the initial learning rate is set.
+        self._lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer)
+        self._lr_scheduler.step_update(0)
+
+    def save_checkpoint(self, filename, extra_state):
+        """Save all training state in a checkpoint file."""
+        if self.is_data_parallel_master:  # only save one checkpoint
+            extra_state["metrics"] = metrics.state_dict()
+            extra_state["previous_training_time"] = self.cumulative_training_time()
+            checkpoint_utils.save_state(
+                filename,
+                self.args,
+                self.get_model().state_dict(),
+                self.get_criterion(),
+                self.optimizer,
+                self.lr_scheduler,
+                self.get_num_updates(),
+                self._optim_history,
+                extra_state,
+            )
+
+    def load_checkpoint(
+        self,
+        filename,
+        reset_optimizer=False,
+        reset_lr_scheduler=False,
+        optimizer_overrides=None,
+        reset_meters=False,
+    ):
+        """Load all training state from a checkpoint file."""
+        extra_state, self._optim_history, last_optim_state = None, [], None
+
+        bexists = PathManager.isfile(filename)
+        if bexists:
+            state = checkpoint_utils.load_checkpoint_to_cpu(filename)
+
+            # load model parameters
+            try:
+                self.get_model().load_state_dict(
+                    state["model"], strict=True, args=self.args
+                )
+                if utils.has_parameters(self.get_criterion()):
+                    self.get_criterion().load_state_dict(
+                        state["criterion"], strict=True
+                    )
+            except Exception:
+                raise Exception(
+                    "Cannot load model parameters from checkpoint {}; "
+                    "please ensure that the architectures match.".format(filename)
+                )
+
+            extra_state = state["extra_state"]
+            self._optim_history = state["optimizer_history"]
+            last_optim_state = state.get("last_optimizer_state", None)
+
+        if last_optim_state is not None and not reset_optimizer:
+            # rebuild optimizer after loading model, since params may have changed
+            self._build_optimizer()
+
+            # only reload optimizer and lr_scheduler if they match
+            last_optim = self._optim_history[-1]
+            assert (
+                last_optim["criterion_name"] == self.get_criterion().__class__.__name__
+            ), "Criterion does not match; please reset the optimizer (--reset-optimizer)."
+            assert (
+                last_optim["optimizer_name"] == self.optimizer.__class__.__name__
+            ), "Optimizer does not match; please reset the optimizer (--reset-optimizer)."
+
+            if not reset_lr_scheduler:
+                self.lr_scheduler.load_state_dict(last_optim["lr_scheduler_state"])
+            self.optimizer.load_state_dict(last_optim_state, optimizer_overrides)
+
+            self.set_num_updates(last_optim["num_updates"])
+
+        if extra_state is not None:
+            epoch = extra_state["train_iterator"]["epoch"]
+            logger.info(
+                "loaded checkpoint {} (epoch {} @ {} updates)".format(
+                    filename, epoch, self.get_num_updates()
+                )
+            )
+
+            if "previous_training_time" in extra_state:
+                self._previous_training_time = extra_state["previous_training_time"]
+                self._start_time = time.time()
+
+            self.lr_step(epoch)
+
+            if "metrics" in extra_state and not reset_meters:
+                metrics.load_state_dict(extra_state["metrics"])
+
+                # reset TimeMeters, since their start times don't make sense anymore
+                for meter in metrics.get_meters("default"):
+                    if isinstance(meter, meters.TimeMeter):
+                        meter.reset()
+        else:
+            logger.info("no existing checkpoint found {}".format(filename))
+
+        return extra_state
+
+    def get_train_iterator(
+        self,
+        epoch,
+        combine=True,
+        load_dataset=True,
+        data_selector=None,
+        shard_batch_itr=True,
+    ):
+        """Return an EpochBatchIterator over the training set for a given epoch."""
+        if load_dataset:
+            logger.info("loading train data for epoch {}".format(epoch))
+            self.task.load_dataset(
+                self.args.train_subset,
+                epoch=epoch,
+                combine=combine,
+                data_selector=data_selector,
+            )
+        return self.task.get_batch_iterator(
+            dataset=self.task.dataset(self.args.train_subset),
+            max_tokens=self.args.max_tokens,
+            max_sentences=self.args.max_sentences,
+            max_positions=utils.resolve_max_positions(
+                self.task.max_positions(),
+                self.model.max_positions(),
+                self.args.max_tokens,
+            ),
+            ignore_invalid_inputs=True,
+            required_batch_size_multiple=self.args.required_batch_size_multiple,
+            seed=self.args.seed,
+            num_shards=self.data_parallel_world_size if shard_batch_itr else 1,
+            shard_id=self.data_parallel_rank if shard_batch_itr else 0,
+            num_workers=self.args.num_workers,
+            epoch=epoch
+        )
+
+    def get_valid_iterator(
+        self,
+        subset,
+    ):
+        """Return an EpochBatchIterator over given validation subset for a given epoch."""
+        return self.task.get_batch_iterator(
+            dataset=self.task.dataset(subset),
+            max_tokens=self.args.max_tokens_valid,
+            max_sentences=self.args.max_sentences_valid,
+            max_positions=utils.resolve_max_positions(
+                self.task.max_positions(),
+                self.model.max_positions(),
+            ),
+            ignore_invalid_inputs=self.args.skip_invalid_size_inputs_valid_test,
+            required_batch_size_multiple=self.args.required_batch_size_multiple,
+            seed=self.args.seed,
+            num_shards=self.data_parallel_world_size,
+            shard_id=self.data_parallel_rank,
+            num_workers=self.args.num_workers
+        )
+
+    def begin_epoch(self, epoch):
+        """Called at the beginning of each epoch."""
+        if self.quantizer is not None:
+            self.quantizer.begin_epoch(epoch)
+
+        # task specific setup per epoch
+        self.task.begin_epoch(epoch, self.get_model())
+
+    @metrics.aggregate("train")
+    def train_step(self, samples, raise_oom=False):
+        """Do forward, backward and parameter update."""
+        if self._dummy_batch == "DUMMY":
+            self._dummy_batch = samples[0]
+
+        self._set_seed()
+        self.model.train()
+        self.criterion.train()
+        self.zero_grad()
+
+        metrics.log_start_time("train_wall", priority=800, round=0)
+
+        # forward and backward pass
+        logging_outputs, sample_size, ooms = [], 0, 0
+        for i, sample in enumerate(samples):
+            sample = self._prepare_sample(sample)
+            if sample is None:
+                # when sample is None, run forward/backward on a dummy batch
+                # and ignore the resulting gradients
+                sample = self._prepare_sample(self._dummy_batch)
+                is_dummy_batch = True
+            else:
+                is_dummy_batch = False
+
+            def maybe_no_sync():
+                """
+                Whenever *samples* contains more than one mini-batch, we
+                want to accumulate gradients locally and only call
+                all-reduce in the last backwards pass.
+                """
+                if (
+                    self.data_parallel_world_size > 1
+                    and hasattr(self.model, "no_sync")
+                    and i < len(samples) - 1
+                ):
+                    return self.model.no_sync()
+                else:
+                    return contextlib.ExitStack()  # dummy contextmanager
+
+            try:
+                with maybe_no_sync():
+                    # forward and backward
+                    loss, sample_size_i, logging_output = self.task.train_step(
+                        sample=sample,
+                        model=self.model,
+                        criterion=self.criterion,
+                        optimizer=self.optimizer,
+                        update_num=self.get_num_updates(),
+                        ignore_grad=is_dummy_batch,
+                    )
+                    del loss
+
+                logging_outputs.append(logging_output)
+                sample_size += sample_size_i
+
+                # emptying the CUDA cache after the first step can
+                # reduce the chance of OOM
+                if self.cuda and self.get_num_updates() == 0:
+                    torch.cuda.empty_cache()
+            except RuntimeError as e:
+                if "out of memory" in str(e):
+                    self._log_oom(e)
+                    if raise_oom:
+                        raise e
+                    logger.warning(
+                        "attempting to recover from OOM in forward/backward pass"
+                    )
+                    ooms += 1
+                    self.zero_grad()
+                    if self.cuda:
+                        torch.cuda.empty_cache()
+                    if self.args.distributed_world_size == 1:
+                        return None
+                else:
+                    raise e
+
+            if self.tpu and i < len(samples) - 1:
+                # tpu-comment: every XLA operation before marking step is
+                # appended to the IR graph, and processing too many batches
+                # before marking step can lead to OOM errors.
+                # To handle gradient accumulation use case, we explicitly
+                # mark step here for every forward pass without a backward pass
+                import torch_xla.core.xla_model as xm
+                xm.mark_step()
+
+        if is_dummy_batch:
+            if torch.is_tensor(sample_size):
+                sample_size.zero_()
+            else:
+                sample_size *= 0.
+
+        if torch.is_tensor(sample_size):
+            sample_size = sample_size.float()
+        else:
+            sample_size = float(sample_size)
+
+        # gather logging outputs from all replicas
+        if self._sync_stats():
+            train_time = self._local_cumulative_training_time()
+            logging_outputs, (sample_size, ooms, total_train_time) = self._aggregate_logging_outputs(
+                logging_outputs, sample_size, ooms, train_time, ignore=is_dummy_batch,
+            )
+            self._cumulative_training_time = total_train_time / self.data_parallel_world_size
+
+        if hasattr(self.model, 'all_reduce'):
+            self.model.all_reduce()
+
+        overflow = False
+        try:
+            if self.tpu and self.data_parallel_world_size > 1:
+                import torch_xla.core.xla_model as xm
+                gradients = xm._fetch_gradients(self.optimizer.optimizer)
+                xm.all_reduce('sum', gradients, scale=1.0 / self.data_parallel_world_size)
+
+            with torch.autograd.profiler.record_function("multiply-grads"):
+                # multiply gradients by (# GPUs / sample_size) since DDP
+                # already normalizes by the number of GPUs. Thus we get
+                # (sum_of_gradients / sample_size).
+                if not self.args.use_bmuf:
+                    self.optimizer.multiply_grads(self.data_parallel_world_size / sample_size)
+                elif sample_size > 0:  # BMUF needs to check sample size
+                    num = self.data_parallel_world_size if self._sync_stats() else 1
+                    self.optimizer.multiply_grads(num / sample_size)
+
+            with torch.autograd.profiler.record_function("clip-grads"):
+                # clip grads
+                grad_norm = self.clip_grad_norm(self.args.clip_norm)
+
+            # check that grad norms are consistent across workers
+            if (
+                not self.args.use_bmuf
+                and self.args.distributed_wrapper != 'SlowMo'
+                and not self.tpu
+            ):
+                self._check_grad_norms(grad_norm)
+
+            with torch.autograd.profiler.record_function("optimizer"):
+                # take an optimization step
+                self.optimizer.step()
+        except FloatingPointError:
+            # re-run the forward and backward pass with hooks attached to print
+            # out where it fails
+            with NanDetector(self.model):
+                self.task.train_step(
+                    sample, self.model, self.criterion, self.optimizer, self.get_num_updates(),
+                    ignore_grad=False
+                )
+            raise
+        except OverflowError as e:
+            overflow = True
+            logger.info("NOTE: overflow detected, " + str(e))
+            grad_norm = torch.tensor(0.).cuda()
+            self.zero_grad()
+        except RuntimeError as e:
+            if "out of memory" in str(e):
+                self._log_oom(e)
+                logger.error("OOM during optimization, irrecoverable")
+            raise e
+
+        # Some distributed wrappers (e.g., SlowMo) need access to the optimizer after the step
+        if hasattr(self.model, 'perform_additional_optimizer_actions'):
+            if hasattr(self.optimizer, 'fp32_params'):
+                self.model.perform_additional_optimizer_actions(self.optimizer.optimizer, self.optimizer.fp32_params)
+            else:
+                self.model.perform_additional_optimizer_actions(self.optimizer.optimizer)
+
+        if not overflow or self.args.distributed_wrapper == 'SlowMo':
+            self.set_num_updates(self.get_num_updates() + 1)
+
+            if self.tpu:
+                # mark step on TPUs
+                import torch_xla.core.xla_model as xm
+                xm.mark_step()
+
+                # only log stats every log_interval steps
+                # this causes wps to be misreported when log_interval > 1
+                logging_output = {}
+                if self.get_num_updates() % self.args.log_interval == 0:
+                    logging_output = self._reduce_and_log_stats(
+                        logging_outputs, sample_size, grad_norm,
+                    )
+
+                # log whenever there's an XLA compilation, since these
+                # slow down training and may indicate opportunities for
+                # optimization
+                self._check_xla_compilation()
+            else:
+                # log stats
+                logging_output = self._reduce_and_log_stats(
+                    logging_outputs, sample_size, grad_norm,
+                )
+
+                # clear CUDA cache to reduce memory fragmentation
+                if (
+                    self.cuda
+                    and self.args.empty_cache_freq > 0
+                    and (
+                        (self.get_num_updates() + self.args.empty_cache_freq - 1)
+                        % self.args.empty_cache_freq
+                    ) == 0
+                ):
+                    torch.cuda.empty_cache()
+
+        if self.args.fp16:
+            metrics.log_scalar("loss_scale", self.optimizer.scaler.loss_scale, priority=700, round=0)
+
+        metrics.log_stop_time("train_wall")
+
+        return logging_output
+
+    @metrics.aggregate("valid")
+    def valid_step(self, sample, raise_oom=False):
+        """Do forward pass in evaluation mode."""
+        if self._dummy_batch == "DUMMY":
+            self._dummy_batch = sample
+        if self.tpu:
+            import torch_xla.core.xla_model as xm
+            xm.rendezvous('valid_step')  # wait for all workers
+            xm.mark_step()
+
+        with torch.no_grad():
+            self.model.eval()
+            self.criterion.eval()
+
+            sample = self._prepare_sample(sample)
+            if sample is None:
+                sample = self._prepare_sample(self._dummy_batch)
+                is_dummy_batch = True
+            else:
+                is_dummy_batch = False
+
+            try:
+                _loss, sample_size, logging_output = self.task.valid_step(
+                    sample, self.model, self.criterion
+                )
+            except RuntimeError as e:
+                if "out of memory" in str(e):
+                    self._log_oom(e)
+                    if not raise_oom:
+                        logger.warning(
+                            "ran out of memory in validation step, retrying batch"
+                        )
+                        for p in self.model.parameters():
+                            if p.grad is not None:
+                                p.grad = None  # free some memory
+                        if self.cuda:
+                            torch.cuda.empty_cache()
+                        return self.valid_step(sample, raise_oom=True)
+                raise e
+
+            logging_outputs = [logging_output]
+            if is_dummy_batch:
+                if torch.is_tensor(sample_size):
+                    sample_size.zero_()
+                else:
+                    sample_size *= 0.
+
+        # gather logging outputs from all replicas
+        if self.data_parallel_world_size > 1:
+            logging_outputs, (sample_size, ) = self._aggregate_logging_outputs(
+                logging_outputs, sample_size, ignore=is_dummy_batch,
+            )
+
+        # log validation stats
+        logging_output = self._reduce_and_log_stats(logging_outputs, sample_size)
+
+        return logging_output
+
+    def zero_grad(self):
+        self.optimizer.zero_grad()
+
+    def lr_step(self, epoch, val_loss=None):
+        """Adjust the learning rate at the end of the epoch."""
+        self.lr_scheduler.step(epoch, val_loss)
+        # prefer updating the LR based on the number of steps
+        return self.lr_step_update()
+
+    def lr_step_update(self):
+        """Update the learning rate after each update."""
+        new_lr = self.lr_scheduler.step_update(self.get_num_updates())
+        metrics.log_scalar("lr", new_lr, weight=0, priority=300)
+        return new_lr
+
+    def get_lr(self):
+        """Get the current learning rate."""
+        return self.optimizer.get_lr()
+
+    def get_model(self):
+        """Get the (non-wrapped) model instance."""
+        return self._model
+
+    def get_criterion(self):
+        """Get the (non-wrapped) criterion instance."""
+        return self._criterion
+
+    def get_meter(self, name):
+        """[deprecated] Get a specific meter by name."""
+        from fairseq import meters
+
+        if 'get_meter' not in self._warn_once:
+            self._warn_once.add('get_meter')
+            utils.deprecation_warning(
+                'Trainer.get_meter is deprecated. Please use fairseq.metrics instead.'
+            )
+
+        train_meters = metrics.get_meters("train")
+        if train_meters is None:
+            train_meters = {}
+
+        if name == "train_loss" and "loss" in train_meters:
+            return train_meters["loss"]
+        elif name == "train_nll_loss":
+            # support for legacy train.py, which assumed this meter is
+            # always initialized
+            m = train_meters.get("nll_loss", None)
+            return m or meters.AverageMeter()
+        elif name == "wall":
+            # support for legacy train.py, which assumed this meter is
+            # always initialized
+            m = metrics.get_meter("default", "wall")
+            return m or meters.TimeMeter()
+        elif name == "wps":
+            m = metrics.get_meter("train", "wps")
+            return m or meters.TimeMeter()
+        elif name in {"valid_loss", "valid_nll_loss"}:
+            # support for legacy train.py, which assumed these meters
+            # are always initialized
+            k = name[len("valid_"):]
+            m = metrics.get_meter("valid", k)
+            return m or meters.AverageMeter()
+        elif name == "oom":
+            return meters.AverageMeter()
+        elif name in train_meters:
+            return train_meters[name]
+        return None
+
+    def get_num_updates(self):
+        """Get the number of parameters updates."""
+        return self._num_updates
+
+    def set_num_updates(self, num_updates):
+        """Set the number of parameters updates."""
+        self._num_updates = num_updates
+        self.lr_step_update()
+        if self.quantizer:
+            self.quantizer.step_update(self._num_updates)
+        metrics.log_scalar("num_updates", self._num_updates, weight=0, priority=200)
+
+    def clip_grad_norm(self, clip_norm):
+        return self.optimizer.clip_grad_norm(clip_norm, aggregate_norm_fn=None)
+
+    def cumulative_training_time(self):
+        if self._cumulative_training_time is None:
+            # single GPU
+            return self._local_cumulative_training_time()
+        else:
+            return self._cumulative_training_time
+
+    def _local_cumulative_training_time(self):
+        """Aggregate training time in seconds."""
+        return time.time() - self._start_time + self._previous_training_time
+
+    def _prepare_sample(self, sample):
+        if sample == "DUMMY":
+            raise Exception(
+                "Trying to use an uninitialized 'dummy' batch. This usually indicates "
+                "that the total number of batches is smaller than the number of "
+                "participating GPUs. Try reducing the batch size or using fewer GPUs."
+            )
+
+        if sample is None or len(sample) == 0:
+            return None
+
+        if self.cuda:
+            sample = utils.move_to_cuda(sample)
+
+        def apply_half(t):
+            if t.dtype is torch.float32:
+                return t.half()
+            return t
+
+        def apply_bfloat16(t):
+            if t.dtype is torch.float32:
+                return t.to(dtype=torch.bfloat16)
+            return t
+
+        if self.args.fp16:
+            sample = utils.apply_to_sample(apply_half, sample)
+
+        if self.args.bf16:
+            sample = utils.apply_to_sample(apply_bfloat16, sample)
+
+        return sample
+
+    def _set_seed(self):
+        # Set seed based on args.seed and the update number so that we get
+        # reproducible results when resuming from checkpoints
+        seed = self.args.seed + self.get_num_updates()
+        utils.set_torch_seed(seed)
+
+    def _sync_stats(self):
+        # Return True if it's using multiple GPUs and DDP or multiple GPUs with
+        # BMUF and it's a bmuf sync with warmup iterations completed before.
+        if self.data_parallel_world_size == 1:
+            return False
+        elif self.args.use_bmuf:
+            return (
+                (self.get_num_updates() + 1) % self.args.global_sync_iter == 0
+                and (self.get_num_updates() + 1) > self.args.warmup_iterations
+            )
+        else:
+            return True
+
+    def _log_oom(self, exc):
+        msg = "OOM: Ran out of memory with exception: {}".format(exc)
+        logger.warning(msg)
+        if torch.cuda.is_available() and hasattr(torch.cuda, "memory_summary"):
+            for device_idx in range(torch.cuda.device_count()):
+                logger.warning(torch.cuda.memory_summary(device=device_idx))
+        sys.stderr.flush()
+
+    def _aggregate_logging_outputs(
+        self,
+        logging_outputs: List[Dict[str, Any]],
+        *extra_stats_to_sum,
+        ignore=False,
+    ):
+        if self.task.__class__.logging_outputs_can_be_summed(self.get_criterion()):
+            return self._fast_stat_sync_sum(
+                logging_outputs, *extra_stats_to_sum, ignore=ignore
+            )
+        else:
+            return self._all_gather_list_sync(
+                logging_outputs, *extra_stats_to_sum, ignore=ignore
+            )
+
+    def _all_gather_list_sync(
+        self,
+        logging_outputs: List[Dict[str, Any]],
+        *extra_stats_to_sum,
+        ignore=False,
+    ):
+        """
+        Sync logging outputs across workers. all_gather_list_sync is
+        suitable when logging outputs are complex types.
+        """
+        if self.tpu:
+            raise NotImplementedError
+        if ignore:
+            logging_outputs = []
+        results = list(zip(
+            *distributed_utils.all_gather_list(
+                [logging_outputs] + list(extra_stats_to_sum),
+                max_size=getattr(self.args, 'all_gather_list_size', 16384),
+                group=self.data_parallel_process_group,
+            )
+        ))
+        logging_outputs, extra_stats_to_sum = results[0], results[1:]
+        logging_outputs = list(chain.from_iterable(logging_outputs))
+        extra_stats_to_sum = [sum(s) for s in extra_stats_to_sum]
+        return logging_outputs, extra_stats_to_sum
+
+    def _fast_stat_sync_sum(
+        self,
+        logging_outputs: List[Dict[str, Any]],
+        *extra_stats_to_sum,
+        ignore=False,
+    ):
+        """
+        Sync logging outputs across workers. fast_stat_sync_sum is
+        faster than all_gather_list_sync, but is only suitable when
+        logging outputs are scalars and can be summed. Note that
+        *logging_outputs* cannot contain any nested dicts/lists.
+        """
+        data = {}
+        for i, stat in enumerate(extra_stats_to_sum):
+            data['extra_stats_' + str(i)] = stat
+        if len(logging_outputs) > 0:
+            log_keys = list(logging_outputs[0].keys())
+            for k in log_keys:
+                if not ignore:
+                    v = sum(log[k] for log in logging_outputs if k in log)
+                else:
+                    v = logging_outputs[0][k]
+                    v = torch.zeros_like(v) if torch.is_tensor(v) else 0
+                data['logging_outputs_' + k] = v
+        else:
+            log_keys = None
+
+        data = distributed_utils.all_reduce_dict(
+            data,
+            device=self.device,
+            group=self.data_parallel_process_group
+        )
+
+        extra_stats_to_sum = [
+            data['extra_stats_' + str(i)] for i in range(len(extra_stats_to_sum))
+        ]
+        if log_keys is not None:
+            logging_outputs = [{k: data['logging_outputs_' + k] for k in log_keys}]
+        else:
+            logging_outputs = []
+        return logging_outputs, extra_stats_to_sum
+
+    def _check_grad_norms(self, grad_norm):
+        """Check that grad norms are consistent across workers."""
+        if self._grad_norm_buf is not None:
+            self._grad_norm_buf.zero_()
+            self._grad_norm_buf[self.data_parallel_rank] = grad_norm
+            distributed_utils.all_reduce(
+                self._grad_norm_buf,
+                group=self.data_parallel_process_group
+            )
+
+            def is_consistent(tensor):
+                max_abs_diff = torch.max(torch.abs(tensor - tensor[0]))
+                return (max_abs_diff / (tensor[0] + 1e-6) < 1e-6).all()
+
+            if not is_consistent(self._grad_norm_buf):
+                pretty_detail = "\n".join(
+                    "rank {:3d} = {:.8f}".format(r, n)
+                    for r, n in enumerate(self._grad_norm_buf.tolist())
+                )
+                error_detail = "grad_norm across the workers:\n{}\n".format(pretty_detail)
+                raise RuntimeError(
+                    "Fatal error: gradients are inconsistent between workers. "
+                    "Try --ddp-backend=no_c10d. "
+                    "Or are you mixing up different generation of GPUs in training?"
+                    + "\n"
+                    + "-" * 80
+                    + "\n{}\n".format(error_detail)
+                    + "-" * 80
+                )
+
+    def _reduce_and_log_stats(self, logging_outputs, sample_size, grad_norm=None):
+        if grad_norm is not None:
+            metrics.log_speed("ups", 1., priority=100, round=2)
+            metrics.log_scalar("gnorm", grad_norm, priority=400, round=3)
+            if self.args.clip_norm > 0:
+                metrics.log_scalar(
+                    "clip",
+                    torch.where(
+                        grad_norm > self.args.clip_norm,
+                        grad_norm.new_tensor(100),
+                        grad_norm.new_tensor(0),
+                    ),
+                    priority=500,
+                    round=1,
+                )
+
+        with metrics.aggregate() as agg:
+            if logging_outputs is not None:
+                self.task.reduce_metrics(logging_outputs, self.get_criterion())
+                del logging_outputs
+
+            # extra warning for criterions that don't properly log a loss value
+            if "loss" not in agg:
+                if "loss" not in self._warn_once:
+                    self._warn_once.add("loss")
+                    logger.warning(
+                        "Criterion.reduce_metrics did not log a 'loss' value, "
+                        "which may break some functionality"
+                    )
+                metrics.log_scalar("loss", -1)
+
+            # support legacy interface
+            if self.tpu:
+                logging_output = {}
+            else:
+                logging_output = agg.get_smoothed_values()
+                logging_output["sample_size"] = sample_size
+                for key_to_delete in ["ppl", "wps", "wpb", "bsz"]:
+                    if key_to_delete in logging_output:
+                        del logging_output[key_to_delete]
+            return logging_output
+
+    def _check_xla_compilation(self, message=None):
+        import torch_xla.debug.metrics as met
+        compile_stats = met.metric_data("CompileTime")
+        if compile_stats is None:
+            return
+        num_xla_compiles = compile_stats[0]
+        if num_xla_compiles > self._num_xla_compiles:
+            if message is None:
+                message = (
+                    "too many of these can lead to slow training, "
+                    "but we expect a few in the beginning"
+                )
+            logging.info("NOTE: XLA compilation detected; {}".format(message))
+        self._num_xla_compiles = num_xla_compiles
+
+
+def _catalog_shared_params(module, memo=None, prefix=''):
+    if memo is None:
+        first_call = True
+        memo = {}
+    else:
+        first_call = False
+    for name, param in module._parameters.items():
+        param_prefix = prefix + ('.' if prefix else '') + name
+        if param not in memo:
+            memo[param] = []
+        memo[param].append(param_prefix)
+    for name, m in module._modules.items():
+        if m is None:
+            continue
+        submodule_prefix = prefix + ('.' if prefix else '') + name
+        _catalog_shared_params(m, memo, submodule_prefix)
+    if first_call:
+        return [x for x in memo.values() if len(x) > 1]
+
+
+def _get_module_by_path(module, path):
+    path = path.split('.')
+    for name in path:
+        module = getattr(module, name)
+    return module
+
+
+def _set_module_by_path(module, path, value):
+    path = path.split('.')
+    for name in path[:-1]:
+        module = getattr(module, name)
+    setattr(module, path[-1], value)
diff --git a/fairseq/utils.py b/fairseq/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f68860330c71ba9e32735afb18f8888c66d41004
--- /dev/null
+++ b/fairseq/utils.py
@@ -0,0 +1,589 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import contextlib
+import copy
+import importlib.util
+import logging
+import math
+import os
+import sys
+import warnings
+from collections import defaultdict
+from itertools import accumulate
+from typing import Callable, Dict, List, Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from fairseq.logging.meters import safe_round
+from fairseq.modules import gelu, gelu_accurate
+from fairseq.modules.multihead_attention import MultiheadAttention
+from torch import Tensor
+
+try:
+    from amp_C import multi_tensor_l2norm
+    multi_tensor_l2norm_available = True
+except ImportError:
+    multi_tensor_l2norm_available = False
+
+
+logger = logging.getLogger(__name__)
+
+
+MANIFOLD_PATH_SEP = "|"
+
+
+def split_paths(paths: str) -> List[str]:
+    return paths.split(os.pathsep) if "://" not in paths else paths.split(MANIFOLD_PATH_SEP)
+
+
+def load_ensemble_for_inference(filenames, task, model_arg_overrides=None):
+    from fairseq import checkpoint_utils
+
+    deprecation_warning(
+        "utils.load_ensemble_for_inference is deprecated. "
+        "Please use checkpoint_utils.load_model_ensemble instead."
+    )
+    return checkpoint_utils.load_model_ensemble(
+        filenames, arg_overrides=model_arg_overrides, task=task
+    )
+
+
+def apply_to_sample(f, sample):
+    if hasattr(sample, '__len__') and len(sample) == 0:
+        return {}
+
+    def _apply(x):
+        if torch.is_tensor(x):
+            return f(x)
+        elif isinstance(x, dict):
+            return {key: _apply(value) for key, value in x.items()}
+        elif isinstance(x, list):
+            return [_apply(x) for x in x]
+        elif isinstance(x, tuple):
+            return tuple(_apply(x) for x in x)
+        elif isinstance(x, set):
+            return {_apply(x) for x in x}
+        else:
+            return x
+
+    return _apply(sample)
+
+
+def move_to_cuda(sample):
+    def _move_to_cuda(tensor):
+        return tensor.cuda()
+
+    return apply_to_sample(_move_to_cuda, sample)
+
+
+def move_to_cpu(sample):
+    def _move_to_cpu(tensor):
+        # PyTorch has poor support for half tensors (float16) on CPU.
+        # Move any such tensors to float32.
+        if tensor.dtype in {torch.bfloat16, torch.float16}:
+            tensor = tensor.to(dtype=torch.float32)
+        return tensor.cpu()
+
+    return apply_to_sample(_move_to_cpu, sample)
+
+
+def get_incremental_state(
+    module: MultiheadAttention,
+    incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+    key: str,
+) -> Optional[Dict[str, Optional[Tensor]]]:
+    """Helper for getting incremental state for an nn.Module."""
+    return module.get_incremental_state(incremental_state, key)
+
+
+def set_incremental_state(
+    module: MultiheadAttention,
+    incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+    key: str,
+    value: Dict[str, Optional[Tensor]],
+) -> Optional[Dict[str, Dict[str, Optional[Tensor]]]]:
+    """Helper for setting incremental state for an nn.Module."""
+    if incremental_state is not None:
+        result = module.set_incremental_state(incremental_state, key, value)
+        if result is not None:
+            incremental_state = result
+    return incremental_state
+
+
+def load_align_dict(replace_unk):
+    if replace_unk is None:
+        align_dict = None
+    elif isinstance(replace_unk, str) and len(replace_unk) > 0:
+        # Load alignment dictionary for unknown word replacement if it was passed as an argument.
+        align_dict = {}
+        with open(replace_unk, "r") as f:
+            for line in f:
+                cols = line.split()
+                align_dict[cols[0]] = cols[1]
+    else:
+        # No alignment dictionary provided but we still want to perform unknown word replacement by copying the
+        # original source word.
+        align_dict = {}
+    return align_dict
+
+
+def print_embed_overlap(embed_dict, vocab_dict):
+    embed_keys = set(embed_dict.keys())
+    vocab_keys = set(vocab_dict.symbols)
+    overlap = len(embed_keys & vocab_keys)
+    logger.info("found {}/{} types in embedding file".format(overlap, len(vocab_dict)))
+
+
+def parse_embedding(embed_path):
+    """Parse embedding text file into a dictionary of word and embedding tensors.
+
+    The first line can have vocabulary size and dimension. The following lines
+    should contain word and embedding separated by spaces.
+
+    Example:
+        2 5
+        the -0.0230 -0.0264  0.0287  0.0171  0.1403
+        at -0.0395 -0.1286  0.0275  0.0254 -0.0932
+    """
+    embed_dict = {}
+    with open(embed_path) as f_embed:
+        next(f_embed)  # skip header
+        for line in f_embed:
+            pieces = line.rstrip().split(" ")
+            embed_dict[pieces[0]] = torch.Tensor(
+                [float(weight) for weight in pieces[1:]]
+            )
+    return embed_dict
+
+
+def load_embedding(embed_dict, vocab, embedding):
+    for idx in range(len(vocab)):
+        token = vocab[idx]
+        if token in embed_dict:
+            embedding.weight.data[idx] = embed_dict[token]
+    return embedding
+
+
+def replace_unk(hypo_str, src_str, alignment, align_dict, unk):
+    from fairseq import tokenizer
+
+    # Tokens are strings here
+    hypo_tokens = tokenizer.tokenize_line(hypo_str)
+    # TODO: Very rare cases where the replacement is '<eos>' should be handled gracefully
+    src_tokens = tokenizer.tokenize_line(src_str) + ["<eos>"]
+    for i, ht in enumerate(hypo_tokens):
+        if ht == unk:
+            src_token = src_tokens[alignment[i]]
+            # Either take the corresponding value in the aligned dictionary or just copy the original value.
+            hypo_tokens[i] = align_dict.get(src_token, src_token)
+    return " ".join(hypo_tokens)
+
+
+def post_process_prediction(
+    hypo_tokens, src_str, alignment, align_dict, tgt_dict, remove_bpe=None, extra_symbols_to_ignore=None
+):
+    hypo_str = tgt_dict.string(hypo_tokens, remove_bpe, extra_symbols_to_ignore=extra_symbols_to_ignore)
+    if align_dict is not None:
+        hypo_str = replace_unk(
+            hypo_str, src_str, alignment, align_dict, tgt_dict.unk_string()
+        )
+    if align_dict is not None or remove_bpe is not None:
+        # Convert back to tokens for evaluating with unk replacement or without BPE
+        # Note that the dictionary can be modified inside the method.
+        hypo_tokens = tgt_dict.encode_line(hypo_str, add_if_not_exist=True)
+    return hypo_tokens, hypo_str, alignment
+
+
+def make_positions(tensor, padding_idx: int, onnx_trace: bool = False):
+    """Replace non-padding symbols with their position numbers.
+
+    Position numbers begin at padding_idx+1. Padding symbols are ignored.
+    """
+    # The series of casts and type-conversions here are carefully
+    # balanced to both work with ONNX export and XLA. In particular XLA
+    # prefers ints, cumsum defaults to output longs, and ONNX doesn't know
+    # how to handle the dtype kwarg in cumsum.
+    mask = tensor.ne(padding_idx).int()
+    return (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx
+
+
+def strip_pad(tensor, pad):
+    return tensor[tensor.ne(pad)]
+
+
+def buffered_arange(max):
+    if not hasattr(buffered_arange, "buf"):
+        buffered_arange.buf = torch.LongTensor()
+    if max > buffered_arange.buf.numel():
+        buffered_arange.buf.resize_(max)
+        torch.arange(max, out=buffered_arange.buf)
+    return buffered_arange.buf[:max]
+
+
+def convert_padding_direction(
+    src_tokens, padding_idx, right_to_left: bool = False, left_to_right: bool = False
+):
+    assert right_to_left ^ left_to_right
+    pad_mask = src_tokens.eq(padding_idx)
+    if not pad_mask.any():
+        # no padding, return early
+        return src_tokens
+    if left_to_right and not pad_mask[:, 0].any():
+        # already right padded
+        return src_tokens
+    if right_to_left and not pad_mask[:, -1].any():
+        # already left padded
+        return src_tokens
+    max_len = src_tokens.size(1)
+    buffered = torch.empty(0).long()
+    if max_len > 0:
+        torch.arange(max_len, out=buffered)
+    range = buffered.type_as(src_tokens).expand_as(src_tokens)
+    num_pads = pad_mask.long().sum(dim=1, keepdim=True)
+    if right_to_left:
+        index = torch.remainder(range - num_pads, max_len)
+    else:
+        index = torch.remainder(range + num_pads, max_len)
+    return src_tokens.gather(1, index)
+
+
+def item(tensor):
+    if hasattr(tensor, "item"):
+        return tensor.item()
+    if hasattr(tensor, "__getitem__"):
+        return tensor[0]
+    return tensor
+
+
+def multi_tensor_total_norm(grads, chunk_size=2048*32) -> torch.Tensor:
+    per_device_grads = {}
+    norms = []
+    for grad in grads:
+        device = grad.device
+        cur_device_grads = per_device_grads.get(device)
+        if cur_device_grads is None:
+            cur_device_grads = []
+            per_device_grads[device] = cur_device_grads
+        cur_device_grads.append(grad)
+    for device in per_device_grads.keys():
+        cur_device_grads = per_device_grads[device]
+        if device.type == "cuda":
+            # TODO(msb) return has_inf
+            has_inf = torch.zeros((1, 1), dtype=torch.int, device=device)
+            with torch.cuda.device(device):
+                norm = multi_tensor_l2norm(chunk_size, has_inf, [cur_device_grads], False)
+                norms.append(norm[0])
+        else:
+            norms += [torch.norm(g, p=2, dtype=torch.float32) for g in cur_device_grads]
+    total_norm = torch.norm(torch.stack(norms))
+    return total_norm
+
+
+def clip_grad_norm_(params, max_norm, aggregate_norm_fn=None) -> torch.Tensor:
+    if isinstance(params, torch.Tensor):
+        params = [params]
+    params = list(params)
+    grads = [p.grad.detach() for p in filter(lambda p: p.grad is not None, params)]
+    if len(grads) == 0:
+        if len(params) > 0:
+            return params[0].new_tensor(0.)
+        else:
+            return torch.tensor(0.)
+
+    if len(grads) == 1:
+        total_norm = torch.norm(grads[0], p=2, dtype=torch.float32)
+    else:
+        if multi_tensor_l2norm_available:
+            total_norm = multi_tensor_total_norm(grads)
+        else:
+            if torch.cuda.is_available():
+                warnings.warn(
+                    "amp_C fused kernels unavailable, disabling multi_tensor_l2norm; "
+                    "you may get better performance by installing NVIDIA's apex library"
+                )
+            total_norm = torch.norm(
+                torch.stack([torch.norm(g, p=2, dtype=torch.float32) for g in grads])
+            )
+
+    if aggregate_norm_fn is not None:
+        total_norm = aggregate_norm_fn(total_norm)
+
+    if max_norm > 0:
+        max_norm = float(max_norm)
+        clip_coef = (max_norm / (total_norm + 1e-6)).clamp_(max=1)
+        for g in grads:
+            g.mul_(clip_coef)
+    return total_norm
+
+
+def fill_with_neg_inf(t):
+    """FP16-compatible function that fills a tensor with -inf."""
+    return t.float().fill_(float("-inf")).type_as(t)
+
+
+def _match_types(arg1, arg2):
+    """Convert the numerical argument to the same type as the other argument"""
+
+    def upgrade(arg_number, arg_structure):
+        if isinstance(arg_structure, tuple):
+            return tuple([arg_number] * len(arg_structure))
+        elif isinstance(arg_structure, dict):
+            arg = copy.deepcopy(arg_structure)
+            for k in arg:
+                arg[k] = upgrade(arg_number, arg_structure[k])
+            return arg
+        else:
+            return arg_number
+
+    if isinstance(arg1, float) or isinstance(arg1, int):
+        return upgrade(arg1, arg2), arg2
+    elif isinstance(arg2, float) or isinstance(arg2, int):
+        return arg1, upgrade(arg2, arg1)
+
+    return arg1, arg2
+
+
+def resolve_max_positions(*args):
+    """Resolve max position constraints from multiple sources."""
+
+    def map_value_update(d1, d2):
+        updated_value = copy.deepcopy(d1)
+        for key in d2:
+            if key not in updated_value:
+                updated_value[key] = d2[key]
+            else:
+                updated_value[key] = min(d1[key], d2[key])
+        return updated_value
+
+    def nullsafe_min(l):
+        minim = None
+        for item in l:
+            if minim is None:
+                minim = item
+            elif item is not None and item < minim:
+                minim = item
+        return minim
+
+    max_positions = None
+    for arg in args:
+        if max_positions is None:
+            max_positions = arg
+        elif arg is not None:
+            max_positions, arg = _match_types(max_positions, arg)
+            if isinstance(arg, float) or isinstance(arg, int):
+                max_positions = min(max_positions, arg)
+            elif isinstance(arg, dict):
+                max_positions = map_value_update(max_positions, arg)
+            else:
+                max_positions = tuple(map(nullsafe_min, zip(max_positions, arg)))
+
+    return max_positions
+
+
+def import_user_module(args):
+    module_path = getattr(args, "user_dir", None)
+    if module_path is not None:
+        module_path = os.path.abspath(args.user_dir)
+        if not os.path.exists(module_path):
+            fairseq_rel_path = os.path.join(
+                os.path.dirname(__file__), "..", args.user_dir
+            )
+            if os.path.exists(fairseq_rel_path):
+                module_path = fairseq_rel_path
+        module_parent, module_name = os.path.split(module_path)
+
+        if module_name not in sys.modules:
+            sys.path.insert(0, module_parent)
+            importlib.import_module(module_name)
+
+
+def softmax(x, dim: int, onnx_trace: bool = False):
+    if onnx_trace:
+        return F.softmax(x.float(), dim=dim)
+    else:
+        return F.softmax(x, dim=dim, dtype=torch.float32)
+
+
+def log_softmax(x, dim: int, onnx_trace: bool = False):
+    if onnx_trace:
+        return F.log_softmax(x.float(), dim=dim)
+    else:
+        return F.log_softmax(x, dim=dim, dtype=torch.float32)
+
+
+def get_perplexity(loss, round=2, base=2):
+    if loss is None:
+        return 0.
+    try:
+        return safe_round(base ** loss, round)
+    except OverflowError:
+        return float('inf')
+
+
+def deprecation_warning(message, stacklevel=3):
+    # don't use DeprecationWarning, since it's ignored by default
+    warnings.warn(message, stacklevel=stacklevel)
+
+
+def get_activation_fn(activation: str) -> Callable:
+    """ Returns the activation function corresponding to `activation` """
+    if activation == "relu":
+        return F.relu
+    elif activation == "gelu":
+        return gelu
+    elif activation == "gelu_fast":
+        deprecation_warning(
+            "--activation-fn=gelu_fast has been renamed to gelu_accurate"
+        )
+        return gelu_accurate
+    elif activation == "gelu_accurate":
+        return gelu_accurate
+    elif activation == "tanh":
+        return torch.tanh
+    elif activation == "linear":
+        return lambda x: x
+    else:
+        raise RuntimeError("--activation-fn {} not supported".format(activation))
+
+
+def get_available_activation_fns() -> List:
+    return [
+        "relu",
+        "gelu",
+        "gelu_fast",  # deprecated
+        "gelu_accurate",
+        "tanh",
+        "linear",
+    ]
+
+
+@contextlib.contextmanager
+def eval(model):
+    is_training = model.training
+    model.eval()
+    yield
+    model.train(is_training)
+
+
+def has_parameters(module):
+    try:
+        next(module.parameters())
+        return True
+    except StopIteration:
+        return False
+
+
+def set_torch_seed(seed):
+    # Set seed based on args.seed and the update number so that we get
+    # reproducible results when resuming from checkpoints
+    assert isinstance(seed, int)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+
+
+@contextlib.contextmanager
+def with_torch_seed(seed):
+    assert isinstance(seed, int)
+    rng_state = torch.get_rng_state()
+    cuda_rng_state = torch.cuda.get_rng_state()
+    set_torch_seed(seed)
+    yield
+    torch.set_rng_state(rng_state)
+    torch.cuda.set_rng_state(cuda_rng_state)
+
+
+def parse_alignment(line):
+    """
+    Parses a single line from the alingment file.
+
+    Args:
+        line (str): String containing the alignment of the format:
+            <src_idx_1>-<tgt_idx_1> <src_idx_2>-<tgt_idx_2> ..
+            <src_idx_m>-<tgt_idx_m>. All indices are 0 indexed.
+
+    Returns:
+        torch.IntTensor: packed alignments of shape (2 * m).
+    """
+    alignments = line.strip().split()
+    parsed_alignment = torch.IntTensor(2 * len(alignments))
+    for idx, alignment in enumerate(alignments):
+        src_idx, tgt_idx = alignment.split("-")
+        parsed_alignment[2 * idx] = int(src_idx)
+        parsed_alignment[2 * idx + 1] = int(tgt_idx)
+    return parsed_alignment
+
+
+def get_token_to_word_mapping(tokens, exclude_list):
+    n = len(tokens)
+    word_start = [int(token not in exclude_list) for token in tokens]
+    word_idx = list(accumulate(word_start))
+    token_to_word = {i: word_idx[i] for i in range(n)}
+    return token_to_word
+
+
+def extract_hard_alignment(attn, src_sent, tgt_sent, pad, eos):
+    tgt_valid = ((tgt_sent != pad) & (tgt_sent != eos)).nonzero().squeeze(dim=-1)
+    src_invalid = ((src_sent == pad) | (src_sent == eos)).nonzero().squeeze(dim=-1)
+    src_token_to_word = get_token_to_word_mapping(src_sent, [eos, pad])
+    tgt_token_to_word = get_token_to_word_mapping(tgt_sent, [eos, pad])
+    alignment = []
+    if len(tgt_valid) != 0 and len(src_invalid) < len(src_sent):
+        attn_valid = attn[tgt_valid]
+        attn_valid[:, src_invalid] = float("-inf")
+        _, src_indices = attn_valid.max(dim=1)
+        for tgt_idx, src_idx in zip(tgt_valid, src_indices):
+            alignment.append(
+                (
+                    src_token_to_word[src_idx.item()] - 1,
+                    tgt_token_to_word[tgt_idx.item()] - 1,
+                )
+            )
+    return alignment
+
+
+def new_arange(x, *size):
+    """
+    Return a Tensor of `size` filled with a range function on the device of x.
+    If size is empty, using the size of the variable x.
+    """
+    if len(size) == 0:
+        size = x.size()
+    return torch.arange(size[-1], device=x.device).expand(*size).contiguous()
+
+
+def get_tpu_device(args):
+    import torch_xla.core.xla_model as xm
+    return xm.xla_device()
+
+
+class CudaEnvironment(object):
+    def __init__(self):
+        cur_device = torch.cuda.current_device()
+        prop = torch.cuda.get_device_properties("cuda:{}".format(cur_device))
+        self.name = prop.name
+        self.major = prop.major
+        self.minor = prop.minor
+        self.total_memory_in_GB = prop.total_memory / 1024 / 1024 / 1024
+
+    @staticmethod
+    def pretty_print_cuda_env_list(cuda_env_list):
+        """
+        Given a list of CudaEnviorments, pretty print them
+        """
+        num_workers = len(cuda_env_list)
+        center = "CUDA enviroments for all {} workers".format(num_workers)
+        banner_len = 40 - len(center) // 2
+        first_line = "*" * banner_len + center + "*" * banner_len
+        logger.info(first_line)
+        for r, env in enumerate(cuda_env_list):
+            logger.info(
+                "rank {:3d}: ".format(r)
+                + "capabilities = {:2d}.{:<2d} ; ".format(env.major, env.minor)
+                + "total memory = {:.3f} GB ; ".format(env.total_memory_in_GB)
+                + "name = {:40s}".format(env.name)
+            )
+        logger.info(first_line)